c#抓取网页内容,基本解决乱码问题;适配各类网页

private string GetHtmlCode(string url)
        {
            string htmlCode;
            HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
            webRequest.Timeout = 30000;
            webRequest.Method = "GET";
            webRequest.UserAgent = "Mozilla/4.0";
            webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
            HttpWebResponse webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();
            if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压
            {
                using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
                {
                    using (var zipStream =
                        new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))
                    {
                        Encoding enc = GetEncoding(url);
                        using (StreamReader sr = new System.IO.StreamReader(zipStream, enc))
                        {
                            htmlCode = sr.ReadToEnd();
                        }
                    }
                }
            }
            else
            {
                using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
                {
                    Encoding enc = GetEncoding(url);
                    using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, enc))
                    {
                        htmlCode = sr.ReadToEnd();
                    }
                }
            }

            return htmlCode;
        }
        public Encoding GetEncoding(string strurl)
        {
            string urlToCrawl = strurl;
            //generate http request
            if (urlToCrawl != null && urlToCrawl != "")
            {
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);
                //use GET method to get url's html
                req.Method = "GET";
                req.Accept = "*/*";
                req.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
                req.ContentType = "text/xml";
                //use request to get response
                HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
                Encoding enc;
                try
                {
                    if (resp.CharacterSet != "ISO-8859-1")
                        enc = Encoding.GetEncoding(resp.CharacterSet);
                    else
                        enc = Encoding.UTF8;
                }
                catch
                {
                    // *** Invalid encoding passed
                    enc = Encoding.UTF8;
                }
                string sHTML = string.Empty;
                using (StreamReader read = new StreamReader(resp.GetResponseStream(), enc))
                {
                    sHTML = read.ReadToEnd();
                    Match charSetMatch = Regex.Match(sHTML, "charset=(?[a-zA-Z0-9\\-]+)", RegexOptions.IgnoreCase);
                    string sChartSet = charSetMatch.Groups["code"].Value;
                    //if it's not utf-8,we should redecode the html.
                    if (!string.IsNullOrEmpty(sChartSet) && !sChartSet.Equals("utf-8", StringComparison.OrdinalIgnoreCase))
                    {
                        enc = Encoding.GetEncoding(sChartSet);
                    }
                }
                return enc;
            }
            return Encoding.Default;
        }

 使用C#抓取网页时遇到乱码问题,找了各种办法都没有妥善解决的,发现存在gzip压缩的问题;于是乎,在参考CSDN上两位达人的帖子以后,我把代码进行了修正,基本妥善解决页面代码错误问题;欢迎大家使用上面的代码尝试;

 

以下为参考贴:

http://blog.csdn.net/wsc449/article/details/7280646

http://bbs.csdn.net/topics/320213776

 

你可能感兴趣的