Using the following code, I can download the HTML of a file from the internet:
WebClient wc = new WebClient();
// ....
string downloadedFile = wc.DownloadS
Try this
string downloadedFile = wc.DownloadString("");
i allways remove the last "Slash" and it worked till now like a charm. But i could be also a hazard
Here's a wrapped download class which supports gzip and checks encoding header and meta tags in order to decode it correctly.
Instantiate the class, and call GetPage()
public class HttpDownloader
private readonly string _referer;
private readonly string _userAgent;
public Encoding Encoding { get; set; }
public WebHeaderCollection Headers { get; set; }
public Uri Url { get; set; }
public HttpDownloader(string url, string referer, string userAgent)
Encoding = Encoding.GetEncoding("ISO-8859-1");
Url = new Uri(url); // verify the uri
_userAgent = userAgent;
_referer = referer;
public string GetPage()
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
if (!string.IsNullOrEmpty(_referer))
request.Referer = _referer;
if (!string.IsNullOrEmpty(_userAgent))
request.UserAgent = _userAgent;
request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
Headers = response.Headers;
Url = response.ResponseUri;
return ProcessContent(response);
private string ProcessContent(HttpWebResponse response)
Stream s = response.GetResponseStream();
if (response.ContentEncoding.ToLower().Contains("gzip"))
s = new GZipStream(s, CompressionMode.Decompress);
else if (response.ContentEncoding.ToLower().Contains("deflate"))
s = new DeflateStream(s, CompressionMode.Decompress);
MemoryStream memStream = new MemoryStream();
int bytesRead;
byte[] buffer = new byte[0x1000];
for (bytesRead = s.Read(buffer, 0, buffer.Length); bytesRead > 0; bytesRead = s.Read(buffer, 0, buffer.Length))
memStream.Write(buffer, 0, bytesRead);
string html;
memStream.Position = 0;
using (StreamReader r = new StreamReader(memStream, Encoding))
html = r.ReadToEnd().Trim();
html = CheckMetaCharSetAndReEncode(memStream, html);
return html;
private void SetEncodingFromHeader(HttpWebResponse response)
string charset = null;
if (string.IsNullOrEmpty(response.CharacterSet))
Match m = Regex.Match(response.ContentType, @";\s*charset\s*=\s*(?<charset>.*)", RegexOptions.IgnoreCase);
if (m.Success)
charset = m.Groups["charset"].Value.Trim(new[] { '\'', '"' });
charset = response.CharacterSet;
if (!string.IsNullOrEmpty(charset))
Encoding = Encoding.GetEncoding(charset);
catch (ArgumentException)
private string CheckMetaCharSetAndReEncode(Stream memStream, string html)
Match m = new Regex(@"<meta\s+.*?charset\s*=\s*""?(?<charset>[A-Za-z0-9_-]+)""?", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
if (m.Success)
string charset = m.Groups["charset"].Value.ToLower() ?? "iso-8859-1";
if ((charset == "unicode") || (charset == "utf-16"))
charset = "utf-8";
Encoding metaEncoding = Encoding.GetEncoding(charset);
if (Encoding != metaEncoding)
memStream.Position = 0L;
StreamReader recodeReader = new StreamReader(memStream, metaEncoding);
html = recodeReader.ReadToEnd().Trim();
catch (ArgumentException)
return html;
Since I am not allowed to comment (insufficient reputation), I'll have to post an additional answer. I am using Mikael's great class routinely, but I encountered a practical problem with the regex that tries to find the charset meta-info. This
Match m = new Regex(@"<meta\s+.*?charset\s*=\s*(?<charset>[A-Za-z0-9_-]+)", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
fails on this
<meta charset="UTF-8"/>
whereas this
Match m = new Regex(@"<meta\s+.*?charset\s*=\s*""?(?<charset>[A-Za-z0-9_-]+)""?", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
does not.
Thanks, Mikael.