[英]Why is there a limit in the concurrent number of downloads?
我正在嘗試制作自己的簡單網絡爬蟲。 我想從URL下載具有特定擴展名的文件。 我寫了以下代碼:
private void button1_Click(object sender, RoutedEventArgs e)
{
if (bw.IsBusy) return;
bw.DoWork += new DoWorkEventHandler(bw_DoWork);
bw.RunWorkerAsync(new string[] { URL.Text, SavePath.Text, Filter.Text });
}
//--------------------------------------------------------------------------------------------
void bw_DoWork(object sender, DoWorkEventArgs e)
{
try
{
ThreadPool.SetMaxThreads(4, 4);
string[] strs = e.Argument as string[];
Regex reg = new Regex("<a(\\s*[^>]*?){0,1}\\s*href\\s*\\=\\s*\\\"([^>]*?)\\\"\\s*[^>]*>(.*?)</a>", RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase);
int i = 0;
string domainS = strs[0];
string Extensions = strs[2];
string OutDir = strs[1];
var domain = new Uri(domainS);
string[] Filters = Extensions.Split(new char[] { ';', ',', ' ' }, StringSplitOptions.RemoveEmptyEntries);
string outPath = System.IO.Path.Combine(OutDir, string.Format("File_{0}.html", i));
WebClient webClient = new WebClient();
string str = webClient.DownloadString(domainS);
str = str.Replace("\r\n", " ").Replace('\n', ' ');
MatchCollection mc = reg.Matches(str);
int NumOfThreads = mc.Count;
Parallel.ForEach(mc.Cast<Match>(), new ParallelOptions { MaxDegreeOfParallelism = 2, },
mat =>
{
string val = mat.Groups[2].Value;
var link = new Uri(domain, val);
foreach (string ext in Filters)
if (val.EndsWith("." + ext))
{
Download((object)new object[] { OutDir, link });
break;
}
});
throw new Exception("Finished !");
}
catch (System.Exception ex)
{
ReportException(ex);
}
finally
{
}
}
//--------------------------------------------------------------------------------------------
private static void Download(object o)
{
try
{
object[] objs = o as object[];
Uri link = (Uri)objs[1];
string outPath = System.IO.Path.Combine((string)objs[0], System.IO.Path.GetFileName(link.ToString()));
if (!File.Exists(outPath))
{
//WebClient webClient = new WebClient();
//webClient.DownloadFile(link, outPath);
DownloadFile(link.ToString(), outPath);
}
}
catch (System.Exception ex)
{
ReportException(ex);
}
}
//--------------------------------------------------------------------------------------------
private static bool DownloadFile(string url, string filePath)
{
try
{
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
request.UserAgent = "Web Crawler";
request.Timeout = 40000;
WebResponse response = request.GetResponse();
Stream stream = response.GetResponseStream();
using (FileStream fs = new FileStream(filePath, FileMode.CreateNew))
{
const int siz = 1000;
byte[] bytes = new byte[siz];
for (; ; )
{
int count = stream.Read(bytes, 0, siz);
fs.Write(bytes, 0, count);
if (count == 0) break;
}
fs.Flush();
fs.Close();
}
}
catch (System.Exception ex)
{
ReportException(ex);
return false;
}
finally
{
}
return true;
}
問題是雖然它適用於2個並行下載:
new ParallelOptions { MaxDegreeOfParallelism = 2, }
...它不適用於更大程度的並行性,如:
new ParallelOptions { MaxDegreeOfParallelism = 5, }
...我得到連接超時異常。
起初我以為是因為WebClient
:
//WebClient webClient = new WebClient();
//webClient.DownloadFile(link, outPath);
...但是當我用使用HttpWebRequest
的函數DownloadFile
替換它時,我仍然遇到錯誤。
我已在許多網頁上測試過,沒有任何改變。 我還確認了chrome的擴展名“Download Master”,這些Web服務器允許多個並行下載。 有沒有人知道為什么我會超時嘗試並行下載多個文件?
您需要分配ServicePointManager.DefaultConnectionLimit
。 與同一主機的默認並發連接是2.另請參閱使用web.config connectionManagement
相關SO帖子 。
據我所知,IIS將限制進出的連接總數,但是這個數字應該在10 ^ 3而不是5的范圍內。
您是否可以測試相同的網址? 我知道很多Web服務器限制了來自客戶端的同時連接數。 例如:您是否嘗試下載10份http://www.google.com進行測試 ?
如果是這樣,您可能想嘗試使用不同網站的列表進行測試,例如:
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.