[英]How to cancel Task await after a timeout period
我正在使用此方法以編程方式實例化 Web 瀏覽器,導航到 url 並在文檔完成后返回結果。
如果文檔加載時間超過 5 秒,我如何能夠停止Task
並讓GetFinalUrl()
返回null
?
我已經看到許多使用TaskFactory
示例,但我無法將其應用於此代碼。
private Uri GetFinalUrl(PortalMerchant portalMerchant)
{
SetBrowserFeatureControl();
Uri finalUri = null;
if (string.IsNullOrEmpty(portalMerchant.Url))
{
return null;
}
Uri trackingUrl = new Uri(portalMerchant.Url);
var task = MessageLoopWorker.Run(DoWorkAsync, trackingUrl);
task.Wait();
if (!String.IsNullOrEmpty(task.Result.ToString()))
{
return new Uri(task.Result.ToString());
}
else
{
throw new Exception("Parsing Failed");
}
}
// by Noseratio - http://stackoverflow.com/users/1768303/noseratio
static async Task<object> DoWorkAsync(object[] args)
{
_threadCount++;
Console.WriteLine("Thread count:" + _threadCount);
Uri retVal = null;
var wb = new WebBrowser();
wb.ScriptErrorsSuppressed = true;
TaskCompletionSource<bool> tcs = null;
WebBrowserDocumentCompletedEventHandler documentCompletedHandler = (s, e) => tcs.TrySetResult(true);
foreach (var url in args)
{
tcs = new TaskCompletionSource<bool>();
wb.DocumentCompleted += documentCompletedHandler;
try
{
wb.Navigate(url.ToString());
await tcs.Task;
}
finally
{
wb.DocumentCompleted -= documentCompletedHandler;
}
retVal = wb.Url;
wb.Dispose();
return retVal;
}
return null;
}
public static class MessageLoopWorker
{
#region Public static methods
public static async Task<object> Run(Func<object[], Task<object>> worker, params object[] args)
{
var tcs = new TaskCompletionSource<object>();
var thread = new Thread(() =>
{
EventHandler idleHandler = null;
idleHandler = async (s, e) =>
{
// handle Application.Idle just once
Application.Idle -= idleHandler;
// return to the message loop
await Task.Yield();
// and continue asynchronously
// propogate the result or exception
try
{
var result = await worker(args);
tcs.SetResult(result);
}
catch (Exception ex)
{
tcs.SetException(ex);
}
// signal to exit the message loop
// Application.Run will exit at this point
Application.ExitThread();
};
// handle Application.Idle just once
// to make sure we're inside the message loop
// and SynchronizationContext has been correctly installed
Application.Idle += idleHandler;
Application.Run();
});
// set STA model for the new thread
thread.SetApartmentState(ApartmentState.STA);
// start the thread and await for the task
thread.Start();
try
{
return await tcs.Task;
}
finally
{
thread.Join();
}
}
#endregion
}
更新:在最新版本的WebBrowser
為基礎的控制台網站刮板可以在Github上找到。
更新:為多個並行下載添加一個WebBrowser
對象池。
您是否有任何機會在控制台應用程序中執行此操作的示例? 此外,我不認為 webBrowser 可以是一個類變量,因為我正在為每個並行運行整個事物,迭代數千個 URL
下面是一個或多或少通用的 ** 基於WebBrowser
的網絡爬蟲** 的實現,它作為控制台應用程序工作。 它整合了我之前與WebBrowser
相關的一些工作,包括問題中引用的代碼:
幾點:
可重用MessageLoopApartment
類用於啟動和運行帶有自己的消息泵的 WinForms STA 線程。 它可以從控制台應用程序中使用,如下所示。 此類公開了一個 TPL 任務調度程序 ( FromCurrentSynchronizationContext
) 和一組Task.Factory.StartNew
包裝器以使用此任務調度程序。
這使得async/await
成為在單獨的 STA 線程上運行WebBrowser
導航任務的絕佳工具。 這樣,在該線程上創建、導航和銷毀WebBrowser
對象。 雖然, MessageLoopApartment
並沒有專門綁定到WebBrowser
。
使用Browser Feature Control啟用 HTML5 渲染非常重要,否則默認情況下WebBrowser
對象將在 IE7 仿真模式下運行。 這就是SetFeatureBrowserEmulation
下面所做的。
並非總是可以確定網頁何時以 100% 的概率完成呈現。 一些頁面非常復雜並且使用持續的 AJAX 更新。 然而,我們可以通過首先處理DocumentCompleted
事件,然后輪詢頁面的當前 HTML 快照以進行更改並檢查WebBrowser.IsBusy
屬性來非常接近。 這就是NavigateAsync
在下面所做的。
超時邏輯存在於上述之上,以防頁面呈現永無止境(注意CancellationTokenSource
和CreateLinkedTokenSource
)。
using Microsoft.Win32;
using System;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace Console_22239357
{
class Program
{
// by Noseratio - https://stackoverflow.com/a/22262976/1768303
// main logic
static async Task ScrapeSitesAsync(string[] urls, CancellationToken token)
{
using (var apartment = new MessageLoopApartment())
{
// create WebBrowser inside MessageLoopApartment
var webBrowser = apartment.Invoke(() => new WebBrowser());
try
{
foreach (var url in urls)
{
Console.WriteLine("URL:\n" + url);
// cancel in 30s or when the main token is signalled
var navigationCts = CancellationTokenSource.CreateLinkedTokenSource(token);
navigationCts.CancelAfter((int)TimeSpan.FromSeconds(30).TotalMilliseconds);
var navigationToken = navigationCts.Token;
// run the navigation task inside MessageLoopApartment
string html = await apartment.Run(() =>
webBrowser.NavigateAsync(url, navigationToken), navigationToken);
Console.WriteLine("HTML:\n" + html);
}
}
finally
{
// dispose of WebBrowser inside MessageLoopApartment
apartment.Invoke(() => webBrowser.Dispose());
}
}
}
// entry point
static void Main(string[] args)
{
try
{
WebBrowserExt.SetFeatureBrowserEmulation(); // enable HTML5
var cts = new CancellationTokenSource((int)TimeSpan.FromMinutes(3).TotalMilliseconds);
var task = ScrapeSitesAsync(
new[] { "http://example.com", "http://example.org", "http://example.net" },
cts.Token);
task.Wait();
Console.WriteLine("Press Enter to exit...");
Console.ReadLine();
}
catch (Exception ex)
{
while (ex is AggregateException && ex.InnerException != null)
ex = ex.InnerException;
Console.WriteLine(ex.Message);
Environment.Exit(-1);
}
}
}
/// <summary>
/// WebBrowserExt - WebBrowser extensions
/// by Noseratio - https://stackoverflow.com/a/22262976/1768303
/// </summary>
public static class WebBrowserExt
{
const int POLL_DELAY = 500;
// navigate and download
public static async Task<string> NavigateAsync(this WebBrowser webBrowser, string url, CancellationToken token)
{
// navigate and await DocumentCompleted
var tcs = new TaskCompletionSource<bool>();
WebBrowserDocumentCompletedEventHandler handler = (s, arg) =>
tcs.TrySetResult(true);
using (token.Register(() => tcs.TrySetCanceled(), useSynchronizationContext: true))
{
webBrowser.DocumentCompleted += handler;
try
{
webBrowser.Navigate(url);
await tcs.Task; // wait for DocumentCompleted
}
finally
{
webBrowser.DocumentCompleted -= handler;
}
}
// get the root element
var documentElement = webBrowser.Document.GetElementsByTagName("html")[0];
// poll the current HTML for changes asynchronosly
var html = documentElement.OuterHtml;
while (true)
{
// wait asynchronously, this will throw if cancellation requested
await Task.Delay(POLL_DELAY, token);
// continue polling if the WebBrowser is still busy
if (webBrowser.IsBusy)
continue;
var htmlNow = documentElement.OuterHtml;
if (html == htmlNow)
break; // no changes detected, end the poll loop
html = htmlNow;
}
// consider the page fully rendered
token.ThrowIfCancellationRequested();
return html;
}
// enable HTML5 (assuming we're running IE10+)
// more info: https://stackoverflow.com/a/18333982/1768303
public static void SetFeatureBrowserEmulation()
{
if (System.ComponentModel.LicenseManager.UsageMode != System.ComponentModel.LicenseUsageMode.Runtime)
return;
var appName = System.IO.Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName);
Registry.SetValue(@"HKEY_CURRENT_USER\Software\Microsoft\Internet Explorer\Main\FeatureControl\FEATURE_BROWSER_EMULATION",
appName, 10000, RegistryValueKind.DWord);
}
}
/// <summary>
/// MessageLoopApartment
/// STA thread with message pump for serial execution of tasks
/// by Noseratio - https://stackoverflow.com/a/22262976/1768303
/// </summary>
public class MessageLoopApartment : IDisposable
{
Thread _thread; // the STA thread
TaskScheduler _taskScheduler; // the STA thread's task scheduler
public TaskScheduler TaskScheduler { get { return _taskScheduler; } }
/// <summary>MessageLoopApartment constructor</summary>
public MessageLoopApartment()
{
var tcs = new TaskCompletionSource<TaskScheduler>();
// start an STA thread and gets a task scheduler
_thread = new Thread(startArg =>
{
EventHandler idleHandler = null;
idleHandler = (s, e) =>
{
// handle Application.Idle just once
Application.Idle -= idleHandler;
// return the task scheduler
tcs.SetResult(TaskScheduler.FromCurrentSynchronizationContext());
};
// handle Application.Idle just once
// to make sure we're inside the message loop
// and SynchronizationContext has been correctly installed
Application.Idle += idleHandler;
Application.Run();
});
_thread.SetApartmentState(ApartmentState.STA);
_thread.IsBackground = true;
_thread.Start();
_taskScheduler = tcs.Task.Result;
}
/// <summary>shutdown the STA thread</summary>
public void Dispose()
{
if (_taskScheduler != null)
{
var taskScheduler = _taskScheduler;
_taskScheduler = null;
// execute Application.ExitThread() on the STA thread
Task.Factory.StartNew(
() => Application.ExitThread(),
CancellationToken.None,
TaskCreationOptions.None,
taskScheduler).Wait();
_thread.Join();
_thread = null;
}
}
/// <summary>Task.Factory.StartNew wrappers</summary>
public void Invoke(Action action)
{
Task.Factory.StartNew(action,
CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Wait();
}
public TResult Invoke<TResult>(Func<TResult> action)
{
return Task.Factory.StartNew(action,
CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Result;
}
public Task Run(Action action, CancellationToken token)
{
return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler);
}
public Task<TResult> Run<TResult>(Func<TResult> action, CancellationToken token)
{
return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler);
}
public Task Run(Func<Task> action, CancellationToken token)
{
return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap();
}
public Task<TResult> Run<TResult>(Func<Task<TResult>> action, CancellationToken token)
{
return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap();
}
}
}
我懷疑在另一個線程上運行處理循環效果不佳,因為WebBrowser
是一個承載 ActiveX 控件的 UI 組件。
當您通過 EAP 包裝器編寫TAP 時,我建議使用擴展方法來保持代碼整潔:
public static Task<string> NavigateAsync(this WebBrowser @this, string url)
{
var tcs = new TaskCompletionSource<string>();
WebBrowserDocumentCompletedEventHandler subscription = null;
subscription = (_, args) =>
{
@this.DocumentCompleted -= subscription;
tcs.TrySetResult(args.Url.ToString());
};
@this.DocumentCompleted += subscription;
@this.Navigate(url);
return tcs.Task;
}
現在您的代碼可以輕松應用超時:
async Task<string> GetUrlAsync(string url)
{
using (var wb = new WebBrowser())
{
var navigate = wb.NavigateAsync(url);
var timeout = Task.Delay(TimeSpan.FromSeconds(5));
var completed = await Task.WhenAny(navigate, timeout);
if (completed == navigate)
return await navigate;
return null;
}
}
可以這樣消費:
private async Task<Uri> GetFinalUrlAsync(PortalMerchant portalMerchant)
{
SetBrowserFeatureControl();
if (string.IsNullOrEmpty(portalMerchant.Url))
return null;
var result = await GetUrlAsync(portalMerchant.Url);
if (!String.IsNullOrEmpty(result))
return new Uri(result);
throw new Exception("Parsing Failed");
}
我正在嘗試從 Noseratio 的解決方案中受益,並遵循 Stephen Cleary 的建議。
這是我更新的代碼,包含在來自斯蒂芬的代碼中,來自 Noseratio 的關於 AJAX 技巧的代碼。
第一部分:Stephen 建議的Task NavigateAsync
public static Task<string> NavigateAsync(this WebBrowser @this, string url)
{
var tcs = new TaskCompletionSource<string>();
WebBrowserDocumentCompletedEventHandler subscription = null;
subscription = (_, args) =>
{
@this.DocumentCompleted -= subscription;
tcs.TrySetResult(args.Url.ToString());
};
@this.DocumentCompleted += subscription;
@this.Navigate(url);
return tcs.Task;
}
第二部分:一個新的Task NavAjaxAsync
來運行 AJAX 的提示(基於 Noseratio 的代碼)
public static async Task<string> NavAjaxAsync(this WebBrowser @this)
{
// get the root element
var documentElement = @this.Document.GetElementsByTagName("html")[0];
// poll the current HTML for changes asynchronosly
var html = documentElement.OuterHtml;
while (true)
{
// wait asynchronously
await Task.Delay(POLL_DELAY);
// continue polling if the WebBrowser is still busy
if (webBrowser.IsBusy)
continue;
var htmlNow = documentElement.OuterHtml;
if (html == htmlNow)
break; // no changes detected, end the poll loop
html = htmlNow;
}
return @this.Document.Url.ToString();
}
第三部分:一個新的Task NavAndAjaxAsync
來獲取導航和 AJAX
public static async Task NavAndAjaxAsync(this WebBrowser @this, string url)
{
await @this.NavigateAsync(url);
await @this.NavAjaxAsync();
}
第四部分也是最后一部分:Stephen 更新的Task GetUrlAsync
和 Noseratio 的 AJAX 代碼
async Task<string> GetUrlAsync(string url)
{
using (var wb = new WebBrowser())
{
var navigate = wb.NavAndAjaxAsync(url);
var timeout = Task.Delay(TimeSpan.FromSeconds(5));
var completed = await Task.WhenAny(navigate, timeout);
if (completed == navigate)
return await navigate;
return null;
}
}
我想知道這是否是正確的方法。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.