Use threadpool to limit max number of threads - Attempted to read or write protected memory error

后端 未结 1 817
隐瞒了意图╮
隐瞒了意图╮ 2021-01-16 05:12

I am using some scrapping code by Noseratio found here https://stackoverflow.com/a/22262976/3499115. He wrote it to scrape a list of urls, but I am using it, however to ren

相关标签:
1条回答
  • 2021-01-16 05:53

    One solution is to use SemaphoreSlim to maintain a limited pool of WebBrowser objects to scrap web sites in parallel. It also makes sense to share the common message loop for all WebBrowser instances.

    Here is how it can be implemented, based on my console web scrapper code you linked. The new part is the WebBrowserPool class (warning: only slightly tested):

    using Microsoft.Win32;
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Threading;
    using System.Threading.Tasks;
    using System.Windows.Forms;
    
    namespace AsyncWebBrowserScrapper
    {
        class Program
        {
            // by Noseratio - https://stackoverflow.com/a/23819021/1768303
    
            // test: web-scrap a list of URLs
            static async Task ScrapSitesAsync(string[] urls, CancellationToken token)
            {
                using (var pool = new WebBrowserPool(maxParallel: 2, token: token))
                {
                    // cancel each site in 30s or when the main token is signalled
                    var timeout = (int)TimeSpan.FromSeconds(30).TotalMilliseconds;
    
                    var results = urls.ToDictionary(
                        url => url, url => pool.ScrapSiteAsync(url, timeout));
    
                    await Task.WhenAll(results.Values);
    
                    foreach (var url in results.Keys)
                    {
                        Console.WriteLine("URL:\n" + url);
    
                        string html = results[url].Result;
    
                        Console.WriteLine("HTML:\n" + html);
                    }
                }
            }
    
            // entry point
            static void Main(string[] args)
            {
                try
                {
                    WebBrowserExt.SetFeatureBrowserEmulation(); // enable HTML5
    
                    var cts = new CancellationTokenSource((int)TimeSpan.FromMinutes(3).TotalMilliseconds);
    
                    var task = ScrapSitesAsync(
                        new[] { "http://example.com", "http://example.org", "http://example.net", "http://www.bing.com", "http://www.google.com" },
                        cts.Token);
    
                    task.Wait();
    
                    Console.WriteLine("Press Enter to exit...");
                    Console.ReadLine();
                }
                catch (Exception ex)
                {
                    while (ex is AggregateException && ex.InnerException != null)
                        ex = ex.InnerException;
                    Console.WriteLine(ex.Message);
                    Environment.Exit(-1);
                }
            }
        }
    
        /// <summary>
        /// WebBrowserPool the pool of WebBrowser objects sharing the same message loop
        /// </summary>
        public class WebBrowserPool : IDisposable
        {
            MessageLoopApartment _apartment; // a WinFroms STA thread with message loop
            readonly SemaphoreSlim _semaphore; // regulate available browsers
            readonly Queue<WebBrowser> _browsers; // the pool of available browsers
            readonly HashSet<Task> _pendingTasks; // keep track of pending tasks for proper cancellation
            readonly CancellationTokenSource _cts; // global cancellation (for Dispose)
    
            public WebBrowserPool(int maxParallel, CancellationToken token)
            {
                if (maxParallel < 1)
                    throw new ArgumentException("maxParallel");
    
                _cts = CancellationTokenSource.CreateLinkedTokenSource(token);
                _apartment = new MessageLoopApartment();
                _semaphore = new SemaphoreSlim(maxParallel);
                _browsers = new Queue<WebBrowser>();
                _pendingTasks = new HashSet<Task>();
    
                // init the pool of WebBrowser objects
                _apartment.Invoke(() =>
                {
                    while (--maxParallel >= 0)
                        _browsers.Enqueue(new WebBrowser());
                });
            }
    
            // Navigate to a site and get a snapshot of its DOM HTML
            public async Task<string> ScrapSiteAsync(string url, int timeout, CancellationToken token = default(CancellationToken))
            {
                var navigationCts = CancellationTokenSource.CreateLinkedTokenSource(token, _cts.Token);
                var combinedToken = navigationCts.Token;
    
                // we have a limited number of WebBrowser objects available, so await the semaphore
                await _semaphore.WaitAsync(combinedToken);
                try
                {
                    if (timeout != Timeout.Infinite)
                        navigationCts.CancelAfter(timeout);
    
                    // run the main logic on the STA thread
                    return await _apartment.Run(async () =>
                    {
                        // acquire the 1st available WebBrowser from the pool
                        var webBrowser = _browsers.Dequeue();
                        try
                        {
                            var task = webBrowser.NavigateAsync(url, combinedToken);
                            _pendingTasks.Add(task); // register the pending task
                            try
                            {
                                return await task;
                            }
                            finally
                            {
                                // unregister the completed task
                                _pendingTasks.Remove(task);
                            }
                        }
                        finally
                        {
                            // return the WebBrowser to the pool
                            _browsers.Enqueue(webBrowser);
                        }
                    }, combinedToken);
                }
                finally
                {
                    _semaphore.Release();
                }
            }
    
            // Dispose of WebBrowserPool
            public void Dispose()
            {
                if (_apartment == null)
                    throw new ObjectDisposedException(this.GetType().Name);
    
                // cancel and wait for all pending tasks
                _cts.Cancel();
                var task = _apartment.Run(() => Task.WhenAll(_pendingTasks.ToArray()));
                try
                {
                    task.Wait();
                }
                catch
                {
                    if (!task.IsCanceled)
                        throw;
                }
    
                // dispose of WebBrowser objects
                _apartment.Run(() =>
                {
                    while (_browsers.Any())
                        _browsers.Dequeue().Dispose();
                });
    
                _apartment.Dispose();
                _apartment = null;
            }
        }
    
        /// <summary>
        /// WebBrowserExt - WebBrowser extensions
        /// by Noseratio - https://stackoverflow.com/a/22262976/1768303
        /// </summary>
        public static class WebBrowserExt
        {
            const int POLL_DELAY = 500;
    
            // navigate and download 
            public static async Task<string> NavigateAsync(this WebBrowser webBrowser, string url, CancellationToken token)
            {
                // navigate and await DocumentCompleted
                var tcs = new TaskCompletionSource<bool>();
                WebBrowserDocumentCompletedEventHandler handler = (s, arg) =>
                    tcs.TrySetResult(true);
    
                using (token.Register(
                    () => { webBrowser.Stop(); tcs.TrySetCanceled(); },
                    useSynchronizationContext: true))
                {
                    webBrowser.DocumentCompleted += handler;
                    try
                    {
                        webBrowser.Navigate(url);
                        await tcs.Task; // wait for DocumentCompleted
                    }
                    finally
                    {
                        webBrowser.DocumentCompleted -= handler;
                    }
                }
    
                // get the root element
                var documentElement = webBrowser.Document.GetElementsByTagName("html")[0];
    
                // poll the current HTML for changes asynchronosly
                var html = documentElement.OuterHtml;
                while (true)
                {
                    // wait asynchronously, this will throw if cancellation requested
                    await Task.Delay(POLL_DELAY, token);
    
                    // continue polling if the WebBrowser is still busy
                    if (webBrowser.IsBusy)
                        continue;
    
                    var htmlNow = documentElement.OuterHtml;
                    if (html == htmlNow)
                        break; // no changes detected, end the poll loop
    
                    html = htmlNow;
                }
    
                // consider the page fully rendered 
                token.ThrowIfCancellationRequested();
                return html;
            }
    
            // enable HTML5 (assuming we're running IE10+)
            // more info: https://stackoverflow.com/a/18333982/1768303
            public static void SetFeatureBrowserEmulation()
            {
                if (System.ComponentModel.LicenseManager.UsageMode != System.ComponentModel.LicenseUsageMode.Runtime)
                    return;
                var appName = System.IO.Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName);
                Registry.SetValue(@"HKEY_CURRENT_USER\Software\Microsoft\Internet Explorer\Main\FeatureControl\FEATURE_BROWSER_EMULATION",
                    appName, 10000, RegistryValueKind.DWord);
            }
        }
    
        /// <summary>
        /// MessageLoopApartment
        /// STA thread with message pump for serial execution of tasks
        /// by Noseratio - https://stackoverflow.com/a/22262976/1768303
        /// </summary>
        public class MessageLoopApartment : IDisposable
        {
            Thread _thread; // the STA thread
    
            TaskScheduler _taskScheduler; // the STA thread's task scheduler
    
            public TaskScheduler TaskScheduler { get { return _taskScheduler; } }
    
            /// <summary>MessageLoopApartment constructor</summary>
            public MessageLoopApartment()
            {
                var tcs = new TaskCompletionSource<TaskScheduler>();
    
                // start an STA thread and gets a task scheduler
                _thread = new Thread(startArg =>
                {
                    EventHandler idleHandler = null;
    
                    idleHandler = (s, e) =>
                    {
                        // handle Application.Idle just once
                        Application.Idle -= idleHandler;
                        // return the task scheduler
                        tcs.SetResult(TaskScheduler.FromCurrentSynchronizationContext());
                    };
    
                    // handle Application.Idle just once
                    // to make sure we're inside the message loop
                    // and SynchronizationContext has been correctly installed
                    Application.Idle += idleHandler;
                    Application.Run();
                });
    
                _thread.SetApartmentState(ApartmentState.STA);
                _thread.IsBackground = true;
                _thread.Start();
                _taskScheduler = tcs.Task.Result;
            }
    
            /// <summary>shutdown the STA thread</summary>
            public void Dispose()
            {
                if (_taskScheduler != null)
                {
                    var taskScheduler = _taskScheduler;
                    _taskScheduler = null;
    
                    // execute Application.ExitThread() on the STA thread
                    Task.Factory.StartNew(
                        () => Application.ExitThread(),
                        CancellationToken.None,
                        TaskCreationOptions.None,
                        taskScheduler).Wait();
    
                    _thread.Join();
                    _thread = null;
                }
            }
    
            /// <summary>Task.Factory.StartNew wrappers</summary>
            public void Invoke(Action action)
            {
                Task.Factory.StartNew(action,
                    CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Wait();
            }
    
            public TResult Invoke<TResult>(Func<TResult> action)
            {
                return Task.Factory.StartNew(action,
                    CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Result;
            }
    
            public Task Run(Action action, CancellationToken token = default(CancellationToken))
            {
                return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler);
            }
    
            public Task<TResult> Run<TResult>(Func<TResult> action, CancellationToken token = default(CancellationToken))
            {
                return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler);
            }
    
            public Task Run(Func<Task> action, CancellationToken token = default(CancellationToken))
            {
                return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap();
            }
    
            public Task<TResult> Run<TResult>(Func<Task<TResult>> action, CancellationToken token = default(CancellationToken))
            {
                return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap();
            }
        }
    }
    
    0 讨论(0)
提交回复
热议问题