How to download an image using Selenium (any version)?

前端 未结 13 1465
夕颜
夕颜 2020-11-29 04:11

I was wondering, how can one use selenium/webdriver to download an image for a page. Assuming that the user session is required to download the image hence having pure URL i

相关标签:
13条回答
  • 2020-11-29 04:43

    How to download to a file, taking URL from element text or attribute

    The complete extension code can be found here:

    https://github.com/gravity-api/gravity-core/blob/master/src/csharp/Gravity.Core/Gravity.Core/Extensions/WebElementExtensions.cs

    If you want to use this method without writing the code, use the NuGet https://www.nuget.org/packages/Gravity.Core/

    Install-Package Gravity.Core -Version 2020.7.5.3
    

    Usage

    using OpenQA.Selenium.Extensions;
     
    ...
     
    var driver = new ChromeDriver();
     
    // from element attribute
    var element = driver.FindElement(By.XPath("//img[@id='my_img']")).DownloadResource(path: @"C:\images\cap_image_01.png", attribute: "src");
     
    // from element text
    var element = driver.FindElement(By.XPath("//div[1]")).DownloadResource(path: @"C:\images\cap_image_01.png");
    

    It is recommended to use the NuGet, since it contains a lot more tools and extension for Selenium

    For using without the NuGet (implement on your own)

    Extension Class

    using System.IO;
    using System.Net.Http;
    using System.Text.RegularExpressions;
     
    namespace Extensions
    {
        public static class WebElementExtensions
        {
            public static IWebElement DownloadResource(this IWebElement element, string path)
            {
                return DoDownloadResource(element, path, "");
            }
     
            public static IWebElement DownloadResource(this IWebElement element, string path, string attribute)
            {
                return DoDownloadResource(element, path, attribute);
            }
     
            private static IWebElement DoDownloadResource(this IWebElement element, string path, string attribute)
            {
                // get resource address
                var resource = (string.IsNullOrEmpty(attribute))
                    ? element.Text
                    : element.GetAttribute(attribute);
     
                // download resource
                using (var client = new HttpClient())
                {
                    // get response for the current resource
                    var httpResponseMessage = client.GetAsync(resource).GetAwaiter().GetResult();
     
                    // exit condition
                    if (!httpResponseMessage.IsSuccessStatusCode) return element;
     
                    // create directories path
                    Directory.CreateDirectory(path);
     
                    // get absolute file name
                    var fileName = Regex.Match(resource, @"[^/\\&\?]+\.\w{3,4}(?=([\?&].*$|$))").Value;
                    path = (path.LastIndexOf(@"\") == path.Length - 1)
                        ? path + fileName
                        : path + $@"\{fileName}";
     
                    // write the file
                    File.WriteAllBytes(path, httpResponseMessage.Content.ReadAsByteArrayAsync().GetAwaiter().GetResult());
                }
     
                // keep the fluent
                return element;
            }
        }
    }
    

    Usage

    using Extensions;
     
    ...
     
    var driver = new ChromeDriver();
     
    // from element attribute
    var element = driver.FindElement(By.XPath("//img[@id='my_img']")).DownloadResource(path: @"C:\images\cap_image_01.png", attribute: "src");
     
    // from element text
    var element = driver.FindElement(By.XPath("//div[1]")).DownloadResource(path: @"C:\images\cap_image_01.png");
    
    0 讨论(0)
  • 2020-11-29 04:47

    here is a javascript solution. it's a tad silly -- and i'm weary of hitting the source image's server with too many requests. can someone tell me if the fetch() accesses the browser's cache? i don't want to spam the source server.

    it appends a FileReader() to the window, fetches and converts the image to base64 and tags that string onto the window.

    the driver can then return that window variable.

    export async function scrapePic(driver) {
    try {
    console.log("waiting for that profile piccah")
    console.log(driver)
    
    let rootEl = await driver.findElement(By.css('.your-root-element'));
    let imgEl = await rootEl.findElement(By.css('img'))
    await driver.wait(until.elementIsVisible(imgEl, 10000));
    console.log('profile piccah found')
    let img = await imgEl.getAttribute('src')
    //attach reader to driver window
    await driver.executeScript(`window.myFileReader = new FileReader();`)
    await driver.executeScript(`
      window.myFileReader.onloadend = function() {
        window['profileImage'] = this.result
      }
      fetch( arguments[0] ).then( res => res.blob() ).then( blob => window.electronFileReader.readAsDataURL(blob) )
      `, img)
    await driver.sleep(5000)
    let img64 = await driver.executeScript(`return window.profileImage`)
    console.log(img64)
    
    
    } catch (e) {
    console.log(e)
    } finally {
    return img64
      }
    }
    
    0 讨论(0)
  • 2020-11-29 04:50

    For my use case there were cookies and other issues that made the other approaches here unsuitable.

    I ended up using an XMLHttpRequest to populate a FileReader (from How to convert image into base64 string using javascript, and then calling that using Selenium's ExecuteAsyncScript (as shown in Selenium and asynchronous JavaScript calls). This allowed me to get a Data URL which was straight forward to parse.

    Here's my C# code for getting the Data URL:

    public string ImageUrlToDataUrl(IWebDriver driver, string imageUrl)
    {
      var js = new StringBuilder();
      js.AppendLine("var done = arguments[0];"); // The callback from ExecuteAsyncScript
      js.AppendLine(@"
        function toDataURL(url, callback) {
          var xhr = new XMLHttpRequest();
          xhr.onload = function() {
            var reader = new FileReader();
            reader.onloadend = function() {
              callback(reader.result);
            }
            reader.readAsDataURL(xhr.response);
          };
          xhr.open('GET', url);
          xhr.responseType = 'blob';
          xhr.send();
        }"); // XMLHttpRequest -> FileReader -> DataURL conversion
      js.AppendLine("toDataURL('" + imageUrl + "', done);"); // Invoke the function
    
      var executor = (IJavaScriptExecutor) driver;
      var dataUrl = executor.ExecuteAsyncScript(js.ToString()) as string;
      return dataUrl;
    }
    
    0 讨论(0)
  • 2020-11-29 04:52

    use selenium for getting the image src

    elemImg.get_attribute('src')
    

    use the programming language for this, for python; check this answer: How to save an image locally using Python whose URL address I already know?

    0 讨论(0)
  • 2020-11-29 04:52

    Other solutions here don't work across all browsers, don't work across all websites, or both.

    This solution should be far more robust. It uses the browser to view the image, resizes the browser to fit the image size, takes a screenshot, and finally resizes the browser back to the original size.

    Python:

    def get_image(driver, img_url):
        '''Given an images url, return a binary screenshot of it in png format.'''
        driver.get_url(img_url)
    
        # Get the dimensions of the browser and image.
        orig_h = driver.execute_script("return window.outerHeight")
        orig_w = driver.execute_script("return window.outerWidth")
        margin_h = orig_h - driver.execute_script("return window.innerHeight")
        margin_w = orig_w - driver.execute_script("return window.innerWidth")
        new_h = driver.execute_script('return document.getElementsByTagName("img")[0].height')
        new_w = driver.execute_script('return document.getElementsByTagName("img")[0].width')
    
        # Resize the browser window.
        logging.info("Getting Image: orig %sX%s, marg %sX%s, img %sX%s - %s"%(
          orig_w, orig_h, margin_w, margin_h, new_w, new_h, img_url))
        driver.set_window_size(new_w + margin_w, new_h + margin_h)
    
        # Get the image by taking a screenshot of the page.
        img_val = driver.get_screenshot_as_png()
        # Set the window size back to what it was.
        driver.set_window_size(orig_w, orig_h)
    
        # Go back to where we started.
        driver.back()
        return img_val
    

    One disadvantage of this solution is that if the image is very small, the browser will not resize that small, and you may get a black border around it.

    0 讨论(0)
  • 2020-11-29 04:53

    I prefer doing something like this :

    1. Get the SRC attribute of the image.
    2. Use ImageIO.read to read the image onto a BufferedImage
    3. Save the BufferedImage using ImageIO.write function
    

    For e.g.

    String src = imgElement.getAttribute('src');
    BufferedImage bufferedImage = ImageIO.read(new URL(src));
    File outputfile = new File("saved.png");
    ImageIO.write(bufferedImage, "png", outputfile);
    
    
    0 讨论(0)
提交回复
热议问题