Download a file to a specific path using Selenium WebDriver

前端 未结 3 1394
春和景丽
春和景丽 2021-01-15 13:54

I need to download a file to a given location on a non-local machine. This is the normal flow of the web browser for which I would do this:

  • Go to website
相关标签:
3条回答
  • 2021-01-15 14:45

    When you initialize your driver, be sure to set the download preferences.

    For Firefox:

    ff_prof.set_preference( "browser.download.manager.showWhenStarting", False )
    ff_prof.set_preference( "browser.download.folderList", 2 )
    ff_prof.set_preference( "browser.download.useDownloadDir", True )
    ff_prof.set_preference( "browser.download.dir", self.driver_settings['download_folder'] )
    
    ##
    # if FF still shows the download dialog, make sure that the filetype is included below
    # filetype string options can be found in '~/.mozilla/$USER_PROFILE/mimeTypes.rdf'
    ##
    mime_types = ("application/pdf", "text/html")
    
    ff_prof.set_preference( "browser.helperApps.neverAsk.saveToDisk", (", ".join( mime_types )) )
    ff_prof.set_preference( "browser.helperApps.neverAsk.openFile", (", ".join( mime_types )) )
    

    For Chrome:

    capabilities['chromeOptions']['prefs']['download.prompt_for_download'] = False
    capabilities['chromeOptions']['prefs']['download.default_directory'] = self.driver_settings['download_folder']
    

    Forwarding the download:

    Below is the code I use to redirect the file from self.driver_settings['download_folder'] (set above) to where you actually want the file (to_path can be an existing folder or a filepath). If you're on linux, I'd suggest using tmpfs so that /tmp is held in ram and then set self.driver_settings['download_folder'] to "/tmp/driver_downloads/". Note that the below function assumes that self.driver_settings['download_folder'] always begins as an empty folder (this is how it locates the file being downloaded, since it's the only one in the directory).

    def moveDriverDownload(self, to_path, allowable_extensions, allow_rename_if_exists=False, timeout_seconds=None):
        if timeout_seconds is None:
            timeout_seconds = 30
        wait_delta = timedelta( seconds=timeout_seconds )
        start_download_time = datetime.now()
        hasTimedOut = lambda: datetime.now() - start_download_time > wait_delta
    
        assert isinstance(allowable_extensions, list) or isinstance(allowable_extensions, tuple) or isinstance(allowable_extensions, set), "instead of a list, found allowable_extensions type of '{}'".format(type(allowable_extensions))
        allowable_extensions = [ elem.lower().strip() for elem in allowable_extensions ]
        allowable_extensions = [ elem if elem.startswith(".") else "."+elem for elem in allowable_extensions ]
    
        if not ".part" in allowable_extensions:
            allowable_extensions.append( ".part" )
    
        re_extension_str = "(?:" + ("$)|(?:".join( re.escape(elem) for elem in allowable_extensions )) + "$)"
    
        getFiles = lambda: next( os.walk( self.driver_settings['download_folder'] ) )[2]
    
        while True:
            if hasTimedOut():
                del allowable_extensions[ allowable_extensions.index(".part") ]
                raise DownloadTimeoutError( "timed out after {} seconds while waiting on file download with extension in {}".format(timeout_seconds, allowable_extensions) )
    
            time.sleep( 0.5 )
    
            file_list = [ elem for elem in getFiles() if re.search( re_extension_str, elem ) ]
            if len(file_list) > 0:
                break
    
        file_list = [ re.search( r"(?i)^(.*?)(?:\.part)?$", elem ).groups()[0] for elem in file_list ]
    
        if len(file_list) > 1:
            if len(file_list) == 2:
                if file_list[0] != file_list[1]:
                    raise Exception( "file_list[0] != file_list[1] <==> {} != {}".format(file_list[0], file_list[1]) )
            else:
                raise Exception( "len(file_list) > 1. found {}".format(file_list) )
    
        file_path = "%s%s" %(self.driver_settings['download_folder'], file_list[0])
    
        # see if the file is still being downloaded by checking if it's open by any programs
        if platform.system() == "Linux":
            openProcess = lambda: subprocess.Popen( 'lsof | grep "%s"' %file_path, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE )
            fileIsFinished = lambda txt: txt.strip() == ""
        elif platform.system() == "Windows":
            # 'handle' program must be in PATH
            # https://technet.microsoft.com/en-us/sysinternals/bb896655
            openProcess = lambda: subprocess.Popen( 'handle "%s"' %file_path.replace("/", "\\"), shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE )
            fileIsFinished = lambda txt: bool( re.search("(?i)No matching handles found", txt) )
        else:
            raise Exception( "unrecognised platform.system() of '{}'".format(platform.system()) )
    
        while True:
            lsof_process = openProcess()
            lsof_result = lsof_process.communicate()
    
            if len(lsof_result) != 2:
                raise Exception( "len(lsof_result) != 2. found {}".format(lsof_result) )
            if lsof_result[1].strip() != "":
                raise Exception( 'lsof_result[1].strip() != "". found {}'.format(lsof_result) )
            if fileIsFinished( lsof_result[0] ):
                break
    
            if hasTimedOut():
                raise Exception( "timed out after {} seconds waiting for '{}' to be freed from writing. found lsof/handle of '{}'".format(timeout_seconds, file_path, lsof_result[0]) )
    
            time.sleep( 0.5 )
    
        to_path = to_path.replace("\\", "/")
        if os.path.isdir( to_path ):
            if not to_path.endswith("/"):
                to_path += "/"
    
            to_path += file_list[0]
    
        i = 2
        while os.path.exists( to_path ):
            if not allow_rename_if_exists:
                raise Exception( "{} already exists".format(to_path) )
    
            to_path = re.sub( "^(.*/)(.*?)(?:-" + str(i-1) + r")?(|\..*?)?$", r"\1\2-%i\3" %i, to_path )
            i += 1
    
        shutil.move( file_path, to_path )
    
        return to_path[ to_path.rindex("/")+1: ]
    
    0 讨论(0)
  • 2021-01-15 14:52

    Use selenium webdriver

    Use firefox profile to download your files. This profile skip that dialogue box of firefox. In line:-

       pro.setPreference("browser.downLoad.folderList", 0);
    

    The value of browser.download.folderList can be set to either 0, 1, or 2. When set to 0, Firefox will save all files downloaded via the browser on the user's desktop. When set to 1, these downloads are stored in the Downloads folder. When set to 2, the location specified for the most recent download is utilized again.

    Firefox profile code that you need to implement :-

            FirefoxProfile pro=new FirefoxProfile();
            pro.setPreference("browser.downLoad.folderList", 0);
            pro.setPreference("browser.helperApps.neverAsk.saveToDisk", "Applications/zip");
            WebDriver driver=new FirefoxDriver(pro);
            driver.get("http://selenium-release.storage.googleapis.com/2.47/selenium-java-2.47.1.zip");
    

    Hope it will help you :)

    0 讨论(0)
  • 2021-01-15 14:57

    You would have to examine the javascript on the website and understand how it works before you could override it to do something like that, but even then, browser security will always pop a dialog asking you to confirm the download. That leaves you with two options (as far as I can see):

    • Confirm the alert dialog
    • Determine the location of the file on the remote server, and use a GET to download the file

    I can't really help with the details on either, since I don't know python, but hopefully that helps...

    0 讨论(0)
提交回复
热议问题