How to web scrape followers from Instagram web browser?

前端 未结 3 553
暗喜
暗喜 2020-12-29 15:10

Can anyone tell me how to access the underlying URL to view a given user\'s Instagram followers? I am able to do this with Instagram API, but given the pending changes to th

相关标签:
3条回答
  • 2020-12-29 15:26

    I noticed that the previous answer no longer works, so I made an updated version based on the previous answer, which includes the scrolling feature (to get all the users in the list, not just those loaded initially). In addition, this scrapes both followers and following. (You'll need to download chromedriver as well)

    import time
    from selenium import webdriver as wd
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    # The account you want to check
    account = ""
    
    # Chrome executable
    chrome_binary = r"chrome.exe"   # Add your path here
    
    
    def login(driver):
        username = ""   # Your username
        password = ""   # Your password
    
        # Load page
        driver.get("https://www.instagram.com/accounts/login/")
    
        # Login
        driver.find_element_by_xpath("//div/input[@name='username']").send_keys(username)
        driver.find_element_by_xpath("//div/input[@name='password']").send_keys(password)
        driver.find_element_by_xpath("//span/button").click()
    
        # Wait for the login page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.LINK_TEXT, "See All")))
    
    
    def scrape_followers(driver, account):
        # Load account page
        driver.get("https://www.instagram.com/{0}/".format(account))
    
        # Click the 'Follower(s)' link
        driver.find_element_by_partial_link_text("follower").click()
    
        # Wait for the followers modal to load
        xpath = "/html/body/div[4]/div/div/div[2]/div/div[2]"
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, xpath)))
    
        SCROLL_PAUSE = 0.5  # Pause to allow loading of content
        driver.execute_script("followersbox = document.getElementsByClassName('_gs38e')[0];")
        last_height = driver.execute_script("return followersbox.scrollHeight;")
    
        # We need to scroll the followers modal to ensure that all followers are loaded
        while True:
            driver.execute_script("followersbox.scrollTo(0, followersbox.scrollHeight);")
    
            # Wait for page to load
            time.sleep(SCROLL_PAUSE)
    
            # Calculate new scrollHeight and compare with the previous
            new_height = driver.execute_script("return followersbox.scrollHeight;")
            if new_height == last_height:
                break
            last_height = new_height
    
        # Finally, scrape the followers
        xpath = "/html/body/div[4]/div/div/div[2]/div/div[2]/ul/li"
        followers_elems = driver.find_elements_by_xpath(xpath)
    
        followers_temp = [e.text for e in followers_elems]  # List of followers (username, full name, follow text)
        followers = []  # List of followers (usernames only)
    
        # Go through each entry in the list, append the username to the followers list
        for i in followers_temp:
            username, sep, name = i.partition('\n')
            followers.append(username)
    
        print("______________________________________")
        print("FOLLOWERS")
    
        return followers
    
    def scrape_following(driver, account):
        # Load account page
        driver.get("https://www.instagram.com/{0}/".format(account))
    
        # Click the 'Following' link
        driver.find_element_by_partial_link_text("following").click()
    
        # Wait for the following modal to load
        xpath = "/html/body/div[4]/div/div/div[2]/div/div[2]"
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, xpath)))
    
        SCROLL_PAUSE = 0.5  # Pause to allow loading of content
        driver.execute_script("followingbox = document.getElementsByClassName('_gs38e')[0];")
        last_height = driver.execute_script("return followingbox.scrollHeight;")
    
        # We need to scroll the following modal to ensure that all following are loaded
        while True:
            driver.execute_script("followingbox.scrollTo(0, followingbox.scrollHeight);")
    
            # Wait for page to load
            time.sleep(SCROLL_PAUSE)
    
            # Calculate new scrollHeight and compare with the previous
            new_height = driver.execute_script("return followingbox.scrollHeight;")
            if new_height == last_height:
                break
            last_height = new_height
    
        # Finally, scrape the following
        xpath = "/html/body/div[4]/div/div/div[2]/div/div[2]/ul/li"
        following_elems = driver.find_elements_by_xpath(xpath)
    
        following_temp = [e.text for e in following_elems]  # List of following (username, full name, follow text)
        following = []  # List of following (usernames only)
    
        # Go through each entry in the list, append the username to the following list
        for i in following_temp:
            username, sep, name = i.partition('\n')
            following.append(username)
    
        print("\n______________________________________")
        print("FOLLOWING")
        return following
    
    
    if __name__ == "__main__":
        options = wd.ChromeOptions()
        options.binary_location = chrome_binary # chrome.exe
        driver_binary = r"chromedriver.exe"
        driver = wd.Chrome(driver_binary, chrome_options=options)
        try:
            login(driver)
            followers = scrape_followers(driver, account)
            print(followers)
            following = scrape_following(driver, account)
            print(following)
        finally:
            driver.quit()
    
    0 讨论(0)
  • 2020-12-29 15:35

    Update: March 2020

    This is just the Levi answer with a small updates in some parts because as it is now, it didn't quit the driver successfully. This also gets by default all the followers, as everyone else have said, it's not intended for a lot of followers.

    import itertools
    
    from explicit import waiter, XPATH
    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from time import sleep
    
    def login(driver):
        username = ""  # <username here>
        password = ""  # <password here>
    
        # Load page
        driver.get("https://www.instagram.com/accounts/login/")
        sleep(3)
        # Login
        driver.find_element_by_name("username").send_keys(username)
        driver.find_element_by_name("password").send_keys(password)
        submit = driver.find_element_by_tag_name('form')
        submit.submit()
    
        # Wait for the user dashboard page to load
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.LINK_TEXT, "See All")))
    
    
    def scrape_followers(driver, account):
        # Load account page
        driver.get("https://www.instagram.com/{0}/".format(account))
    
        # Click the 'Follower(s)' link
        # driver.find_element_by_partial_link_text("follower").click
        sleep(2)
        driver.find_element_by_partial_link_text("follower").click()
    
        # Wait for the followers modal to load
        waiter.find_element(driver, "//div[@role='dialog']", by=XPATH)
        allfoll = int(driver.find_element_by_xpath("//li[2]/a/span").text)
        # At this point a Followers modal pops open. If you immediately scroll to the bottom,
        # you hit a stopping point and a "See All Suggestions" link. If you fiddle with the
        # model by scrolling up and down, you can force it to load additional followers for
        # that person.
    
        # Now the modal will begin loading followers every time you scroll to the bottom.
        # Keep scrolling in a loop until you've hit the desired number of followers.
        # In this instance, I'm using a generator to return followers one-by-one
        follower_css = "ul div li:nth-child({}) a.notranslate"  # Taking advange of CSS's nth-child functionality
        for group in itertools.count(start=1, step=12):
            for follower_index in range(group, group + 12):
                if follower_index > allfoll:
                    raise StopIteration
                yield waiter.find_element(driver, follower_css.format(follower_index)).text
    
            # Instagram loads followers 12 at a time. Find the last follower element
            # and scroll it into view, forcing instagram to load another 12
            # Even though we just found this elem in the previous for loop, there can
            # potentially be large amount of time between that call and this one,
            # and the element might have gone stale. Lets just re-acquire it to avoid
            # tha
            last_follower = waiter.find_element(driver, follower_css.format(group+11))
            driver.execute_script("arguments[0].scrollIntoView();", last_follower)
    
    
    if __name__ == "__main__":
        account = ""  # <account to check>
        driver = webdriver.Firefox(executable_path="./geckodriver")
        try:
            login(driver)
            print('Followers of the "{}" account'.format(account))
            for count, follower in enumerate(scrape_followers(driver, account=account), 1):
                print("\t{:>3}: {}".format(count, follower))
        finally:
            driver.quit()
    
    0 讨论(0)
  • 2020-12-29 15:42

    EDIT: Dec 2018 Update:

    Things have changed in Insta land since this was posted. Here is an updated script that is a bit more pythonic and better utilizes XPATH/CSS paths.

    Note that to use this updated script, you must install the explicit package (pip install explicit), or convert each line with waiter to a pure selenium explicit wait.

    import itertools
    
    from explicit import waiter, XPATH
    from selenium import webdriver
    
    
    def login(driver):
        username = ""  # <username here>
        password = ""  # <password here>
    
        # Load page
        driver.get("https://www.instagram.com/accounts/login/")
    
        # Login
        waiter.find_write(driver, "//div/input[@name='username']", username, by=XPATH)
        waiter.find_write(driver, "//div/input[@name='password']", password, by=XPATH)
        waiter.find_element(driver, "//div/button[@type='submit']", by=XPATH).click()
    
        # Wait for the user dashboard page to load
        waiter.find_element(driver, "//a/span[@aria-label='Find People']", by=XPATH)
    
    
    def scrape_followers(driver, account):
        # Load account page
        driver.get("https://www.instagram.com/{0}/".format(account))
    
        # Click the 'Follower(s)' link
        # driver.find_element_by_partial_link_text("follower").click()
        waiter.find_element(driver, "//a[@href='/instagram/followers/']", by=XPATH).click()
    
        # Wait for the followers modal to load
        waiter.find_element(driver, "//div[@role='dialog']", by=XPATH)
    
        # At this point a Followers modal pops open. If you immediately scroll to the bottom,
        # you hit a stopping point and a "See All Suggestions" link. If you fiddle with the
        # model by scrolling up and down, you can force it to load additional followers for
        # that person.
    
        # Now the modal will begin loading followers every time you scroll to the bottom.
        # Keep scrolling in a loop until you've hit the desired number of followers.
        # In this instance, I'm using a generator to return followers one-by-one
        follower_css = "ul div li:nth-child({}) a.notranslate"  # Taking advange of CSS's nth-child functionality
        for group in itertools.count(start=1, step=12):
            for follower_index in range(group, group + 12):
                yield waiter.find_element(driver, follower_css.format(follower_index)).text
    
            # Instagram loads followers 12 at a time. Find the last follower element
            # and scroll it into view, forcing instagram to load another 12
            # Even though we just found this elem in the previous for loop, there can
            # potentially be large amount of time between that call and this one,
            # and the element might have gone stale. Lets just re-acquire it to avoid
            # that
            last_follower = waiter.find_element(driver, follower_css.format(follower_index))
            driver.execute_script("arguments[0].scrollIntoView();", last_follower)
    
    
    if __name__ == "__main__":
        account = 'instagram'
        driver = webdriver.Chrome()
        try:
            login(driver)
            # Print the first 75 followers for the "instagram" account
            print('Followers of the "{}" account'.format(account))
            for count, follower in enumerate(scrape_followers(driver, account=account), 1):
                print("\t{:>3}: {}".format(count, follower))
                if count >= 75:
                    break
        finally:
            driver.quit()
    

    I did a quick benchmark to show how performance decreases exponentially the more followers you attempt to scrape this way:

    $ python example.py
    Followers of the "instagram" account
    Found    100 followers in 11 seconds
    Found    200 followers in 19 seconds
    Found    300 followers in 29 seconds
    Found    400 followers in 47 seconds
    Found    500 followers in 71 seconds
    Found    600 followers in 106 seconds
    Found    700 followers in 157 seconds
    Found    800 followers in 213 seconds
    Found    900 followers in 284 seconds
    Found   1000 followers in 375 seconds
    

    Original post: Your question is a little confusing. For instance, I'm not really sure what "from which I can scrape and paginate through all iterations" actually means. What are you currently using to scrape and paginate?

    Regardless, instagram.com/instagram/media/ is not the same type of endpoint as instagram.com/instagram/followers. The media endpoint appears to be a REST API, configured to return an easily parseable JSON object.

    The followers endpoint isn't really a RESTful endpoint from what I can tell. Rather, Instagram AJAXs in the information to the page source (using React?) after you click the Followers button. I don't think you will be able to get that information without using something like Selenium, which can load/render the javascript that displays the followers to the user.

    This example code will work:

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    
    def login(driver):
        username = ""  # <username here>
        password = ""  # <password here>
    
        # Load page
        driver.get("https://www.instagram.com/accounts/login/")
    
        # Login
        driver.find_element_by_xpath("//div/input[@name='username']").send_keys(username)
        driver.find_element_by_xpath("//div/input[@name='password']").send_keys(password)
        driver.find_element_by_xpath("//span/button").click()
    
        # Wait for the login page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.LINK_TEXT, "See All")))
    
    
    def scrape_followers(driver, account):
        # Load account page
        driver.get("https://www.instagram.com/{0}/".format(account))
    
        # Click the 'Follower(s)' link
        driver.find_element_by_partial_link_text("follower").click()
    
        # Wait for the followers modal to load
        xpath = "//div[@style='position: relative; z-index: 1;']/div/div[2]/div/div[1]"
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, xpath)))
    
        # You'll need to figure out some scrolling magic here. Something that can
        # scroll to the bottom of the followers modal, and know when its reached
        # the bottom. This is pretty impractical for people with a lot of followers
    
        # Finally, scrape the followers
        xpath = "//div[@style='position: relative; z-index: 1;']//ul/li/div/div/div/div/a"
        followers_elems = driver.find_elements_by_xpath(xpath)
    
        return [e.text for e in followers_elems]
    
    
    if __name__ == "__main__":
        driver = webdriver.Chrome()
        try:
            login(driver)
            followers = scrape_followers(driver, "instagram")
            print(followers)
        finally:
            driver.quit()
    

    This approach is problematic for a number of reasons, chief among them being how slow it is, relative to the the API.

    0 讨论(0)
提交回复
热议问题