Logging in to LinkedIn with python requests sessions

前端 未结 5 1267
傲寒
傲寒 2020-12-02 19:57

I\'m trying to log into LinkedIn using Python requests:

import sys
import requests
from BeautifulSoup import BeautifulSoup


payload={
    \'session-key\' :          


        
相关标签:
5条回答
  • 2020-12-02 20:03

    The OP's solution worked for me with only a very slight modification.

    Change 'session-key' to 'session_key' and change 'session-password' to session_password.'

    Other than that, the code is good as it stands.

    0 讨论(0)
  • 2020-12-02 20:16

    2019 Version.

    Slightly revised version working that takes into account the new structure of the page to find the connection cookie and adds the trk parameter.

    import requests
    from bs4 import BeautifulSoup
    
    email = ""
    password = ""
    
    client = requests.Session()
    
    HOMEPAGE_URL = 'https://www.linkedin.com'
    LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'
    
    html = client.get(HOMEPAGE_URL).content
    soup = BeautifulSoup(html, "html.parser")
    csrf = soup.find('input', {'name': 'loginCsrfParam'}).get('value')
    
    login_information = {
        'session_key': email,
        'session_password': password,
        'loginCsrfParam': csrf,
        'trk': 'guest_homepage-basic_sign-in-submit'
    }
    
    client.post(LOGIN_URL, data=login_information)
    
    response = client.get('')
    
    0 讨论(0)
  • 2020-12-02 20:17

    I modified a web-scraping template I use for most of my Python-based scraping needs to fit your needs. Verified it worked with my own login info.

    The way it works is by mimic-ing a browser and maintaining a cookieJar that stores your user session. Got it to work with BeautifulSoup for you as well.

    Note: This is a Python2 version. I added a working Python3 example further below by request.

    import cookielib
    import os
    import urllib
    import urllib2
    import re
    import string
    from BeautifulSoup import BeautifulSoup
    
    username = "user@email.com"
    password = "password"
    
    cookie_filename = "parser.cookies.txt"
    
    class LinkedInParser(object):
    
        def __init__(self, login, password):
            """ Start up... """
            self.login = login
            self.password = password
    
            # Simulate browser with cookies enabled
            self.cj = cookielib.MozillaCookieJar(cookie_filename)
            if os.access(cookie_filename, os.F_OK):
                self.cj.load()
            self.opener = urllib2.build_opener(
                urllib2.HTTPRedirectHandler(),
                urllib2.HTTPHandler(debuglevel=0),
                urllib2.HTTPSHandler(debuglevel=0),
                urllib2.HTTPCookieProcessor(self.cj)
            )
            self.opener.addheaders = [
                ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
                               'Windows NT 5.2; .NET CLR 1.1.4322)'))
            ]
    
            # Login
            self.loginPage()
    
            title = self.loadTitle()
            print title
    
            self.cj.save()
    
    
        def loadPage(self, url, data=None):
            """
            Utility function to load HTML from URLs for us with hack to continue despite 404
            """
            # We'll print the url in case of infinite loop
            # print "Loading URL: %s" % url
            try:
                if data is not None:
                    response = self.opener.open(url, data)
                else:
                    response = self.opener.open(url)
                return ''.join(response.readlines())
            except:
                # If URL doesn't load for ANY reason, try again...
                # Quick and dirty solution for 404 returns because of network problems
                # However, this could infinite loop if there's an actual problem
                return self.loadPage(url, data)
    
        def loginPage(self):
            """
            Handle login. This should populate our cookie jar.
            """
            html = self.loadPage("https://www.linkedin.com/")
            soup = BeautifulSoup(html)
            csrf = soup.find(id="loginCsrfParam-login")['value']
    
            login_data = urllib.urlencode({
                'session_key': self.login,
                'session_password': self.password,
                'loginCsrfParam': csrf,
            })
    
            html = self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
            return
    
        def loadTitle(self):
            html = self.loadPage("https://www.linkedin.com/feed/")
            soup = BeautifulSoup(html)
            return soup.find("title")
    
    parser = LinkedInParser(username, password)
    

    Update June 19, 2014: Added parsing for CSRF token from homepage for use in updated login process.

    Update July 23, 2015: Adding a Python 3 example here. Basically requires substituting library locations and removing deprecated methods. It's not perfectly formatted or anything, but it functions. Sorry for the rush job. In the end the principals and steps are identical.

    import http.cookiejar as cookielib
    import os
    import urllib
    import re
    import string
    from bs4 import BeautifulSoup
    
    username = "user@email.com"
    password = "password"
    
    cookie_filename = "parser.cookies.txt"
    
    class LinkedInParser(object):
    
        def __init__(self, login, password):
            """ Start up... """
            self.login = login
            self.password = password
    
            # Simulate browser with cookies enabled
            self.cj = cookielib.MozillaCookieJar(cookie_filename)
            if os.access(cookie_filename, os.F_OK):
                self.cj.load()
            self.opener = urllib.request.build_opener(
                urllib.request.HTTPRedirectHandler(),
                urllib.request.HTTPHandler(debuglevel=0),
                urllib.request.HTTPSHandler(debuglevel=0),
                urllib.request.HTTPCookieProcessor(self.cj)
            )
            self.opener.addheaders = [
                ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
                               'Windows NT 5.2; .NET CLR 1.1.4322)'))
            ]
    
            # Login
            self.loginPage()
    
            title = self.loadTitle()
            print(title)
    
            self.cj.save()
    
    
        def loadPage(self, url, data=None):
            """
            Utility function to load HTML from URLs for us with hack to continue despite 404
            """
            # We'll print the url in case of infinite loop
            # print "Loading URL: %s" % url
            try:
                if data is not None:
                    response = self.opener.open(url, data)
                else:
                    response = self.opener.open(url)
                return ''.join([str(l) for l in response.readlines()])
            except Exception as e:
                # If URL doesn't load for ANY reason, try again...
                # Quick and dirty solution for 404 returns because of network problems
                # However, this could infinite loop if there's an actual problem
                return self.loadPage(url, data)
    
        def loadSoup(self, url, data=None):
            """
            Combine loading of URL, HTML, and parsing with BeautifulSoup
            """
            html = self.loadPage(url, data)
            soup = BeautifulSoup(html, "html5lib")
            return soup
    
        def loginPage(self):
            """
            Handle login. This should populate our cookie jar.
            """
            soup = self.loadSoup("https://www.linkedin.com/")
            csrf = soup.find(id="loginCsrfParam-login")['value']
            login_data = urllib.parse.urlencode({
                'session_key': self.login,
                'session_password': self.password,
                'loginCsrfParam': csrf,
            }).encode('utf8')
    
            self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
            return
    
        def loadTitle(self):
            soup = self.loadSoup("https://www.linkedin.com/feed/")
            return soup.find("title")
    
    parser = LinkedInParser(username, password)
    
    0 讨论(0)
  • 2020-12-02 20:19

    This is a much simpler version.

    import requests
    from bs4 import BeautifulSoup
    
    client = requests.Session()
    
    HOMEPAGE_URL = 'https://www.linkedin.com'
    LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'
    
    html = client.get(HOMEPAGE_URL).content
    soup = BeautifulSoup(html, "html.parser")
    csrf = soup.find(id="loginCsrfParam-login")['value']
    
    login_information = {
        'session_key':'Login',
        'session_password':'Password',
        'loginCsrfParam': csrf,
    }
    
    client.post(LOGIN_URL, data=login_information)
    
    client.get('Any_Linkedin_URL')
    
    0 讨论(0)
  • 2020-12-02 20:24

    2020 version of @garromark's accepted solution:

    import http.cookiejar as cookielib
    import os
    import urllib
    import re
    import string
    from bs4 import BeautifulSoup
    
    username = ""
    password = ""
    
    cookie_filename = "parser.cookies.txt"
    
    
    class LinkedInParser(object):
    
        def __init__(self, login, password):
            """ Start up... """
            self.login = login
            self.password = password
    
            # Simulate browser with cookies enabled
            self.cj = cookielib.MozillaCookieJar(cookie_filename)
            if os.access(cookie_filename, os.F_OK):
                self.cj.load()
            self.opener = urllib.request.build_opener(
                urllib.request.HTTPRedirectHandler(),
                urllib.request.HTTPHandler(debuglevel=0),
                urllib.request.HTTPSHandler(debuglevel=0),
                urllib.request.HTTPCookieProcessor(self.cj)
            )
            self.opener.addheaders = [
                ('User-agent', 'Mozilla/5.0')
            ]
    
            # Login
            self.loginPage()
    
            title = self.loadTitle()
            print(title)
    
            # self.cj.save()
    
        def loadPage(self, url, data=None):
            """
            Utility function to load HTML from URLs for us with hack to continue despite 404
            """
            # We'll print the url in case of infinite loop
            # print "Loading URL: %s" % url
            try:
                if data is not None:
                    response = self.opener.open(url, data)
                else:
                    response = self.opener.open(url)
                content = ''.join([str(l) for l in response.readlines()])
                print("Page loaded: %s \n Content: %s \n" % (url, content))
                return content
            except Exception as e:
                # If URL doesn't load for ANY reason, try again...
                # Quick and dirty solution for 404 returns because of network problems
                # However, this could infinite loop if there's an actual problem
                print("Exception on %s load: %s" % (url, e))
                # return self.loadPage(url, data)
    
        def loadSoup(self, url, data=None):
            """
            Combine loading of URL, HTML, and parsing with BeautifulSoup
            """
            html = self.loadPage(url, data)
            soup = BeautifulSoup(html, "html5lib")
            return soup
    
        def loginPage(self):
            """
            Handle login. This should populate our cookie jar.
            """
            soup = self.loadSoup("https://www.linkedin.com/login")
            loginCsrfParam = soup.find("input", {"name": "loginCsrfParam"})['value']
            csrfToken = soup.find("input", {"name": "csrfToken"})['value']
            sIdString = soup.find("input", {"name": "sIdString"})['value']
            print("loginCsrfParam: %s" % loginCsrfParam)
            print("csrfToken: %s" % csrfToken)
            print("sIdString: %s" % sIdString)
            login_data = urllib.parse.urlencode({
                'session_key': self.login,
                'session_password': self.password,
                'loginCsrfParam': loginCsrfParam,
                'csrfToken': csrfToken,
                'sIdString': sIdString
            }).encode('utf8')
    
            self.loadPage("https://www.linkedin.com/checkpoint/lg/login-submit", login_data)
    
        def loadTitle(self):
            soup = self.loadSoup("https://www.linkedin.com/feed/")
            return soup.find("title")
    
    
    parser = LinkedInParser(username, password)
    
    
    0 讨论(0)
提交回复
热议问题