Getting links of Youtube search result

后端 未结 1 1318
忘了有多久
忘了有多久 2021-01-22 08:01

I am trying to get links of videos that appear in search result for a particular query on YouTube. I am using BeautifulSoup and requests library of Python and here is what I did

相关标签:
1条回答
  • 2021-01-22 08:44

    Someone created pretty much exactly what you're after apart from the exporting to Excel but rather does export to CSV on a different SE.

    Unfortunately SO doesn't let me paste possible duplicate answers from different SE sites.

    #!/usr/bin/python
    # http://docs.python-requests.org/en/latest/user/quickstart/
    # http://www.crummy.com/software/BeautifulSoup/bs4/doc/
    
    import csv
    import re
    import requests
    import time
    from bs4 import BeautifulSoup
    
    # scrapes the title 
    def getTitle():
        d = soup.find_all("h1", "branded-page-header-title")
        for i in d:
            name = i.text.strip().replace('\n',' ').replace(',','').encode("utf-8")
            f.write(str(name) + ',')
            print(f'\t\t{name}')
    
    # scrapes the subscriber and view count
    def getStats():
        b = soup.find_all("li", "about-stat ") # trailing space is required.
        for i in b:
            value = i.b.text.strip().replace(',','')
            name = i.b.next_sibling.strip().replace(',','')
            f.write(value+',')
            print('\t\t%s = %s') % (name, value)
    
    # scrapes the description
    def getDescription():
        c = soup.find_all("div", "about-description")
        for i in c:
            description = i.text.strip().replace('\n',' ').replace(',','').encode("utf-8")
            f.write(str(description) + ',')
            #print('\t\t%s') % (description)
    
    # scrapes all the external links 
    def getLinks():
        a = soup.find_all("a", "about-channel-link ") # trailing space is required.
        for i in a:
            url = i.get('href')
            f.write(url+',')
            print(f'\t\t{url}')
    
    # scrapes the related channels
    def getRelated():
        s = soup.find_all("h3", "yt-lockup-title")
        for i in s:
            t = i.find_all(href=re.compile("user"))
            for i in t:
                url = 'https://www.youtube.com'+i.get('href')
                rCSV.write(url+'\n')
                print(f'\t\t{i.text}, {url}')  
    
    f = open("youtube-scrape-data.csv", "w+")
    rCSV = open("related-channels.csv", "w+")
    visited = []
    base = "https://www.youtube.com/results?search_query="
    q = ['search+query+here']
    page = "&page="
    features="html.parser"
    count = 1
    pagesToScrape = 20
    
    for query in q:
        while count <= pagesToScrape:
            scrapeURL = base + str(query) + page + str(count)
            print(f'Scraping {scrapeURL} \n')
            r = requests.get(scrapeURL)
            soup = BeautifulSoup(r.text)
            users = soup.find_all("div", "yt-lockup-byline")
            for each in users:
                a = each.find_all(href=re.compile("user"))
                for i in a:
                    url = 'https://www.youtube.com'+i.get('href')+'/about'
                    if url in visited:
                        print(f'\t{url} has already been scraped\n\n')
                    else:
                        r = requests.get(url)
                        soup = BeautifulSoup(r.text)
                        f.write(url+',')
                        print(f'\t{url}')
                        getTitle()
                        getStats()
                        getDescription()
                        getLinks()
                        getRelated()
                        f.write('\n')   
                        print('\n')
                        visited.append(url)
                        time.sleep(3)
            count += 1  
            time.sleep(3)
            print('\n')
        count = 1
        print('\n') 
    f.close()
    

    Source: https://codereview.stackexchange.com/questions/92001/youtube-search-result-scraper

    0 讨论(0)
提交回复
热议问题