My script are not going to next page for scraping

前端 未结 1 1655
故里飘歌
故里飘歌 2021-01-28 23:06

I wrote a code for web scraping everything is ok except next page activity. When I run my code to srape data from the website it just sraping first page not moving forward to sc

相关标签:
1条回答
  • 2021-01-28 23:53

    I started trying to find out why it wasn't loading the next page correctly, but before I found the answer I found another way to get the data you are looking for. On the page there is an option to change how many results you want to return. I changed this to 10000, and now all items from the collection load on one page.

    If this isn't what you want, and instead still want to sove the page change problem let me know and I'll have another look.

    cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/display/10000/order/nosort/ad/asc

    I tested loading the index page but have not tested the detailed pages. I didn't want to download the entire collection.

    Here are some changes I made and some recommendations.

    1. Removed the need for looping index pages. Page now returns all items.
    2. Be specific with what exception you are catching. In this case its AttributeError
    3. Add a user-agent to your request, many websites will block requests that do not have it

    Good luck!

    import requests
    from bs4 import BeautifulSoup
    #import pandas as pd
    #import pandas as pd
    import csv
    
    def get_page(url):
        response = requests.get(url, headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.37"})
        if not response.ok:
            print('server responded:', response.status_code)
        else:
            soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
        return soup
    
    def get_detail_page(soup):
    
        # Be specific with your exception capturing. 
        try:
            title = (soup.find('h1',class_="cdm_style",id=False).text)
        except AttributeError:
            title = 'Empty Title'
        try:
            collection = (soup.find('td',id="metadata_collec").find('a').text)
        except AttributeError:
            collection = "Empty Collection"
        try:
            author = (soup.find('td',id="metadata_creato").text)
        except AttributeError:
            author = "Empty Author"
        try:
            abstract = (soup.find('td',id="metadata_descri").text)
        except AttributeError:
            abstract = "Empty Abstract"
        try:
            keywords = (soup.find('td',id="metadata_keywor").text)
        except AttributeError:
            keywords = "Empty Keywords"
        try:
            publishers = (soup.find('td',id="metadata_publis").text)
        except AttributeError:
            publishers = "Empty Publishers"
        try:
            date_original = (soup.find('td',id="metadata_contri").text)
        except AttributeError:
            date_original = "Empty Date original"
        try:
            date_digital = (soup.find('td',id="metadata_date").text)
        except AttributeError:
            date_digital = "Empty Date digital"
        try:
            formatt = (soup.find('td',id="metadata_source").text)
        except AttributeError:
            formatt = "Empty Format"
        try:
            release_statement = (soup.find('td',id="metadata_rights").text)
        except AttributeError:
            release_statement = "Empty Realease Statement"
        try:
            library = (soup.find('td',id="metadata_librar").text)
        except AttributeError:
            library = "Empty Library"
        try:
            date_created = (soup.find('td',id="metadata_dmcreated").text)
        except AttributeError:
            date_created = "Empty date Created"
        data = {
        'Title'        : title.strip(),
        'Collection'   : collection.strip(),
        'Author'       : author.strip(),
        'Abstract'     : abstract.strip(),
        'Keywords'     : keywords.strip(),
        'Publishers'   : publishers.strip(),
        'Date_original': date_original.strip(),
        'Date_digital' : date_digital.strip(),
        'Format'       : formatt.strip(),
        'Release-st'   : release_statement.strip(),
        'Library'      : library.strip(),
        'Date_created' : date_created.strip()
        }
        return data
    
    def get_index_data(soup):
        try:
            titles_link = soup.find_all('a',class_="body_link_11")
        except:
            titles_link = []
        else:
            titles_link_output = []
            for link in titles_link:
                try:
                    item_id = link.attrs.get('item_id', None) #All titles with valid links will have an item_id
                    if item_id:
                        titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
                except:
                    continue
        return titles_link_output
    
    def write_csv(data,url):
        with open('11_to_55.csv','a') as csvfile:
            writer = csv.writer(csvfile)
            row = [data['Title'], data['Collection'], data['Author'],
            data['Abstract'], data['Keywords'], data['Publishers'], data['Date_original'],
            data['Date_digital'], data['Format'], data['Release-st'], data['Library'],
            data['Date_created'], url]
            writer.writerow(row)
    
    def main():
        main_url = ("http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/display/10000/order/nosort/ad/asc")
        products = get_index_data(get_page(main_url))
        print(products)
    #     for product in products:
    #         data1 = get_detail_page(get_page(product))
    #         write_csv(data1,product)
    #     write_csv(data,url)
    
    
    if __name__ == '__main__':
        main()
    
    0 讨论(0)
提交回复
热议问题