Scraping Pricing off a search Bar - site link changed

喜夏-厌秋 提交于 2021-01-29 12:29:16

问题


With the help of some experts here I was able to build a scraper that works fine.

The essential line of code is really:

data = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17155"}}
r = requests.post('https://www.partssource.com/catalog/Service', json=data).json()"

However the site recently changed their link from partsfinder.com to partssource.com, and the code longer seems to work.

Just wondering if there's a trick I can use on my original code to have it working again.

Any thoughts is appreciated, thanks!

import requests
import pandas as pd


df = pd.read_excel(r'C:\Users\212677036\Documents\Part Number Input.xlsx')
PN = pd.DataFrame(df, columns=['Product code'])
9
i = 0

Total_rows = PN.shape[0]
partnumlist = []
partnumlist1 = []
partnumlist2 = []
partnumlist3 = []
partnumlist4 = []
partnumlist5 = []

while i < Total_rows:

         data = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17155"}}
         r = requests.post('https://www.partssource.com/catalog/Service', json=data).json()

         partnumlist.append(r['Data']['PartOptions'][0]['YourPrice'])

         data1 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17475"}}
         r1 = requests.post('https://www.partssource.com/catalog/Service', json=data1).json()

         partnumlist1.append(r1['Data']['PartOptions'][0]['YourPrice'])

         data2 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "16880"}}
         r2 = requests.post('https://www.partssource.com/catalog/Service', json=data2).json()

         partnumlist2.append(r2['Data']['PartOptions'][0]['YourPrice'])

         data3 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "47221"}}
         r3 = requests.post('https://www.partssource.com/catalog/Service', json=data3).json()

         partnumlist3.append(r3['Data']['PartOptions'][0]['YourPrice'])

         data4 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17045"}}
         r4 = requests.post('https://www.partssource.com/catalog/Service', json=data4).json()

         partnumlist4.append(r4['Data']['PartOptions'][0]['YourPrice'])

         data5 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17055"}}
         r5 = requests.post('https://www.partssource.com/catalog/Service', json=data5).json()

         partnumlist5.append(r5['Data']['PartOptions'][0]['YourPrice'])

         i=i+1

list_of_dataframes = [pd.DataFrame(partnumlist),pd.DataFrame(partnumlist1),
               pd.DataFrame(partnumlist2), pd.DataFrame(partnumlist3),
               pd.DataFrame(partnumlist4), pd.DataFrame(partnumlist5)]

pd.concat(list_of_dataframes).to_csv(r'C:\Users\212677036\Documents\output25.csv')

回答1:


After using DevTools in Firefox/Chrome I created this code.

Page use different url, send different data, get result with different keys.

You would have to use DevTools to observe more requests from browser to server to recognize how to use more params in data

import requests

query = "mobile"

data = {
#    "facets":[{
#        "name":"OEM",
#        "value":"GE%20Healthcare"
#    }],
    "facets":[],    
    "facilityId": 38451,
    "id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
    "limit": 15,
    "query": query,
    "referer": "/catalog/Service",
    "start": 0,
#    "urlParams":[{
#        "name": "OEM",
#        "value": "GE Healthcare"
#    }],
    "urlParams":[]    
}

r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=data)
data = r.json()

#print(data['products'])
#print(data['products'][0])
#print(data['products'][0]['options'])
#print(data['products'][0]['options'][0])

print(data['products'][0]['options'][0]['price'])

EDIT (2020.09.01)

If you have manu queries then use for-loop to run the same code many times but with different query. And when you get data for one query then use for-loop to get all prices from data['products']

EDIT (2020.09.06)

I added variable start and limit in get_data() and later I run it in loop for start in range(0, limit*10, limit) to get 10 pages (every with 100 elements)

import requests
# import pprint  # to format data on screen `pprint.pprint()

# --- fucntions ---

def get_data(query, start=0, limit=15): # <-- new (2020.09.06)
    """Get data from server"""
    
    payload = {
    #    "facets":[{
    #        "name":"OEM",
    #        "value":"GE%20Healthcare"
    #    }],
        "facets":[],    
        "facilityId": 38451,
        "id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
        "limit": limit,  # <-- new (2020.09.06)
        "query": query,
        "referer": "/catalog/Service",
        "start": start,  # <-- new (2020.09.06)
    #    "urlParams":[{
    #        "name": "OEM",
    #        "value": "GE Healthcare"
    #    }],
        "urlParams":[]    
    }

    r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
    data = r.json()
    
    return data

def show_data(data):
    #print(data['products'])
    #print(data['products'][0])
    #print(data['products'][0]['options'])
    #print(data['products'][0]['options'][0])

    print(data['products'][0]['options'][0]['price'])

    for item in data['products']:
        #pprint.pprint(item)
        print('title:', item['title'])
    
        if not item['options']:
            print('price: unknown')
        else:
            for option in item['options']:
                print('price:', option['price'], '| vendor item number:', option['vendorItemNumber'])

        print('---')
    
def filter_data(data):
    filtered = []
    
    for item in data['products']:
        if not item['options']:
            filtered.append( [] )  # unknown
        else:
            all_prices = [option['price'] for option in item['options']]
            filtered.append( all_prices )
            
    return filtered
    
# --- main ---

all_queries = ["mobile", 'GE Healthcare']

limit = 100  # <-- new (2020.09.06)

for query in all_queries:

    # pagination
    for start in range(0, limit*10, limit): # <-- new (2020.09.06)

        print('\n--- QUERY:', query, 'start:', start, '---\n')

        data = get_data(query, start, limit)
        #show_data(data)

        filtered = filter_data(data)
        print(filtered)


来源:https://stackoverflow.com/questions/62437946/scraping-pricing-off-a-search-bar-site-link-changed

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!