问题
With the help of some experts here I was able to build a scraper that works fine.
The essential line of code is really:
data = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17155"}}
r = requests.post('https://www.partssource.com/catalog/Service', json=data).json()"
However the site recently changed their link from partsfinder.com to partssource.com, and the code longer seems to work.
Just wondering if there's a trick I can use on my original code to have it working again.
Any thoughts is appreciated, thanks!
import requests
import pandas as pd
df = pd.read_excel(r'C:\Users\212677036\Documents\Part Number Input.xlsx')
PN = pd.DataFrame(df, columns=['Product code'])
9
i = 0
Total_rows = PN.shape[0]
partnumlist = []
partnumlist1 = []
partnumlist2 = []
partnumlist3 = []
partnumlist4 = []
partnumlist5 = []
while i < Total_rows:
data = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17155"}}
r = requests.post('https://www.partssource.com/catalog/Service', json=data).json()
partnumlist.append(r['Data']['PartOptions'][0]['YourPrice'])
data1 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17475"}}
r1 = requests.post('https://www.partssource.com/catalog/Service', json=data1).json()
partnumlist1.append(r1['Data']['PartOptions'][0]['YourPrice'])
data2 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "16880"}}
r2 = requests.post('https://www.partssource.com/catalog/Service', json=data2).json()
partnumlist2.append(r2['Data']['PartOptions'][0]['YourPrice'])
data3 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "47221"}}
r3 = requests.post('https://www.partssource.com/catalog/Service', json=data3).json()
partnumlist3.append(r3['Data']['PartOptions'][0]['YourPrice'])
data4 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17045"}}
r4 = requests.post('https://www.partssource.com/catalog/Service', json=data4).json()
partnumlist4.append(r4['Data']['PartOptions'][0]['YourPrice'])
data5 = {"partOptionFilter": {"PartNumber": PN.iloc[i, 0], "AlternativeOemId": "17055"}}
r5 = requests.post('https://www.partssource.com/catalog/Service', json=data5).json()
partnumlist5.append(r5['Data']['PartOptions'][0]['YourPrice'])
i=i+1
list_of_dataframes = [pd.DataFrame(partnumlist),pd.DataFrame(partnumlist1),
pd.DataFrame(partnumlist2), pd.DataFrame(partnumlist3),
pd.DataFrame(partnumlist4), pd.DataFrame(partnumlist5)]
pd.concat(list_of_dataframes).to_csv(r'C:\Users\212677036\Documents\output25.csv')
回答1:
After using DevTools
in Firefox
/Chrome
I created this code.
Page use different url, send different data, get result with different keys.
You would have to use DevTools
to observe more requests from browser to server to recognize how to use more params in data
import requests
query = "mobile"
data = {
# "facets":[{
# "name":"OEM",
# "value":"GE%20Healthcare"
# }],
"facets":[],
"facilityId": 38451,
"id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
"limit": 15,
"query": query,
"referer": "/catalog/Service",
"start": 0,
# "urlParams":[{
# "name": "OEM",
# "value": "GE Healthcare"
# }],
"urlParams":[]
}
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=data)
data = r.json()
#print(data['products'])
#print(data['products'][0])
#print(data['products'][0]['options'])
#print(data['products'][0]['options'][0])
print(data['products'][0]['options'][0]['price'])
EDIT (2020.09.01)
If you have manu queries then use for
-loop to run the same code many times but with different query. And when you get data for one query then use for
-loop to get all prices from data['products']
EDIT (2020.09.06)
I added variable start
and limit
in get_data()
and later I run it in loop for start in range(0, limit*10, limit)
to get 10 pages (every with 100 elements)
import requests
# import pprint # to format data on screen `pprint.pprint()
# --- fucntions ---
def get_data(query, start=0, limit=15): # <-- new (2020.09.06)
"""Get data from server"""
payload = {
# "facets":[{
# "name":"OEM",
# "value":"GE%20Healthcare"
# }],
"facets":[],
"facilityId": 38451,
"id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
"limit": limit, # <-- new (2020.09.06)
"query": query,
"referer": "/catalog/Service",
"start": start, # <-- new (2020.09.06)
# "urlParams":[{
# "name": "OEM",
# "value": "GE Healthcare"
# }],
"urlParams":[]
}
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
data = r.json()
return data
def show_data(data):
#print(data['products'])
#print(data['products'][0])
#print(data['products'][0]['options'])
#print(data['products'][0]['options'][0])
print(data['products'][0]['options'][0]['price'])
for item in data['products']:
#pprint.pprint(item)
print('title:', item['title'])
if not item['options']:
print('price: unknown')
else:
for option in item['options']:
print('price:', option['price'], '| vendor item number:', option['vendorItemNumber'])
print('---')
def filter_data(data):
filtered = []
for item in data['products']:
if not item['options']:
filtered.append( [] ) # unknown
else:
all_prices = [option['price'] for option in item['options']]
filtered.append( all_prices )
return filtered
# --- main ---
all_queries = ["mobile", 'GE Healthcare']
limit = 100 # <-- new (2020.09.06)
for query in all_queries:
# pagination
for start in range(0, limit*10, limit): # <-- new (2020.09.06)
print('\n--- QUERY:', query, 'start:', start, '---\n')
data = get_data(query, start, limit)
#show_data(data)
filtered = filter_data(data)
print(filtered)
来源:https://stackoverflow.com/questions/62437946/scraping-pricing-off-a-search-bar-site-link-changed