问题
I am trying to get all the historical information of a particular stock from yahoo finance. I am new to python and web-scraping.
I want to download all the historical data into a CSV file. The problem is that the code downloads only the first 100 entries of any stock on the website. When any stock is viewed on the browser, we have to scroll to the bottom of the page for more table entries to load.
I think the same thing is happening when I download using the library. Some kind of optimization seems to be preventing the web page from downloading entirely. Try it out here (https://in.finance.yahoo.com/quote/TVSMOTOR.NS/history?period1=-19800&period2=1524236374&interval=1d&filter=history&frequency=1d). Is there a way to overcome this?
Here's my code:
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url= 'https://in.finance.yahoo.com/quote/TVSMOTOR.NS/history?period1=-19800&period2=1524236374&interval=1d&filter=history&frequency=1d'
page=uReq(my_url)
page_html = page.read()
page_data = soup(page_html,"html.parser")
container= page_data.findAll("table",{"data-test":"historical-prices"})
container= container[0].tbody
rows=container.findAll("tr")
filename="tvs.csv"
f=open(filename,"w")
headers = "date, open, low, close, adjusted_close_price, vol \n"
f.write(headers)
for row in rows:
if len(row.find_all("td",{"colspan":""}))==7 :
col=row.findAll("td")
date=col[0].span.text.strip()
opend=col[1].span.text.strip().replace(",","")
if opend!='null':
high=col[2].span.text.strip().replace(",","")
low=col[3].span.text.strip().replace(",","")
close=col[4].span.text.strip().replace(",","")
adjclose=col[5].span.text.strip().replace(",","")
vol=col[6].span.text.strip().replace(",","")
f.write(date+","+opend+","+low+","+close+","+adjclose+","+vol+","+"\n")
f.close();
Thanks in Advance!
EDIT:
Okay , I found another piece of code that works well. But I have no idea how it works. Any help would be appreciated.
#!/usr/bin/env python
"""
get-yahoo-quotes.py: Script to download Yahoo historical quotes using the new cookie authenticated site.
Usage: get-yahoo-quotes SYMBOL
History
06-03-2017 : Created script
"""
__author__ = "Brad Luicas"
__copyright__ = "Copyright 2017, Brad Lucas"
__license__ = "MIT"
__version__ = "1.0.0"
__maintainer__ = "Brad Lucas"
__email__ = "brad@beaconhill.com"
__status__ = "Production"
import re
import sys
import time
import datetime
import requests
def split_crumb_store(v):
return v.split(':')[2].strip('"')
def find_crumb_store(lines):
# Looking for
# ,"CrumbStore":{"crumb":"9q.A4D1c.b9
for l in lines:
if re.findall(r'CrumbStore', l):
return l
print("Did not find CrumbStore")
def get_cookie_value(r):
return {'B': r.cookies['B']}
def get_page_data(symbol):
url = "https://finance.yahoo.com/quote/%s/?p=%s" % (symbol, symbol)
r = requests.get(url)
cookie = get_cookie_value(r)
# Code to replace possible \u002F value
# ,"CrumbStore":{"crumb":"FWP\u002F5EFll3U"
# FWP\u002F5EFll3U
lines = r.content.decode('unicode-escape').strip(). replace('}', '\n')
return cookie, lines.split('\n')
def get_cookie_crumb(symbol):
cookie, lines = get_page_data(symbol)
crumb = split_crumb_store(find_crumb_store(lines))
return cookie, crumb
def get_data(symbol, start_date, end_date, cookie, crumb):
filename = '%s.csv' % (symbol)
url = "https://query1.finance.yahoo.com/v7/finance/download/%s?period1=%s&period2=%s&interval=1d&events=history&crumb=%s" % (symbol, start_date, end_date, crumb)
response = requests.get(url, cookies=cookie)
with open (filename, 'wb') as handle:
for block in response.iter_content(1024):
handle.write(block)
def get_now_epoch():
# @see https://www.linuxquestions.org/questions/programming-9/python-datetime-to-epoch-4175520007/#post5244109
return int(time.time())
def download_quotes(symbol):
start_date = 0
end_date = get_now_epoch()
cookie, crumb = get_cookie_crumb(symbol)
get_data(symbol, start_date, end_date, cookie, crumb)
if __name__ == '__main__':
# If we have at least one parameter go ahead and loop overa all the parameters assuming they are symbols
if len(sys.argv) == 1:
print("\nUsage: get-yahoo-quotes.py SYMBOL\n\n")
else:
for i in range(1, len(sys.argv)):
symbol = sys.argv[i]
print("--------------------------------------------------")
print("Downloading %s to %s.csv" % (symbol, symbol))
download_quotes(symbol)
print("--------------------------------------------------")
回答1:
Initially only 100 results are downloaded to the browser. And when you scroll through it to bottom of the page then JS Event occurs which trigger AJAX function to download next 50/100 data entry on the background and then it displayed to the browser. On your python code there is no possible way to create the JS Event and the AJAX call request as python does not execute javascript. So it's better to use https://intrinio.com/ or https://www.alphavantage.co
You may try yahoo-finance python package. https://pypi.org/project/yahoo-finance/
来源:https://stackoverflow.com/questions/49946597/python-web-scraping-on-large-html-webpages