I want to scrape the data from a page that shows a graph using highcharts.js
, and thus I finished to parse all the pages to get to the following page. However,
The data is in a script tag. You can get the script tag using bs4 and a regex. You could also extract the data using a regex but I like using /js2xml to parse js functions into a xml tree:
from bs4 import BeautifulSoup
import requests
import re
import js2xml
soup = BeautifulSoup(requests.get("http://www.worldweatheronline.com/brussels-weather-averages/be.aspx").content, "html.parser")
script = soup.find("script", text=re.compile("Highcharts.Chart")).text
# script = soup.find("script", text=re.compile("precipchartcontainer")).text if you want precipitation data
parsed = js2xml.parse(script)
print js2xml.pretty_print(parsed)
That gives you:
<identifier name="$"/>
<var name="chart"/>
<identifier name="$"/>
<identifier name="document"/>
<identifier name="ready"/>
<assign operator="=">
<identifier name="chart"/>
<identifier name="Highcharts"/>
<identifier name="Chart"/>
<property name="chart">
<property name="renderTo">
<property name="type">
<property name="credits">
<property name="enabled">
<property name="colors">
<property name="title">
<property name="text">
<string>Average Temperature (°c) Graph for Brussels</string>
<property name="xAxis">
<property name="categories">
<property name="labels">
<property name="rotation">
<number value="270"/>
<property name="y">
<number value="40"/>
<property name="yAxis">
<property name="title">
<property name="text">
<string>Temperature (°c)</string>
<property name="tooltip">
<property name="enabled">
<property name="plotOptions">
<property name="spline">
<property name="dataLabels">
<property name="enabled">
<property name="enableMouseTracking">
<property name="series">
<property name="name">
<string>Average High Temp (°c)</string>
<property name="color">
<property name="data">
<number value="6"/>
<number value="8"/>
<number value="11"/>
<number value="14"/>
<number value="19"/>
<number value="21"/>
<number value="23"/>
<number value="23"/>
<number value="19"/>
<number value="15"/>
<number value="9"/>
<number value="6"/>
<property name="name">
<string>Average Low Temp (°c)</string>
<property name="color">
<property name="data">
<number value="2"/>
<number value="2"/>
<number value="4"/>
<number value="6"/>
<number value="10"/>
<number value="12"/>
<number value="14"/>
<number value="14"/>
<number value="11"/>
<number value="8"/>
<number value="5"/>
<number value="2"/>
So to get all the data:
In [28]: from bs4 import BeautifulSoup
In [29]: import requests
In [30]: import re
In [31]: import js2xml
In [32]: from itertools import repeat
In [33]: from pprint import pprint as pp
In [34]: soup = BeautifulSoup(requests.get("http://www.worldweatheronline.com/brussels-weather-averages/be.aspx").content, "html.parser")
In [35]: script = soup.find("script", text=re.compile("Highcharts.Chart")).text
In [36]: parsed = js2xml.parse(script)
In [37]: data = [d.xpath(".//array/number/@value") for d in parsed.xpath("//property[@name='data']")]
In [38]: categories = parsed.xpath("//property[@name='categories']//string/text()")
In [39]: output = list(zip(repeat(categories), data))
In [40]: pp(output)
['6', '8', '11', '14', '19', '21', '23', '23', '19', '15', '9', '6']),
['2', '2', '4', '6', '10', '12', '14', '14', '11', '8', '5', '2'])]
Like I said you could just use a regex but js2xml I find is more reliable as erroneous spaces etc.. won't break it.