问题
So i am trying to find a set of specific word ("shall" "may" "must" etc) of each page, and add up its occurrence, the code I used:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
def levelfour(main_url):
pattern = re.compile(r"\bmay not\b", re.IGNORECASE)
pattern1 = re.compile(r"\bshall\b", re.IGNORECASE)
pattern2 = re.compile(r"\bmust\b", re.IGNORECASE)
pattern3 = re.compile(r"\bprohibited\b", re.IGNORECASE)
pattern4 = re.compile(r"\brequired\b", re.IGNORECASE)
r = requests.get(main_url)
soup = BeautifulSoup((r.content), "html.parser")
results = soup.find('article', {'id': 'maincontent'})
results = results.text.encode("utf-8", "ignore")
total = 0
total1 = 0
total2 = 0
total3 = 0
total4 = 0
m = re.findall(pattern, r.content)
m1 = re.findall(pattern1, r.content)
m2 = re.findall(pattern2, r.content)
m3 = re.findall(pattern3, r.content)
m4 = re.findall(pattern4, r.content)
total += len(m)
total1 += len(m1)
total2 += len(m2)
total3 += len(m3)
total4 += len(m4)
print total, total1, total2, total3, total4
########################################Sections##########################
def levelthree(item2_url):
r = requests.get(item2_url)
for sectionlinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sectionlinks.has_attr('href'):
if 'section' in sectionlinks['href']:
href = "http://law.justia.com" + sectionlinks.get('href')
levelfour(href)
########################################Chapters##########################
def leveltwo(item_url):
r = requests.get(item_url)
for sublinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sublinks.has_attr('href'):
if 'chapt' in sublinks['href']:
chapterlinks = "http://law.justia.com" + sublinks.get('href')
levelthree(chapterlinks)
print (chapterlinks)
######################################Titles###############################
def levelone(url):
r = requests.get(url)
for links in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if links.has_attr('href'):
if 'title-54' in links['href']:
titlelinks = "http://law.justia.com" + links.get('href')
# titlelinks = "\n" + str(titlelinks)
leveltwo(titlelinks)
# print (titlelinks)
###########################################################################
base_url = "http://law.justia.com/codes/idaho/2015/"
levelone(base_url)
when I print out total, total1, total2, total3, total4, it gives a zeros instead [0, 0, 0, 0, 0 ] my question, how do can appropriately find and add up the occurrence of this set or words?
回答1:
use m = re.findall(pattern, r.content)
fixed the problem
回答2:
Using a variable for each phrase is a mess. Try this:
from collections import Counter
counter = Counter()
text = r.content.lower()
for phrase in ['may not', 'shall', 'must']:
counter[phrase] += len(re.findall(r'\b%s\b' % phrase, text))
来源:https://stackoverflow.com/questions/37313733/python-web-scraping-counting-the-occurrence-of-a-list-of-words-of-each-page