Python web scraping, counting the occurrence of a list of words of each page

问题

So i am trying to find a set of specific word ("shall" "may" "must" etc) of each page, and add up its occurrence, the code I used:

import requests
from bs4 import BeautifulSoup, SoupStrainer
import re


def levelfour(main_url):

    pattern = re.compile(r"\bmay not\b", re.IGNORECASE)
    pattern1 = re.compile(r"\bshall\b", re.IGNORECASE)
    pattern2 = re.compile(r"\bmust\b", re.IGNORECASE)
    pattern3 = re.compile(r"\bprohibited\b", re.IGNORECASE)
    pattern4 = re.compile(r"\brequired\b", re.IGNORECASE)

    r = requests.get(main_url)
    soup = BeautifulSoup((r.content), "html.parser")
    results = soup.find('article', {'id': 'maincontent'})
    results = results.text.encode("utf-8", "ignore")

    total = 0
    total1 = 0
    total2 = 0
    total3 = 0
    total4 = 0

    m = re.findall(pattern, r.content)
    m1 = re.findall(pattern1, r.content)
    m2 = re.findall(pattern2, r.content)
    m3 = re.findall(pattern3, r.content)
    m4 = re.findall(pattern4, r.content)
    total += len(m)
    total1 += len(m1)
    total2 += len(m2)
    total3 += len(m3)
    total4 += len(m4)
    print total, total1, total2, total3, total4

########################################Sections##########################
def levelthree(item2_url):
 r = requests.get(item2_url)
 for sectionlinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
  if sectionlinks.has_attr('href'):
   if 'section' in sectionlinks['href']:
         href = "http://law.justia.com" + sectionlinks.get('href')
         levelfour(href)

########################################Chapters##########################
def leveltwo(item_url):
 r = requests.get(item_url)
 for sublinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
  if sublinks.has_attr('href'):
   if 'chapt' in sublinks['href']:
         chapterlinks = "http://law.justia.com" + sublinks.get('href')
         levelthree(chapterlinks)
         print (chapterlinks)

######################################Titles###############################
def levelone(url):
 r = requests.get(url)
 for links in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
    if links.has_attr('href'):
        if 'title-54' in links['href']:
          titlelinks = "http://law.justia.com" + links.get('href')
          # titlelinks = "\n" + str(titlelinks)
          leveltwo(titlelinks)
          # print (titlelinks)

###########################################################################
base_url = "http://law.justia.com/codes/idaho/2015/"
levelone(base_url)

when I print out total, total1, total2, total3, total4, it gives a zeros instead [0, 0, 0, 0, 0 ] my question, how do can appropriately find and add up the occurrence of this set or words?

回答1:

use m = re.findall(pattern, r.content) fixed the problem

回答2:

Using a variable for each phrase is a mess. Try this:

from collections import Counter
counter = Counter()
text = r.content.lower()
for phrase in ['may not', 'shall', 'must']:
    counter[phrase] += len(re.findall(r'\b%s\b' % phrase, text))

来源：https://stackoverflow.com/questions/37313733/python-web-scraping-counting-the-occurrence-of-a-list-of-words-of-each-page

标签

python

regex

python-2.7

beautifulsoup

frequency