how can I find all span\'s with a class of \'blue\'
that contain text in the format:
04/18/13 7:29pm
which could therefore be:
import re
from bs4 import BeautifulSoup
html_doc = """
here is a lot of text that i don't need
this is the span i need because it contains 04/18/13 7:29pm
04/19/13 7:30pm
Posted on 04/20/13 10:31pm
"""
# parse the html
soup = BeautifulSoup(html_doc)
# find a list of all span elements
spans = soup.find_all('span', {'class' : 'blue'})
# create a list of lines corresponding to element texts
lines = [span.get_text() for span in spans]
# collect the dates from the list of lines using regex matching groups
found_dates = []
for line in lines:
m = re.search(r'(\d{2}/\d{2}/\d{2} \d+:\d+[a|p]m)', line)
if m:
found_dates.append(m.group(1))
# print the dates we collected
for date in found_dates:
print(date)
output:
04/18/13 7:29pm
04/19/13 7:30pm
04/20/13 10:31pm