How to list all strings that have a PA/ inside of a html file using beautiful soup

问题

I have a program that converts pdfs into html and I needed to complement this program so after converting It would search for the tags PA/ and the character in front of it and save these tags and characters to a CSV file, I'm trying to do it but I can't.

Here's the code so far:

import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import csv
import pickle
def rename_files():
    file_list = os.listdir(r"C:\\PROJECT\\pdfs")
    print(file_list)
    saved_path = os.getcwd()
    print('Current working directory is '+saved_path)
    os.chdir(r'C:\\PROJECT\\pdfs')
    for file_name in file_list:
        os.rename(file_name, file_name.translate(None, " "))
    os.chdir(saved_path)
rename_files()

def run(command):
    if platform.system() != 'Windows':
        args = shlex.split(command)
    else:
        args = command
    s = subprocess.Popen(args,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    output, errors = s.communicate()
    return s.returncode == 0, output, errors

# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
    print "%s is not a directory" % base_directory
    exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
    print "Could not find %s" % bin_path
    exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
    for file_name in file_name_list:
        # If this is not a PDF file
        if not file_name.endswith('.pdf'):
            # Skip it
            continue
        file_path = os.path.join(dir_path, file_name)
        # Convert your PDF to HTML here
        args = (bin_path, file_name, file_path)
        success, output, errors = run("python %s -o %s.html %s " %args)
        if not success:
            print "Could not convert %s to HTML" % file_path
            print "%s" % errors
htmls_path = 'C:\\PROJECT'
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
    for file_name in file_name_list:
        if not file_name.endswith('.html'):
            continue
        with open(file_name) as markup:
            soup = BeautifulSoup(markup.read())
            text = soup.get_text()
            match = re.findall("PA/(\S*)\s*(\S*)", text)
            print(match)
with open ('score.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows('%s' %match)

The html is too big, I'll write here a part of it that includes the PA's and the text that I don't want:

<html>
    <title>Testing</title>
    <body>
        <div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34023px; width:84px; height:32px;"><span style="font-family: YFEHEP+Times-Bold; font-size:17px">JUST SOME TEXT THAT I DON'T WANT TO HAVE ON THE CSV FILE
        <br></span><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/00986/17 GTD
        <br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34066px; width:84px; height:16px;"><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/01008/17 GTD
        <br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34105px; width:84px; height:16px;"><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/01095/17 GTD
    </body>
</html>

回答1:

Check Online Demo

    import re
    from bs4 import BeautifulSoup
    html_doc = """
    <html>
        <title>Testing</title>
        <body>
            <div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34023px; width:84px; height:32px;"><span style="font-family: YFEHEP+Times-Bold; font-size:17px">JUST SOME TEXT THAT I DON'T WANT TO HAVE ON THE CSV FILE
            <br></span><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/00986/17 GTD
            <br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34066px; width:84px; height:16px;"><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/01008/17 GTD
            <br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34105px; width:84px; height:16px;"><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/01095/17 GTD
        </body>
    </html>
    """

    soup = BeautifulSoup(html_doc, 'html.parser')
    text = soup.get_text()

    match = re.findall("PA/(\S*)\s*(\S*)", text)
    print(match)

For writting to CSV

import csv
with open('ur file.csv','wb') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['fist_col','second_col'])
    for row in match:
        csv_out.writerow(row)

来源：https://stackoverflow.com/questions/43629600/how-to-list-all-strings-that-have-a-pa-inside-of-a-html-file-using-beautiful-so

标签

python

html

pdf

beautifulsoup

converters