How to extract text from pdf in Python 3.7

后端 未结 10 1180
后悔当初
后悔当初 2020-12-29 10:19

I am trying to extract text from a PDF file using Python. My main goal is I am trying to create a program that reads a bank statement and extracts its text to update an exce

10条回答
  •  囚心锁ツ
    2020-12-29 11:10

    Here is an alternative solution in Windows 10, Python 3.8

    Example test pdf: https://drive.google.com/file/d/1aUfQAlvq5hA9kz2c9CyJADiY3KpY3-Vn/view?usp=sharing

    #pip install pdfminer.six
    import io
    
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    
    
    def convert_pdf_to_txt(path):
        '''Convert pdf content from a file path to text
    
        :path the file path
        '''
        rsrcmgr = PDFResourceManager()
        codec = 'utf-8'
        laparams = LAParams()
    
        with io.StringIO() as retstr:
            with TextConverter(rsrcmgr, retstr, codec=codec,
                               laparams=laparams) as device:
                with open(path, 'rb') as fp:
                    interpreter = PDFPageInterpreter(rsrcmgr, device)
                    password = ""
                    maxpages = 0
                    caching = True
                    pagenos = set()
    
                    for page in PDFPage.get_pages(fp,
                                                  pagenos,
                                                  maxpages=maxpages,
                                                  password=password,
                                                  caching=caching,
                                                  check_extractable=True):
                        interpreter.process_page(page)
    
                    return retstr.getvalue()
    
    
    if __name__ == "__main__":
        print(convert_pdf_to_txt('C:\\Path\\To\\Test_PDF.pdf'))
    

提交回复
热议问题