Working with tables in python-docx

前端 未结 3 482
逝去的感伤
逝去的感伤 2021-01-15 02:57

I have a small question about working with opened docx-file. This is part of my code:

doc = Document(self.fileName[0])

for paragraph in doc.paragraphs:
   s         


        
相关标签:
3条回答
  • 2021-01-15 03:15

    This code will work for this:

    ###Import all necessary packages
    from docx.opc.constants import RELATIONSHIP_TYPE as RT
    from docx import *
    from docx.text.paragraph import Paragraph
    from docx.text.paragraph import Run
    import xml.etree.ElementTree as ET
    from docx.document import Document as doctwo
    from docx.oxml.table import CT_Tbl
    from docx.oxml.text.paragraph import CT_P
    from docx.table import _Cell, Table
    from docx.text.paragraph import Paragraph
    from docx.shared import Pt
    from docxcompose.composer import Composer
    from docx import Document as Document_compose
    import pandas as pd
    from xml.etree import ElementTree
    from io import StringIO
    import io
    import csv
    import base64
    
    
    #Load the docx file into document object. You can input your own docx file in this step by changing the input path below:
    document = Document('/Users/karthick/Desktop/iclouddrive/Work/QA/microchip datasheets/22100F-converted-latest.docx')
    
    
    
    
    ##This function extracts the tables and paragraphs from the document object
    def iter_block_items(parent):
        """
        Yield each paragraph and table child within *parent*, in document order.
        Each returned value is an instance of either Table or Paragraph. *parent*
        would most commonly be a reference to a main Document object, but
        also works for a _Cell object, which itself can contain paragraphs and tables.
        """
        if isinstance(parent, doctwo):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("something's not right")
    
        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                yield Table(child, parent)
    
    
    #This function extracts the table from the document object as a dataframe
    def read_docx_tables(tab_id=None, **kwargs):
        """
        parse table(s) from a Word Document (.docx) into Pandas DataFrame(s)
        Parameters:
            filename:   file name of a Word Document
            tab_id:     parse a single table with the index: [tab_id] (counting from 0).
                        When [None] - return a list of DataFrames (parse all tables)
            kwargs:     arguments to pass to `pd.read_csv()` function
        Return: a single DataFrame if tab_id != None or a list of DataFrames otherwise
        """
        def read_docx_tab(tab, **kwargs):
            vf = io.StringIO()
            writer = csv.writer(vf)
            for row in tab.rows:
                writer.writerow(cell.text for cell in row.cells)
            vf.seek(0)
            return pd.read_csv(vf, **kwargs)
    
    #    doc = Document(filename)
        if tab_id is None:
            return [read_docx_tab(tab, **kwargs) for tab in document.tables]
        else:
            try:
                return read_docx_tab(document.tables[tab_id], **kwargs)
            except IndexError:
                print('Error: specified [tab_id]: {}  does not exist.'.format(tab_id))
                raise
    
    
    
    #The combined_df dataframe will store all the content in document order including images, tables and paragraphs.
    #If the content is an image or a table, it has to be referenced from image_df for images and table_list for tables using the corresponding image or table id that is stored in combined_df
    #And if the content is paragraph, the paragraph text will be stored in combined_df
    combined_df = pd.DataFrame(columns=['para_text','table_id','style'])
    table_mod = pd.DataFrame(columns=['string_value','table_id'])
    
    #The image_df will consist of base64 encoded image data of all the images in the document
    image_df = pd.DataFrame(columns=['image_index','image_rID','image_filename','image_base64_string'])
    
    #The table_list is a list consisting of all the tables in the document
    table_list=[]
    xml_list=[]
    
    i=0
    imagecounter = 0
    
    
    blockxmlstring = ''
    for block in iter_block_items(document):
        if 'text' in str(block):
            isappend = False
    
            runboldtext = ''
            for run in block.runs:                        
                if run.bold:
                    runboldtext = runboldtext + run.text
    
            style = str(block.style.name)
    
            appendtxt = str(block.text)
            appendtxt = appendtxt.replace("\n","")
            appendtxt = appendtxt.replace("\r","")
            tabid = 'Novalue'
            paragraph_split = appendtxt.lower().split()                
    
            isappend = True
            for run in block.runs:
                xmlstr = str(run.element.xml)
                my_namespaces = dict([node for _, node in ElementTree.iterparse(StringIO(xmlstr), events=['start-ns'])])
                root = ET.fromstring(xmlstr) 
                #Check if pic is there in the xml of the element. If yes, then extract the image data
                if 'pic:pic' in xmlstr:
                    xml_list.append(xmlstr)
                    for pic in root.findall('.//pic:pic', my_namespaces):
                        cNvPr_elem = pic.find("pic:nvPicPr/pic:cNvPr", my_namespaces)
                        name_attr = cNvPr_elem.get("name")
                        blip_elem = pic.find("pic:blipFill/a:blip", my_namespaces)
                        embed_attr = blip_elem.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
                        isappend = True
                        appendtxt = str('Document_Imagefile/' + name_attr + '/' + embed_attr + '/' + str(imagecounter))
                        document_part = document.part
                        image_part = document_part.related_parts[embed_attr]
                        image_base64 = base64.b64encode(image_part._blob)
                        image_base64 = image_base64.decode()                            
                        dftemp = pd.DataFrame({'image_index':[imagecounter],'image_rID':[embed_attr],'image_filename':[name_attr],'image_base64_string':[image_base64]})
                        image_df = image_df.append(dftemp,sort=False)
                        style = 'Novalue'
                    imagecounter = imagecounter + 1
    
        elif 'table' in str(block):
            isappend = True
            style = 'Novalue'
            appendtxt = str(block)
            tabid = i
            dfs = read_docx_tables(tab_id=i)
            dftemp = pd.DataFrame({'para_text':[appendtxt],'table_id':[i],'style':[style]})
            table_mod = table_mod.append(dftemp,sort=False)
            table_list.append(dfs)
            i=i+1
        if isappend:
                dftemp = pd.DataFrame({'para_text':[appendtxt],'table_id':[tabid],'style':[style]})
                combined_df=combined_df.append(dftemp,sort=False)
    
    combined_df = combined_df.reset_index(drop=True)
    image_df = image_df.reset_index(drop=True)
    

    You can refer to this link for a detailed explanation of how the code works:

    https://github.com/kmrambo/Python-docx-Reading-paragraphs-tables-and-images-in-document-order-

    0 讨论(0)
  • 2021-01-15 03:17

    This operation is not yet directly supported by the python-docx API. However you can find a workaround function for it here: https://github.com/python-openxml/python-docx/issues/40 and a little further information may be found by searching on 'python-docx iter block items'.

    The basic problem is that the Microsoft API for Word does not include a method for iterating block-level items in document order. Block-level items in Word are the paragraph and table objects. python-docx modeled the MS API as its starting point and so the Document.paragraphs and Document.tables properties were the first to be implemented. Document.iter_block_items() or perhaps just Document.block_items hasn't been implemented yet, although it's closer to the top of the enhancement list than many other features because it's frequently asked for.

    In the meantime, you'll need to implement a workaround function in your own code.

    0 讨论(0)
  • 2021-01-15 03:25

    Thanks for the reference to https://github.com/python-openxml/python-docx/issues/40, though had to do some updates to the code found, maybe due to changes over ~5 years, and ended up using:

    from docx.document import Document
    from docx.oxml.table import CT_Tbl
    from docx.oxml.text.paragraph import CT_P
    from docx.table import _Cell, Table
    from docx.text.paragraph import Paragraph
    
    def iter_block_items(parent):
        """
        Yield each paragraph and table child within *parent*, in document order.
        Each returned value is an instance of either Table or Paragraph. *parent*
        would most commonly be a reference to a main Document object, but
        also works for a _Cell object, which itself can contain paragraphs and tables.
        """
    def iter_block_items(parent):
        # Get parrent element
        if isinstance(parent, Document):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("something's not right")
        # Get children in parent element
        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                yield Table(child, parent)
    
    0 讨论(0)
提交回复
热议问题