Working with tables in python-docx

问题

I have a small question about working with opened docx-file. This is part of my code:

doc = Document(self.fileName[0])

for paragraph in doc.paragraphs:
   self.cursor.insertText(paragraph.text + '\n')

for table_index, table in enumerate(doc.tables):
   self.cursor.insertText('Таблица {0}\n'.format(table_index+1))
   for row_index in range(len(table.rows)):
      for column_index in range(len(table.columns)):
         self.cursor.insertText(table.cell(row_index, column_index).text + '\t')
      self.cursor.insertText('\n')
   self.cursor.insertText('\n')

The question is can I find out where table is placed physicaly in the original doc? I need to display paragraphs and tables in the same order like in doc.

回答1:

This operation is not yet directly supported by the python-docx API. However you can find a workaround function for it here: https://github.com/python-openxml/python-docx/issues/40 and a little further information may be found by searching on 'python-docx iter block items'.

The basic problem is that the Microsoft API for Word does not include a method for iterating block-level items in document order. Block-level items in Word are the paragraph and table objects. python-docx modeled the MS API as its starting point and so the Document.paragraphs and Document.tables properties were the first to be implemented. Document.iter_block_items() or perhaps just Document.block_items hasn't been implemented yet, although it's closer to the top of the enhancement list than many other features because it's frequently asked for.

In the meantime, you'll need to implement a workaround function in your own code.

回答2:

Thanks for the reference to https://github.com/python-openxml/python-docx/issues/40, though had to do some updates to the code found, maybe due to changes over ~5 years, and ended up using:

from docx.document import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph

def iter_block_items(parent):
    """
    Yield each paragraph and table child within *parent*, in document order.
    Each returned value is an instance of either Table or Paragraph. *parent*
    would most commonly be a reference to a main Document object, but
    also works for a _Cell object, which itself can contain paragraphs and tables.
    """
def iter_block_items(parent):
    # Get parrent element
    if isinstance(parent, Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")
    # Get children in parent element
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

回答3:

This code will work for this:

###Import all necessary packages
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from docx import *
from docx.text.paragraph import Paragraph
from docx.text.paragraph import Run
import xml.etree.ElementTree as ET
from docx.document import Document as doctwo
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.shared import Pt
from docxcompose.composer import Composer
from docx import Document as Document_compose
import pandas as pd
from xml.etree import ElementTree
from io import StringIO
import io
import csv
import base64


#Load the docx file into document object. You can input your own docx file in this step by changing the input path below:
document = Document('/Users/karthick/Desktop/iclouddrive/Work/QA/microchip datasheets/22100F-converted-latest.docx')




##This function extracts the tables and paragraphs from the document object
def iter_block_items(parent):
    """
    Yield each paragraph and table child within *parent*, in document order.
    Each returned value is an instance of either Table or Paragraph. *parent*
    would most commonly be a reference to a main Document object, but
    also works for a _Cell object, which itself can contain paragraphs and tables.
    """
    if isinstance(parent, doctwo):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)


#This function extracts the table from the document object as a dataframe
def read_docx_tables(tab_id=None, **kwargs):
    """
    parse table(s) from a Word Document (.docx) into Pandas DataFrame(s)
    Parameters:
        filename:   file name of a Word Document
        tab_id:     parse a single table with the index: [tab_id] (counting from 0).
                    When [None] - return a list of DataFrames (parse all tables)
        kwargs:     arguments to pass to `pd.read_csv()` function
    Return: a single DataFrame if tab_id != None or a list of DataFrames otherwise
    """
    def read_docx_tab(tab, **kwargs):
        vf = io.StringIO()
        writer = csv.writer(vf)
        for row in tab.rows:
            writer.writerow(cell.text for cell in row.cells)
        vf.seek(0)
        return pd.read_csv(vf, **kwargs)

#    doc = Document(filename)
    if tab_id is None:
        return [read_docx_tab(tab, **kwargs) for tab in document.tables]
    else:
        try:
            return read_docx_tab(document.tables[tab_id], **kwargs)
        except IndexError:
            print('Error: specified [tab_id]: {}  does not exist.'.format(tab_id))
            raise



#The combined_df dataframe will store all the content in document order including images, tables and paragraphs.
#If the content is an image or a table, it has to be referenced from image_df for images and table_list for tables using the corresponding image or table id that is stored in combined_df
#And if the content is paragraph, the paragraph text will be stored in combined_df
combined_df = pd.DataFrame(columns=['para_text','table_id','style'])
table_mod = pd.DataFrame(columns=['string_value','table_id'])

#The image_df will consist of base64 encoded image data of all the images in the document
image_df = pd.DataFrame(columns=['image_index','image_rID','image_filename','image_base64_string'])

#The table_list is a list consisting of all the tables in the document
table_list=[]
xml_list=[]

i=0
imagecounter = 0


blockxmlstring = ''
for block in iter_block_items(document):
    if 'text' in str(block):
        isappend = False

        runboldtext = ''
        for run in block.runs:                        
            if run.bold:
                runboldtext = runboldtext + run.text

        style = str(block.style.name)

        appendtxt = str(block.text)
        appendtxt = appendtxt.replace("\n","")
        appendtxt = appendtxt.replace("\r","")
        tabid = 'Novalue'
        paragraph_split = appendtxt.lower().split()                

        isappend = True
        for run in block.runs:
            xmlstr = str(run.element.xml)
            my_namespaces = dict([node for _, node in ElementTree.iterparse(StringIO(xmlstr), events=['start-ns'])])
            root = ET.fromstring(xmlstr) 
            #Check if pic is there in the xml of the element. If yes, then extract the image data
            if 'pic:pic' in xmlstr:
                xml_list.append(xmlstr)
                for pic in root.findall('.//pic:pic', my_namespaces):
                    cNvPr_elem = pic.find("pic:nvPicPr/pic:cNvPr", my_namespaces)
                    name_attr = cNvPr_elem.get("name")
                    blip_elem = pic.find("pic:blipFill/a:blip", my_namespaces)
                    embed_attr = blip_elem.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
                    isappend = True
                    appendtxt = str('Document_Imagefile/' + name_attr + '/' + embed_attr + '/' + str(imagecounter))
                    document_part = document.part
                    image_part = document_part.related_parts[embed_attr]
                    image_base64 = base64.b64encode(image_part._blob)
                    image_base64 = image_base64.decode()                            
                    dftemp = pd.DataFrame({'image_index':[imagecounter],'image_rID':[embed_attr],'image_filename':[name_attr],'image_base64_string':[image_base64]})
                    image_df = image_df.append(dftemp,sort=False)
                    style = 'Novalue'
                imagecounter = imagecounter + 1

    elif 'table' in str(block):
        isappend = True
        style = 'Novalue'
        appendtxt = str(block)
        tabid = i
        dfs = read_docx_tables(tab_id=i)
        dftemp = pd.DataFrame({'para_text':[appendtxt],'table_id':[i],'style':[style]})
        table_mod = table_mod.append(dftemp,sort=False)
        table_list.append(dfs)
        i=i+1
    if isappend:
            dftemp = pd.DataFrame({'para_text':[appendtxt],'table_id':[tabid],'style':[style]})
            combined_df=combined_df.append(dftemp,sort=False)

combined_df = combined_df.reset_index(drop=True)
image_df = image_df.reset_index(drop=True)

You can refer to this link for a detailed explanation of how the code works:

https://github.com/kmrambo/Python-docx-Reading-paragraphs-tables-and-images-in-document-order-

来源：https://stackoverflow.com/questions/46388408/working-with-tables-in-python-docx

标签

python-docx