Extract individual field from table image to excel with OCR

前端 未结 3 916
醉酒成梦
醉酒成梦 2020-12-09 23:25

I have scanned images which have tables as shown in this image:

I am trying to extract each box separately and perform OCR but when I try to detect horizont

相关标签:
3条回答
  • 2020-12-09 23:57

    nanthancy's answer is also accurate, I used the following script for getting each box and sorting it by columns and rows.

    Note: Most of this code is from a medium blog by Kanan Vyas here: https://medium.com/coinmonks/a-box-detection-algorithm-for-any-image-containing-boxes-756c15d7ed26

    #most of this code is take from blog by Kanan Vyas here: 
    #https://medium.com/coinmonks/a-box-detection-algorithm-for-any-image-containing-boxes-756c15d7ed26
    
    
    import cv2
    import numpy as np
    
    img = cv2.imread('images/scan2.jpg',0)
    
    #fn to show np images with cv2 and close on any key press
    def imshow(img, label='default'):
        cv2.imshow(label, img)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
    
    # Thresholding the image
    (thresh, img_bin) = cv2.threshold(img, 250, 255,cv2.THRESH_BINARY|cv2.THRESH_OTSU)
    #inverting the image
    img_bin = 255-img_bin 
    
    # Defining a kernel length
    kernel_length = np.array(img).shape[1]//80
    
    # A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image.
    verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length))# A horizontal kernel of (kernel_length X 1), which will help to detect all the horizontal line from the image.
    hori_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_length, 1))# A kernel of (3 X 3) ones.
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    
    # Morphological operation to detect vertical lines from an image
    img_temp1 = cv2.erode(img_bin, verticle_kernel, iterations=3)
    verticle_lines_img = cv2.dilate(img_temp1, verticle_kernel, iterations=3)
    #cv2.imwrite("verticle_lines.jpg",verticle_lines_img)
    
    # Morphological operation to detect horizontal lines from an image
    img_temp2 = cv2.erode(img_bin, hori_kernel, iterations=3)
    horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=3)
    #cv2.imwrite("horizontal_lines.jpg",horizontal_lines_img)
    
    # Weighting parameters, this will decide the quantity of an image to be added to make a new image.
    alpha = 0.5
    
    beta = 1.0 - alpha# This function helps to add two image with specific weight parameter to get a third image as summation of two image.
    
    img_final_bin = cv2.addWeighted(verticle_lines_img, alpha, horizontal_lines_img, beta, 0.0)
    
    img_final_bin = cv2.erode(~img_final_bin, kernel, iterations=2)
    
    (thresh, img_final_bin) = cv2.threshold(img_final_bin, 128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    
    cv2.imwrite("img_final_bin.jpg",img_final_bin)
    
    
    # Find contours for image, which will detect all the boxes
    contours, hierarchy = cv2.findContours(img_final_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    
    """ this section saves each extracted box as a seperate image.
    idx = 0
    for c in contours:
        # Returns the location and width,height for every contour
        x, y, w, h = cv2.boundingRect(c)
        #only selecting boxes within certain width height range
        if (w > 10 and h > 15 and h < 50):
            idx += 1
            new_img = img[y:y+h, x:x+w]
            #cv2.imwrite("kanan/1/"+ "{}-{}-{}-{}".format(x, y, w, h) + '.jpg', new_img)
    """
    
    #get set of all y-coordinates to sort boxes row wise
    def getsety(boxes):
        ally = []
        for b in boxes:
            ally.append(b[1])
        ally = set(ally)
        ally = sorted(ally)
        return ally
    
    #sort boxes by y in certain range, because if image is tilted than same row boxes 
    #could have different Ys but within certain range
    def sort_boxes(boxes, y, row_column):
        l = []
        for b in boxes:
            if (b[2] > 10 and b[3] > 15 and b[3] < 50):
                if b[1] >= y - 7 and b[1] <= y + 7:
                    l.append(b)
    
        if l in row_column:
            return row_column
        else:
            row_column.append(l)
            return row_column
    
    #sort each row using X of each box to sort it column wise
    def sortrows(rc):
        new_rc = []
        for row in rc:
            r_new = sorted(row, key = lambda cell: cell[0])
            new_rc.append(r_new)
        return new_rc
    
    row_column = []
    for i in getsety(boundingBoxes):
        row_column = sort_boxes(boundingBoxes, i, row_column)
    row_column = [i for i in row_column if i != []]
    
    #final np array with sorted boxes from top left to bottom right
    row_column = sortrows(row_column)
    
    
    

    I made this in Jupyter notebook and copy-pasted here, if any errors come up, let me know.

    Thank you everyone for answers

    0 讨论(0)
  • 2020-12-09 23:58

    This is function, which uses tesseract-ocr for layout detection. You can try with different RIL levels and PSM. For more details have a look here: https://github.com/sirfz/tesserocr

    import os
    import platform
    from typing import List, Tuple
    
    from tesserocr import PyTessBaseAPI, iterate_level, RIL
    
    system = platform.system()
    if system == 'Linux':
        tessdata_folder_default = ''
    elif system == 'Windows':
        tessdata_folder_default = r'C:\Program Files (x86)\Tesseract-OCR\tessdata'
    else:
        raise NotImplementedError
    
    # this tesseract specific env variable takes precedence for tessdata folder location selection
    # especially important for windows, as we don't know if we're running 32 or 64bit tesseract
    tessdata_folder = os.getenv('TESSDATA_PREFIX', tessdata_folder_default)
    
    
    def get_layout_boxes(input_image,  # PIL image object
                         level: RIL,
                         include_text: bool,
                         include_boxes: bool,
                         language: str,
                         psm: int,
                         tessdata_path='') -> List[Tuple]:
        """
        Get image components coordinates. It will return also text if include_text is True.
        :param input_image: input PIL image
        :param level: page iterator level, please see "RIL" enum
        :param include_text: if True return boxes texts
        :param include_boxes: if True return boxes coordinates
        :param language: language for OCR
        :param psm: page segmentation mode, by default it is PSM.AUTO which is 3
        :param tessdata_path: the path to the tessdata folder
        :return: list of tuples: [((x1, y1, x2, y2), text)), ...]
        """
        assert any((include_text, include_boxes)), (
            'Both include_text and include_boxes can not be False.')
    
        if not tessdata_path:
            tessdata_path = tessdata_folder
    
        try:
            with PyTessBaseAPI(path=tessdata_path, lang=language) as api:
                api.SetImage(input_image)
    
                api.SetPageSegMode(psm)
                api.Recognize()
                page_iterator = api.GetIterator()
                data = []
                for pi in iterate_level(page_iterator, level):
                    bounding_box = pi.BoundingBox(level)
                    if bounding_box is not None:
                        text = pi.GetUTF8Text(level) if include_text else None
                        box = bounding_box if include_boxes else None
                        data.append((box, text))
                return data
        except RuntimeError:
            print('Please specify correct path to tessdata.')
    
    0 讨论(0)
  • 2020-12-10 00:00

    You're on the right track. Here's a continuation of your approach with slight modifications. The idea is:

    1. Obtain binary image. Load image, convert to grayscale, and Otsu's threshold.

    2. Remove all character text contours. We create a rectangular kernel and perform opening to only keep the horizontal/vertical lines. This will effectively make the text into tiny noise so we find contours and filter using contour area to remove them.

    3. Repair horizontal/vertical lines and extract each ROI. We morph close to fix and broken lines and smooth the table. From here we sort the box field contours using imutils.sort_contours() with the top-to-bottom parameter. Next we find contours and filter using contour area then extract each ROI.


    Here's a visualization of each box field and the extracted ROI

    Code

    import cv2
    import numpy as np
    from imutils import contours
    
    # Load image, grayscale, Otsu's threshold
    image = cv2.imread('1.jpg')
    original = image.copy()
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    
    # Remove text characters with morph open and contour filtering
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
    cnts = cv2.findContours(opening, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        area = cv2.contourArea(c)
        if area < 500:
            cv2.drawContours(opening, [c], -1, (0,0,0), -1)
    
    # Repair table lines, sort contours, and extract ROI
    close = 255 - cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel, iterations=1)
    cnts = cv2.findContours(close, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    (cnts, _) = contours.sort_contours(cnts, method="top-to-bottom")
    for c in cnts:
        area = cv2.contourArea(c)
        if area < 25000:
            x,y,w,h = cv2.boundingRect(c)
            cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), -1)
            ROI = original[y:y+h, x:x+w]
    
            # Visualization
            cv2.imshow('image', image)
            cv2.imshow('ROI', ROI)
            cv2.waitKey(20)
    
    cv2.imshow('opening', opening)
    cv2.imshow('close', close)
    cv2.imshow('image', image)
    cv2.waitKey()
    
    0 讨论(0)
提交回复
热议问题