User Tools

Site Tools


pdf_to_txt.py

PDF to TXT

# -*- coding: utf-8 -*-
"""
Created on Sun Jan 20 00:59:23 2019

@author: Dikesh Faldu
"""

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import XMLConverter
from bs4 import BeautifulSoup as bsoup
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
import pandas as pd
import numpy as np
import re


def pdf_to_xml_converter(pdf_file_path):
    """
        This method converts from pdf to xml, and returns xml string,
        if argument "xml_file_path" is not None then it will create and save xml file at given xml file path.
    """
    RETRY_THRESHOLD = 3
#    xml_file_path = re.sub("\.pdf|\.PDF",".xml",pdf_file_path)
    xml_str = None

    retry = 0

    while retry < RETRY_THRESHOLD:
        try:
            rsrcmgr = PDFResourceManager()
            retstr = BytesIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
            fp = open(pdf_file_path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos=set()
        
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
                interpreter.process_page(page)

            xml_str = retstr.getvalue()
            # Closing resources
            fp.close()
            device.close()
            retstr.close()
            # If xml file path is given then write to that
            
        except Exception as e:
            print(str(e))
            
        retry += 1
        
    return xml_str

def xml_parser_to_df(xml_str):
    """
        This method accepts xml string as an argument and returns dataframe after parsing it.
    """
#    print("xml_parser_to_df method start")
    df = pd.DataFrame()
    try:
        # loading xml string into bsoup 
        XmlSoupElement = bsoup(xml_str, 'xml')
        # Iteratting through each page
        for page in XmlSoupElement.findAll('page'):
            x = []
            y = []
            w = []
            h = []
            word = []
            # iterating through all textbox in page
            for tb in page.findAll('textbox'):
                if len(tb.findAll('textline')) == 0:
                    continue
                for tl in tb.findAll('textline'):
                    if len(tl.findAll('text')) == 0:
                        continue
                    comb = ""
                    word_begin = 0
                    word_x = ""
                    word_y = ""
                    word_h = ""
                    word_w = ""
                    flag = 0
                    for txt in tl.findAll('text'):
                        txt_attrs = dict(txt.attrs)
                        if (word_begin == 0):
                            flag = 1
                            word_begin = 1
#                            comb += str(txt.get_text().encode('utf-8'))
                            comb += str(txt.get_text())
                            word_x = txt_attrs['bbox'].split(',')[0]
                            word_w = txt['bbox'].split(',')[2]
                            word_y = txt['bbox'].split(',')[1]
                            word_h = txt['bbox'].split(',')[3]
                        elif (word_begin == 1 and txt_attrs != {}):
#                            comb += str(txt.get_text().encode('utf-8'))
                            comb += str(txt.get_text())
                            word_w = txt['bbox'].split(',')[2]
                            word_y = txt['bbox'].split(',')[1]
                            word_h = txt['bbox'].split(',')[3]
                        elif (word_begin == 1 and txt_attrs == {}):
                            flag = 0
                            word_begin = 0
                            x.append(word_x)
                            y.append(word_y)
                            w.append(word_w)
                            h.append(word_h)

                            word.append(comb)

                            word_x = ""
                            word_y = ""
                            word_h = ""
                            word_w = ""
                            comb = ""
                    if (flag == 1):
                        x.append(tl['bbox'].split(',')[0])
                        y.append(tl['bbox'].split(',')[1])
                        w.append(tl['bbox'].split(',')[2])
                        h.append(tl['bbox'].split(',')[3])
                        word.append(comb)
            df_page = pd.DataFrame()

            df_page["x"] = x
            df_page["y"] = y
            df_page["w"] = w
            df_page["h"] = h
            df_page["page_number"] = page['id']

            word = [re.sub(r'\s+', ' ', words) for words in word]
            word = [re.sub(r'^_+$', '', words) for words in word]
            word = [re.sub(r'^[\-]{2,}$', '', words) for words in word]
            word = [re.sub(r'^\*+$', '', words) for words in word]

            df_page["output"] = word
            df_page[['x', 'y', 'w', 'h']] = df_page[['x', 'y', 'w', 'h']].astype(float)
            df_page[['page_number']] = df_page[['page_number']].apply(pd.to_numeric)

            try:
                y_min = min(df_page['y'])
                h_max = max(df_page['h'])
                df_page_temp = df_page.copy(deep=True)
                df_page.loc[:, 'y'] = h_max - df_page.loc[:, 'h'] + y_min
                df_page.loc[:, 'h'] = h_max - df_page_temp.loc[:, 'y'] + y_min
                df_page = df_page.reset_index(drop=True)
                df = df.append(df_page)
                df = df.reset_index(drop=True)
            except:
                print("Empty Page or Error at page number : "+ str(page['id']))

        df['output'] = list(df['output'].str.strip())
        rr = df[df['output'].str.contains("^$") == True]
        df = df.drop(df.index[list(rr.index.values)])
        df = df.reset_index(drop=True)
        # calculate liine number
        df = add_line_number(df)
        
    except Exception as e:
        print("Error in parser"+str(e))
    
    return df

def computing_median_height(dataframe):
    """
        This method calculates median hight of char , we assume 15 id default height
    """
    avg_height = 15 # this is default value fixed by Quadtatyx team
    try:
        dataframe = dataframe.reset_index(drop=True)
        dataframe['height'] = dataframe['h'] - dataframe['y']
        avg_height = int(np.median(dataframe.height.tolist()))
    except Exception as ve:
        print(str(ve))
    return avg_height

def compute_median(row):
    """
        This method calculates median of h and y coordinates
    """
    try:
        return (int((row['h'] + row['y']) / 2))
    except Exception as ke:
        print(str(ke))

def add_blank_line(row,median_char_height):
    
    diff_of_midian = row['Mid_Point_diff']
    
    if(diff_of_midian > median_char_height+2):
        row['output'] = "\n"+str(row['output'])
        
    return row


def compute_line_number(pageDF):
    """
        This method caluculates line number for particular page data frame given as an argument
    """
    # calculating median char height
    median_char_height = computing_median_height(pageDF)
    # Sorting by page number , y and x
    pageDF = pageDF.sort_values(['page_number', 'y', 'x'], ascending=[True, True, True])
    # reseting index
    pageDF = pageDF.reset_index(drop=True)
    # calculating median of char height for each word
    pageDF["median_word_y_coOrdinate"] = pageDF.apply(compute_median, axis=1)
    # calculating difference between mediam-height of each word with its next word's median height
    pageDF["diff_of_midian"] = pageDF['median_word_y_coOrdinate'] - pageDF['median_word_y_coOrdinate'].shift(1)
    # there is no prev word for the very first word hence we are adding big num for this
    pageDF["diff_of_midian"].iloc[0] = 100000000
    ## Copy these value to other column for further use
    pageDF['Mid_Point_diff'] = pageDF["diff_of_midian"]
    # changing to int
    pageDF['diff_of_midian'] = pageDF['diff_of_midian'].astype(int)
    # putting very large number if difference is > half of threshold median height 
    pageDF.loc[pageDF['diff_of_midian'] > int(median_char_height / 2), 'diff_of_midian'] = 100000000
    # incrementally adding integer to line_number
    pageDF['line_number'] = (pageDF.diff_of_midian == 100000000).cumsum()
    pageDF = pageDF.sort_values(['line_number', 'x'], ascending=[True, True])
    # droping unused columns
    
    pageDF = pageDF.apply(lambda x : add_blank_line(x,median_char_height),axis=1)
    
    del pageDF['median_word_y_coOrdinate'], pageDF['diff_of_midian'] , pageDF['Mid_Point_diff']
    
    ## Removing the page number at the end of every page
    last_line = max(list(pageDF['line_number']))
    pageDF = pageDF[pageDF['line_number']<last_line]

    return pageDF

def add_line_number(df):
    """
        This method will add line number to words
    """
    pages = list(set(df['page_number']))
    new_df = pd.DataFrame()
    page_num = 1
    for page_num in pages:
        pageDF = df[df['page_number'] == page_num]
        new_df = new_df.append(compute_line_number(pageDF))
    return new_df

def combine_line(line_df):
    
    output_str = list(line_df['output'])
    
    line = " ".join(output_str)
    
    return line


def convert_pdf_to_txt(pdf_file_path):
    ## Convert PDf file to XML
    #pdf_file_path = r"D:/Fiverr/jerodtufte/75-09.2-01.pdf"
    xml_str = pdf_to_xml_converter(pdf_file_path)
    
    ## Parse the XMl through parser and then calculate the line number in each page
    df = xml_parser_to_df(xml_str)
    
    ## Sort values  by page number then line number and then x cordinate of the word
    df = df.sort_values(['page_number','line_number', 'x'], ascending=[True,True, True])
    
    ## Combine the same line data 
    lines = df.groupby(["page_number","line_number"]).apply(combine_line)
    
    ## make a list of the each line
    lines= list(lines)

    ##
    return lines

pdf_to_txt.py.txt · Last modified: 2022/03/04 10:31 (external edit)