pdf_to_txt.py
PDF to TXT
# -*- coding: utf-8 -*- """ Created on Sun Jan 20 00:59:23 2019 @author: Dikesh Faldu """ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import XMLConverter from bs4 import BeautifulSoup as bsoup from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import BytesIO import pandas as pd import numpy as np import re def pdf_to_xml_converter(pdf_file_path): """ This method converts from pdf to xml, and returns xml string, if argument "xml_file_path" is not None then it will create and save xml file at given xml file path. """ RETRY_THRESHOLD = 3 # xml_file_path = re.sub("\.pdf|\.PDF",".xml",pdf_file_path) xml_str = None retry = 0 while retry < RETRY_THRESHOLD: try: rsrcmgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(pdf_file_path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) xml_str = retstr.getvalue() # Closing resources fp.close() device.close() retstr.close() # If xml file path is given then write to that except Exception as e: print(str(e)) retry += 1 return xml_str def xml_parser_to_df(xml_str): """ This method accepts xml string as an argument and returns dataframe after parsing it. """ # print("xml_parser_to_df method start") df = pd.DataFrame() try: # loading xml string into bsoup XmlSoupElement = bsoup(xml_str, 'xml') # Iteratting through each page for page in XmlSoupElement.findAll('page'): x = [] y = [] w = [] h = [] word = [] # iterating through all textbox in page for tb in page.findAll('textbox'): if len(tb.findAll('textline')) == 0: continue for tl in tb.findAll('textline'): if len(tl.findAll('text')) == 0: continue comb = "" word_begin = 0 word_x = "" word_y = "" word_h = "" word_w = "" flag = 0 for txt in tl.findAll('text'): txt_attrs = dict(txt.attrs) if (word_begin == 0): flag = 1 word_begin = 1 # comb += str(txt.get_text().encode('utf-8')) comb += str(txt.get_text()) word_x = txt_attrs['bbox'].split(',')[0] word_w = txt['bbox'].split(',')[2] word_y = txt['bbox'].split(',')[1] word_h = txt['bbox'].split(',')[3] elif (word_begin == 1 and txt_attrs != {}): # comb += str(txt.get_text().encode('utf-8')) comb += str(txt.get_text()) word_w = txt['bbox'].split(',')[2] word_y = txt['bbox'].split(',')[1] word_h = txt['bbox'].split(',')[3] elif (word_begin == 1 and txt_attrs == {}): flag = 0 word_begin = 0 x.append(word_x) y.append(word_y) w.append(word_w) h.append(word_h) word.append(comb) word_x = "" word_y = "" word_h = "" word_w = "" comb = "" if (flag == 1): x.append(tl['bbox'].split(',')[0]) y.append(tl['bbox'].split(',')[1]) w.append(tl['bbox'].split(',')[2]) h.append(tl['bbox'].split(',')[3]) word.append(comb) df_page = pd.DataFrame() df_page["x"] = x df_page["y"] = y df_page["w"] = w df_page["h"] = h df_page["page_number"] = page['id'] word = [re.sub(r'\s+', ' ', words) for words in word] word = [re.sub(r'^_+$', '', words) for words in word] word = [re.sub(r'^[\-]{2,}$', '', words) for words in word] word = [re.sub(r'^\*+$', '', words) for words in word] df_page["output"] = word df_page[['x', 'y', 'w', 'h']] = df_page[['x', 'y', 'w', 'h']].astype(float) df_page[['page_number']] = df_page[['page_number']].apply(pd.to_numeric) try: y_min = min(df_page['y']) h_max = max(df_page['h']) df_page_temp = df_page.copy(deep=True) df_page.loc[:, 'y'] = h_max - df_page.loc[:, 'h'] + y_min df_page.loc[:, 'h'] = h_max - df_page_temp.loc[:, 'y'] + y_min df_page = df_page.reset_index(drop=True) df = df.append(df_page) df = df.reset_index(drop=True) except: print("Empty Page or Error at page number : "+ str(page['id'])) df['output'] = list(df['output'].str.strip()) rr = df[df['output'].str.contains("^$") == True] df = df.drop(df.index[list(rr.index.values)]) df = df.reset_index(drop=True) # calculate liine number df = add_line_number(df) except Exception as e: print("Error in parser"+str(e)) return df def computing_median_height(dataframe): """ This method calculates median hight of char , we assume 15 id default height """ avg_height = 15 # this is default value fixed by Quadtatyx team try: dataframe = dataframe.reset_index(drop=True) dataframe['height'] = dataframe['h'] - dataframe['y'] avg_height = int(np.median(dataframe.height.tolist())) except Exception as ve: print(str(ve)) return avg_height def compute_median(row): """ This method calculates median of h and y coordinates """ try: return (int((row['h'] + row['y']) / 2)) except Exception as ke: print(str(ke)) def add_blank_line(row,median_char_height): diff_of_midian = row['Mid_Point_diff'] if(diff_of_midian > median_char_height+2): row['output'] = "\n"+str(row['output']) return row def compute_line_number(pageDF): """ This method caluculates line number for particular page data frame given as an argument """ # calculating median char height median_char_height = computing_median_height(pageDF) # Sorting by page number , y and x pageDF = pageDF.sort_values(['page_number', 'y', 'x'], ascending=[True, True, True]) # reseting index pageDF = pageDF.reset_index(drop=True) # calculating median of char height for each word pageDF["median_word_y_coOrdinate"] = pageDF.apply(compute_median, axis=1) # calculating difference between mediam-height of each word with its next word's median height pageDF["diff_of_midian"] = pageDF['median_word_y_coOrdinate'] - pageDF['median_word_y_coOrdinate'].shift(1) # there is no prev word for the very first word hence we are adding big num for this pageDF["diff_of_midian"].iloc[0] = 100000000 ## Copy these value to other column for further use pageDF['Mid_Point_diff'] = pageDF["diff_of_midian"] # changing to int pageDF['diff_of_midian'] = pageDF['diff_of_midian'].astype(int) # putting very large number if difference is > half of threshold median height pageDF.loc[pageDF['diff_of_midian'] > int(median_char_height / 2), 'diff_of_midian'] = 100000000 # incrementally adding integer to line_number pageDF['line_number'] = (pageDF.diff_of_midian == 100000000).cumsum() pageDF = pageDF.sort_values(['line_number', 'x'], ascending=[True, True]) # droping unused columns pageDF = pageDF.apply(lambda x : add_blank_line(x,median_char_height),axis=1) del pageDF['median_word_y_coOrdinate'], pageDF['diff_of_midian'] , pageDF['Mid_Point_diff'] ## Removing the page number at the end of every page last_line = max(list(pageDF['line_number'])) pageDF = pageDF[pageDF['line_number']<last_line] return pageDF def add_line_number(df): """ This method will add line number to words """ pages = list(set(df['page_number'])) new_df = pd.DataFrame() page_num = 1 for page_num in pages: pageDF = df[df['page_number'] == page_num] new_df = new_df.append(compute_line_number(pageDF)) return new_df def combine_line(line_df): output_str = list(line_df['output']) line = " ".join(output_str) return line def convert_pdf_to_txt(pdf_file_path): ## Convert PDf file to XML #pdf_file_path = r"D:/Fiverr/jerodtufte/75-09.2-01.pdf" xml_str = pdf_to_xml_converter(pdf_file_path) ## Parse the XMl through parser and then calculate the line number in each page df = xml_parser_to_df(xml_str) ## Sort values by page number then line number and then x cordinate of the word df = df.sort_values(['page_number','line_number', 'x'], ascending=[True,True, True]) ## Combine the same line data lines = df.groupby(["page_number","line_number"]).apply(combine_line) ## make a list of the each line lines= list(lines) ## return lines
pdf_to_txt.py.txt · Last modified: 2022/03/04 10:31 by 127.0.0.1