# -*- coding: utf-8 -*- """ Created on Sat Feb 9 15:00:51 2019 @author: Dikesh Faldu Modified 2019-10-27 by JET to correct headers with contingent effective dates and repeals """ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import XMLConverter from bs4 import BeautifulSoup as bsoup from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import BytesIO import pandas as pd import numpy as np import re import os import urllib import unidecode def pdf_to_xml_converter(pdf_file_path): """ This method converts from pdf to xml, and returns xml string, if argument "xml_file_path" is not None then it will create and save xml file at given xml file path. """ RETRY_THRESHOLD = 3 # xml_file_path = re.sub("\.pdf|\.PDF",".xml",pdf_file_path) xml_str = None retry = 0 while retry < RETRY_THRESHOLD: try: rsrcmgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(pdf_file_path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) xml_str = retstr.getvalue() # Closing resources fp.close() device.close() retstr.close() # If xml file path is given then write to that except Exception as e: print(str(e)) retry += 1 return xml_str def xml_parser_to_df(xml_str): """ This method accepts xml string as an argument and returns dataframe after parsing it. """ # print("xml_parser_to_df method start") df = pd.DataFrame() try: # loading xml string into bsoup XmlSoupElement = bsoup(xml_str, 'xml') # Iteratting through each page for page in XmlSoupElement.findAll('page'): x = [] y = [] w = [] h = [] word = [] # iterating through all textbox in page for tb in page.findAll('textbox'): if len(tb.findAll('textline')) == 0: continue for tl in tb.findAll('textline'): if len(tl.findAll('text')) == 0: continue comb = "" word_begin = 0 word_x = "" word_y = "" word_h = "" word_w = "" flag = 0 for txt in tl.findAll('text'): txt_attrs = dict(txt.attrs) if (word_begin == 0): flag = 1 word_begin = 1 # comb += str(txt.get_text().encode('utf-8')) comb += str(txt.get_text()) word_x = txt_attrs['bbox'].split(',')[0] word_w = txt['bbox'].split(',')[2] word_y = txt['bbox'].split(',')[1] word_h = txt['bbox'].split(',')[3] elif (word_begin == 1 and txt_attrs != {}): # comb += str(txt.get_text().encode('utf-8')) comb += str(txt.get_text()) word_w = txt['bbox'].split(',')[2] word_y = txt['bbox'].split(',')[1] word_h = txt['bbox'].split(',')[3] elif (word_begin == 1 and txt_attrs == {}): flag = 0 word_begin = 0 x.append(word_x) y.append(word_y) w.append(word_w) h.append(word_h) word.append(comb) word_x = "" word_y = "" word_h = "" word_w = "" comb = "" if (flag == 1): x.append(tl['bbox'].split(',')[0]) y.append(tl['bbox'].split(',')[1]) w.append(tl['bbox'].split(',')[2]) h.append(tl['bbox'].split(',')[3]) word.append(comb) df_page = pd.DataFrame() df_page["x"] = x df_page["y"] = y df_page["w"] = w df_page["h"] = h df_page["page_number"] = page['id'] word = [re.sub(r'\s+', ' ', words) for words in word] word = [re.sub(r'^_+$', '', words) for words in word] word = [re.sub(r'^[\-]{2,}$', '', words) for words in word] word = [re.sub(r'^\*+$', '', words) for words in word] df_page["output"] = word df_page[['x', 'y', 'w', 'h']] = df_page[['x', 'y', 'w', 'h']].astype(float) df_page[['page_number']] = df_page[['page_number']].apply(pd.to_numeric) try: y_min = min(df_page['y']) h_max = max(df_page['h']) df_page_temp = df_page.copy(deep=True) df_page.loc[:, 'y'] = h_max - df_page.loc[:, 'h'] + y_min df_page.loc[:, 'h'] = h_max - df_page_temp.loc[:, 'y'] + y_min df_page = df_page.reset_index(drop=True) df = df.append(df_page) df = df.reset_index(drop=True) except: print("Empty Page or Error at page number : "+ str(page['id'])) df['output'] = list(df['output'].str.strip()) rr = df[df['output'].str.contains("^$") == True] df = df.drop(df.index[list(rr.index.values)]) df = df.reset_index(drop=True) # calculate liine number df = add_line_number(df) except Exception as e: print("Error in parser"+str(e)) return df def computing_median_height(dataframe): """ This method calculates median hight of char , we assume 15 id default height """ avg_height = 15 # this is default value fixed by Quadtatyx team try: dataframe = dataframe.reset_index(drop=True) dataframe['height'] = dataframe['h'] - dataframe['y'] avg_height = int(np.median(dataframe.height.tolist())) except Exception as ve: print(str(ve)) return avg_height def compute_median(row): """ This method calculates median of h and y coordinates """ try: return (int((row['h'] + row['y']) / 2)) except Exception as ke: print(str(ke)) def add_blank_line(row,median_char_height): diff_of_midian = row['Mid_Point_diff'] if(diff_of_midian > median_char_height+2): row['output'] = "\n"+str(row['output']) return row def compute_line_number(pageDF): """ This method caluculates line number for particular page data frame given as an argument """ # calculating median char height median_char_height = computing_median_height(pageDF) # Sorting by page number , y and x pageDF = pageDF.sort_values(['page_number', 'y', 'x'], ascending=[True, True, True]) # reseting index pageDF = pageDF.reset_index(drop=True) # calculating median of char height for each word pageDF["median_word_y_coOrdinate"] = pageDF.apply(compute_median, axis=1) # calculating difference between mediam-height of each word with its next word's median height pageDF["diff_of_midian"] = pageDF['median_word_y_coOrdinate'] - pageDF['median_word_y_coOrdinate'].shift(1) # there is no prev word for the very first word hence we are adding big num for this pageDF["diff_of_midian"].iloc[0] = 100000000 ## Copy these value to other column for further use pageDF['Mid_Point_diff'] = pageDF["diff_of_midian"] # changing to int pageDF['diff_of_midian'] = pageDF['diff_of_midian'].astype(int) # putting very large number if difference is > half of threshold median height pageDF.loc[pageDF['diff_of_midian'] > int(median_char_height / 2), 'diff_of_midian'] = 100000000 # incrementally adding integer to line_number pageDF['line_number'] = (pageDF.diff_of_midian == 100000000).cumsum() pageDF = pageDF.sort_values(['line_number', 'x'], ascending=[True, True]) # droping unused columns pageDF = pageDF.apply(lambda x : add_blank_line(x,median_char_height),axis=1) del pageDF['median_word_y_coOrdinate'], pageDF['diff_of_midian'] , pageDF['Mid_Point_diff'] ## Removing the page number at the end of every page last_line = max(list(pageDF['line_number'])) pageDF = pageDF[pageDF['line_number']