User Tools

Site Tools


code

The code used to scrape the NDCC PDF files from legis.nd.gov is linked below.

Thanks to dikesh_faldu on fiverr.com for some timely development help along the way.

Please note, the code was not cleaned up for publication.

ndcc-scrape.py
# -*- coding: utf-8 -*-
"""
Created on Sat Feb  9 12:33:28 2019
 
@author: Dikesh Faldu
"""
 
import re, os
import urllib.request
from bs4 import BeautifulSoup
 
 
cwd = "."
os.chdir(cwd)
 
from pdf_to_txt import Extract_PDF
 
 
#Extract_PDF("","") #quick test
 
def extract_table(chapter_no,link):
 
    soup = BeautifulSoup(urllib.request.urlopen(link).read(), "html.parser")
    table = soup.find("div",attrs={'id':'application'})
 
    if(not os.path.exists(chapter_no)):
        os.makedirs(chapter_no)
 
    chapter_name = table.find("caption").get_text().strip()
    final_list = []
    for tr in table.find_all("tr"):
        row_list = []
        for j,td in enumerate(tr.find_all("td")):
            if(j!=1):
                cell = td.get_text()
                cell = cell.strip()
                if(j==0):
                    a = td.find('a')
                    pdf_link = str(a.get('href'))
                    base_link = link.rsplit("/",1)[0]
                    full_pdf_link = base_link+"/"+pdf_link
                    Folder_path  = chapter_no+"/"+cell
                    Extract_PDF(full_pdf_link,Folder_path)
                    cell = cell.ljust(15)
 
                row_list.append(cell)
        if(row_list!=[]):
            final_list.append(row_list)
 
    file = open(chapter_no+"/start.txt", "w")
    Title = "==== Title "+str(chapter_no)+" "+str(chapter_name) + " ===="
    file.write(Title+"\n\n")
#   chapter ="Chapter".ljust(15)
#   Header = chapter+"\tChapter Name"
#   file.write(Header+"\n\n")
    for list1 in final_list:
#        temp_str = "\t".join(list1)
        ttl_data=list1[0].strip().split('-')
        ttl_str=ttl_data[0]
        temp_str = "[["+ttl_str+":"+list1[0].strip()+":|"+list1[0].strip() + ". " + list1[1]+"]]\n"
        file.write(temp_str+"\n")
 
    file.close()
 
 
 
leglink=""
url = "https://www.legis.nd.gov/general-information/north-dakota-century-code"
 
soup = BeautifulSoup(urllib.request.urlopen(url).read(), "html.parser")
 
Header = soup.find("div",attrs={"id":"content"}).find("h1").get_text()
 
content = soup.find("div",attrs={"id":"block-system-main"}).get_text().strip()
 
content = Header+"\n\n"+re.sub("(\n)(\n)+","\n\n",content)
 
file = open("start.txt", "w")
file.write(content)
file.close()
 
links =[]
for a in soup.find_all("a"):
    c=str(a.get('href'))
    if(re.compile("cencode").search(c)):
#        print(c)
        links.append(c)
 
for link in links:   
#   link = "https://www.legis.nd.gov/cencode/t01.html"
   t = re.sub("t0|t","",link.rsplit("/",1)[1].split(".")[0].split("c")[0].lower())
   chapter_no = re.sub("-",".",t)
   chapternum=t.split('-')
   if(int(chapternum[0]) < 0):
       print("skipping " + t)
   elif(re.search("html",link)):
       extract_table(chapter_no,link)
   elif(re.search("pdf",link)):
       Extract_PDF(link,chapter_no)
   else:
       print("Invalid Link :- ",link)
pdf_to_txt.py
# -*- coding: utf-8 -*-
"""
Created on Sat Feb  9 15:00:51 2019
 
@author: Dikesh Faldu
Modified 2019-10-27 by JET to correct headers with contingent effective dates and repeals
"""
 
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import XMLConverter
from bs4 import BeautifulSoup as bsoup
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
import pandas as pd
import numpy as np
import re
import os
import urllib
 
import unidecode
 
 
 
def pdf_to_xml_converter(pdf_file_path):
    """
        This method converts from pdf to xml, and returns xml string,
        if argument "xml_file_path" is not None then it will create and save xml file at given xml file path.
    """
    RETRY_THRESHOLD = 3
#    xml_file_path = re.sub("\.pdf|\.PDF",".xml",pdf_file_path)
    xml_str = None
 
    retry = 0
 
    while retry < RETRY_THRESHOLD:
        try:
            rsrcmgr = PDFResourceManager()
            retstr = BytesIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
            fp = open(pdf_file_path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos=set()
 
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
                interpreter.process_page(page)
 
            xml_str = retstr.getvalue()
            # Closing resources
            fp.close()
            device.close()
            retstr.close()
            # If xml file path is given then write to that
 
        except Exception as e:
            print(str(e))
 
        retry += 1
 
    return xml_str
 
def xml_parser_to_df(xml_str):
    """
        This method accepts xml string as an argument and returns dataframe after parsing it.
    """
#    print("xml_parser_to_df method start")
    df = pd.DataFrame()
    try:
        # loading xml string into bsoup 
        XmlSoupElement = bsoup(xml_str, 'xml')
        # Iteratting through each page
        for page in XmlSoupElement.findAll('page'):
            x = []
            y = []
            w = []
            h = []
            word = []
            # iterating through all textbox in page
            for tb in page.findAll('textbox'):
                if len(tb.findAll('textline')) == 0:
                    continue
                for tl in tb.findAll('textline'):
                    if len(tl.findAll('text')) == 0:
                        continue
                    comb = ""
                    word_begin = 0
                    word_x = ""
                    word_y = ""
                    word_h = ""
                    word_w = ""
                    flag = 0
                    for txt in tl.findAll('text'):
                        txt_attrs = dict(txt.attrs)
                        if (word_begin == 0):
                            flag = 1
                            word_begin = 1
#                            comb += str(txt.get_text().encode('utf-8'))
                            comb += str(txt.get_text())
                            word_x = txt_attrs['bbox'].split(',')[0]
                            word_w = txt['bbox'].split(',')[2]
                            word_y = txt['bbox'].split(',')[1]
                            word_h = txt['bbox'].split(',')[3]
                        elif (word_begin == 1 and txt_attrs != {}):
#                            comb += str(txt.get_text().encode('utf-8'))
                            comb += str(txt.get_text())
                            word_w = txt['bbox'].split(',')[2]
                            word_y = txt['bbox'].split(',')[1]
                            word_h = txt['bbox'].split(',')[3]
                        elif (word_begin == 1 and txt_attrs == {}):
                            flag = 0
                            word_begin = 0
                            x.append(word_x)
                            y.append(word_y)
                            w.append(word_w)
                            h.append(word_h)
 
                            word.append(comb)
 
                            word_x = ""
                            word_y = ""
                            word_h = ""
                            word_w = ""
                            comb = ""
                    if (flag == 1):
                        x.append(tl['bbox'].split(',')[0])
                        y.append(tl['bbox'].split(',')[1])
                        w.append(tl['bbox'].split(',')[2])
                        h.append(tl['bbox'].split(',')[3])
                        word.append(comb)
            df_page = pd.DataFrame()
 
            df_page["x"] = x
            df_page["y"] = y
            df_page["w"] = w
            df_page["h"] = h
            df_page["page_number"] = page['id']
 
            word = [re.sub(r'\s+', ' ', words) for words in word]
            word = [re.sub(r'^_+$', '', words) for words in word]
            word = [re.sub(r'^[\-]{2,}$', '', words) for words in word]
            word = [re.sub(r'^\*+$', '', words) for words in word]
 
            df_page["output"] = word
            df_page[['x', 'y', 'w', 'h']] = df_page[['x', 'y', 'w', 'h']].astype(float)
            df_page[['page_number']] = df_page[['page_number']].apply(pd.to_numeric)
 
            try:
                y_min = min(df_page['y'])
                h_max = max(df_page['h'])
                df_page_temp = df_page.copy(deep=True)
                df_page.loc[:, 'y'] = h_max - df_page.loc[:, 'h'] + y_min
                df_page.loc[:, 'h'] = h_max - df_page_temp.loc[:, 'y'] + y_min
                df_page = df_page.reset_index(drop=True)
                df = df.append(df_page)
                df = df.reset_index(drop=True)
            except:
                print("Empty Page or Error at page number : "+ str(page['id']))
 
        df['output'] = list(df['output'].str.strip())
        rr = df[df['output'].str.contains("^$") == True]
        df = df.drop(df.index[list(rr.index.values)])
        df = df.reset_index(drop=True)
        # calculate liine number
        df = add_line_number(df)
 
    except Exception as e:
        print("Error in parser"+str(e))
 
    return df
 
def computing_median_height(dataframe):
    """
        This method calculates median hight of char , we assume 15 id default height
    """
    avg_height = 15 # this is default value fixed by Quadtatyx team
    try:
        dataframe = dataframe.reset_index(drop=True)
        dataframe['height'] = dataframe['h'] - dataframe['y']
        avg_height = int(np.median(dataframe.height.tolist()))
    except Exception as ve:
        print(str(ve))
    return avg_height
 
def compute_median(row):
    """
        This method calculates median of h and y coordinates
    """
    try:
        return (int((row['h'] + row['y']) / 2))
    except Exception as ke:
        print(str(ke))
 
def add_blank_line(row,median_char_height):
 
    diff_of_midian = row['Mid_Point_diff']
 
    if(diff_of_midian > median_char_height+2):
        row['output'] = "\n"+str(row['output'])
 
    return row
 
 
def compute_line_number(pageDF):
    """
        This method caluculates line number for particular page data frame given as an argument
    """
    # calculating median char height
    median_char_height = computing_median_height(pageDF)
    # Sorting by page number , y and x
    pageDF = pageDF.sort_values(['page_number', 'y', 'x'], ascending=[True, True, True])
    # reseting index
    pageDF = pageDF.reset_index(drop=True)
    # calculating median of char height for each word
    pageDF["median_word_y_coOrdinate"] = pageDF.apply(compute_median, axis=1)
    # calculating difference between mediam-height of each word with its next word's median height
    pageDF["diff_of_midian"] = pageDF['median_word_y_coOrdinate'] - pageDF['median_word_y_coOrdinate'].shift(1)
    # there is no prev word for the very first word hence we are adding big num for this
    pageDF["diff_of_midian"].iloc[0] = 100000000
    ## Copy these value to other column for further use
    pageDF['Mid_Point_diff'] = pageDF["diff_of_midian"]
    # changing to int
    pageDF['diff_of_midian'] = pageDF['diff_of_midian'].astype(int)
    # putting very large number if difference is > half of threshold median height 
    pageDF.loc[pageDF['diff_of_midian'] > int(median_char_height / 2), 'diff_of_midian'] = 100000000
    # incrementally adding integer to line_number
    pageDF['line_number'] = (pageDF.diff_of_midian == 100000000).cumsum()
    pageDF = pageDF.sort_values(['line_number', 'x'], ascending=[True, True])
    # droping unused columns
 
    pageDF = pageDF.apply(lambda x : add_blank_line(x,median_char_height),axis=1)
 
    del pageDF['median_word_y_coOrdinate'], pageDF['diff_of_midian'] , pageDF['Mid_Point_diff']
 
    ## Removing the page number at the end of every page
    last_line = max(list(pageDF['line_number']))
    pageDF = pageDF[pageDF['line_number']<last_line]
 
    return pageDF
 
def add_line_number(df):
    """
        This method will add line number to words
    """
    pages = list(set(df['page_number']))
    new_df = pd.DataFrame()
    page_num = 1
    for page_num in pages:
        pageDF = df[df['page_number'] == page_num]
        new_df = new_df.append(compute_line_number(pageDF))
    return new_df
 
def combine_line(line_df):
 
    output_str = list(line_df['output'])
 
    line = " ".join(output_str)
 
    return line
 
 
def convert_pdf_to_txt(pdf_file_path):
    ## Convert PDf file to XML
    #pdf_file_path = r"D:/Fiverr/jerodtufte/75-09.2-01.pdf"
    xml_str = pdf_to_xml_converter(pdf_file_path)
 
    ## Parse the XMl through parser and then calculate the line number in each page
    df = xml_parser_to_df(xml_str)
 
    ## Sort values  by page number then line number and then x cordinate of the word
    df = df.sort_values(['page_number','line_number', 'x'], ascending=[True,True, True])
    ## Combine the same line data 
    lines = df.groupby(["page_number","line_number"]).apply(combine_line)
    ## make a list of the each line
    lines= list(lines)
    ##
    return lines
 
 
def Extract_PDF(link,Folder):
#   link = "https://www.legis.nd.gov/cencode/t39c08.pdf"
#   Folder = "39/39-08"
    leglink="\n\nsource: [["+str(link)+"]]\n"
    print("Folder Name :- ",Folder)
    from pathlib import Path
    local_pdf=Path(Folder+".pdf")
    filename = Folder+".pdf"
    if not local_pdf.is_file():
        print("downloading " + link + "\n")
        pdf_str = urllib.request.urlopen(link).read() #SECOND TIME CHECK IF FILE EXIST FIRST
        ## Save PDF     
        with open(filename, 'wb') as out_file:
            out_file.write(pdf_str)
 
    ##  Convert PDF to Text
    print ("converting " + filename + "\n")
    rl = convert_pdf_to_txt(filename)
 
    ## Remove Downloaded file
#   os.remove(filename) ## You can comment if you dont want to remove PDF file
 
    firstsec = True
    file = False
    flag = 0
    skip = 0
    if(not os.path.exists(Folder)):
        os.makedirs(Folder)
    Text_file =Folder + "/start.txt"
    file = open(Text_file, "w")
 
    for cnt, line in enumerate(rl):
        #strip leading newlines if present
        line=line.strip()
        if(skip == 1): # this waits to write multi-line section heading until all lines found
            skip = 0
            pass
        else:
            match = re.search('^(CHAPTER.*)$', line.strip())
            if(match): # capture chapter heading and print it as dokuwiki heading
                t=match.group(1)
                flag = 1
                skip = 1
                line = "==== " + t + " " + rl[cnt+1].strip() + " ====\n"
            match = re.search('^(([\d\.]+)-([\d\.]+)-([\d]+(?:\.\d{0,2})?))\.\s+[\w\s,\'-]*', line)
            if(match): # found section heading first line
                t=match.group(1)
                nameddest=re.sub("\.", "p", str(t))
                line = str(unidecode.unidecode(line)).strip()
                # concatenate multi-line headings this is not last line because
                # section title ends in period or effective date prenthetical
                if(not re.search("(\.|[0-9][0-9][0-9][0-9]\)|See note\))$",line)):
#                   skip = 1
                    rl[cnt+1] = line+" "+str(unidecode.unidecode(rl[cnt+1])).strip()
                    flag = 0
                else:
                    if firstsec: # first section is not preceded by link to legis.nd.gov
                        firstsec=False
                    else:
                        file.write(leglink+"\n")
                    leglink="\n\nsource: [["+str(link)+"#nameddest="+nameddest+"]]\n"
                    file.write("{{anchor:ndcc"+t+"}}\n\n") # dokuwiki anchor
#                   file.write("=== " + line + " ===\n\n") # dokuwiki heading
                    line = "=== " + line + " ===\n\n" # dokuwiki heading
                    flag = 1
            else: 
#               flag = 0
                if file:
                    line = str(unidecode.unidecode(line))
                    if(re.search("^([0-9]{1,2}\.|[a-z]\.|\([0-9]{1,2}\)|\([a-z]\))[^\d]",line.strip())):
                        new_line = line.strip()
                        line = "\n"+new_line
#                   file.write(line+"\n")
                else:
                    pass
 
            if(flag == 1): #write the line to the file
                if file:
                    line = str(unidecode.unidecode(line))
                    if(re.search("(^[1-9]{1,2}\.[^\d])|(\(Effective [0-9a-zA-Z ,]{12,22}[0-9][0-9][0-9][0-9])|(See note\))",line)):
#                       new_line = line.strip()
#                       line = "\n"+new_line
                        line = "\n\n"+line
                    file.write(line+"\n")
 
    if file:
        file.write(leglink+"\n")
        file.close()
        file = False
code.txt · Last modified: 2019/10/28 00:06 by justice