User Tools

Site Tools


ndactxt.py

import re import os from pdf_to_txt import convert_pdf_to_txt import unidecode import pdb import multiprocessing import time

def txt_extractor(dir_plus_name):

  root=dir_plus_name[0]
  pdfFile=dir_plus_name[1]

# pdb.set_trace()

  file = False
  skip = 0
  rl = convert_pdf_to_txt(pdfFile)
  for cnt, line in enumerate(rl):
      match = re.search('^\n([\d\.]+)-([\d\.]+)-([\d\.]+)-([\d]+(?:\.\d{0,2})?)\.\s+[\w\s,\'-]*', line)
      if match: # start of new section

# pdb.set_trace() # print(“section title: ” + line)

          t=match.group(1)
          a=match.group(2)
          c=match.group(3)
          s=match.group(4)
          # next 4 lines added to properly handle malformed single-line section headers having no terminal period, as in 28-03.1-01-13
          if (re.search('^\n([\d\.]+)-([\d\.]+)-([\d\.]+)-([\d]+(?:\.\d{0,2})?)\.\s+[\w\s,\'-]*', line) and not re.search("\.$",line) and len(line) < 70 ):
              anchorline = "{{anchor:ndac" + t + '-' + a + '-' + c + '-' + s + "}}\n" # dokuwiki anchor
              line = "\n===== " + line[1:] + " =====\n"  # add dokuwiki heading markup (===== header markings)
              skip = 0
          elif(not re.search("\.$",line)): # merge multi-line headings. if no end ., not last line
              skip = 1
              rl[cnt+1] = '\n' + line + " "+str(unidecode.unidecode(rl[cnt+1])).strip()
              flag = 0
              pass
          else: # this is a one-line section title or last line of multi-line section header
              anchorline = "{{anchor:ndac" + t + '-' + a + '-' + c + '-' + s + "}}\n" # dokuwiki anchor
              line = "\n===== " + line[1:] + " =====\n"  # add dokuwiki heading markup (===== header markings)
              skip = 0
          if file: # write last line with link to legis.nd.gov PDF

# pdb.set_trace()

              file.write("\n[[https://ndlegis.gov/information/acdata/pdf/"+ t + "-" + a + "-" + c + ".pdf]]\n\n\n")
              file.close() # close out prior section
              file = False
          if (skip == 0):
              filename =  t + '-' + a + '-' + c + '-' + s + '.txt' ## Create Filename
              secfile = os.path.join(root,filename)                ## Store Txt file in the same directory
              file = open(secfile, "w")                            ## Open new file to store the PDF

# print('opened file ' + secfile) ## DEBUG statement # if (a == '03.1'): # pause at specific section # pdb.set_trace()

              file.write(anchorline + line + "\n")
      else: # line between start of section and start of next section
          if file:
              nlmatch = re.search('^\n[^\d]', line)
              if nlmatch: #drop leading newline that shows up on first line of page

# pdb.set_trace()

                  line=line.strip()
              hglmatch=re.match('^(History:|General Authority:|Law Implemented:)', line)
              if hglmatch is not None :
                  line = "\n" + line                               ## add spaces between these lines
              file.write(line+"\n")
          else:
              pass
  if file: # this catches the last section in each PDF file ###TESTME
      # write last line with link to legis.nd.gov PDF
      file.write("\n[[https://ndlegis.gov/information/acdata/pdf/"+ t + "-" + a + "-" + c + ".pdf]]\n\n\n")
      file.close()
      file = False

def main():

  pdf_list = []
  for root, dirs, files in os.walk(".", topdown=False):
      for name in files:
          f=os.path.join(root, name)
          pmatch=re.search('\.pdf', f)
          if pmatch is not None:
              pdfFile=f

# print(“PDF File=”+f)

              pdf_list.append([root,f])
  start_time=time.time()
  print(f"{start_time} processing following PDF list: ", pdf_list)
  # new parallel processing model
  with multiprocessing.Pool() as pool: #new parallel proc
      pool.map(txt_extractor,pdf_list)

# for pf in pdf_list: #old serial processing # print(“extracting from ” +pf[0] + “,” + pf[1] + “\n”) # txt_extractor(pf)

  duration = time.time() - start_time
  print(" completed processing " + len(pdf_list) + " files in " + duration + " seconds\n")

if name == 'main':

  main()
ndactxt.py.txt · Last modified: 2022/03/04 10:31 (external edit)