User Tools

Site Tools


ndacpdfscrape.py

NDAC PDF scrape

# ndacpdfscrape.py
# this will walk the legislature web site and download all the PDF files
# for the administrative rules and store them in a local directory tree

import re, os
import urllib.request
from bs4 import BeautifulSoup
import time

# TODO
# build dokuwiki txt page for each title and article - skip those with [Repealed] or [Superseded]


soup = BeautifulSoup(urllib.request.urlopen('https://ndlegis.gov/agency-rules/north-dakota-administrative-code').read(), "html.parser")

count = 1

for title in soup.find_all('a', attrs={'href':re.compile("Title\d+\.html")}):
    t=title.get('href')
    tmatch = re.search('Title(\d+)\.html', t)
    if tmatch is not None:
        tnumstr=tmatch.group(1)
    else:
        print("no match for t =" + t + "\n")
        break
    tnum=int(tnumstr)
    soupa = BeautifulSoup(urllib.request.urlopen(t).read(), "html.parser")
    for art in soupa.find_all('a', attrs={'href': re.compile("[-\d]+\.html$")}):
        a = art.get('href')
        amatch = re.search('(\d+)-([\d]+(?:\.\d{0,2})?)\.html', a)
        if amatch is not None:
            a1str = amatch.group(1)
            a2str = amatch.group(2)
            anumstr=a1str+"-"+a2str
        else:
            print("no match for a="+a+"\n")
            break
        arturl="https://ndlegis.gov/information/acdata/html/" + a1str +"-"+a2str + ".html"
        soupc = BeautifulSoup(urllib.request.urlopen(arturl).read(), "html.parser")
        for ch in soupc.find_all('a', attrs={'href': re.compile("[-\d]+\.pdf$")}):
            count += 1
            c=ch.get('href')
            cmatch = re.search('pdf/(\d+)-([\d]+(?:\.\d{0,2})?)-([\d]+(?:\.\d{0,2})?)\.pdf', c)
            if cmatch is not None:
                c1str = cmatch.group(1)
                c2str = cmatch.group(2)
                c3str = cmatch.group(3)
                cnumstr = c1str+"-"+c2str+"-"+c3str
            else:
                print("no match for c="+c+"\n")
                break
            churl="https://ndlegis.gov/information/acdata/pdf/" + c1str + "-" + c2str + "-" + c3str + ".pdf"
            print(str(count) + ". churl = " + churl)
            dirname=tnumstr+'/'+anumstr+'/'+cnumstr
            filename=cnumstr+'.pdf'
            pathname=dirname+'/'+filename
            print("pathname = " + pathname)
            print("dir = " + dirname)
            os.makedirs(dirname, exist_ok=True)
            time.sleep(1) # slower traffic to server
            with urllib.request.urlopen(churl) as response, open(pathname, 'wb') as out_file:
                data = response.read() # a `bytes` object
                out_file.write(data)
ndacpdfscrape.py.txt · Last modified: 2022/03/04 10:31 (external edit)