# Script to read a directory of PDF forms and extract textual info # using pypdf library, open source library for parsing PDFs # repo link: https://github.com/py-pdf/pypdf # pip install pypdf from os import listdir, mkdir from os.path import isfile from json import dump import pypdf # use pypdf to parse the pdf # the getFormTextFields() method returns a dictionary with the field names as keys # works on the files that werent being captured by the other script def parsePDF02(fileName): # read the pdf file and create a PdfReader object pdfobject=open(fileName,'rb') pdf=pypdf.PdfReader(pdfobject) # extract data data = pdf.get_form_text_fields() pdfobject.close() return data # grab the dictionary returned by the parsePDF02() # clean up the data and create a json file def create_json(data, fname): # remove the .pdf from the filename filename = fname[:-4] # for some reason only some strings have a newline character at the end # remove it for key in data.keys(): # if the value is a string if isinstance(data[key], str): data[key] = data[key].strip() # create json file # utf8 encoding for accented characters with open('fichas_jsons/' + filename + '.json', 'w', encoding='utf8') as outfile: dump(data, outfile, indent=4, ensure_ascii=False) # call functions def run_script(path): #find the path to the directory tmp = path + '/FICHAS/FICHAS_PRELIMINARES/' # loop through the files in the directory for f in listdir(tmp): if isfile(tmp + '/' + f): ficha_data = parsePDF02(tmp + '/' + f) create_json(ficha_data, f)