123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 |
- # Script to read a directory of PDF forms and extract textual info
-
- # using pypdf library, open source library for parsing PDFs
-
- # repo link: https://github.com/py-pdf/pypdf
-
- # pip install pypdf
-
-
-
-
-
-
-
- from os import listdir, mkdir
-
- from os.path import isfile
-
- from json import dump
-
- import pypdf
-
-
-
-
-
-
-
- # use pypdf to parse the pdf
-
- # the getFormTextFields() method returns a dictionary with the field names as keys
-
- # works on the files that werent being captured by the other script
-
- def parsePDF02(fileName):
-
- # read the pdf file and create a PdfReader object
-
- pdfobject=open(fileName,'rb')
-
- pdf=pypdf.PdfReader(pdfobject)
-
-
-
- # extract data
-
- data = pdf.get_form_text_fields()
-
- pdfobject.close()
-
-
-
- return data
-
-
-
-
-
- # grab the dictionary returned by the parsePDF02()
-
- # clean up the data and create a json file
-
- def create_json(data, fname):
-
-
-
- # remove the .pdf from the filename
-
- filename = fname[:-4]
-
-
-
- # for some reason only some strings have a newline character at the end
-
- # remove it
-
- for key in data.keys():
-
- # if the value is a string
-
- if isinstance(data[key], str):
-
- data[key] = data[key].strip()
-
-
-
-
-
- # create json file
-
- # utf8 encoding for accented characters
-
- with open('fichas_jsons/' + filename + '.json', 'w', encoding='utf8') as outfile:
-
- dump(data, outfile, indent=4, ensure_ascii=False)
-
-
-
-
-
- # call functions
-
- def run_script(path):
-
- #find the path to the directory
- tmp = path + '/FICHAS/FICHAS_PRELIMINARES/'
-
-
- # loop through the files in the directory
-
- for f in listdir(tmp):
-
- if isfile(tmp + '/' + f):
-
- ficha_data = parsePDF02(tmp + '/' + f)
-
- create_json(ficha_data, f)
-
-
-
-
-
-
|