eduardo.santin
/
Fotoex


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
							# Script to read a directory of PDF forms and extract textual info

# using pypdf library, open source library for parsing PDFs

# repo link: https://github.com/py-pdf/pypdf

# pip install pypdf


from os import listdir, mkdir

from os.path import isfile

from json import dump

import pypdf


# use pypdf to parse the pdf

# the getFormTextFields() method returns a dictionary with the field names as keys

# works on the files that werent being captured by the other script

def parsePDF02(fileName):

    # read the pdf file and create a PdfReader object

    pdfobject=open(fileName,'rb')

    pdf=pypdf.PdfReader(pdfobject)


    # extract data

    data = pdf.get_form_text_fields()

    pdfobject.close()


    return data


# grab the dictionary returned by the parsePDF02() 

# clean up the data and create a json file

def create_json(data, fname):


    # remove the .pdf from the filename

    filename = fname[:-4]


    # for some reason only some strings have a newline character at the end

    # remove it

    for key in data.keys():    

        # if the value is a string

        if isinstance(data[key], str):

            data[key] = data[key].strip()


    # create json file

    # utf8 encoding for accented characters

    with open('fichas_jsons/' + filename + '.json', 'w', encoding='utf8') as outfile:

        dump(data, outfile, indent=4, ensure_ascii=False)

    
# call functions

def run_script(path):

    #find the path to the directory
    tmp = path + '/FICHAS/FICHAS_PRELIMINARES/'


    # loop through the files in the directory

    for f in listdir(tmp):

        if isfile(tmp + '/' + f):

            ficha_data = parsePDF02(tmp + '/' + f)

            create_json(ficha_data, f)