No Description

ficha_script.py 1.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. # Script to read a directory of PDF forms and extract textual info
  2. # using pypdf library, open source library for parsing PDFs
  3. # repo link: https://github.com/py-pdf/pypdf
  4. # pip install pypdf
  5. from os import listdir, mkdir
  6. from os.path import isfile
  7. from json import dump
  8. import pypdf
  9. # use pypdf to parse the pdf
  10. # the getFormTextFields() method returns a dictionary with the field names as keys
  11. # works on the files that werent being captured by the other script
  12. def parsePDF02(fileName):
  13. # read the pdf file and create a PdfReader object
  14. pdfobject=open(fileName,'rb')
  15. pdf=pypdf.PdfReader(pdfobject)
  16. # extract data
  17. data = pdf.get_form_text_fields()
  18. pdfobject.close()
  19. return data
  20. # grab the dictionary returned by the parsePDF02()
  21. # clean up the data and create a json file
  22. def create_json(data, fname):
  23. # remove the .pdf from the filename
  24. filename = fname[:-4]
  25. # for some reason only some strings have a newline character at the end
  26. # remove it
  27. for key in data.keys():
  28. # if the value is a string
  29. if isinstance(data[key], str):
  30. data[key] = data[key].strip()
  31. # create json file
  32. # utf8 encoding for accented characters
  33. with open('fichas_jsons/' + filename + '.json', 'w', encoding='utf8') as outfile:
  34. dump(data, outfile, indent=4, ensure_ascii=False)
  35. # call functions
  36. def run_script(path):
  37. #find the path to the directory
  38. tmp = path + '/FICHAS/FICHAS_PRELIMINARES/'
  39. # loop through the files in the directory
  40. for f in listdir(tmp):
  41. if isfile(tmp + '/' + f):
  42. ficha_data = parsePDF02(tmp + '/' + f)
  43. create_json(ficha_data, f)