Eduardo vor 1 Jahr
Ursprung
Commit
3d2c5af531
3 geänderte Dateien mit 426 neuen und 0 gelöschten Zeilen
  1. 8
    0
      .gitignore
  2. 123
    0
      ficha_script.py
  3. 295
    0
      tables_script.py

+ 8
- 0
.gitignore Datei anzeigen

@@ -0,0 +1,8 @@
1
+edificios.txt
2
+FotoExploratorio 2022 2023-20230125T192022Z-001.zip
3
+rejected_fichas.txt
4
+rejected_images.txt
5
+__pycache__/
6
+.vscode/
7
+fichas_jsons/
8
+FotoExploratorio 2022 2023/

+ 123
- 0
ficha_script.py Datei anzeigen

@@ -0,0 +1,123 @@
1
+# Script to read a directory of PDF forms and extract textual info
2
+
3
+# using pypdf library, open source library for parsing PDFs
4
+
5
+# repo link: https://github.com/py-pdf/pypdf
6
+
7
+# pip install pypdf
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+from os import listdir, mkdir
16
+
17
+from os.path import isfile
18
+
19
+from json import dump
20
+
21
+import pypdf
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+# use pypdf to parse the pdf
30
+
31
+# the getFormTextFields() method returns a dictionary with the field names as keys
32
+
33
+# works on the files that werent being captured by the other script
34
+
35
+def parsePDF02(fileName):
36
+
37
+    # read the pdf file and create a PdfReader object
38
+
39
+    pdfobject=open(fileName,'rb')
40
+
41
+    pdf=pypdf.PdfReader(pdfobject)
42
+
43
+
44
+
45
+    # extract data
46
+
47
+    data = pdf.get_form_text_fields()
48
+
49
+    pdfobject.close()
50
+
51
+
52
+
53
+    return data
54
+
55
+
56
+
57
+
58
+
59
+# grab the dictionary returned by the parsePDF02() 
60
+
61
+# clean up the data and create a json file
62
+
63
+def create_json(data, fname):
64
+
65
+
66
+
67
+    # remove the .pdf from the filename
68
+
69
+    filename = fname[:-4]
70
+
71
+
72
+
73
+    # for some reason only some strings have a newline character at the end
74
+
75
+    # remove it
76
+
77
+    for key in data.keys():    
78
+
79
+        # if the value is a string
80
+
81
+        if isinstance(data[key], str):
82
+
83
+            data[key] = data[key].strip()
84
+
85
+
86
+
87
+    
88
+
89
+    # create json file
90
+
91
+    # utf8 encoding for accented characters
92
+
93
+    with open('fichas_jsons/' + filename + '.json', 'w', encoding='utf8') as outfile:
94
+
95
+        dump(data, outfile, indent=4, ensure_ascii=False)
96
+
97
+    
98
+
99
+
100
+
101
+# call functions
102
+
103
+def run_script(path):
104
+
105
+    #find the path to the directory
106
+    tmp = path + '/FICHAS/FICHAS_PRELIMINARES/'
107
+
108
+
109
+    # loop through the files in the directory
110
+
111
+    for f in listdir(tmp):
112
+
113
+        if isfile(tmp + '/' + f):
114
+
115
+            ficha_data = parsePDF02(tmp + '/' + f)
116
+
117
+            create_json(ficha_data, f)
118
+
119
+
120
+
121
+
122
+
123
+        

+ 295
- 0
tables_script.py Datei anzeigen

@@ -0,0 +1,295 @@
1
+# script to grab the zip file from the google drive and populate the database 
2
+# with the data. Phase 1 of the proyect since we wont have user entries yet
3
+# Not all of the buildings and images and fichas will be uploaded due to the 
4
+# formating discrepencies in the directories, this
5
+# will be fixed later, for now we will just upload the ones we can
6
+
7
+
8
+import mysql.connector
9
+import zipfile
10
+from sys import argv
11
+from os import listdir, mkdir
12
+from os.path import isdir, isfile, abspath, join, basename
13
+
14
+# using copy for now instead of move so I can delete the folders and not have to set everything up
15
+from shutil import move, copy
16
+# import module from current directory
17
+import ficha_script
18
+import json
19
+
20
+
21
+# making a simple connection to the database
22
+# this will be changed later to a more secure connection
23
+def connect_db():
24
+    user = 'root'
25
+    password = ''
26
+    host = 'localhost'
27
+    port = '3306'
28
+    database = 'fotoexploratorio'
29
+    connection = mysql.connector.connect(user=user, password=password, host=host, port=port, database=database)
30
+    
31
+    return connection
32
+
33
+
34
+def create_tables(connection):
35
+
36
+    # create the tables if they dont exist
37
+    cursor = connection.cursor()
38
+
39
+    cursor.execute('''CREATE TABLE IF NOT EXISTS buildings (
40
+    id INT AUTO_INCREMENT PRIMARY KEY,
41
+    name VARCHAR(255) CHARACTER SET utf8 UNIQUE NOT NULL
42
+    )''')
43
+
44
+    cursor.execute('''CREATE TABLE IF NOT EXISTS images (
45
+    id INT AUTO_INCREMENT PRIMARY KEY,
46
+    building_id INT,
47
+    path VARCHAR(255) CHARACTER SET utf8 UNIQUE NOT NULL,
48
+    year INT,
49
+    medium VARCHAR(255) CHARACTER SET utf8 DEFAULT NULL,
50
+    author VARCHAR(255) CHARACTER SET utf8 DEFAULT NULL,
51
+    recovery_method VARCHAR(255) CHARACTER SET utf8 DEFAULT NULL,
52
+    recovery_date DATE DEFAULT NULL,
53
+    FOREIGN KEY (building_id) REFERENCES buildings(id)
54
+    )''')
55
+
56
+    # thinking separating authors into a seperate table eventually but some FICHAS dont have authors
57
+    # so I will leave it like this for now
58
+    # cursor.execute('CREATE TABLE IF NOT EXIST authors (id INT NOT NULL AUTO_INCREMENT), name VARCHAR(255) encoding="utf8", PRIMARY KEY (id))')
59
+
60
+    connection.commit()
61
+    cursor.close()
62
+
63
+
64
+
65
+# reciveve the path of the zip file
66
+def unzip(zip):
67
+
68
+    # if already exists, skip this step
69
+    if not 'FotoExploratorio 2022 2023' in listdir():
70
+        with zipfile.ZipFile(zip, 'r') as zip_ref:
71
+            zip_ref.extractall()
72
+            print('unziped')
73
+    
74
+    
75
+    path = abspath('FotoExploratorio 2022 2023')
76
+    return path
77
+
78
+
79
+
80
+
81
+def get_edificios(path, img_reject, connection):
82
+    # search for the directory called EDIFICIOS
83
+    edificio_folder = join(path, 'EDIFICIOS')
84
+    # get the list of edificios
85
+    edificios = []
86
+    # recurse through the EDIFICIOS folder
87
+    for edificio_folders in listdir(edificio_folder):
88
+        nested_folder = False
89
+
90
+        
91
+        # recurse throgh the folders inside said edificio folder
92
+        for objects in listdir(join(edificio_folder, edificio_folders)):
93
+
94
+
95
+            # if its a file put in the image reject list
96
+            if isfile(edificio_folder + edificio_folders + '/' + objects):
97
+                img_reject.append(edificio_folder + edificio_folders + '/' + objects)
98
+
99
+            # if its a folder called imagenes, then we will add the images from there so it goes to the editios list
100
+            if isdir(join(edificio_folder, edificio_folders, objects)) and objects == 'imágenes':
101
+                path_images = join(edificio_folder, edificio_folders, objects)
102
+                
103
+                for items in listdir(path_images):
104
+
105
+                    # if its a folder, then we will not add the folder name to the edificios list   
106
+                    if isdir (join(path_images, items)):
107
+                        # since this indicates theres an indented folder, I will not write the folder name of
108
+                        # the upper directory
109
+                        nested_folder = True
110
+
111
+                        # check if the folder is already in the list
112
+                        if join(path_images, items) not in edificios:
113
+                            edificios.append(abspath(join(path_images, items)))
114
+
115
+            # just skips the folders that are not imagenes:  
116
+            else:
117
+                continue
118
+
119
+
120
+            # if nested_folder is false then will add the folder name to the edifcio list
121
+            if not nested_folder:
122
+                # check if thats its not already in the list
123
+                if join(edificio_folder, edificio_folders) not in edificios:
124
+                    edificios.append(abspath(join(edificio_folder, edificio_folders)))
125
+
126
+    # enter the edificios in the list into the database
127
+    edificios = [edificio for edificio in edificios if 'Fotos' not in edificio]
128
+
129
+    for edificio in edificios:
130
+        # remove the path
131
+        edificio = basename(edificio)
132
+
133
+        # remove the _uprrp from the name
134
+        edificio = edificio.replace('_uprrp', '')
135
+
136
+        # enter the edificio into the database
137
+        cursor = connection.cursor()
138
+        cursor.execute('INSERT IGNORE INTO buildings (name) VALUES (%s)', (edificio,))
139
+        connection.commit()
140
+        cursor.close()
141
+    
142
+    return edificios
143
+
144
+def get_images(edificios_list, connection):
145
+
146
+    # make a folder to store the images
147
+    if not 'accepted_images' in listdir():
148
+        mkdir('accepted_images')
149
+    
150
+
151
+    images = []
152
+    #recurse the edificios folder
153
+    for edificio in edificios_list:
154
+        # append the images in the folder to the list, if the folder has an imagenes folder
155
+        # then we will add the image from there
156
+        cursor = connection.cursor(buffered=True)
157
+        cursor.execute('SELECT id FROM buildings WHERE name = %s', (basename(edificio.replace('_uprrp', '')),))
158
+        building_id = cursor.fetchone()[0]
159
+        cursor.close()
160
+        for folders in listdir(edificio):
161
+
162
+            # if its a folder called imagenes, then we will add the images from there so it goes to the editios list
163
+            if isdir(join(edificio, folders)) and folders == 'imágenes':
164
+                for img in listdir(join(edificio, folders)):
165
+                    cursor = connection.cursor(buffered=True)
166
+                    # append the image to the list                     
167
+                    img = basename(img)
168
+                    images.append({'path': folders + img, 'building_id': building_id})
169
+                    copy(join(edificio, folders, img), join('accepted_images', img))
170
+                    cursor.execute('INSERT IGNORE INTO images (path, building_id) VALUES (%s, %s)', ('accepted_images/' + img, building_id))
171
+                    connection.commit()
172
+                    cursor.close()
173
+
174
+            # due to formatting issues, some images are in the edificio folder, so we will add to the database from here
175
+            if isfile(join(edificio, folders)):
176
+                cursor = connection.cursor(buffered=True)
177
+                img = basename(folders)
178
+                images.append({'path': img, 'building_id': building_id})
179
+                copy(join(edificio, folders), join('accepted_images', img))
180
+                cursor.execute('INSERT IGNORE INTO images (path, building_id) VALUES (%s, %s)', ('accepted_images/' + img, building_id))
181
+                connection.commit()
182
+                cursor.close()
183
+
184
+
185
+        
186
+        
187
+    
188
+
189
+    return images
190
+
191
+def get_fichas(path, images_list, connection):
192
+
193
+    fichas = []
194
+    fichas_path = path + '/FICHAS/FICHAS_PRELIMINARES/'
195
+
196
+    # recurse the fichas folder
197
+    ficha_script.run_script(path)
198
+    fichas_path = 'fichas_jsons/'
199
+
200
+
201
+    # recurse the fichas json folder, read the image reference and try and find it in the images list
202
+    for ficha in listdir(fichas_path):
203
+        # read the json file for the "Código de imagen" key
204
+        with open(fichas_path + ficha, encoding='utf-8') as ficha_file:
205
+            key = json.load(ficha_file)['Código de la imagen']
206
+    
207
+        # the key may not have a path so we need to search the key inside the edificio folder
208
+        for img in images_list:
209
+
210
+            # grab the image name only, remove file extension
211
+            img_name = img['path'].split('.')[0]
212
+            #  if the image name is the same as the key, then we will add the ficha to the list
213
+            if img_name == key:
214
+                # fichas.append(fichas_path + ficha)
215
+                # get the data from the json and insert it into the database
216
+                with open(fichas_path + ficha, encoding='utf-8') as ficha_file:
217
+                    img_path = 'accepted_images/' + img['path']
218
+                    data = json.load(ficha_file)
219
+                    cursor = connection.cursor(buffered=True)
220
+                    # turn on autocommit
221
+                    # insert the data into the database
222
+
223
+                    # update the image table with the data from the ficha
224
+                    cursor.execute('''UPDATE images SET
225
+                    year = IFNULL(%s, year),
226
+                    author = IFNULL(%s, author),
227
+                    medium = IFNULL(%s, medium),
228
+                    recovery_method = IFNULL(%s, recovery_method),
229
+                    recovery_date = IFNULL(%s, recovery_date)
230
+                    WHERE path = %s''', 
231
+                    (data['Año'], data['Autor de la fotografía o dibujo'], data['Medio'], data['Recuperado'], data['Fecha de recuperación de la imagen'], img_path))
232
+                    connection.commit()
233
+                    cursor.close()
234
+
235
+                break
236
+    
237
+
238
+        
239
+
240
+            
241
+        
242
+def main():
243
+
244
+    try:
245
+        mkdir('fichas_jsons')
246
+    except:
247
+        pass
248
+
249
+    try:
250
+        zip_path = argv[1]
251
+    except:
252
+        zip_path = 'FotoExploratorio 2022 2023.zip'
253
+
254
+    path = unzip(zip_path)
255
+
256
+    # create the list of rejected images and fichas
257
+    img_reject = []
258
+    
259
+
260
+    # connection = connect_db()
261
+    connection = connect_db()
262
+
263
+    # create the tables
264
+    create_tables(connection)
265
+
266
+    # get the list of edificios
267
+    edificios = get_edificios(path, img_reject, connection)
268
+    # get the list of images
269
+    images = get_images(edificios, connection)
270
+    # get the list of fichas
271
+    get_fichas(path, images, connection)
272
+
273
+    # write the list of rejected images and fichas to a file
274
+    with open('rejected_images.txt', 'w', encoding='utf-8') as f:
275
+        for img in img_reject:
276
+            f.write(img + '\n')
277
+    # with open('rejected_fichas.txt', 'w', encoding='utf-8') as f:
278
+    #     for ficha in fichas_reject:
279
+    #         f.write(ficha + '\n')
280
+
281
+    # write edificio list to see the buildings listed because some of them are not buildings
282
+    with open('edificios.txt', 'w', encoding='utf-8') as f:
283
+        for edificio in edificios:
284
+            f.write(edificio + '\n')
285
+
286
+
287
+    # end the program
288
+    print('done')
289
+    
290
+
291
+
292
+
293
+if __name__ == '__main__':
294
+    main()
295
+