# script to grab the zip file from the google drive and populate the database # with the data. Phase 1 of the proyect since we wont have user entries yet # Not all of the buildings and images and fichas will be uploaded due to the # formating discrepencies in the directories, this # will be fixed later, for now we will just upload the ones we can import mysql.connector import zipfile from sys import argv from os import listdir, mkdir from os.path import isdir, isfile, abspath, join, basename # using copy for now instead of move so I can delete the folders and not have to set everything up from shutil import move, copy # import module from current directory import ficha_script import json # making a simple connection to the database # this will be changed later to a more secure connection def connect_db(): user = 'root' password = '' host = 'localhost' port = '3306' database = 'fotoexploratorio' connection = mysql.connector.connect(user=user, password=password, host=host, port=port, database=database) return connection def create_tables(connection): # create the tables if they dont exist cursor = connection.cursor() cursor.execute('''CREATE TABLE IF NOT EXISTS buildings ( id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(255) CHARACTER SET utf8 UNIQUE NOT NULL )''') cursor.execute('''CREATE TABLE IF NOT EXISTS images ( id INT AUTO_INCREMENT PRIMARY KEY, building_id INT, path VARCHAR(255) CHARACTER SET utf8 UNIQUE NOT NULL, year INT, medium VARCHAR(255) CHARACTER SET utf8 DEFAULT NULL, author VARCHAR(255) CHARACTER SET utf8 DEFAULT NULL, recovery_method VARCHAR(255) CHARACTER SET utf8 DEFAULT NULL, recovery_date DATE DEFAULT NULL, FOREIGN KEY (building_id) REFERENCES buildings(id) )''') # thinking separating authors into a seperate table eventually but some FICHAS dont have authors # so I will leave it like this for now # cursor.execute('CREATE TABLE IF NOT EXIST authors (id INT NOT NULL AUTO_INCREMENT), name VARCHAR(255) encoding="utf8", PRIMARY KEY (id))') connection.commit() cursor.close() # reciveve the path of the zip file def unzip(zip): # if already exists, skip this step if not 'FotoExploratorio 2022 2023' in listdir(): with zipfile.ZipFile(zip, 'r') as zip_ref: zip_ref.extractall() print('unziped') path = abspath('FotoExploratorio 2022 2023') return path def get_edificios(path, img_reject, connection): # search for the directory called EDIFICIOS edificio_folder = join(path, 'EDIFICIOS') # get the list of edificios edificios = [] # recurse through the EDIFICIOS folder for edificio_folders in listdir(edificio_folder): nested_folder = False # recurse throgh the folders inside said edificio folder for objects in listdir(join(edificio_folder, edificio_folders)): # if its a file put in the image reject list if isfile(edificio_folder + edificio_folders + '/' + objects): img_reject.append(edificio_folder + edificio_folders + '/' + objects) # if its a folder called imagenes, then we will add the images from there so it goes to the editios list if isdir(join(edificio_folder, edificio_folders, objects)) and objects == 'imágenes': path_images = join(edificio_folder, edificio_folders, objects) for items in listdir(path_images): # if its a folder, then we will not add the folder name to the edificios list if isdir (join(path_images, items)): # since this indicates theres an indented folder, I will not write the folder name of # the upper directory nested_folder = True # check if the folder is already in the list if join(path_images, items) not in edificios: edificios.append(abspath(join(path_images, items))) # just skips the folders that are not imagenes: else: continue # if nested_folder is false then will add the folder name to the edifcio list if not nested_folder: # check if thats its not already in the list if join(edificio_folder, edificio_folders) not in edificios: edificios.append(abspath(join(edificio_folder, edificio_folders))) # enter the edificios in the list into the database edificios = [edificio for edificio in edificios if 'Fotos' not in edificio] for edificio in edificios: # remove the path edificio = basename(edificio) # remove the _uprrp from the name edificio = edificio.replace('_uprrp', '') # enter the edificio into the database cursor = connection.cursor() cursor.execute('INSERT IGNORE INTO buildings (name) VALUES (%s)', (edificio,)) connection.commit() cursor.close() return edificios def get_images(edificios_list, connection): # make a folder to store the images if not 'accepted_images' in listdir(): mkdir('accepted_images') images = [] #recurse the edificios folder for edificio in edificios_list: # append the images in the folder to the list, if the folder has an imagenes folder # then we will add the image from there cursor = connection.cursor(buffered=True) cursor.execute('SELECT id FROM buildings WHERE name = %s', (basename(edificio.replace('_uprrp', '')),)) building_id = cursor.fetchone()[0] cursor.close() for folders in listdir(edificio): # if its a folder called imagenes, then we will add the images from there so it goes to the editios list if isdir(join(edificio, folders)) and folders == 'imágenes': for img in listdir(join(edificio, folders)): cursor = connection.cursor(buffered=True) # append the image to the list img = basename(img) images.append({'path': folders + img, 'building_id': building_id}) copy(join(edificio, folders, img), join('accepted_images', img)) cursor.execute('INSERT IGNORE INTO images (path, building_id) VALUES (%s, %s)', ('accepted_images/' + img, building_id)) connection.commit() cursor.close() # due to formatting issues, some images are in the edificio folder, so we will add to the database from here if isfile(join(edificio, folders)): cursor = connection.cursor(buffered=True) img = basename(folders) images.append({'path': img, 'building_id': building_id}) copy(join(edificio, folders), join('accepted_images', img)) cursor.execute('INSERT IGNORE INTO images (path, building_id) VALUES (%s, %s)', ('accepted_images/' + img, building_id)) connection.commit() cursor.close() return images def get_fichas(path, images_list, connection): fichas = [] fichas_path = path + '/FICHAS/FICHAS_PRELIMINARES/' # recurse the fichas folder ficha_script.run_script(path) fichas_path = 'fichas_jsons/' # recurse the fichas json folder, read the image reference and try and find it in the images list for ficha in listdir(fichas_path): # read the json file for the "Código de imagen" key with open(fichas_path + ficha, encoding='utf-8') as ficha_file: key = json.load(ficha_file)['Código de la imagen'] # the key may not have a path so we need to search the key inside the edificio folder for img in images_list: # grab the image name only, remove file extension img_name = img['path'].split('.')[0] # if the image name is the same as the key, then we will add the ficha to the list if img_name == key: # fichas.append(fichas_path + ficha) # get the data from the json and insert it into the database with open(fichas_path + ficha, encoding='utf-8') as ficha_file: img_path = 'accepted_images/' + img['path'] data = json.load(ficha_file) cursor = connection.cursor(buffered=True) # turn on autocommit # insert the data into the database # update the image table with the data from the ficha cursor.execute('''UPDATE images SET year = IFNULL(%s, year), author = IFNULL(%s, author), medium = IFNULL(%s, medium), recovery_method = IFNULL(%s, recovery_method), recovery_date = IFNULL(%s, recovery_date) WHERE path = %s''', (data['Año'], data['Autor de la fotografía o dibujo'], data['Medio'], data['Recuperado'], data['Fecha de recuperación de la imagen'], img_path)) connection.commit() cursor.close() break def main(): try: mkdir('fichas_jsons') except: pass try: zip_path = argv[1] except: zip_path = 'FotoExploratorio 2022 2023.zip' path = unzip(zip_path) # create the list of rejected images and fichas img_reject = [] # connection = connect_db() connection = connect_db() # create the tables create_tables(connection) # get the list of edificios edificios = get_edificios(path, img_reject, connection) # get the list of images images = get_images(edificios, connection) # get the list of fichas get_fichas(path, images, connection) # write the list of rejected images and fichas to a file with open('rejected_images.txt', 'w', encoding='utf-8') as f: for img in img_reject: f.write(img + '\n') # with open('rejected_fichas.txt', 'w', encoding='utf-8') as f: # for ficha in fichas_reject: # f.write(ficha + '\n') # write edificio list to see the buildings listed because some of them are not buildings with open('edificios.txt', 'w', encoding='utf-8') as f: for edificio in edificios: f.write(edificio + '\n') # end the program print('done') if __name__ == '__main__': main()