暫無描述

parse.py 3.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. import pandas as pd
  2. import json
  3. from os import remove
  4. def initial_parse(name, url):
  5. table_MN = pd.read_html(url)
  6. df = table_MN[0]
  7. df.columns = ['Curso', 'Nombre', 'c', 'Dias', 'Hora', 'Salon']
  8. upi = df[['Curso', 'Nombre', 'Dias', 'Hora', 'Salon']]
  9. # print(upi.head())
  10. # print(upi.to_json(orient='records'))
  11. # puedes usar esta alternativa o la de la linea 17
  12. # upi.to_json(r'C:\Users\diego\Documents\miupi_parse\soup\file.json')
  13. upi.to_json(f'{name}_temp.json', orient='records')
  14. def final_parse(name):
  15. with open(f"{name}_temp.json") as f_in:
  16. data = json.load(f_in)
  17. newdata = []
  18. for item in data:
  19. if(item["Curso"] == None):
  20. pass
  21. else:
  22. course = {}
  23. course["Curso"] = item["Curso"][0:8]
  24. course["Seccion"] = item["Curso"][9:12]
  25. info = item["Nombre"].split("Profesor: ")
  26. if((profesor := info[1][0:-11]) != ""):
  27. course["Profesor"] = profesor
  28. else:
  29. course["Profesor"] = "TBA"
  30. course["Creditos"] = info[1][-1]
  31. course["Nombre"] = info[0][0:-1]
  32. course["Dias"] = item["Dias"].split(" ")
  33. horario = []
  34. if(item["Hora"] != None):
  35. for i in range(len(course["Dias"])):
  36. horario.append(item["Hora"][15*i:15*(i+1)])
  37. else:
  38. horario.append("TBA")
  39. course["Horario"] = horario
  40. salones_raw = item["Salon"].split("Edificio: ")
  41. salones = []
  42. for i in salones_raw:
  43. if(i != "-" and i != "- " and i != ""):
  44. salones.append(i[:-1])
  45. course["Salones"] = salones
  46. course["Info_Extra"] = item["Curso"][13:-1]
  47. newdata.append(course)
  48. with open(f'{name}.json', 'w') as f_out:
  49. json.dump(newdata, f_out)
  50. if __name__ == "__main__":
  51. faculty_urls = {"Asuntos_Aca": "https://miupi.uprrp.edu/horarios/RBA120_AA.HTML",
  52. "Admi": "https://miupi.uprrp.edu/horarios/RBA120_AE.HTML",
  53. "Admi_Grad": "https://miupi.uprrp.edu/horarios/RBA120_AE2.HTML",
  54. "Arqui": "https://miupi.uprrp.edu/horarios/RBA120_AQ.HTML",
  55. "Arqui_Grad": "https://miupi.uprrp.edu/horarios/RBA120_AQ2.HTML",
  56. "Ciencia_Tech_Info_Grad": "https://miupi.uprrp.edu/horarios/RBA120_CB.HTML",
  57. "Ciencias_Militares": "https://miupi.uprrp.edu/horarios/RBA120_CM.HTML",
  58. "Naturales": "https://miupi.uprrp.edu/horarios/RBA120_CN.HTML",
  59. "Naturales_Grad": "https://miupi.uprrp.edu/horarios/RBA120_CN2.HTML",
  60. "Comunicaciones": "https://miupi.uprrp.edu/horarios/RBA120_CP.HTML",
  61. "Comunicaciones_Grad": "https://miupi.uprrp.edu/horarios/RBA120_CP2.HTML",
  62. "Sociales": "https://miupi.uprrp.edu/horarios/RBA120_CS.HTML",
  63. "Sociales_Grad": "https://miupi.uprrp.edu/horarios/RBA120_CS2.HTML ",
  64. "Derecho": "https://miupi.uprrp.edu/horarios/RBA120_DE.HTML",
  65. "Educacion_Continua": "https://miupi.uprrp.edu/horarios/RBA120_EC.HTML",
  66. "Educacion": "https://miupi.uprrp.edu/horarios/RBA120_ED.HTML",
  67. "Educacion_Grad": "https://miupi.uprrp.edu/horarios/RBA120_ED2.HTML",
  68. "Generales": "https://miupi.uprrp.edu/horarios/RBA120_EG.HTML",
  69. "Humanidades": "https://miupi.uprrp.edu/horarios/RBA120_HU.HTML",
  70. "Humanidades_Grad": "https://miupi.uprrp.edu/horarios/RBA120_HU2.HTML",
  71. "Planificacion": "https://miupi.uprrp.edu/horarios/RBA120_PL.HTML"}
  72. #initial_parse("Naturales", faculty_urls["Naturales"])
  73. for key in faculty_urls:
  74. initial_parse(key, faculty_urls[key])
  75. final_parse(key)
  76. remove(f"{key}_temp.json")