Bez popisu

primer_parse.py 3.5KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. import pandas as pd
  2. import json
  3. from os import remove
  4. def initial_parse(name, url):
  5. table_MN = pd.read_html(url)
  6. df = table_MN[0]
  7. df.columns = ['Curso', 'Nombre', 'c', 'Dias', 'Hora', 'Salon']
  8. upi = df[['Curso', 'Nombre', 'Dias', 'Hora', 'Salon']]
  9. # print(upi.head())
  10. # print(upi.to_json(orient='records'))
  11. # puedes usar esta alternativa o la de la linea 17
  12. # upi.to_json(r'C:\Users\diego\Documents\miupi_parse\soup\file.json')
  13. upi.to_json(f'{name}_temp.json', orient='records')
  14. def final_parse(name):
  15. with open(f"{name}_temp.json") as f_in:
  16. data = json.load(f_in)
  17. newdata = {}
  18. for item in data:
  19. # print(item)
  20. if(item["Curso"] != None):
  21. course = item["Curso"][0:8]
  22. info = item["Nombre"].split("Profesor: ")
  23. labCourse = course + '_' + 'LAB'
  24. courseName = info[0]
  25. creditos = info[1][-1]
  26. if(course in newdata):
  27. if(newdata[course][0] != courseName):
  28. newKey = course + '_' + 'LAB'
  29. if(newKey not in newdata):
  30. newdata[newKey] = []
  31. newdata[newKey].append(courseName)
  32. newdata[newKey].append(creditos)
  33. else:
  34. newdata[course] = []
  35. newdata[course].append(courseName)
  36. newdata[course].append(creditos)
  37. # print(newdata)
  38. # print(len(newdata))
  39. with open(f'{name}.json', 'w') as f_out:
  40. json.dump(newdata, f_out)
  41. if __name__ == "__main__":
  42. faculty_urls = {"Asuntos_Aca": "https://miupi.uprrp.edu/horarios/RBA120_AA.HTML",
  43. "Admi": "https://miupi.uprrp.edu/horarios/RBA120_AE.HTML",
  44. "Admi_Grad": "https://miupi.uprrp.edu/horarios/RBA120_AE2.HTML",
  45. "Arqui": "https://miupi.uprrp.edu/horarios/RBA120_AQ.HTML",
  46. "Arqui_Grad": "https://miupi.uprrp.edu/horarios/RBA120_AQ2.HTML",
  47. "Escuela_Grad_Ciencia_Tech_Info": "https://miupi.uprrp.edu/horarios/RBA120_CB.HTML",
  48. "Ciencias_Militares": "https://miupi.uprrp.edu/horarios/RBA120_CM.HTML",
  49. "Ciencias_Naturales": "https://miupi.uprrp.edu/horarios/RBA120_CN.HTML",
  50. "Ciencias_Naturales_Grad": "https://miupi.uprrp.edu/horarios/RBA120_CN2.HTML",
  51. "Comunicaciones": "https://miupi.uprrp.edu/horarios/RBA120_CP.HTML",
  52. "Comunicaciones_Grad": "https://miupi.uprrp.edu/horarios/RBA120_CP2.HTML",
  53. "Ciencias_Sociales": "https://miupi.uprrp.edu/horarios/RBA120_CS.HTML",
  54. "Ciencias_Sociales_Grad": "https://miupi.uprrp.edu/horarios/RBA120_CS2.HTML ",
  55. "Escuela_Derecho": "https://miupi.uprrp.edu/horarios/RBA120_DE.HTML",
  56. "Educacion_Continua": "https://miupi.uprrp.edu/horarios/RBA120_EC.HTML",
  57. "Educacion": "https://miupi.uprrp.edu/horarios/RBA120_ED.HTML",
  58. "Educacion_Grad": "https://miupi.uprrp.edu/horarios/RBA120_ED2.HTML",
  59. "Estudios_Generales": "https://miupi.uprrp.edu/horarios/RBA120_EG.HTML",
  60. "Humanidades": "https://miupi.uprrp.edu/horarios/RBA120_HU.HTML",
  61. "Humanidades_Grad": "https://miupi.uprrp.edu/horarios/RBA120_HU2.HTML",
  62. "Planificacion": "https://miupi.uprrp.edu/horarios/RBA120_PL.HTML"}
  63. for key in faculty_urls:
  64. initial_parse(key, faculty_urls[key])
  65. final_parse(key)
  66. remove(f"{key}_temp.json")