暫無描述

second_parser.py 4.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. import pandas as pd
  2. import json
  3. from os import remove
  4. def initial_parse(name, url):
  5. table_MN = pd.read_html(url)
  6. df = table_MN[0]
  7. del df[5]
  8. df.columns = ['Curso', 'Nombre', 'c', 'Dias', 'Hora', 'Salon']
  9. upi = df[['Curso', 'Nombre', 'Dias', 'Hora', 'Salon']]
  10. # print(upi.head())
  11. # print(upi.to_json(orient='records'))
  12. # puedes usar esta alternativa o la de la linea 17
  13. # upi.to_json(r'C:\Users\diego\Documents\miupi_parse\soup\file.json')
  14. upi.to_json(f'{name}2_temp.json', orient='records')
  15. def final_parse(name):
  16. with open(f"{name}2_temp.json") as f_in:
  17. data = json.load(f_in)
  18. newdata = {}
  19. for item in data:
  20. # print(item)
  21. course = item["Curso"][0:8]
  22. info = item["Nombre"]
  23. labCourse = course + '_' + 'LAB'
  24. if(course in newdata):
  25. if( (newdata[course][0] != info) and ('Créditos' not in info) and ('Prof.' not in info)):
  26. newKey = course + '_' + 'LAB'
  27. if(newKey not in newdata):
  28. newdata[newKey] = []
  29. newdata[newKey].append(info)
  30. if(info == 'LABORATORIO'):
  31. newdata[newKey].append(0)
  32. elif(info == 'LABORATORIO INTERMED II'):
  33. newdata[newKey].append(2)
  34. else:
  35. newdata[newKey].append(1)
  36. elif('Prof.' in info):
  37. pass
  38. elif('Créditos' in info):
  39. creditos = info.split()
  40. if (newdata[course][0] == "INVESTIG NO GRADUADA"):
  41. newdata[course].append(int(creditos[0]))
  42. elif(len(newdata[course]) == 1):
  43. newdata[course].append(int(creditos[0]))
  44. else:
  45. newdata[course] = []
  46. newdata[course].append(info)
  47. # print(newdata)
  48. # print(len(newdata))
  49. with open(f'{name}2.json', 'w') as f_out:
  50. json.dump(newdata, f_out)
  51. if __name__ == "__main__":
  52. faculty_urls = {"Asuntos_Aca": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_AA.HTML",
  53. "Admi": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_AE.HTML",
  54. "Admi_Grad": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_AE2.HTML",
  55. "Arqui": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_AQ.HTML",
  56. "Arqui_Grad": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_AQ2.HTML",
  57. "Escuela_Grad_Ciencia_Tech_Info": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_CB.HTML",
  58. "Ciencias_Militares": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_CM.HTML",
  59. "Ciencias_Naturales": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_CN.HTML",
  60. "Ciencias_Naturales_Grad": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_CN2.HTML",
  61. "Comunicaciones": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_CP.HTML",
  62. "Comunicaciones_Grad": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_CP2.HTML",
  63. "Ciencias_Sociales": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_CS.HTML",
  64. "Ciencias_Sociales_Grad": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_CS2.HTML ",
  65. "Escuela_Derecho": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_DE.HTML",
  66. "Educacion_Continua": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_EC.HTML",
  67. "Educacion": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_ED.HTML",
  68. "Educacion_Grad": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_ED2.HTML",
  69. "Estudios_Generales": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_EG.HTML",
  70. "Humanidades": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_HU.HTML",
  71. "Humanidades_Grad": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_HU2.HTML",
  72. "Planificacion": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_PL.HTML"}
  73. # faculty_urls = {"Generales": "https://miupi.uprrp.edu/horarios/enero-2019/RBA120_EG.HTML"}
  74. for key in faculty_urls:
  75. initial_parse(key, faculty_urls[key])
  76. final_parse(key)
  77. remove(f"{key}2_temp.json")