from bs4 import BeautifulSoup import os import sys import json files = os.listdir(sys.argv[1]) courses_list = [] for file in files: # Extract the college in the file if not file[0:3] == "RBA": continue with open(sys.argv[1]+"/"+file) as fp: soup = BeautifulSoup(fp) rows = soup.find_all("tr") rows.pop(0) while rows: c_name = None c_code = None c_type = None c_days = None c_times = None c_room = None c_schedule = [] prof = None c_cupo = None creditos = None comment = None for i in range(3): row = rows.pop(0) cols = row.text.split("\n") #, row.getText(), dir(row) if i == 0: c_code = cols[1].strip() c_name = cols[2].strip() c_type = cols[3].strip() c_days = cols[4].strip() c_times = cols[5].strip() c_room = cols[6].strip() c_schedule.append({"days":c_days, "times": c_times, "room": c_room}) elif i == 1: prof = cols[1].strip().split("Prof.")[-1] c_cupo = cols[2].strip().split(":")[-1].strip() c_days = cols[3].strip() if c_days: c_times = cols[4].strip() c_room = cols[5].strip() c_schedule.append({"days":c_days, "times": c_times, "room": c_room}) elif i == 2: #print cols creditos = int(cols[1].strip().split()[0]) if cols[1].find("**") >=0: comment = "Cursos no se ofreceran en el proximo semestre" courses_list.append({"code": c_code, "name": c_name, "type": c_type, "schedule": c_schedule, "cupo": c_cupo, "prof": prof, "creds": creditos, "comm": comment}) print "data=",json.dumps(courses_list)