import datetime import json import os import sys import re import glob import shutil import errno import subprocess from lxml import etree from urllib.request import urlopen scheduleUrl = 'http://data.c3voc.de/rC3/everything.schedule.xml' scheduleTree=None # Download the Events-Schedule and parse all Events out of it. Yield a tupel for each Event def downloadSchedule(scheduleUrl): print("downloading schedule") # download the schedule response = urlopen(scheduleUrl) # read xml-source xml = response.read() # parse into ElementTree parser = etree.XMLParser(huge_tree=True) return etree.fromstring(xml, parser) def getSchedule(scheduleUrl): global scheduleTree if not scheduleTree: scheduleTree=downloadSchedule(scheduleUrl) return scheduleTree def persons(scheduleUrl, personmap={}, taglinemap={}, forEventId=None): schedule = getSchedule(scheduleUrl) # iterate all days for day in schedule.iter('day'): # iterate all rooms for room in day.iter('room'): # iterate events on that day in this room for event in room.iter('event'): eventid = int(event.get("id")) if event != None and not eventid == forEventId: continue # aggregate names of the persons holding this talk persons_seen = [] if event.find('persons') is not None: for person in event.find('persons').iter('person'): id = int(person.get("id")) person = re.sub(r'\s+', ' ', person.text).strip() match = re.search(r'\((.*?)\)', person) tagline = '' if not match is None: tagline = match.group(1) person = person.split(" (")[0] if id in taglinemap: tagline = taglinemap[id] if id in personmap: person = personmap[id] if not id in persons_seen: persons_seen.append(id) yield { 'id': id, 'person': person, 'tagline': tagline } def events(scheduleUrl, titlemap={}): schedule = getSchedule(scheduleUrl) # iterate all days for day in schedule.iter('day'): # iterate all rooms for room in day.iter('room'): # iterate events on that day in this room for event in room.iter('event'): # aggregate names of the persons holding this talk personnames = [] if event.find('persons') is not None: for person in event.find('persons').iter('person'): personname = re.sub(r'\s+', ' ', person.text).strip() personnames.append(personname) id = int(event.get('id')) if id in titlemap: title = titlemap[id] elif event.find('title') is not None and event.find('title').text is not None: title = re.sub(r'\s+', ' ', event.find('title').text).strip() else: title = '' if event.find('subtitle') is not None and event.find('subtitle').text is not None: subtitle = re.sub(r'\s+', ' ', event.find('subtitle').text).strip() else: subtitle = '' if event.find('start') is not None and event.find('start').text is not None: start = re.sub(r'\s+', ' ', event.find('start').text).strip() else: start = '' # yield a tupel with the event-id, event-title and person-names yield { 'day': day.get('index'), 'id': id, 'title': title, 'subtitle': subtitle, #'persons': personnames, 'personnames': ', '.join(personnames), 'room': room.attrib['name'], 'track': event.find('track').text, 'start': event.find('start').text, 'datetime': event.find('date').text, 'roomguid': room.attrib['guid'] if 'guid' in room.attrib else '', #'url': event.find('url').text } if __name__ == "__main__": events = list(events(scheduleUrl)) events = sorted(events, key=lambda x: datetime.datetime.fromisoformat(x["datetime"])) with open("fahrplan.json", "w") as f: json.dump(events, f)