From 5f1b207d6a1d6fb29ea58463bc0dd1146cd64451 Mon Sep 17 00:00:00 2001 From: octycs Date: Tue, 22 Dec 2020 19:50:45 +0100 Subject: [PATCH] Add Fahrplan DL script --- fahrplan.py | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 fahrplan.py diff --git a/fahrplan.py b/fahrplan.py new file mode 100644 index 0000000..84cb85d --- /dev/null +++ b/fahrplan.py @@ -0,0 +1,130 @@ +import datetime +import json +import os +import sys +import re +import glob +import shutil +import errno +import subprocess +from lxml import etree +from urllib.request import urlopen + +scheduleUrl = 'http://data.c3voc.de/rC3/everything.schedule.xml' + +scheduleTree=None + +# Download the Events-Schedule and parse all Events out of it. Yield a tupel for each Event +def downloadSchedule(scheduleUrl): + print("downloading schedule") + + # download the schedule + response = urlopen(scheduleUrl) + + # read xml-source + xml = response.read() + + # parse into ElementTree + parser = etree.XMLParser(huge_tree=True) + return etree.fromstring(xml, parser) + +def getSchedule(scheduleUrl): + global scheduleTree + if not scheduleTree: + scheduleTree=downloadSchedule(scheduleUrl) + return scheduleTree + +def persons(scheduleUrl, personmap={}, taglinemap={}, forEventId=None): + schedule = getSchedule(scheduleUrl) + # iterate all days + for day in schedule.iter('day'): + # iterate all rooms + for room in day.iter('room'): + # iterate events on that day in this room + for event in room.iter('event'): + eventid = int(event.get("id")) + if event != None and not eventid == forEventId: + continue + # aggregate names of the persons holding this talk + persons_seen = [] + if event.find('persons') is not None: + for person in event.find('persons').iter('person'): + id = int(person.get("id")) + person = re.sub(r'\s+', ' ', person.text).strip() + match = re.search(r'\((.*?)\)', person) + tagline = '' + if not match is None: + tagline = match.group(1) + person = person.split(" (")[0] + if id in taglinemap: + tagline = taglinemap[id] + if id in personmap: + person = personmap[id] + if not id in persons_seen: + persons_seen.append(id) + yield { + 'id': id, + 'person': person, + 'tagline': tagline + } + +def events(scheduleUrl, titlemap={}): + schedule = getSchedule(scheduleUrl) + # iterate all days + for day in schedule.iter('day'): + # iterate all rooms + for room in day.iter('room'): + # iterate events on that day in this room + for event in room.iter('event'): + # aggregate names of the persons holding this talk + personnames = [] + if event.find('persons') is not None: + for person in event.find('persons').iter('person'): + personname = re.sub(r'\s+', ' ', person.text).strip() + personnames.append(personname) + + id = int(event.get('id')) + + if id in titlemap: + title = titlemap[id] + elif event.find('title') is not None and event.find('title').text is not None: + title = re.sub(r'\s+', ' ', event.find('title').text).strip() + else: + title = '' + + if event.find('subtitle') is not None and event.find('subtitle').text is not None: + subtitle = re.sub(r'\s+', ' ', event.find('subtitle').text).strip() + else: + subtitle = '' + + if event.find('start') is not None and event.find('start').text is not None: + start = re.sub(r'\s+', ' ', event.find('start').text).strip() + else: + start = '' + + # yield a tupel with the event-id, event-title and person-names + yield { + 'day': day.get('index'), + 'id': id, + 'title': title, + 'subtitle': subtitle, + #'persons': personnames, + 'personnames': ', '.join(personnames), + 'room': room.attrib['name'], + 'track': event.find('track').text, + 'start': event.find('start').text, + 'datetime': event.find('date').text, + 'roomguid': room.attrib['guid'] if 'guid' in room.attrib else '', + #'url': event.find('url').text + } + + + +if __name__ == "__main__": + events = list(events(scheduleUrl)) + events = sorted(events, key=lambda x: datetime.datetime.strptime(x["datetime"], "%Y-%m-%dT%H:%M:%S%z")) + + with open("fahrplan.json", "w") as f: + json.dump(events, f) + +