Add Fahrplan DL script

This commit is contained in:
octycs 2020-12-22 19:50:45 +01:00
parent 1c734bf109
commit 5f1b207d6a
1 changed files with 130 additions and 0 deletions

130
fahrplan.py Normal file
View File

@ -0,0 +1,130 @@
import datetime
import json
import os
import sys
import re
import glob
import shutil
import errno
import subprocess
from lxml import etree
from urllib.request import urlopen
scheduleUrl = 'http://data.c3voc.de/rC3/everything.schedule.xml'
scheduleTree=None
# Download the Events-Schedule and parse all Events out of it. Yield a tupel for each Event
def downloadSchedule(scheduleUrl):
print("downloading schedule")
# download the schedule
response = urlopen(scheduleUrl)
# read xml-source
xml = response.read()
# parse into ElementTree
parser = etree.XMLParser(huge_tree=True)
return etree.fromstring(xml, parser)
def getSchedule(scheduleUrl):
global scheduleTree
if not scheduleTree:
scheduleTree=downloadSchedule(scheduleUrl)
return scheduleTree
def persons(scheduleUrl, personmap={}, taglinemap={}, forEventId=None):
schedule = getSchedule(scheduleUrl)
# iterate all days
for day in schedule.iter('day'):
# iterate all rooms
for room in day.iter('room'):
# iterate events on that day in this room
for event in room.iter('event'):
eventid = int(event.get("id"))
if event != None and not eventid == forEventId:
continue
# aggregate names of the persons holding this talk
persons_seen = []
if event.find('persons') is not None:
for person in event.find('persons').iter('person'):
id = int(person.get("id"))
person = re.sub(r'\s+', ' ', person.text).strip()
match = re.search(r'\((.*?)\)', person)
tagline = ''
if not match is None:
tagline = match.group(1)
person = person.split(" (")[0]
if id in taglinemap:
tagline = taglinemap[id]
if id in personmap:
person = personmap[id]
if not id in persons_seen:
persons_seen.append(id)
yield {
'id': id,
'person': person,
'tagline': tagline
}
def events(scheduleUrl, titlemap={}):
schedule = getSchedule(scheduleUrl)
# iterate all days
for day in schedule.iter('day'):
# iterate all rooms
for room in day.iter('room'):
# iterate events on that day in this room
for event in room.iter('event'):
# aggregate names of the persons holding this talk
personnames = []
if event.find('persons') is not None:
for person in event.find('persons').iter('person'):
personname = re.sub(r'\s+', ' ', person.text).strip()
personnames.append(personname)
id = int(event.get('id'))
if id in titlemap:
title = titlemap[id]
elif event.find('title') is not None and event.find('title').text is not None:
title = re.sub(r'\s+', ' ', event.find('title').text).strip()
else:
title = ''
if event.find('subtitle') is not None and event.find('subtitle').text is not None:
subtitle = re.sub(r'\s+', ' ', event.find('subtitle').text).strip()
else:
subtitle = ''
if event.find('start') is not None and event.find('start').text is not None:
start = re.sub(r'\s+', ' ', event.find('start').text).strip()
else:
start = ''
# yield a tupel with the event-id, event-title and person-names
yield {
'day': day.get('index'),
'id': id,
'title': title,
'subtitle': subtitle,
#'persons': personnames,
'personnames': ', '.join(personnames),
'room': room.attrib['name'],
'track': event.find('track').text,
'start': event.find('start').text,
'datetime': event.find('date').text,
'roomguid': room.attrib['guid'] if 'guid' in room.attrib else '',
#'url': event.find('url').text
}
if __name__ == "__main__":
events = list(events(scheduleUrl))
events = sorted(events, key=lambda x: datetime.datetime.strptime(x["datetime"], "%Y-%m-%dT%H:%M:%S%z"))
with open("fahrplan.json", "w") as f:
json.dump(events, f)