Add Fahrplan DL script
This commit is contained in:
		
							parent
							
								
									1c734bf109
								
							
						
					
					
						commit
						5f1b207d6a
					
				| 
						 | 
					@ -0,0 +1,130 @@
 | 
				
			||||||
 | 
					import datetime
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import glob
 | 
				
			||||||
 | 
					import shutil
 | 
				
			||||||
 | 
					import errno
 | 
				
			||||||
 | 
					import subprocess
 | 
				
			||||||
 | 
					from lxml import etree
 | 
				
			||||||
 | 
					from urllib.request import urlopen
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					scheduleUrl = 'http://data.c3voc.de/rC3/everything.schedule.xml'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					scheduleTree=None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Download the Events-Schedule and parse all Events out of it. Yield a tupel for each Event
 | 
				
			||||||
 | 
					def downloadSchedule(scheduleUrl):
 | 
				
			||||||
 | 
					    print("downloading schedule")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # download the schedule
 | 
				
			||||||
 | 
					    response = urlopen(scheduleUrl)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # read xml-source
 | 
				
			||||||
 | 
					    xml = response.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # parse into ElementTree
 | 
				
			||||||
 | 
					    parser = etree.XMLParser(huge_tree=True)
 | 
				
			||||||
 | 
					    return etree.fromstring(xml, parser)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def getSchedule(scheduleUrl):
 | 
				
			||||||
 | 
					    global scheduleTree
 | 
				
			||||||
 | 
					    if not scheduleTree:
 | 
				
			||||||
 | 
					        scheduleTree=downloadSchedule(scheduleUrl)
 | 
				
			||||||
 | 
					    return scheduleTree
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def persons(scheduleUrl, personmap={}, taglinemap={}, forEventId=None):
 | 
				
			||||||
 | 
					    schedule = getSchedule(scheduleUrl)
 | 
				
			||||||
 | 
					    # iterate all days
 | 
				
			||||||
 | 
					    for day in schedule.iter('day'):
 | 
				
			||||||
 | 
					        # iterate all rooms
 | 
				
			||||||
 | 
					        for room in day.iter('room'):
 | 
				
			||||||
 | 
					            # iterate events on that day in this room
 | 
				
			||||||
 | 
					            for event in room.iter('event'):
 | 
				
			||||||
 | 
					                eventid = int(event.get("id"))
 | 
				
			||||||
 | 
					                if event != None and not eventid == forEventId:
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					                # aggregate names of the persons holding this talk
 | 
				
			||||||
 | 
					                persons_seen = []
 | 
				
			||||||
 | 
					                if event.find('persons') is not None:
 | 
				
			||||||
 | 
					                    for person in event.find('persons').iter('person'):
 | 
				
			||||||
 | 
					                        id = int(person.get("id"))
 | 
				
			||||||
 | 
					                        person = re.sub(r'\s+', ' ', person.text).strip()
 | 
				
			||||||
 | 
					                        match = re.search(r'\((.*?)\)', person)
 | 
				
			||||||
 | 
					                        tagline = ''
 | 
				
			||||||
 | 
					                        if not match is None:
 | 
				
			||||||
 | 
					                            tagline = match.group(1)
 | 
				
			||||||
 | 
					                            person = person.split(" (")[0]
 | 
				
			||||||
 | 
					                        if id in taglinemap:
 | 
				
			||||||
 | 
					                            tagline = taglinemap[id]
 | 
				
			||||||
 | 
					                        if id in personmap:
 | 
				
			||||||
 | 
					                            person = personmap[id]
 | 
				
			||||||
 | 
					                        if not id in persons_seen:
 | 
				
			||||||
 | 
					                            persons_seen.append(id)
 | 
				
			||||||
 | 
					                            yield {
 | 
				
			||||||
 | 
					                                'id': id,
 | 
				
			||||||
 | 
					                                'person': person,
 | 
				
			||||||
 | 
					                                'tagline': tagline
 | 
				
			||||||
 | 
					                            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def events(scheduleUrl, titlemap={}):
 | 
				
			||||||
 | 
					    schedule = getSchedule(scheduleUrl)
 | 
				
			||||||
 | 
					    # iterate all days
 | 
				
			||||||
 | 
					    for day in schedule.iter('day'):
 | 
				
			||||||
 | 
					        # iterate all rooms
 | 
				
			||||||
 | 
					        for room in day.iter('room'):
 | 
				
			||||||
 | 
					            # iterate events on that day in this room
 | 
				
			||||||
 | 
					            for event in room.iter('event'):
 | 
				
			||||||
 | 
					                # aggregate names of the persons holding this talk
 | 
				
			||||||
 | 
					                personnames = []
 | 
				
			||||||
 | 
					                if event.find('persons') is not None:
 | 
				
			||||||
 | 
					                    for person in event.find('persons').iter('person'):
 | 
				
			||||||
 | 
					                        personname = re.sub(r'\s+', ' ', person.text).strip()
 | 
				
			||||||
 | 
					                        personnames.append(personname)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                id = int(event.get('id'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                if id in titlemap:
 | 
				
			||||||
 | 
					                    title = titlemap[id]
 | 
				
			||||||
 | 
					                elif event.find('title') is not None and event.find('title').text is not None:
 | 
				
			||||||
 | 
					                    title = re.sub(r'\s+', ' ', event.find('title').text).strip()
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    title = ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                if event.find('subtitle') is not None and event.find('subtitle').text is not None:
 | 
				
			||||||
 | 
					                    subtitle = re.sub(r'\s+', ' ', event.find('subtitle').text).strip()
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    subtitle = ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                if event.find('start') is not None and event.find('start').text is not None:
 | 
				
			||||||
 | 
					                    start = re.sub(r'\s+', ' ', event.find('start').text).strip()
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    start = ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                # yield a tupel with the event-id, event-title and person-names
 | 
				
			||||||
 | 
					                yield {
 | 
				
			||||||
 | 
					                    'day': day.get('index'),
 | 
				
			||||||
 | 
					                    'id': id,
 | 
				
			||||||
 | 
					                    'title': title,
 | 
				
			||||||
 | 
					                    'subtitle': subtitle,
 | 
				
			||||||
 | 
					                    #'persons': personnames,
 | 
				
			||||||
 | 
					                    'personnames': ', '.join(personnames),
 | 
				
			||||||
 | 
					                    'room': room.attrib['name'],
 | 
				
			||||||
 | 
					                    'track': event.find('track').text,
 | 
				
			||||||
 | 
					                    'start': event.find('start').text,
 | 
				
			||||||
 | 
					                    'datetime': event.find('date').text,
 | 
				
			||||||
 | 
					                    'roomguid': room.attrib['guid'] if 'guid' in room.attrib else '',
 | 
				
			||||||
 | 
					                    #'url': event.find('url').text
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":	
 | 
				
			||||||
 | 
					    events = list(events(scheduleUrl))
 | 
				
			||||||
 | 
					    events = sorted(events, key=lambda x: datetime.datetime.strptime(x["datetime"], "%Y-%m-%dT%H:%M:%S%z"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("fahrplan.json", "w") as f:
 | 
				
			||||||
 | 
					        json.dump(events, f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue