#!/usr/bin/env python3 from pathlib import Path from util import DiskCache import html import json import re import urllib.request import xml.etree.ElementTree as ET INPUT_PATH = Path(__file__).parent.parent.joinpath("input.json").resolve() OUTPUT_PATH = Path(__file__).parent.parent.joinpath("data.json").resolve() CACHE_PATH = Path(__file__).parent.parent.joinpath("cache").resolve() class BoardGame: def __init__(self, id, name, year, player_range, time, rank, families, weight, description, mechanics, thumbnail_url, image_url): self.id = id self.name = name self.year = year self.player_range = player_range self.time = time self.rank = rank self.families = families self.weight = weight self.description = description self.mechanics = mechanics self.thumbnail_url = thumbnail_url self.image_url = image_url @classmethod def from_bgg_xml(cls, xml_doc): families = [e.attrib['friendlyname'].split()[0] for e in xml_doc.findall('.//rank[@type="family"]')] mechanics = [e.attrib['value'] for e in xml_doc.findall('.//link[@type="boardgamemechanic"]')] return cls( name=re.split(r'[–-]', xml_doc.find(".//name").attrib['value'])[0].strip(), id=int(xml_doc.find(".//item").attrib['id']), year=int(xml_doc.find('.//yearpublished').attrib['value']), player_range=( int(xml_doc.find('.//minplayers').attrib['value']), int(xml_doc.find('.//maxplayers').attrib['value']), ), time=int(xml_doc.find('.//playingtime').attrib['value']), rank=int(xml_doc.find('.//rank[@type="subtype"]').attrib['value']), families=families, weight=float(xml_doc.find('.//averageweight').attrib['value']), description=html.unescape(xml_doc.find('.//description').text.strip()), mechanics=mechanics, thumbnail_url=xml_doc.find('.//thumbnail').text.strip(), image_url=xml_doc.find('.//image').text.strip() ) def serialize(self): return { 'id': self.id, 'title': self.name, 'year': self.year, 'player_range': self.player_range, 'time': self.time, 'rank': self.rank, 'families': self.families, 'description': self.description, 'mechanics': self.mechanics, 'weight': self.weight } def fetch_boardgames(data, cache): boardgames = [] for item in data: key = f"{item['title']}.xml" if key not in cache: url = f"https://api.geekdo.com/xmlapi2/thing?id={item['id']}&stats=1" with urllib.request.urlopen(url) as response: cache[key] = response.read() boardgame = BoardGame.from_bgg_xml(ET.fromstring(cache[key])) if 'families' in item: boardgame.families = item['families'] boardgames.append(boardgame) return boardgames def cache_images(boardgames, cache): for boardgame in boardgames: key = f"{boardgame.id}_thumbnail.jpg" if key not in cache: with urllib.request.urlopen(boardgame.thumbnail_url) as response: cache[key] = response.read() key = f"{boardgame.id}.jpg" if key not in cache: with urllib.request.urlopen(boardgame.image_url) as response: cache[key] = response.read() if __name__ == "__main__": CACHE_PATH.mkdir(parents=True, exist_ok=True) cache = DiskCache(CACHE_PATH) with open(INPUT_PATH) as infile: data = json.load(infile) boardgames = fetch_boardgames(data, cache) cache_images(boardgames, cache) with open(OUTPUT_PATH, 'w') as outfile: json.dump([game.serialize() for game in boardgames], outfile)