#!/usr/bin/env python3 import re import os import argparse from urllib.request import urlretrieve from urllib.parse import urlparse, quote, unquote from urllib.error import HTTPError, URLError # example usage: # python archive.py --page original-wiki-page.md --file-dir files --output readme.md --repo-base https://raw.githubusercontent.com/yuzu-mirror/yuzu-mod-archive/main # group 1: game name # group 2: contents SECTION_REGEX = r"### ([A-Za-z0-9].+)((?:.|\n)*?)#" # group 1: title # group 2: link (raw) # group 3: description # group 4: version # group 5: authors # The contents of all groups, except the link group (2), may be formatted with Markdown. TABLE_REGEX = r"\| \[(.+?)\]\((http.+?)\) *\| *(.+?) *\| *`(.+)` \| (.+)" parser = argparse.ArgumentParser() parser.add_argument("--page", required=True, help="path to the 'Switch Mods' wiki page Markdown file") parser.add_argument("--file-dir", required=True, help="path to the directory to download all files to") parser.add_argument("--output", required=True, help="filename of the output modified Markdown file, with replaced URLs") parser.add_argument("--repo-base", required=True, help="base URL of the repository where the files will be held") parser.add_argument("--no-dl", action="store_true", help="don't download anything, just output the modified document") args = parser.parse_args() with open(args.page, "r") as file: wiki_content = file.read() sections = re.findall(SECTION_REGEX, wiki_content) replacements: list[tuple[str, str]] = [] for section in sections: game_name: str = section[0] table = section[1] folder_name = "".join([x for x in game_name if x.isalnum() or x.isspace()]) folder = os.path.join(args.file_dir, folder_name) skip_dl = args.no_dl if not args.no_dl: if os.path.isdir(folder): print(f"[!] folder '{folder}' already exists, will skip dl'ing for this game") skip_dl = True else: os.makedirs(folder) for row in re.findall(TABLE_REGEX, table): title = row[0] url = row[1] description = row[2] version = row[3] authors = row[4] filename = unquote(os.path.basename(urlparse(url).path)) out_url = f"{args.repo_base}/{quote(game_name)}/{quote(filename)}" if skip_dl: replacements.append((url, out_url)) continue try: urlretrieve(url, os.path.join(folder, filename)) print(f"[+] mod '{title}' downloaded for game {game_name}") replacements.append((url, out_url)) except HTTPError as e: print(f"[ ] mod '{title}' not available from original source, error {e}") # try using the internet archive try: # the date does not matter, IA will automatically pick the closest one (we use the oldest date # available so that we don't download an archived error message) # # this assumes the files itself don't change, which is true for the mods that we need to archive webarchive_url = f"https://web.archive.org/web/20200101125317if_/{url}" urlretrieve(webarchive_url, os.path.join(folder, filename)) print(f"[+] mod downloaded from the Internet Archive") replacements.append((url, out_url)) except: print(f"[-] mod not available on the Internet Archive nor the original source") except URLError as e: print(f"[-] mod '{title}' NOT downloaded - URL error {e}") print(f" url: {url}") modified = wiki_content for item in replacements: modified = modified.replace(item[0], item[1]) with open(args.output, "w") as file: file.write(modified) print(f"[+] all done! modified document saved to '{args.output}'")