yuzu-mod-archive/archive.py

102 lines
3.9 KiB
Python
Raw Normal View History

2024-03-06 16:57:22 +01:00
#!/usr/bin/env python3
import re
import os
import argparse
from urllib.request import urlretrieve
from urllib.parse import urlparse, quote, unquote
from urllib.error import HTTPError, URLError
# example usage:
# python archive.py --page original-wiki-page.md --file-dir files --output readme.md --repo-base https://raw.githubusercontent.com/yuzu-mirror/yuzu-mod-archive/main
# group 1: game name
# group 2: contents
SECTION_REGEX = r"### ([A-Za-z0-9].+)((?:.|\n)*?)#"
# group 1: title
# group 2: link (raw)
# group 3: description
# group 4: version
# group 5: authors
# The contents of all groups, except the link group (2), may be formatted with Markdown.
TABLE_REGEX = r"\| \[(.+?)\]\((http.+?)\) *\| *(.+?) *\| *`(.+)` \| (.+)"
parser = argparse.ArgumentParser()
parser.add_argument("--page", required=True, help="path to the 'Switch Mods' wiki page Markdown file")
parser.add_argument("--file-dir", required=True, help="path to the directory to download all files to")
parser.add_argument("--output", required=True, help="filename of the output modified Markdown file, with replaced URLs")
parser.add_argument("--repo-base", required=True, help="base URL of the repository where the files will be held")
parser.add_argument("--no-dl", action="store_true", help="don't download anything, just output the modified document")
args = parser.parse_args()
with open(args.page, "r") as file:
wiki_content = file.read()
sections = re.findall(SECTION_REGEX, wiki_content)
replacements: list[tuple[str, str]] = []
for section in sections:
game_name: str = section[0]
table = section[1]
folder_name = "".join([x for x in game_name if x.isalnum() or x.isspace()])
folder = os.path.join(args.file_dir, folder_name)
skip_dl = args.no_dl
if not args.no_dl:
if os.path.isdir(folder):
print(f"[!] folder '{folder}' already exists, will skip dl'ing for this game")
skip_dl = True
else:
os.makedirs(folder)
for row in re.findall(TABLE_REGEX, table):
title = row[0]
url = row[1]
description = row[2]
version = row[3]
authors = row[4]
filename = unquote(os.path.basename(urlparse(url).path))
2024-03-06 16:59:14 +01:00
out_url = f"{args.repo_base}/{args.file_dir}/{quote(game_name)}/{quote(filename)}"
2024-03-06 16:57:22 +01:00
if skip_dl:
replacements.append((url, out_url))
continue
try:
urlretrieve(url, os.path.join(folder, filename))
print(f"[+] mod '{title}' downloaded for game {game_name}")
replacements.append((url, out_url))
except HTTPError as e:
print(f"[ ] mod '{title}' not available from original source, error {e}")
# try using the internet archive
try:
# the date does not matter, IA will automatically pick the closest one (we use the oldest date
# available so that we don't download an archived error message)
#
# this assumes the files itself don't change, which is true for the mods that we need to archive
webarchive_url = f"https://web.archive.org/web/20200101125317if_/{url}"
urlretrieve(webarchive_url, os.path.join(folder, filename))
print(f"[+] mod downloaded from the Internet Archive")
replacements.append((url, out_url))
except:
print(f"[-] mod not available on the Internet Archive nor the original source")
except URLError as e:
print(f"[-] mod '{title}' NOT downloaded - URL error {e}")
print(f" url: {url}")
modified = wiki_content
for item in replacements:
modified = modified.replace(item[0], item[1])
with open(args.output, "w") as file:
file.write(modified)
print(f"[+] all done! modified document saved to '{args.output}'")