yuzu-mod-archive/archive.py

#!/usr/bin/env python3
import re
import os
import argparse
from urllib.request import urlretrieve
from urllib.parse import urlparse, quote, unquote
from urllib.error import HTTPError, URLError

# example usage:
# python archive.py --page original-wiki-page.md --file-dir files --output readme.md --repo-base https://raw.githubusercontent.com/yuzu-mirror/yuzu-mod-archive/main

# group 1: game name
# group 2: contents
SECTION_REGEX = r"### ([A-Za-z0-9].+)((?:.|\n)*?)#"

# group 1: title
# group 2: link (raw)
# group 3: description
# group 4: version
# group 5: authors
# The contents of all groups, except the link group (2), may be formatted with Markdown.
TABLE_REGEX = r"\| \[(.+?)\]\((http.+?)\) *\| *(.+?) *\| *`(.+)` \| (.+)"

parser = argparse.ArgumentParser()
parser.add_argument("--page", required=True, help="path to the 'Switch Mods' wiki page Markdown file")
parser.add_argument("--file-dir", required=True, help="path to the directory to download all files to")
parser.add_argument("--output", required=True, help="filename of the output modified Markdown file, with replaced URLs")
parser.add_argument("--repo-base", required=True, help="base URL of the repository where the files will be held")
parser.add_argument("--no-dl", action="store_true", help="don't download anything, just output the modified document")
args = parser.parse_args()

with open(args.page, "r") as file:
    wiki_content = file.read()
    sections = re.findall(SECTION_REGEX, wiki_content)

replacements: list[tuple[str, str]] = []

for section in sections:
    game_name: str = section[0]
    table = section[1]

    folder_name = "".join([x for x in game_name if x.isalnum() or x.isspace()])

    folder = os.path.join(args.file_dir, folder_name)

    skip_dl = args.no_dl

    if not args.no_dl:
        if os.path.isdir(folder):
            print(f"[!] folder '{folder}' already exists, will skip dl'ing for this game")
            skip_dl = True
        else:
            os.makedirs(folder)

    for row in re.findall(TABLE_REGEX, table):
        title = row[0]
        url = row[1]
        description = row[2]
        version = row[3]
        authors = row[4]

        filename = unquote(os.path.basename(urlparse(url).path))

        out_url = f"{args.repo_base}/{args.file_dir}/{quote(game_name)}/{quote(filename)}"

        if skip_dl:
            replacements.append((url, out_url))
            continue

        try:
            urlretrieve(url, os.path.join(folder, filename))
            print(f"[+] mod '{title}' downloaded for game {game_name}")
            replacements.append((url, out_url))
        except HTTPError as e:
            print(f"[ ] mod '{title}' not available from original source, error {e}")

            # try using the internet archive
            try:
                # the date does not matter, IA will automatically pick the closest one (we use the oldest date
                # available so that we don't download an archived error message)
                #
                # this assumes the files itself don't change, which is true for the mods that we need to archive
                webarchive_url = f"https://web.archive.org/web/20200101125317if_/{url}"
                urlretrieve(webarchive_url, os.path.join(folder, filename))
                print(f"[+]    mod downloaded from the Internet Archive")
                replacements.append((url, out_url))
            except:
                print(f"[-] mod not available on the Internet Archive nor the original source")
        except URLError as e:
            print(f"[-] mod '{title}' NOT downloaded - URL error {e}")
            print(f"    url: {url}")

        
modified = wiki_content
for item in replacements:
    modified = modified.replace(item[0], item[1])

with open(args.output, "w") as file:
    file.write(modified)

print(f"[+] all done! modified document saved to '{args.output}'")
Add archived mods 2024-03-06 16:57:22 +01:00			`#!/usr/bin/env python3`
			`import re`
			`import os`
			`import argparse`
			`from urllib.request import urlretrieve`
			`from urllib.parse import urlparse, quote, unquote`
			`from urllib.error import HTTPError, URLError`

			`# example usage:`
			`# python archive.py --page original-wiki-page.md --file-dir files --output readme.md --repo-base https://raw.githubusercontent.com/yuzu-mirror/yuzu-mod-archive/main`

			`# group 1: game name`
			`# group 2: contents`
			`SECTION_REGEX = r"### ([A-Za-z0-9].+)((?:.\|\n)*?)#"`

			`# group 1: title`
			`# group 2: link (raw)`
			`# group 3: description`
			`# group 4: version`
			`# group 5: authors`
			`# The contents of all groups, except the link group (2), may be formatted with Markdown.`
			TABLE_REGEX = r"\\| \[(.+?)\]\((http.+?)\) \\| (.+?) \\| `(.+)` \\| (.+)"

			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--page", required=True, help="path to the 'Switch Mods' wiki page Markdown file")`
			`parser.add_argument("--file-dir", required=True, help="path to the directory to download all files to")`
			`parser.add_argument("--output", required=True, help="filename of the output modified Markdown file, with replaced URLs")`
			`parser.add_argument("--repo-base", required=True, help="base URL of the repository where the files will be held")`
			`parser.add_argument("--no-dl", action="store_true", help="don't download anything, just output the modified document")`
			`args = parser.parse_args()`

			`with open(args.page, "r") as file:`
			`wiki_content = file.read()`
			`sections = re.findall(SECTION_REGEX, wiki_content)`

			`replacements: list[tuple[str, str]] = []`

			`for section in sections:`
			`game_name: str = section[0]`
			`table = section[1]`

			`folder_name = "".join([x for x in game_name if x.isalnum() or x.isspace()])`

			`folder = os.path.join(args.file_dir, folder_name)`

			`skip_dl = args.no_dl`

			`if not args.no_dl:`
			`if os.path.isdir(folder):`
			`print(f"[!] folder '{folder}' already exists, will skip dl'ing for this game")`
			`skip_dl = True`
			`else:`
			`os.makedirs(folder)`

			`for row in re.findall(TABLE_REGEX, table):`
			`title = row[0]`
			`url = row[1]`
			`description = row[2]`
			`version = row[3]`
			`authors = row[4]`

			`filename = unquote(os.path.basename(urlparse(url).path))`

Fix githubusercontent links 2024-03-06 16:59:14 +01:00			`out_url = f"{args.repo_base}/{args.file_dir}/{quote(game_name)}/{quote(filename)}"`
Add archived mods 2024-03-06 16:57:22 +01:00
			`if skip_dl:`
			`replacements.append((url, out_url))`
			`continue`

			`try:`
			`urlretrieve(url, os.path.join(folder, filename))`
			`print(f"[+] mod '{title}' downloaded for game {game_name}")`
			`replacements.append((url, out_url))`
			`except HTTPError as e:`
			`print(f"[ ] mod '{title}' not available from original source, error {e}")`

			`# try using the internet archive`
			`try:`
			`# the date does not matter, IA will automatically pick the closest one (we use the oldest date`
			`# available so that we don't download an archived error message)`
			`#`
			`# this assumes the files itself don't change, which is true for the mods that we need to archive`
			`webarchive_url = f"https://web.archive.org/web/20200101125317if_/{url}"`
			`urlretrieve(webarchive_url, os.path.join(folder, filename))`
			`print(f"[+] mod downloaded from the Internet Archive")`
			`replacements.append((url, out_url))`
			`except:`
			`print(f"[-] mod not available on the Internet Archive nor the original source")`
			`except URLError as e:`
			`print(f"[-] mod '{title}' NOT downloaded - URL error {e}")`
			`print(f" url: {url}")`


			`modified = wiki_content`
			`for item in replacements:`
			`modified = modified.replace(item[0], item[1])`

			`with open(args.output, "w") as file:`
			`file.write(modified)`

			`print(f"[+] all done! modified document saved to '{args.output}'")`