From f567cd1263e55838687ef5420b43510f1c353df2 Mon Sep 17 00:00:00 2001 From: Brian Lester Date: Tue, 24 Sep 2024 12:30:19 -0400 Subject: [PATCH] Tooling to download and process Wikis (#51) Add tools to scrape mediawiki wikis that don't publish dumps Add tool that exports the xml based on the list of pages. Add the ability to convert wikis to dolma Download and extract script supports multiworker Create WTF Wikipedia parsing server which uses a worker pool to allow for timeouts Creation of script that removes html tags we found in many wiki dumps. Added Shadow Paging to the creation of wikitext dolma files Added Shadow Paging to dolma preprocessing. Added script that remove `None` lines from dolma files. Added script that can combine dolma shards while tracking what was used where to allow for aligned combinations of later versions. --- .gitignore | 3 + licensed_pile/licenses.py | 39 +- licensed_pile/logs.py | 1 + licensed_pile/scrape.py | 1 + licensed_pile/scripts/.gitignore | 1 + licensed_pile/scripts/combine_dolma.py | 304 ++++++++++ licensed_pile/scripts/id_to_shard.py | 59 ++ licensed_pile/scripts/remove_html.py | 143 +++++ licensed_pile/scripts/remove_none.py | 130 ++++ licensed_pile/scripts/stats.py | 140 +++-- licensed_pile/utils.py | 21 +- licensed_pile/write.py | 139 +++-- licensed_pile/xml.py | 28 +- requirements.txt | 9 +- setup.py | 6 +- wiki/.gitignore | 1 + wiki/README.md | 47 ++ wiki/__init__.py | 0 wiki/archive/.gitignore | 1 + wiki/archive/README.md | 32 + wiki/archive/__init__.py | 0 wiki/archive/download_archive.py | 176 ++++++ wiki/archive/get_metadata.py | 99 +++ wiki/archive/to_dolma.py | 351 +++++++++++ wiki/archive/utils.py | 414 +++++++++++++ wiki/dolma_utils.py | 1 + wiki/dump/.gitignore | 1 + wiki/dump/README.md | 12 + wiki/dump/download.py | 48 ++ wiki/dump/download.sh | 32 + wiki/dump/to_dolma.sh | 34 ++ wiki/parser/.gitignore | 14 + wiki/parser/README.md | 34 ++ wiki/parser/haproxy.cfg | 48 ++ wiki/parser/package.json | 10 + wiki/parser/parser.js | 94 +++ wiki/parser/start.sh | 34 ++ wiki/parser/worker.js | 27 + wiki/preprocess.py | 221 +++++++ wiki/scrape/.gitignore | 1 + wiki/scrape/README.md | 26 + wiki/scrape/export_pages.py | 103 ++++ wiki/scrape/get_namespaces.py | 70 +++ wiki/scrape/list_pages.py | 146 +++++ wiki/scrape/list_wikis.py | 113 ++++ wiki/scrape/utils.py | 59 ++ wiki/scripts/find.py | 31 + wiki/scripts/grammar.py | 194 ++++++ wiki/scripts/remove_html.py | 133 ++++ wiki/to_dolma.py | 148 +++++ wiki/wiki.py | 804 +++++++++++++++++++++++++ 51 files changed, 4450 insertions(+), 133 deletions(-) create mode 100644 licensed_pile/scripts/.gitignore create mode 100644 licensed_pile/scripts/combine_dolma.py create mode 100644 licensed_pile/scripts/id_to_shard.py create mode 100644 licensed_pile/scripts/remove_html.py create mode 100644 licensed_pile/scripts/remove_none.py create mode 100644 wiki/.gitignore create mode 100644 wiki/README.md create mode 100644 wiki/__init__.py create mode 100644 wiki/archive/.gitignore create mode 100644 wiki/archive/README.md create mode 100644 wiki/archive/__init__.py create mode 100644 wiki/archive/download_archive.py create mode 100644 wiki/archive/get_metadata.py create mode 100644 wiki/archive/to_dolma.py create mode 100644 wiki/archive/utils.py create mode 100644 wiki/dolma_utils.py create mode 100644 wiki/dump/.gitignore create mode 100644 wiki/dump/README.md create mode 100644 wiki/dump/download.py create mode 100755 wiki/dump/download.sh create mode 100755 wiki/dump/to_dolma.sh create mode 100644 wiki/parser/.gitignore create mode 100644 wiki/parser/README.md create mode 100644 wiki/parser/haproxy.cfg create mode 100644 wiki/parser/package.json create mode 100644 wiki/parser/parser.js create mode 100755 wiki/parser/start.sh create mode 100644 wiki/parser/worker.js create mode 100644 wiki/preprocess.py create mode 100644 wiki/scrape/.gitignore create mode 100644 wiki/scrape/README.md create mode 100644 wiki/scrape/export_pages.py create mode 100644 wiki/scrape/get_namespaces.py create mode 100644 wiki/scrape/list_pages.py create mode 100644 wiki/scrape/list_wikis.py create mode 100644 wiki/scrape/utils.py create mode 100644 wiki/scripts/find.py create mode 100644 wiki/scripts/grammar.py create mode 100644 wiki/scripts/remove_html.py create mode 100644 wiki/to_dolma.py create mode 100644 wiki/wiki.py diff --git a/.gitignore b/.gitignore index 9c3acb0..a5c31c6 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,6 @@ cython_debug/ #.idea/ .python-version **/licensed_pile_log.txt + +node_modules +package-lock.json diff --git a/licensed_pile/licenses.py b/licensed_pile/licenses.py index 6367480..b95bf28 100644 --- a/licensed_pile/licenses.py +++ b/licensed_pile/licenses.py @@ -9,6 +9,8 @@ def __str__(self): return self.value +# TODO: With all the different versions that are out in the wild, this flat enum +# is getting hard to use. We should re-thing how to do this. class PermissiveLicenses(StringEnum): """By 'Permissive' we mean licenses that are in the Gold, Silver, or Bronze lists of the Blue Oak Countil (https://blueoakcouncil.org/list), even if @@ -17,15 +19,24 @@ class PermissiveLicenses(StringEnum): PD = "Public Domain" CC0 = "Creative Commons Zero - Public Domain - https://creativecommons.org/publicdomain/zero/1.0/" + CC_PDM = "Creative Commons Public Domain Mark - https://creativecommons.org/publicdomain/mark/1.0/" CC_BY = ( "Creative Commons - Attribution - https://creativecommons.org/licenses/by/4.0/" ) CC_BY_3 = ( "Creative Commons - Attribution - https://creativecommons.org/licenses/by/3.0/" ) + CC_BY_2_5 = ( + "Creative Commons - Attribution - https://creativecommons.org/licenses/by/2.5/" + ) + CC_BY_2 = ( + "Creative Commons - Attribution - https://creativecommons.org/licenses/by/2.0/" + ) CC_BY_SA = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/4.0/" CC_BY_SA_3 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/3.0/" CC_BY_SA_2_5 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/2.5/" + CC_BY_SA_2_1 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/2.1/" + CC_BY_SA_1 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/1.0/" GFDL = "GNU Free Documentation License" APACHE_2 = "Apache 2 License - https://www.apache.org/licenses/LICENSE-2.0" MIT = "MIT License" @@ -49,17 +60,35 @@ def from_string(cls, s: str) -> "PermissiveLicenses": s = s.lower().strip() if re.match(r".*/publicdomain/zero/1.0/?$", s): return cls.CC0 - if m := re.match(r".*/licenses/by(?P-sa)?/(?P\d).0/?$", s): - if m.group("version") == "4": - if m.group("share") is None: + if re.match(r".*/publicdomain/mark/1.0/?$", s): + return cls.CC_PDM + if re.match(r".*/publicdomain/.*", s): + return cls.PD + if m := re.search(r"(?:/licenses/)?by(?P-sa)?/(?P\d.\d)/?", s): + if m.group("version") == "4.0": + if m.group("share") is not None: return cls.CC_BY_SA return cls.CC_BY - elif m.group(1) == "3": - if m.group("share") is None: + elif m.group("version") == "3.0": + if m.group("share") is not None: return cls.CC_BY_SA_3 return cls.CC_BY_3 + elif m.group("version") == "2.5": + if m.group("share") is not None: + return cls.CC_BY_SA_2_5 + return cls.CC_BY_2_5 + elif m.group("version") == "2.1": + if m.group("share") is not None: + return cls.CC_BY_SA_2_1 + elif m.group("version") == "2.0": + return cls.CC_BY_2 + elif m.group("version") == "1.0": + if m.group("share") is not None: + return cls.CC_BY_SA_1 else: raise ValueError(f"Unable to understand license {s}") + if s == "gfdl" or "gnu_free_documentation_license" in s: + return cls.GFDL raise ValueError(f"Unable to understand license {s}") diff --git a/licensed_pile/logs.py b/licensed_pile/logs.py index a139b2b..6d4d476 100644 --- a/licensed_pile/logs.py +++ b/licensed_pile/logs.py @@ -5,6 +5,7 @@ import sys from typing import Protocol, Sequence +import contextual_logger from logging_json import JSONFormatter diff --git a/licensed_pile/scrape.py b/licensed_pile/scrape.py index 45e28a5..d2b479a 100644 --- a/licensed_pile/scrape.py +++ b/licensed_pile/scrape.py @@ -29,6 +29,7 @@ def get_page( resp = requests.get(url, params=params, headers=headers) logging.debug(f"Sending GET to {resp.url}") if resp.status_code != 200: + # TODO: Update logger logging.warning( f"Failed request to {resp.url}: {resp.status_code}, {resp.reason}" ) diff --git a/licensed_pile/scripts/.gitignore b/licensed_pile/scripts/.gitignore new file mode 100644 index 0000000..66c5efb --- /dev/null +++ b/licensed_pile/scripts/.gitignore @@ -0,0 +1 @@ +shard_to_*.json diff --git a/licensed_pile/scripts/combine_dolma.py b/licensed_pile/scripts/combine_dolma.py new file mode 100644 index 0000000..a43c794 --- /dev/null +++ b/licensed_pile/scripts/combine_dolma.py @@ -0,0 +1,304 @@ +"""Tool to recombine many small dolma files into fewer larger ones. + +The tool tracks which example go into which shard to allow to the +alignment of shards when multiple versions need to be combined. +""" + +import argparse +import contextlib +import copy +import glob +import json +import os +from typing import Dict, List + +import contextual_logger +import smart_open + +from licensed_pile import utils +from licensed_pile.logs import configure_logging, get_logger +from licensed_pile.write import shard_name + +parser = argparse.ArgumentParser( + description="Combine many dolma files into one. " + "It also tracks where data came from to create " + "aligned shards when combining different versions." +) +parser.add_argument( + "--input", help="Where the dolma files live. A directory", required=True +) +parser.add_argument( + "--output", + help="Where the combined dolma files will live. A directory", + required=True, +) +# We can't peek at one of the current files names as it will be an iterator, also +# if we are combining files, they might have different names. +# When we are making shards for later versions based on the mapping from the first +# the filename is fixed anyway. +parser.add_argument("--filename", help="The name to give the combined shards.") +parser.add_argument( + "--shard_size", type=int, default=1, help="The size each combined shard will be." +) +parser.add_argument( + "--shard_to_files", help="A path to a shard -> source file mapping." +) +parser.add_argument( + "--shard_to_first_id", help="A path to a shard -> starting id mapping." +) +parser.add_argument("--shard_to_last_id", help="A path to a shard -> final id mapping.") + + +def read_dolma_file(path): + with smart_open.open(path) as f: + yield from (json.loads(l) for l in f if l) + + +def combine_dolma_files( + input_dir: str, + output_dir: str, + filename: str, + shard_size: int = 1, + quiet: bool = False, +): + logger = get_logger() + # Make sure the input_dir ends with documents + input_dir = utils.dolma_output(input_dir) + # Find all .jsonl.gz files under input_dir + files = glob.iglob(os.path.join(input_dir, "**", "*.jsonl.gz"), recursive=True) + # Make sure output_dir ends with /documents + logger.info( + "Combining dolma shards into larger files, writing results to %s", output_dir + ) + output_dir = utils.dolma_output(output_dir) + # Make sure the dir exists, the combining process removes any dir structure + # from the input dir tree so we only need to make this file. + os.makedirs(output_dir, exist_ok=True) + + shard_idx = 0 + size = 0 + max_bytes = shard_size * 1000 * 1000 * 1000 + + # Convert shard n to -> 0000n_{filename} + shard = shard_name(filename, shard_idx) + + # Track a mapping from output shard to input file + shard_to_files = {} + # Track the last example moved from the input file to the output shard + shard_to_last_id = {} + # Track the first example moved from the input file to the output shard + shard_to_first_id = {} + # A list of file contributing to the current shard + active_files = [] + # The last example id we wrote into the output file. + last_id = None + first_id = None + + shard_file = os.path.join(output_dir, shard) + with contextlib.ExitStack() as stack: + wf = stack.enter_context(smart_open.open(shard_file, "w")) + stack.enter_context(logger(shard=shard_file)) + for dolma_file in files: + # Only save the part relative to the root, this lets us find this + # input file in a new revision. + rel_dolma = os.path.relpath(dolma_file, input_dir) + logger.info( + "Starting to copy examples from %s into %s", dolma_file, shard_file + ) + # Read example (via iterator) so we don't have them all in memory + # at once. + for example in read_dolma_file(dolma_file): + # Serialize the data + data = json.dumps(example) + # Check if the new data will go over the size limit. + size += len(data) + # We need to make a new shard. + if size >= max_bytes: + logger.close() + # Close the last shard, note that the /current/ data is *not* + # part of the just closed shard. + wf.close() + # Record the files that went into this shard. They are a list + # as we want to combine them in the same order going forward. + shard_to_files[shard] = copy.deepcopy(active_files) + # Record the last example id that went into this shard. Note + # the last_id currently points to the /previous/ data item + # as we have not assigned to it based on data + shard_to_last_id[shard] = last_id + # Record the first example id that went into this shard. + shard_to_first_id[shard] = first_id + # Increment shard, create new name, path, open file etc. + logger.info( + "Shard %s made from %s up to %s", shard, active_files, last_id + ) + shard_idx += 1 + shard = shard_name(filename, shard_idx) + shard_file = os.path.join(output_dir, shard) + wf = stack.enter_context(smart_open.open(shard_file, "w")) + stack.enter_context(logger(shard=shard_file)) + logger.info( + "Shard size exceeded, creating new shard at %s", shard_file + ) + # Reset size checker + size = 0 + # Reset the active files to be empty, as long as the next + # data item is written, the current file will get added to + # the list. + active_files = [] + # Set this to None so that it can be re-set now that it is + # tracking for the new shard. + first_id = None + logger.info( + "Starting to copy examples from %s into %s", + dolma_file, + shard_file, + ) + # Write the data and update the last_id to point to this item, + # which will become the previous item in the next iteration of + # the loop + wf.write(data + "\n") + last_id = example["id"] + # We only let the first id be written once per shard, by the + # first example that was output. + if first_id is None: + first_id = example["id"] + # Only add the current file to the active for this shard list + # if the current bit of data is actually written to it. By doing + # this /after/ the data is written, we avoid having a false + # positive where the first element of a file triggers a new shard. + # If we saved ourselves to "active" when the file was opened, it + # would look like we contributed to the current shard. + # + # We also make sure to only add ourselves to the list once. + if not active_files or active_files[-1] != (rel_dolma): + active_files.append(rel_dolma) + # We don't actually need this check as the final data write will always + # be into a shard that hasn't saved it's active files yet (as the size + # check/new shard is from before the writing to the file.) + if active_files: + shard_to_files[shard] = copy.deepcopy(active_files) + # In this case, the last_id *is* pointing to the /current/ data item + # this is ok as the last id is *inclusive*, so saving the current data + # id will mean include everything up-to and including this item in + # the current shard. + shard_to_last_id[shard] = last_id + # Save the first id too. + shard_to_first_id[shard] = first_id + logger.info("Shard %s made from %s up to %s", shard, active_files, last_id) + return shard_to_files, shard_to_first_id, shard_to_last_id + + +def combine_dolma_with_shard_info( + input_dir: str, + output_dir: str, + shard_to_files: Dict[str, List[str]], + shard_to_first_id: Dict[str, str], + shard_to_last_id: Dict[str, str], +): + logger = get_logger() + # Ensure both paths end with /documents + input_dir = utils.dolma_output(input_dir) + output_dir = utils.dolma_output(output_dir) + # Make sure the dir exists, the combining process removes any dir structure + # from the input dir tree so we only need to make this file. + os.makedirs(output_dir, exist_ok=True) + # Iterate though the output shards we should generate. + for shard, files in shard_to_files.items(): + with logger(shard=shard): + logger.info("Starting to populate shard") + # Find the last id in the last file that we should write to this shard. + last_id = shard_to_last_id[shard] + # Find the first id in the first file that we should write to this shard. + first_id = shard_to_first_id[shard] + # Create the new shard. + with smart_open.open( + os.path.join(utils.dolma_output(output_dir), shard), "w" + ) as wf: + # Are we skipping through the starting examples because they + # were in an earlier shard? + skipping = True + # Iterate through the files that contributed to this shard. + for dolma_file in files: + with logger(source=dolma_file): + logger.info("Filling shard from new source.") + # Write each example to the shard + for example in read_dolma_file( + os.path.join(input_dir, dolma_file) + ): + if (eid := example["id"]) == first_id: + logger.info( + "Found first id in the first source file, start to fill", + extra={"first_id": first_id}, + ) + skipping = False + if skipping: + logger.debug( + "Skipping example, it was in the last shard.", + extra={"id": eid}, + ) + continue + wf.write(json.dumps(example) + "\n") + # If we are writing the final open file, stop after we write + # the example with the final id. + if dolma_file == files[-1] and eid == last_id: + logger.info( + "Found last id in final source file, closing shard.", + extra={"last_id": last_id}, + ) + break + + +def read_shard_file(path): + logger = get_logger() + logger.info("Reading shard creation map from %s", path) + with open(path) as f: + return json.load(f) + + +def write_shard_file(shard_map, path): + logger = get_logger() + logger.info("Saving shard creation map to %s", path) + with open(path, "w") as wf: + json.dump(shard_map, wf) + + +def main(): + args = parser.parse_args() + configure_logging() + logger = get_logger() + + if not (args.shard_to_files or args.shard_to_first_id or args.shard_to_last_id): + if args.filename is None: + raise ValueError( + "--filename needs to be given when creating the first combined dolma files." + ) + logger.info("Combining files into shards and tracking which go where.") + shard_to_files, shard_to_first_id, shard_to_last_id = combine_dolma_files( + args.input, args.output, args.filename, args.shard_size + ) + logger.info("Created %d new larger shards", len(shard_to_files)) + logger.info( + "Each shard if made of %.2f on average", + sum(len(fs) for fs in shard_to_files.values()) / len(shard_to_files), + ) + write_shard_file(shard_to_files, "shard_to_files.json") + write_shard_file(shard_to_first_id, "shard_to_first_id.json") + write_shard_file(shard_to_last_id, "shard_to_last_id.json") + elif args.shard_to_files and args.shard_to_first_id and args.shard_to_last_id: + logger.info("Combining files into shards based on a mapping.") + shard_to_files = read_shard_file(args.shard_to_files) + shard_to_first_id = read_shard_file(args.shard_to_first_id) + shard_to_last_id = read_shard_file(args.shard_to_last_id) + combine_dolma_with_shard_info( + args.input, args.output, shard_to_files, shard_to_first_id, shard_to_last_id + ) + else: + raise ValueError( + "Either all or none of --shard_to_files, --shard_to_first_id, and " + f"--shard_to_last_id should be given, got --shard_to_files={args.shard_to_files}, " + f"--shard_to_first_id={args.shard_to_first_id}, and --shard_to_last_id={args.shard_to_last_id}" + ) + + +if __name__ == "__main__": + main() diff --git a/licensed_pile/scripts/id_to_shard.py b/licensed_pile/scripts/id_to_shard.py new file mode 100644 index 0000000..d74fa42 --- /dev/null +++ b/licensed_pile/scripts/id_to_shard.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +import argparse +import glob +import json +import multiprocessing as mp +import os +import re +from tempfile import TemporaryDirectory + +import smart_open + +from licensed_pile import utils, write + + +class IdToShardParallel(write.ShardParallelProcessor): + @classmethod + def process_example(cls, example, **kwargs): + return {"id": example["id"]} + + +def main(): + mp.set_start_method("spawn") + parser = argparse.ArgumentParser(description="") + parser.add_argument("--input", help="", required=True) + parser.add_argument( + "--output", + help="", + default="id_to_shards.json", + ) + parser.add_argument("--processes", type=int, default=mp.cpu_count(), help="") + args = parser.parse_args() + + args.input = utils.dolma_input(args.input) + + with TemporaryDirectory() as tempdir: + processor = IdToShardParallel( + source_prefix=args.input, + destination_prefix=tempdir, + metadata_prefix=tempdir, + num_processes=args.processes, + ) + processor() + + id_to_shard = {} + for shard_file in glob.iglob( + os.path.join(tempdir, os.path.basename(args.input)) + ): + if shard := re.search("^(\d{5})_", os.path.basename(shard_file)): + shard = shard.group(1) + with smart_open.smart_open(shard_file) as f: + ids = [json.loads(l)["id"] for l in f if l] + id_to_shard |= dict.fromkeys(ids, shard) + with smart_open.smart_open(args.output, "w") as wf: + json.dump(id_to_shard, wf) + + +if __name__ == "__main__": + main() diff --git a/licensed_pile/scripts/remove_html.py b/licensed_pile/scripts/remove_html.py new file mode 100644 index 0000000..2839da9 --- /dev/null +++ b/licensed_pile/scripts/remove_html.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 + +import argparse +import multiprocessing as mp +import re +from tempfile import TemporaryDirectory + +import bs4 + +from licensed_pile import logs, utils +from licensed_pile.write import ShardParallelProcessor + +parser = argparse.ArgumentParser(description="Remove HTML from dolma documents.") +parser.add_argument( + "--input", + required=True, + help="The input version, this directory should be where the `documents` dir lives.", +) +parser.add_argument( + "--output", + required=True, + help="The output version, this directory should be where the `documents` dir will live.", +) +parser.add_argument( + "--filename", + default="*.jsonl.gz", + help="The filename to match with globs, probably needs to be escaped.", +) +# TODO: Respect this flag +parser.add_argument( + "--overwrite", + action="store_true", + help="Should we overwrite previously processed examples?", +) +parser.add_argument( + "--debug", + action="store_true", + help="Should we log when documents are not changed by preprocessing.", +) +parser.add_argument( + "--processes", + type=int, + default=mp.cpu_count(), + help="Number of processors for multicore.", +) + +logs.configure_logging(level="DEBUG") + + +class CaptureMatches: + def __init__(self): + self.matches = [] + + def __call__(self, m): + try: + self.matches.append(m.group(1)) + except IndexError: + self.matches.append(m) + return "" + + def __iter__(self): + yield from self.matches + + def __bool__(self): + return bool(self.matches) + + +class RegexRemoveHTMLParallel(ShardParallelProcessor): + @classmethod + def process_example(cls, example, **kwargs): + logger = cls.get_logger() + cm = CaptureMatches() + # Capture the smallest amount of text between
+ # This would not be ok if we cared about malicious input. + # cleaned_text = re.sub(r"(<(?:div|font).*?>)", cm, example["text"]) + cleaned_text = re.sub(r"(<[^ >][^>]*?>)", cm, example["text"]) + + if cm: + for m in cm: + logger.debug( + "Removed %s based on regex", + m, + extra={ + "source": example["source"], + "example_id": example["id"], + "match": m, + }, + ) + + example["text"] = cleaned_text + return example + + +class BS4RemoveHTMLParallel(ShardParallelProcessor): + """There are issues with using bs4 to remove partial html.""" + + @classmethod + def process_example(cls, example, **kwargs): + logger = cls.get_logger() + try: + example["text"] = bs4.BeautifulSoup( + example["text"], "html.parser" + ).get_text() + except bs4.ParserRejectedMarkup: + # If this exception is raised, it will be before the assignment so + # example["text"] is still the original text. + logger.warning( + "Failed to remove HTML, probably due to text that looks likes an html tag, keeping text as is.", + extra={ + "source": example["source"], + "example_id": example["id"], + }, + exc_info=True, + ) + except: + logger.error( + "Failed to parse HTML", + extra={ + "source": example["source"], + "example_id": example["id"], + }, + exc_info=True, + ) + # Just pass the text through for now + return example + + +def main(args): + with TemporaryDirectory() as tempdir: + processor = RegexRemoveHTMLParallel( + source_prefix=utils.dolma_input(args.input, args.filename), + destination_prefix=utils.dolma_output(args.output), + metadata_prefix=tempdir, + num_processes=args.processes, + ) + processor(debug=args.debug, overwrite=args.overwrite) + + +if __name__ == "__main__": + # Dolma examples use spawn over fork, unsure why but lets follow them. + mp.set_start_method("spawn") + args = parser.parse_args() + main(args) diff --git a/licensed_pile/scripts/remove_none.py b/licensed_pile/scripts/remove_none.py new file mode 100644 index 0000000..461dbc1 --- /dev/null +++ b/licensed_pile/scripts/remove_none.py @@ -0,0 +1,130 @@ +"""Count the number of (whitespace-delineated) tokens in a dolma dataset.""" + +import argparse +import json +import multiprocessing as mp +import os +import re +from queue import Queue +from tempfile import TemporaryDirectory + +import smart_open +from dolma.core.parallel import BaseParallelProcessor + +from licensed_pile import utils +from licensed_pile.logs import configure_logging, get_logger + +configure_logging() + + +class RemoveNoneParallel(BaseParallelProcessor): + @classmethod + def get_logger(cls): + return get_logger() + + @classmethod + def increment_progressbar( + cls, + queue: Queue, + /, + shards: int = 0, + documents: int = 0, + nones: int = 0, + ): + return super().increment_progressbar( + queue, + shards=shards, + documents=documents, + nones=nones, + ) + + @classmethod + def process_single( + cls, + source_path: str, + destination_path: str, + queue: Queue, + **kwargs, + ): + logger = cls.get_logger() + with logger(file=source_path): + logger.debug("Removing None's from Dolma files at %s", source_path) + with smart_open.open(source_path) as f, smart_open.open( + destination_path, "w" + ) as wf: + document_count = 0 + none_count = 0 + update_interval = kwargs.pop("update_interval", 1) + + for i, line in enumerate(f): + with logger(line=i): + try: + try: + data = json.loads(line) + except json.JSONDecodeError as e: + logger.error( + "Failed to parse JSON from `%s...`", + line[:80], + exc_info=True, + ) + continue + + document_count += 1 + if data is None: + none_count += 1 + else: + wf.write(json.dumps(data) + "\n") + + if document_count % update_interval == 0: + cls.increment_progressbar( + queue, + documents=document_count, + nones=none_count, + ) + if queue.qsize() >= mp.cpu_count(): + update_interval *= 2 + document_count = 0 + none_count = 0 + except Exception: + logger.error( + "Failed to process example", source_path, exc_info=True + ) + raise + cls.increment_progressbar( + queue, shards=1, documents=document_count, nones=none_count + ) + + +def main(): + mp.set_start_method("spawn") + parser = argparse.ArgumentParser(description="Remove None's from dolma files.") + parser.add_argument( + "--input", + required=True, + help="The dolma input directory, should be where the `documents` dir lives. Can also be a specific file.", + ) + parser.add_argument("--output", required=True, help="The dolma output directory") + parser.add_argument( + "--processes", + type=int, + default=mp.cpu_count(), + help="Number of processors for multicore.", + ) + parser.add_argument("--meta", help="Location of Dolma processing metadata.") + args = parser.parse_args() + + source = utils.dolma_input(args.input) + destination = utils.dolma_output(args.output) + + with utils.maybe_temp_dir(args.meta) as meta_dir: + processor = RemoveNoneParallel( + source_prefix=source, + destination_prefix=destination, + metadata_prefix=meta_dir, + num_processes=args.processes, + ) + processor() + + +if __name__ == "__main__": + main() diff --git a/licensed_pile/scripts/stats.py b/licensed_pile/scripts/stats.py index c06ab9d..ca14420 100644 --- a/licensed_pile/scripts/stats.py +++ b/licensed_pile/scripts/stats.py @@ -11,8 +11,17 @@ import smart_open from dolma.core.parallel import BaseParallelProcessor +from licensed_pile import utils +from licensed_pile.logs import configure_logging, get_logger + +configure_logging() + class SizeStatsParallel(BaseParallelProcessor): + @classmethod + def get_logger(cls): + return get_logger() + @classmethod def increment_progressbar( cls, @@ -44,62 +53,67 @@ def process_single( del destination_path logger = cls.get_logger() logger.debug("Counting Tokens from Dolma files at %s", source_path) - with smart_open.open(source_path) as f: - document_count = 0 - token_count = 0 - byte_count = 0 - char_count = 0 - update_interval = kwargs.pop("update_interval", 1) - - try: + with logger(file=source_path): + with smart_open.open(source_path) as f: + document_count = 0 + token_count = 0 + byte_count = 0 + char_count = 0 + update_interval = kwargs.pop("update_interval", 1) + for i, line in enumerate(f): - try: - data = json.loads(line) - except json.JSONDecodeError as e: - logger.warning( - "Failed to parse %s:%s `%s...`: %s", - source_path, - i, - line[:80], - e, - ) - continue - - # TODO: Make this configurable - tokens = data["text"].split() - document_count += 1 - token_count += len(tokens) - char_count += len(data["text"]) - # There are some sources that have invalid unicode that result - # in rendering errors in webpages. Thus we ignore them here. - # Example: https://math.stackexchange.com/a/8849 - byte_count += len(data["text"].encode("utf-8", "ignore")) - - if document_count % update_interval == 0: - cls.increment_progressbar( - queue, - documents=document_count, - tokens=token_count, - bytes_utf8=byte_count, - characters=char_count, - ) - if queue.qsize() >= mp.cpu_count(): - update_interval *= 2 - document_count = 0 - token_count = 0 - char_count = 0 - byte_count = 0 - except Exception as e: - logger.warning("Failed to process %s: %s", source_path, e) - return - cls.increment_progressbar( - queue, - shards=1, - documents=document_count, - tokens=token_count, - bytes_utf8=byte_count, - characters=char_count, - ) + with logger(line=i): + try: + try: + data = json.loads(line) + except json.JSONDecodeError: + logger.error( + "Failed to parse JSON from `%s...`", + line[:80], + exc_info=True, + ) + continue + # TODO: Dolma file generation should not be adding null lines + if data is None: + continue + # TODO: Make this configurable + if data["text"] is None: + document_count += 1 + continue + tokens = data["text"].split() + document_count += 1 + token_count += len(tokens) + char_count += len(data["text"]) + # There are some sources that have invalid unicode that result + # in rendering errors in webpages. Thus we ignore them here. + # Example: https://math.stackexchange.com/a/8849 + byte_count += len(data["text"].encode("utf-8", "ignore")) + + if document_count % update_interval == 0: + cls.increment_progressbar( + queue, + documents=document_count, + tokens=token_count, + bytes_utf8=byte_count, + characters=char_count, + ) + if queue.qsize() >= mp.cpu_count(): + update_interval *= 2 + document_count = 0 + token_count = 0 + char_count = 0 + byte_count = 0 + except Exception as e: + logger.error("Failed to process example", exc_info=True) + raise + cls.increment_progressbar( + queue, + shards=1, + documents=document_count, + tokens=token_count, + bytes_utf8=byte_count, + characters=char_count, + ) def main(): @@ -116,21 +130,19 @@ def main(): default=mp.cpu_count(), help="Number of processors for multicore.", ) + parser.add_argument( + "--meta", help="Location to store dolma metadata while processing." + ) args = parser.parse_args() - if os.path.exists(args.input) and os.path.isfile(args.input): - source = args.input - else: - source = os.path.join( - re.sub("documents/?$", "", args.input), "**", "*.jsonl.gz" - ) + source = utils.dolma_input(args.input) - with TemporaryDirectory() as tempdir: + with utils.maybe_temp_dir(path=args.meta) as meta_dir: processor = SizeStatsParallel( source_prefix=source, # Unused - destination_prefix=tempdir, - metadata_prefix=tempdir, + destination_prefix=meta_dir, + metadata_prefix=meta_dir, num_processes=args.processes, ) processor() diff --git a/licensed_pile/utils.py b/licensed_pile/utils.py index 2ee4e48..3acdeb9 100644 --- a/licensed_pile/utils.py +++ b/licensed_pile/utils.py @@ -2,6 +2,10 @@ import glob import os +import re +from contextlib import contextmanager +from tempfile import TemporaryDirectory +from typing import Optional # We don't use snake case as the string methods added in PIP616 are named like this. @@ -25,7 +29,7 @@ def dolma_input(input_path: str, filepattern: str = "*.jsonl.gz") -> str: # If the input is directly to a file, or it is a glob that returns matches, # use as is. if (os.path.exists(input_path) and os.path.isfile(input_path)) or glob.glob( - input_path + input_path, recursive=True ): return input_path # Otherwise it is probably meant as a directory, so add the ../documents/${filepattern} @@ -39,6 +43,15 @@ def dolma_input(input_path: str, filepattern: str = "*.jsonl.gz") -> str: def dolma_output(output_path: str): # Make sure the output ends in .../documents, many people forget this. - if os.path.basename(output_path) != "documents": - return os.path.join(output_path, "documents") - return output_path + if re.match(".*/documents/?$", output_path): + return output_path + return os.path.join(output_path, "documents") + + +@contextmanager +def maybe_temp_dir(path: Optional[str] = None): + if path is not None: + yield path + else: + with TemporaryDirectory() as tmpdir: + yield tmpdir diff --git a/licensed_pile/write.py b/licensed_pile/write.py index f5d9c34..9083061 100644 --- a/licensed_pile/write.py +++ b/licensed_pile/write.py @@ -3,12 +3,14 @@ import abc import copy import json +import logging import multiprocessing as mp import os from contextlib import ExitStack from queue import Queue from typing import Dict, Iterator +import contextual_logger import smart_open import tqdm from dolma.core.parallel import BaseParallelProcessor @@ -55,6 +57,20 @@ def to_dolma( wf.write(data + "\n") +def smart_open_exists(path): + try: + with smart_open.open(path): + return True + except: + return False + + +def create_shadow(path): + h, t = os.path.split(path) + # Add shadow at the start to not break any filename inference from smart_open + return os.path.join(h, f"shadow.{t}") + + class ShardParallelProcessor(BaseParallelProcessor): """Handle read/writes to jsonl.gz so our processor code only needs to processing a single example.""" @@ -73,6 +89,10 @@ def increment_progressbar( def process_example(cls, example, **kwargs): """Code to process a single example in the dolma format, not the whole file.""" + @classmethod + def get_logger(cls): + return get_logger() + @classmethod def process_single( cls, @@ -82,55 +102,72 @@ def process_single( **kwargs, ): logger = cls.get_logger() - logger.debug("Processing %s into %s", source_path, destination_path) - with smart_open.open(source_path) as f, smart_open.open( - destination_path, "w" - ) as wf: - document_count = 0 - update_interval = kwargs.pop("update_interval", 1) - debug = kwargs.pop("debug", False) - - try: - for i, line in enumerate(f): - try: - data = json.loads(line) - except json.JSONDecodeError as e: - logger.warning( - "Failed to parse %s:%s `%s...`: %s", - source_path, - i, - line[:80], - e, - ) - continue - - if debug: - og = copy.deepcopy(data["text"]) - - processed = cls.process_example(data, **kwargs) - - if processed is None: - logger.warning( - "Preprocessing has reduced %s:%s to nothing, skipping", - source_path, - i, - ) - continue - - if debug and og == processed["text"]: - logger.warning( - "Text unchanged for example %s:%s", source_path, i - ) - - wf.write(json.dumps(processed) + "\n") - document_count += 1 - - if document_count % update_interval == 0: - cls.increment_progressbar(queue, documents=document_count) - if queue.qsize() >= mp.cpu_count(): - update_interval *= 2 - document_count = 0 - except Exception as e: - logger.warning("Failed to process %s: %s", source_path, e) + overwrite = kwargs.pop("overwrite", False) + shadow = kwargs.pop("shadow", True) + with logger(file=source_path): + logger.debug("Processing %s into %s", source_path, destination_path) + if not overwrite and smart_open_exists(destination_path): + logger.info("%s already exists, skipping", destination_path) + cls.increment_progressbar(queue, shards=1) return - cls.increment_progressbar(queue, shards=1, documents=document_count) + output_path = ( + create_shadow(destination_path) if shadow else destination_path + ) + with smart_open.open(source_path) as f, smart_open.open( + output_path, "w" + ) as wf: + document_count = 0 + update_interval = kwargs.pop("update_interval", 1) + debug = kwargs.pop("debug", False) + + for i, line in enumerate(f): + with logger(line=i): + try: + try: + data = json.loads(line) + except json.JSONDecodeError as e: + logger.warning( + "Failed to parse JSON from `%s...`", + line[:80], + exc_info=True, + ) + continue + + og = copy.deepcopy(data["text"]) if debug else None + processed = cls.process_example( + data, source_file=source_path, line_number=i, **kwargs + ) + if processed is None: + logger.warning( + "Preprocessing has reduced example to nothing, skipping" + ) + document_count += 1 + continue + + if debug and og == processed["text"]: + logger.warning("Text unchanged for example.") + + wf.write(json.dumps(processed) + "\n") + document_count += 1 + + if document_count % update_interval == 0: + cls.increment_progressbar( + queue, documents=document_count + ) + if queue.qsize() >= mp.cpu_count(): + update_interval *= 2 + document_count = 0 + except Exception as e: + e.add_note( + f"Exception occured while processing {source_path}:{i}" + ) + logger.warning( + "Exception occured while processing example", + exc_info=True, + ) + raise + # Cloud Storage generally doesn't have a cheap way to rename files. So + # shadow paging should generally only be used for local data. + if shadow: + os.rename(output_path, destination_path) + cls.increment_progressbar(queue, shards=1, documents=document_count) diff --git a/licensed_pile/xml.py b/licensed_pile/xml.py index 346e761..e8bf896 100644 --- a/licensed_pile/xml.py +++ b/licensed_pile/xml.py @@ -1,6 +1,10 @@ """Tools to help with xml parsing.""" -from xml.etree import ElementTree as ET +from typing import List + +import lxml.etree as ET + +from licensed_pile import logs def iterate_xml(path: str, tag: str): @@ -13,10 +17,24 @@ def iterate_xml(path: str, tag: str): See https://web.archive.org/web/20201111201837/http://effbot.org/zone/element-iterparse.htm for more details on what it is doing. """ + logger = logs.get_logger() context = ET.iterparse(path, events=("start", "end")) context = iter(context) event, root = next(context) - for event, elem in context: - if event == "end" and elem.tag == tag: - yield elem - root.clear() + try: + for event, elem in context: + # This `.localname` only exists for lxml. Include this or so you can + # still do a full namespace match if you need too. + if event == "end" and ( + ET.QName(elem.tag).localname == tag or elem.tag == tag + ): + yield elem + root.clear() + except Exception as e: + logger.exception(f"Failed iterating over <{tag}> in {path}") + + +def iterate_xmls(paths: List[str], tag: str): + """Iterable version of parsing multiple xml files with the same structure as a single iterator.""" + for path in paths: + yield from iterate_xml(path, tag) diff --git a/requirements.txt b/requirements.txt index 24a4692..e90d82c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,20 @@ +beautifulsoup4 charset_normalizer +datasets dolma google-cloud-storage +internetarchive logging_json markdown-it-py pandas +patool pre-commit +pyunpack rdflib requests>=2.13 smart_open streamlit tenacity -pandas -jsonlines -datasets tqdm ultimate-sitemap-parser +contextual-logger>=0.0.2 diff --git a/setup.py b/setup.py index 9714f51..6ae9b80 100644 --- a/setup.py +++ b/setup.py @@ -62,8 +62,12 @@ def get_version(file_name: str, version_variable: str = "__version__") -> str: "logging_json", "requests>=2.13", "tenacity", + "lxml", ], entry_points={ - "console_scripts": ["size-stats-dolma = licensed_pile.scripts.stats:main"] + "console_scripts": [ + "size-stats-dolma = licensed_pile.scripts.stats:main", + "remove-none-dolma = licensed_pile.scripts.remove_none:main", + ] }, ) diff --git a/wiki/.gitignore b/wiki/.gitignore new file mode 100644 index 0000000..60baa9c --- /dev/null +++ b/wiki/.gitignore @@ -0,0 +1 @@ +data/* diff --git a/wiki/README.md b/wiki/README.md new file mode 100644 index 0000000..9be21a8 --- /dev/null +++ b/wiki/README.md @@ -0,0 +1,47 @@ +# Wiki + +## Data Generation + +### WikiMedia + +WikiMedia wikis are ones maintained by the WikiMedia Foundataion, they run things like wikipedia. They regularly publish dumps in the `-history.xml` format that we can convert to dolma files. + +Insturctions for downloading and processing all the WikiMedia wikis can be found in the `dump/` directory. + +### WikiTeam + +The Internet Archive WikiTeam scrapes many wikis across the net and publishes them to the Internet Archive. Of these many are openly licensed. The insturctions for downloading and processing those can be found in the `archive/` directory. + +### Wiki Scaping + +We currently do not do wiki scraping, we just use published dumps or scrapes published by the Internet Archive. The `scrape/` directory has some tools to start scraping in the future. If we plan to do more scraping in the future, we should probably use [wikiteam3](https://github.com/saveweb/wikiteam3) instead of writing our own tools. + +### Conversion to Plain Text + +Following the README's in each subdirectory will result in dolma formatted files that are on-disk with wikitext versions as the `text` field. We then convert them to plain text. + +1. Start the WTF Wikipedia parsing server using the instructions in the `parser/` directory. +2. Run `python preprocessing.py ...` +3. Run `python scripts/remove_html.py ...` + +## Notes + +The following scanners output a .history.xml to parse +* "Internet Archive HTML5 Uploader ...": Seems to have .7z +* "wikiteam3 (v...)" these get released as .zstandard files. +* Official Wikipedia Dumps +* "Internet Archive Python library ..." >= 1.0.4 + + +The following use the old format +* "Internet Archive Python library 0.X.X": As a zip file, you need to make a new dir with -d when you unzip. + + +The archive url can be created with `f"archive.org/details/{item_id}"` + + +Some of the items have multiple uploads, for example `wiki-kris159shoutwikicom_w` has multiple history files we so need to parse out the date and pic the most recent one, i.e., `kris159shoutwikicom_w-20180506-history-xml.7z` over `kris159shoutwikicom_w-20140129-history.xml.7z` + +### Special Cases + +Shout Wiki, WikiTravelAllLanguages diff --git a/wiki/__init__.py b/wiki/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wiki/archive/.gitignore b/wiki/archive/.gitignore new file mode 100644 index 0000000..a7e854e --- /dev/null +++ b/wiki/archive/.gitignore @@ -0,0 +1 @@ +data*/* diff --git a/wiki/archive/README.md b/wiki/archive/README.md new file mode 100644 index 0000000..c67cb7a --- /dev/null +++ b/wiki/archive/README.md @@ -0,0 +1,32 @@ +# Wiki Dumps from the Internet Archive + +## Data Generation + +1. Use `python get_metadata.py` to download the wiki metadata from the IA with a bit of parallelism. This creates a `ia-wiki-metadata.json` file that will be used in the rest of the scripts. +2. Use `python download_archive.py` to download and extract the actual wikis. In the future, this will also handle other wiki fetching methods like dump downloading and scraping. +3. Use `python to_dolma.py` from **this** directory to convert the IA archive wikis to the dolma format. This will save them as dolma formatted files with wikitext in the `text` field at `../data/...` by default. We need to use the `to_dolma.py` script from here as many IA wikis are in an old format that the generic dolma conversion script doesn't support. +4. Use the shared preprocessing pipeline to convert to plain text. + +## Notes + +We need to download 4.4 TB from the Internet Archive. + +If we had a Gigabit connection it would take 9 hours to download. + +Based on the Internet, people say the IA generally has a bandwidth of 10 Mbps to 1 Mbps, and the longer you download the less bandwidth they give to you. + +| Bandwith | Hosts | Time to DL | +|----------|------:|-----------:| +| 1 Gb/s | 1 | 9h 40m | +| | 4 | 2.3h | +| | 10 | 0.9h | +| 10 Mb/s | 1 | 40d 17h | +| | 4 | 10d + | +| | 10 | 4d + | +| 1 Mb/s | 1 | 407d 9h | +| | 4 | 101d + | +| | 10 | 40d+ | +| | 100 | 4d+ | +| | 500 | 0.8d | + +We really need hardware based parallelism diff --git a/wiki/archive/__init__.py b/wiki/archive/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wiki/archive/download_archive.py b/wiki/archive/download_archive.py new file mode 100644 index 0000000..7d128f7 --- /dev/null +++ b/wiki/archive/download_archive.py @@ -0,0 +1,176 @@ +"""Download wiki dumps from the internet archive.""" + +import argparse +import functools +import json +import multiprocessing.dummy as mp +import os +import random + +import internetarchive +import pyunpack +import utils + +from licensed_pile import logs + +parser = argparse.ArgumentParser( + description="Download wiki dumps from the internet archive." +) +parser = argparse.ArgumentParser("--dest_dir", default="data/archive/dumps", help="") +parser.add_argument("--wiki_metadata", default="data/archive/ia-wiki-metadata.jsonl") +parser.add_argument("--test_run", type=int, help="") +parser.add_argument("--num_threads", type=int, default=32, help="") +parser.add_argument("--worker_id", type=int, help="") +parser.add_argument("--num_workers", type=int, help="") + + +def download_and_extract( + ident: str, + dl_file, + output_dir: str = "data/archive/dumps", + verbose: bool = True, +): + """Download from the IA and uncompress it.""" + logger = logs.get_logger() + # Turn wiki id's into nested dirs for easier/faster traversal. + dest = os.path.join(output_dir, utils.wiki_to_dir(ident)) + # Don't re-download files. + if os.path.exists(dest): + logger.info( + f"Skipping download of {dl_file['name']} as {dest} already exists on disk." + ) + return dest + else: + # Download using the IA tools, this includes doing a checksum to verify + # the download was correct. + logger.info(f"Downloading {dl_file['name']}.") + try: + internetarchive.download( + ident, + checksum=True, + verbose=verbose, + files=dl_file["name"], + destdir=output_dir, + ) + except: + # TODO: Should we ensure the dest dir is deleted in case of failure? + logger.error(f"Failed to download {dl_file['name']}", exc_info=True) + try: + os.rmdir(dest) + except: + logger.error( + f"Failed to remove {dest} after a failed download.", exc_info=True + ) + return dest + logger.info(f"Extracting download at {dest}.") + try: + # pyunpack wraps multiple extraction tools, and picks the right one. + pyunpack.Archive(os.path.join(dest, dl_file["name"])).extractall(dest) + except: + # The main error I saw was that zstd compressed data uses the flag + # --long=31. We can't pass this flag to pyunpack so we need to run + # zstd ourselves. + logger.error("Pyunpack uncompression failed.", exc_info=True) + if dl_file["name"].endswith(".zst"): + logger.info("Extracting download to {dest} with tweaked zst.") + compressed = os.path.join(dest, dl_file["name"]) + uncompressed = utils.zst_uncompress(compressed) + return dest + + +def download_ia(wiki, dest_dir): + """Download a wiki from the IA.""" + logger = logs.get_logger() + if (ident := wiki["metadata"]["identifier"]) in utils.KNOWN_BAD: + logger.warning(f"Skipping wiki as it is listed under utils.KNOWN_BAD") + return None + # There are multiple files that you can download, only downloaded the needed one. + dl_file = utils.find_download(wiki) + return download_and_extract(ident, dl_file, dest_dir) + + +def download_fandom(wiki, dest_dir): + """TODO: Download wiki dumps directly from fandom.""" + logger = logs.get_logger() + logger.warning(f"Fandom downloads not implemented yet, downloading from IA.") + return download_ia(wiki, dest_dir) + + +def scrape_wiki(wiki, dest_dir): + """TODO: Rescrape a wiki using wikiteam3.""" + logger = logs.get_logger() + logger.warning(f"Wiki Re-scrapes not implemented yet, downloading from IA.") + return download_ia(wiki, dest_dir) + + +def process_wiki(i, wiki, offset, dest_dir): + """Download a wiki, proxying to different fetch functions based on the wiki.""" + logger = logs.get_logger() + if "metadata" not in wiki: + logger.error( + f"Metadata missing from wiki, malformed record", extras={"line": i} + ) + return None + ident = wiki["metadata"]["identifier"] + with logger(wiki=ident): + if not utils.filter_language(lang := wiki["metadata"].get("language")): + logger.warning(f"wiki appears to not be in english, found: {lang}") + return None + if not utils.check_alive(wiki): + logger.info(f"wiki is offline, getting dump from IA.") + return download_ia(wiki, dest_dir) + if not utils.verify_license(wiki): + logger.error(f"The IA license for wiki doesn't match the source.") + return None + if utils.check_fandom(wiki): + logger.info(f"wiki is from fandom, downloading dump from them.") + return download_fandom(wiki, dest_dir) + if utils.check_wikimedia(wiki): + logger.info(f"wiki is a WikiMedia wiki, us the `../dump` tools instead.") + return None + if utils.check_out_of_date(wiki, offset): + logger.warning(f"IA dump of wiki is very out of date, re-scraping.") + return scrape_wiki(wiki, dest_dir) + + +def main(args): + logger = logs.get_logger() + logger.info(f"Reading wiki metadata from {args.wiki_metadata}") + with open(args.wiki_metadata) as f: + wiki_metadata = [json.loads(l) for l in f if l] + logger.info(f"{len(wiki_metadata)} wikis to download.") + + if args.test_run: + logger.info(f"Test Run: Only downloading {args.test_run} wikis") + # Not true shuffling as the number of permutations for so many wikis + # is much larger than the period of the RNG (python breaks after ~2100) + # but we aren't doing crypto so it isn't an issue. + random.shuffle(wiki_metadata) + wiki_metadata = wiki_metadata[: args.test_run] + + if args.num_workers and args.worker_id: + # Partition downloads of different wikis to different workers. + # Each worker runs their own copy of this script, with a unique + # --worker_id + wiki_metadata = [ + w + for i, w in enumerate(wiki_metadata) + if i % args.num_workers == args.worker_id + ] + logger.info( + f"{len(wiki_metadata)} wikis to download as worker {args.worker_id}/{args.num_workers}." + ) + + # Run multiple download processes to try to have concurrent downloads from the IA + # Will still be slow, especially if all workers are writing to a shared disk. + with mp.Pool(args.num_threads) as pool: + pool.starmap( + functools.partial(process_wiki, offset=None, dest_dir=args.dest_dir), + enumerate(wiki_metadata), + ) + + +if __name__ == "__main__": + args = parser.parse_args() + logs.configure_logging() + main(args) diff --git a/wiki/archive/get_metadata.py b/wiki/archive/get_metadata.py new file mode 100644 index 0000000..a2f3d1d --- /dev/null +++ b/wiki/archive/get_metadata.py @@ -0,0 +1,99 @@ +"""Download wiki dump metadata from the internet archive. + +The licenseurl regex's we are using to search are mutually exclusive so we can +split the query into multiple chunks instead of `OR`ing them together to get some +parallelism out of the metadata scrape. Downloading the metadata for the 350k wikis +in a single query to the IA API took hours. + +TODO: If we include the "wikicollections" data, (wikis uploaded to the IA that +aren't scraped by the wikiteam) we jump up to ~4million wikis compared to the +350k we get from wikiteam. +""" + +import argparse +import json +import multiprocessing.dummy as mp +import os +import re +import shutil + +import internetarchive + +from licensed_pile import logs +from licensed_pile.licenses import PermissiveLicenses + +parser = argparse.ArgumentParser( + description="Download metadata for wiki dumps from the IA." +) +parser.add_argument("--output_dir", default="data/metadata/", help="") +parser.add_argument("--file_name", default="ia-wiki-metadata.jsonl") +# TODO: Respect these +parser.add_argument("--include_wikicollections", action="store_true", help="") +parser.add_argument("--licenses", choices=[], action="append", help="") + + +def get_metadata(idx: int, query: str, file_name: str, output_dir: str): + """Fetch item metadata from IA using query and save it to disk.""" + logger = logs.get_logger() + logger.info(f"Querying IA with {query}") + with open(os.path.join(output_dir, f"{idx:>05}_{file_name}"), "w") as wf: + # This is a cursor so it fetches items from the IA as we + # iterate over it. + for item in internetarchive.search_items(query): + wf.write(json.dumps(i.item_metadata) + "\n") + + +def make_queries(licenses, include_wikicollections): + """Convert the CLI args into a collection of queries to make.""" + if include_wikicollections: + raise NotImplementedError("...") + license_regexs = licenses + for license_regex in license_regexs: + yield f"collection:(wikiteam) AND licenseurl:({license_regex})" + + +def merge_shards(output_dir, file_name): + """Merge each of our response shards into one.""" + shards = [] + for fname in os.listdir(output_dir): + if m := re.match(rf"(\d{{5}}_{file_name})"): + shards.append(m.group(1)) + logger = logs.get_logger() + shards = sorted(shards) + logger.info(f"Found {len(shards)} shards, {shards}") + with open(os.path.join(output_dir, file_name), "wb") as wf: + for in_file in shards: + with open(os.path.join(output_dir, in_file), "rb") as f: + # Use shutil to copy the file in chunks without the overhead + # of acutally finding line endings required by `for line in f` + # or .readlines() + # Note: We know that each shard file ends with a newline so we + # can just concatenate them, we don't need to insert another + # newline. + shutil.copyfileobj(f, wf) + + +def main(args): + # TODO have something that translates from the PermissiveLicense Enum to regex's + if args.licenses is None: + args.licesnes = ( + "*\/by\/*", + "*\/by-sa\/*", + "*publicdomain*", + "*GNU_Free_Documentation_License*", + ) + queries = list(make_queries(args.licesnses, args.include_wikicollections)) + with mp.Pool(len(queries)) as pool: + pool.starmap( + functools.partial( + get_metadata, file_name=args.file_name, output_dir=args.output_dir + ), + enumerate(queries), + ) + merge_shards(args.output_dir, args.file_name) + + +if __name__ == "__main__": + args = parser.parse_args() + logs.configure_logging() + main(args) diff --git a/wiki/archive/to_dolma.py b/wiki/archive/to_dolma.py new file mode 100644 index 0000000..5ced237 --- /dev/null +++ b/wiki/archive/to_dolma.py @@ -0,0 +1,351 @@ +"""Convert wiki dumps from the internet archive to dolma.""" + +import argparse +import datetime +import functools +import glob +import json +import math +import multiprocessing as mp +import os +import re +import uuid + +import pandas as pd +import pytz +import utils + +from licensed_pile import logs +from licensed_pile.licenses import PermissiveLicenses +from licensed_pile.utils import dolma_output +from licensed_pile.write import to_dolma +from licensed_pile.xml import iterate_xmls + +parser = argparse.ArgumentParser( + description="Convert Downloaded Wiki dumps from the internet archive to dolma." +) +parser.add_argument("--wiki_metadata", default="data/wiki/archive/ia-wikis.jsonl") +parser.add_argument( + "--dump_dir", help="The location of the IA dump.", default="data/wiki/archive/dumps" +) +parser.add_argument( + "--output_dir", + help="Where the dolma formatted data goes.", +) +parser.add_argument("--filename", help="The base filename for our wiki data.") +parser.add_argument( + "--shard_size", type=int, default=1, help="Size, in GB, for each shard." +) +parser.add_argument( + "--last_author", + action="store_true", + help="Should we only include the most recent author? (Faster)", +) +parser.add_argument( + "--include_redirects", + action="store_true", + help="Should we skip pages that are redirects to others?", +) + + +def format_old( + page: str, + source_name: str, + wiki: str, + dump_url: str, + url: str, + license: PermissiveLicenses, +): + """Convert old style wiki dumps with a pages/ dir into the dolma format.""" + logger = logs.get_logger() + try: + with open(f"{page}.wikitext") as f: + text = f.read() + metadata = pd.read_csv(f"{page}.history.csv") + metadata = metadata.replace(math.nan, None) + authors = set(metadata["Author"]) + # Use .discard instead of .remove in case None isn't in the author set. + authors.discard(None) + + # The date column is formatted as "Date (timezone)". Here we find which timezone + # it is. Unclear if the tz is the same for all dumps so we infer it here. + tz = [ + tz.group("tz") + for col in metadata.columns + if (tz := re.match(r"^Date \((?P.*?)\)$", col)) + ] + tz = tz[0] if tz else "UTC" + date_col = f"Date ({tz})" + tz = pytz.timezone(tz) + + # We are going to use the most recent revision so the "created" for that + # version if the most recent date. + dates = metadata[date_col].apply( + lambda d: datetime.datetime.strptime(d, "%Y-%m-%d %H:%M:%S") + ) + created = max(dates) + + page_title = os.path.basename(page) + + return { + "id": page_title, + "text": text, + "source": f"{source_name}/{wiki}", + "added": datetime.datetime.utcnow().isoformat(), + "created": created.isoformat(), + "metadata": { + "license": str(license), + "authors": sorted(authors), + # Things dumped with only the old format are generally not online + # anymore so use the dump url. + "dump_url": dump_url, + "wiki": wiki, + "url": url, + # These old dumps don't include namespaces. + "namespace": None, + "title": page_title, + }, + } + except: + logger.exception(f"Failed to parse {wiki}:{page}") + + +def format_xml( + xml, + source_name: str, + wiki: str, + url: str, + dump_url: str, + license: PermissiveLicenses, + all_authors: bool = True, + skip_redirect: bool = True, +): + """Convert a -history.xml file to the dolma format.""" + # TODO: This is shared with the generic dolma version, but more robust, should be unified. + logger = logs.get_logger() + if skip_redirect and [x for x in xml if x.tag.endswith("redirect")]: + # Don't log this as we haven't extracted any information to make the log + # entry useful. + # logger.info("Skipping page as it is a redirect.") + return None + + revisions = [r for r in xml if r.tag.endswith("revision")] + if not revisions: + logger.error(f"Failed to parse revision for page", extra={"wiki": wiki}) + return None + text = [t for t in revisions[-1] if t.tag.endswith("text")] + if not text: + logger.error(f"Failed to parse page text", extra={"wiki": wiki}) + text = None + else: + text = text[0].text + + page_namespace = [ns for ns in xml if ns.tag.endswith("ns")] + if not page_namespace: + page_namespace = "" + logger.warning(f"Failed to parse namespace", extra={"wiki": wiki}) + else: + page_namespace = page_namespace[0].text + + page_id = [pid for pid in xml if pid.tag.endswith("id")] + if not page_id: + logger.warning(f"Filed to find page id, generating uuid", extra={"wiki": wiki}) + page_id = uuid.uuid4() + else: + page_id = page_id[0].text + + ts = [ts for ts in revisions[-1] if ts.tag.endswith("timestamp")] + if not ts: + logger.warning("Failed to parse timestamp, using default", extra={"wiki": wiki}) + ts = "1970-01-01" + else: + ts = ts[0].text + try: + created = datetime.datetime.fromisoformat(ts).replace(tzinfo=None) + except TypeError: + logger.warning( + f"Failed to parse timestamp: {ts}", extra={"wiki": wiki}, exc_info=True + ) + created = datetime.datetime.fromisoformat("1970-01-01").replace(tzinfo=None) + + page_title = [t for t in xml if t.tag.endswith("title")] + if not page_title: + logger.warning(f"Failed to parse page title", extra={"wiki": wiki}) + page_title = "" + else: + page_title = page_title[0].text + + contributors = set() + if all_authors: + for revision in revisions: + contribs = [c for c in revision if c.tag.endswith("contributor")] + # When there are multiple contributors, there are multiple contributor + # xml items where each one has a single username and id items. + names = [u.text for c in contribs for u in c if u.tag.endswith("username")] + name = ["" if n is None else n for n in name] + # Save their id too in case they change their username + uid = [u.text for c in contribs for u in c if u.tag.endswith("id")] + uid = ["" if u is None else u for u in uid] + contributors.update(zip(names, uid)) + else: + # We already checked if revisions was empty above, so we will always + # have a revisions[-1] to check. + contrib = [c for c in revisions[-1] if c.tag.endswith("contributor")] + # When there are multiple contributors, there are multiple contributor + # xml items where each one has a single username and id items. + name = [u.text for c in contrib for u in c if u.tag.endswith("username")] + name = ["" if n is None else n for n in name] + # Save their id too in case they change their username + uid = [u.text for c in contrib for u in c if u.tag.endswith("id")] + uid = ["" if u is None else u for u in uid] + contributors.update(zip(name, uid)) + + return { + "id": f"{page_namespace}-{page_id}", + "text": text, + "source": f"{source_name}/{wiki}", + "added": datetime.datetime.utcnow().isoformat(), + "created": created.isoformat(), + "metadata": { + "license": str(license), + "authors": sorted(contributors), + "url": url, + "wiki": wiki, + "dump_url": dump_url, + "namespace": page_namespace, + "title": page_title, + }, + } + + +def convert_wiki( + wiki, + source_name: str, + dump_dir: str, + output_dir: str, + filename: str, + shard_size: int, + all_authors: bool = True, + skip_redirect: bool = True, +): + """Convert a wiki into the dolma format, support new and old style wikis.""" + logger = logs.get_logger() + if "metadata" not in wiki: + logger.error(f"Metadata missing from line, malformed record") + return None + ident = wiki["metadata"]["identifier"] + wiki_path = os.path.join(dump_dir, utils.wiki_to_dir(ident)) + if not os.path.exists(wiki_path): + logger.warning(f"Dump for wiki {ident} is missing from {wiki_path}") + return None + logger.info(f"Converting wiki: {ident} to dolma.") + filename = "wiki.jsonl.gz" if filename is None else filename + dolma_dir = os.path.join(output_dir, utils.wiki_to_dir(ident)) + # Use a shadow dir to allow for starting and stopping in the middle of + # conversion. This lets us skip processing wikis that already have an + # output without worrying that the output is incomplete. + shadow_dir = os.path.join(output_dir, utils.wiki_to_dir(ident), "shadow") + os.makedirs(shadow_dir, exist_ok=True) + if os.path.exists(dolma_dir): + logger.warning(f"{dolma_dir} already exists, skipping") + return + logger.info(f"Writing Dolma documents to {shadow_dir}, shadowing {dolma_dir}") + + # Checking for an old style wiki. + pages = os.path.join(wiki_path, "pages") + if os.path.exists(pages) and os.path.isdir(pages): + logger.info(f"Wiki: {ident} is an old-style dump.") + # Get all the wikitext files via glob, but then remove them as + # we use the basename to access wikitext and history later. + pages = glob.iglob(os.path.join(pages, "*.wikitext")) + pages = map(lambda p: os.path.splitext(p)[0], pages) + pages = map( + functools.partial( + format_old, + source_name=source_name, + wiki=ident, + dump_url=wiki["metadata"].get("identifier-access"), + url=wiki["metadata"].get("originalurl"), + license=PermissiveLicenses.from_string(wiki["metadata"]["licenseurl"]), + ), + pages, + ) + else: + logger.info(f"Wiki: {ident} is a new-style dump.") + export_pages = glob.glob(os.path.join(wiki_path, "*-history.xml")) + if not export_pages: + logger.error(f"Can't find *-histroy.xml file for wiki: {ident}") + return None + pages = iterate_xmls(export_pages, tag="page") + pages = map( + functools.partial( + format_xml, + source_name=source_name, + wiki=ident, + dump_url=wiki["metadata"].get("identifier-access"), + url=wiki["metadata"].get("originalurl"), + license=PermissiveLicenses.from_string(wiki["metadata"]["licenseurl"]), + all_authors=all_authors, + skip_redirect=skip_redirect, + ), + pages, + ) + # Wiki processing is all via iterators so we don't have memory issues. + pages = filter(lambda p: p is not None, pages) + to_dolma(pages, shadow_dir, filename, shard_size) + # Move the shadow page to the real output location + try: + os.makedirs(os.path.dirname(dolma_dir), exist_ok=True) + os.rename(shadow_dir, dolma_dir) + logger.info( + f"Dolma conversion for {ident} complete, moving shadow from {shadow_dir} to {dolma_dir}" + ) + # If something goes wrong moving the shadow page, delete the output + # dir as it will be incomplete, but its presense would cause this + # wiki to be skipped when resuming processing. + except Exception: + os.remove(os.path.dirname(dolma_dir)) + finally: + os.rmdir(os.path.dirname(shadow_dir)) + + +def main(args): + logger = logs.get_logger() + + logger.info(f"Reading wiki metadata from {args.wiki_metadata}") + with open(args.wiki_metadata) as f: + wiki_metadata = [json.loads(l) for l in f if l] + logger.info(f"{len(wiki_metadata)} wikis to convert.") + + args.dump_dir = ( + args.dump_dir + if args.dump_dir is not None + else os.path.dirname(args.wiki_metadata) + ) + args.output_dir = dolma_output( + args.output_dir + if args.output_dir is not None + else os.path.join("..", "data", "wiki", "archive", "raw") + ) + + convert = functools.partial( + convert_wiki, + source_name="wiki/archive", + dump_dir=args.dump_dir, + output_dir=args.output_dir, + filename=args.filename, + shard_size=args.shard_size, + all_authors=not args.last_author, + skip_redirect=not args.include_redirects, + ) + + # Run the action convert function, without a for loop. + # Note: I looked at using mp.Pool here, but there is so much disk IO + # that I was seeing much slower speeds than doing it serially. + list(map(convert, wiki_metadata)) + + +if __name__ == "__main__": + args = parser.parse_args() + logs.configure_logging() + main(args) diff --git a/wiki/archive/utils.py b/wiki/archive/utils.py new file mode 100644 index 0000000..5795bb1 --- /dev/null +++ b/wiki/archive/utils.py @@ -0,0 +1,414 @@ +"""Utilities for working with archived wiki dump.""" + +import datetime +import functools +import operator as op +import os +import re +import urllib.parse +from typing import List, Optional, Protocol, Sequence, Set, Tuple, TypeVar + +import tenacity + +from licensed_pile import licenses, logs, scrape + +ENGLISH = frozenset({"en", "en-ca", "eng", "en-gb", "English", "en_hj"}) +LOGGER = logs.get_logger() + + +def check_alive(item) -> bool: + if "originalurl" not in item["metadata"]: + return False + try: + r = scrape.get_page(item["metadata"]["originalurl"]) + return r.status_code == 200 + except tenacity.RetryError: + return False + + +def check_out_of_date(item, offset: datetime): + if "late-updated-date" not in item["metadata"]: + return False + last_updated = datetime.datetime.strptime( + item["metadata"]["last-updated-date"], "%Y-%m-%d" + ) + return False + + +def check_fandom(item): + """Is a wiki a fandom wiki?""" + url = urllib.parse.urlparse(item["metadata"]["originalurl"]).netloc + return url.endswith("fandom.com") + + +def check_wikimedia(item): + """Based on https://raw.githubusercontent.com/WikiTeam/wikiteam/master/dumpgenerator.py""" + if m := re.findall( + r"(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org", + item["metadata"]["originalurl"], + ): + return True + return False + + +# TODO: Some wiki's licenses don't match the metadata in the uploaded dump. We +# should use the AI2 CC license tool to verify that the metadata and license on +# the actual website match. +def verify_license(item): + return True + + +def filter_language(lang: Optional[str], allowed: Set[str] = ENGLISH) -> bool: + if lang: + return lang in allowed or lang == "Unknown" + # If no language (is None), try it because a lot of the internet is English. + return True + + +def find_date(s: str) -> Optional[datetime.datetime]: + """Lots of identifiers or filenames have dates in them, IA generally uses YYYYMMDD.""" + for regex in [ + r"-(?P\d{8})-", + r"(?P\d{8})", + r"-(?P\d{4}-\d{2}-\d{2})", + ]: + if date_str := re.search(regex, s): + try: + date_str = date_str.group("date").replace("-", "") + return datetime.datetime.strptime(date_str, "%Y%m%d") + except Exception as e: + LOGGER.warning(f"Failed to parse {date_str} into a real date.") + LOGGER.warning(f"Failed to find date string in {s}") + + +def parse_version(s: str) -> Tuple[int, int, int]: + """Based on https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string""" + if v := re.search( + r"(?P0|[1-9]\d*)\.(?P0|[1-9]\d*)(?:\.(?P0|[1-9]\d*))?(?:-(?P(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?", + s, + ): + # If there isn't a patch version, assume it is .0 + return ( + int(v.group("major")), + int(v.group("minor")), + int(v.group("patch")) if v.group("patch") is not None else 0, + ) + LOGGER.warning(f"Failed to parse version from {s}") + return (0, 0, 0) + + +# This was a class used to tell which version of scraping tools was used to +# collect this wiki. It used to be a sign of the format used for uploading +# however, that pattern no longer holds and we need to infer that from the +# files themselves. This is not currently used. +class Scanner(licenses.StringEnum): + WIKITEAM_3 = "wikiteam3" + IA_PYTHON = "Internet Archive Python Library >= 1.0.0" + IA_PYTHON_OLD = "Internet Archive Python Library < 1.0.0" + IA_HTML5 = "Internet Archive HTML5 Uploader" + UNKNOWN = "unknown" + + @classmethod + def from_string(cls, s: str) -> "Scanner": + if s is None: + return cls.UNKNOWN + if s.startswith("wikiteam3"): + return cls.WIKITEAM_3 + if s.startswith("Internet Archive HTML5 Uploader"): + return cls.IA_HTML5 + if s.startswith("Internet Archive Python library"): + version = parse_version(s) + if version < (1, 0, 0): + return cls.IA_PYTHON_OLD + return cls.IA_PYTHON + return cls.UNKNOWN + + +M = TypeVar("M") + + +# Each IA metadata blob comes with a list of files available to download. +# These can be different formats depending on how/when they were uploaded. +# There can also be files that we don't care about, e.g., images +# These finds find the files that we are most likely to want to download. +class FileFinder(Protocol): + def __call__(self, file_metadata: Sequence[M], ident: str) -> List[M]: + """Check if any of the files are worth downloading.""" + + +def find_history(file_metadata, ident): + # Remove the - instead as some files are just called history.xml.gz + LOGGER.debug("Searching for history.xml files to download.") + return [f for f in file_metadata if "history.xml" in f["name"]] + # LOGGER.debug("Searching for -history.xml files to download.") + # return [f for f in file_metadata if "-history.xml" in f["name"]] + + +def find_compressed(file_metadata, ident, ext: str): + LOGGER.debug(f"Searching for {ident}.{ext} files to download") + return [f for f in file_metadata if f["name"] == f"{ident}.{ext}"] + + +find_zip = functools.partial(find_compressed, ext="zip") +find_7z = functools.partial(find_compressed, ext="7z") +find_gz = functools.partial(find_compressed, ext="gz") + + +def find_identifier(file_metadata, ident): + LOGGER.debug( + f"Searching for files to download with the same name as the identifier: {ident}" + ) + return [f for f in file_metadata if f["name"] == ident] + + +def find_wikidump(file_metadata, ident): + LOGGER.debug(f"Searching for files that end in -wikidump to download.") + return [ + f for f in file_metadata if os.path.splitext(f["name"])[0].endswith("-wikidump") + ] + + +def find_history_zipped(file_metadata, ident): + LOGGER.debug(f"Searching for compressed files that end in -history to download.") + return [ + f for f in file_metadata if os.path.splitext(f["name"])[0].endswith("-history") + ] + + +def find_pages_full(file_metadata, ident): + LOGGER.debug(f"Searching for files that end in _pages_full.xml") + return [ + f + for f in file_metadata + if os.path.splitext(f["name"])[0].endswith("_pages_full.xml") + ] + + +def find_pages(file_metadata, ident): + LOGGER.debug(f"Search for pages.xml files to download.") + return [f for f in file_metadata if "pages.xml" in f["name"]] + + +def find_ident_plus_date(file_metadata, ident): + LOGGER.debug(f"Search for file formatted as ident-?\d+?\..*?") + return [ + f + for f in file_metadata + if re.match(rf"{ident}-?\d+?\..*?", f["name"], re.IGNORECASE) + ] + + +def find_complete(file_metadata, ident): + LOGGER.debug(f"Searching for files that end in -complete") + return [ + f + for f in file_metadata + if re.search(r"-complete(7z)?$", os.path.splitext(f["name"])[0]) + ] + + +def find_xmlonly(file_metadata, ident): + LOGGER.debug(f"Searching for files that end in -wikidump.XMLONLY") + return [ + f + for f in file_metadata + if os.path.splitext(f["name"])[0].endswith("-wikidump.XMLONLY") + ] + + +def find_wikidumper(file_metadata, ident): + LOGGER.debug(f"Searching for files that end with dumped_using_wikidumper") + return [f for f in file_metadata if "dumped_using_wikidumper" in f["name"]] + + +def find_gzipped_xml(file_metadata, ident): + LOGGER.debug(f"Searching for files that end with .xml.gz") + return [f for f in file_metadata if f["name"].endswith(".xml.gz")] + + +def find_current(file_metadata, ident): + LOGGER.debug(f"Searching for files with -current") + return [f for f in file_metadata if "-current" in f["name"]] + + +def find_download( + item_metadata, + file_finders: Sequence[FileFinder] = ( + find_history, + find_zip, + find_7z, + find_gz, + find_wikidump, + find_history_zipped, + find_pages_full, + find_pages, + find_identifier, + find_ident_plus_date, + find_complete, + find_xmlonly, + find_wikidumper, + find_gzipped_xml, + find_current, + ), +): + """Given the wiki metadata, find which file of the available is the one we should download.""" + ident = re.sub(r"^[wW]iki-", "", item_metadata["metadata"]["identifier"]) + for file_fn in file_finders: + dl_files = file_fn(item_metadata["files"], ident) + if dl_files: + break + # This else runs if we didn't break out of the loop, that it, we didn't find + # good files to download. + else: + LOGGER.error(f"Failed to find files to download for {ident}") + return None + + # If multiple uploads have happened find the most recent one. + if len(dl_files) > 1: + dates = [find_date(f["name"]) for f in dl_files] + # argmax + _, dl_file = max(zip(dates, dl_files), key=op.itemgetter(0)) + else: + dl_file = dl_files[0] + return dl_file + + +# TODO: This function is duplicated, unify and remove. Need better way to +# share the code between the different subdirs (dump/, archive/, scrape/) +def wiki_to_dir(wiki_id, chars: int = 2, levels: int = 2): + """Convert wiki id to a nested dir for faster filesystem access. + + ex: wiki-car_collectionfandomcom -> wiki-ca/r_/wiki-car_collectionfandomcom + """ + prefix = "wiki-" if wiki_id.startswith("wiki-") else "" + wiki_id = re.sub(f"^{prefix}", "", wiki_id) + parts = ( + (f"{prefix}{wiki_id[:chars]}",) + + tuple(wiki_id[l * chars : (l + 1) * chars] for l in range(1, levels)) + + (f"{prefix}{wiki_id}",) + ) + return os.path.join(*parts) + + +def zst_uncompress(compressed, uncompressed=None): + """Uncompress a file using zstd with the `--long=31` flag.""" + if uncompressed is None: + uncompressed = re.sub(r"\.zst$", "", compressed) + LOGGER.info( + "Uncompressed target for {compressed} not provided, using {uncompressed}" + ) + with open(uncompressed, "w") as wf: + # TODO: Add error handling, namely, delete the (wrong) uncompressed + # file is there are errors. + subprocess.run( + ["/usr/bin/zstd", "-c", "-d", "--long=31", "--", compressed], stdout=wf + ) + return uncompressed + + +KNOWN_BAD = frozenset( + { + # Only has a torrent file + "hunty_cities_skylinesfandomcom", + "kenyonushistory.wikispaces.com", + "Wiki-Puella-magi.net", + "Wiki-Tanasinn.info", + "Wiki-Velowiki.org", + "fanhistory.com", + "hampediaorg-20111022-dump", + "wiki-biblioteca_wikimedia_it", + "wiki-cencilia_fashion_fantasyfandomcom", + "wiki-cynthiaokolo.wikispaces.com", + "wiki-edgetechnology.wikispaces.com", + "wiki-elictmentor.wikispaces.com", + "wiki-ems-teacher.wikispaces.com", + "wiki-es-tech.wikispaces.com", + "wiki-guild_of_heroes_2fandomcom", + "wiki-hchsapes.wikispaces.com", + "wiki-hostagesfandomcom", + "wiki-hunty_cities_skylinesfandomcom", + "wiki-icomputers3.wikispaces.com", + "wiki-kellys-oabcig-pe.wikispaces.com", + "wiki-kenyonushistory.wikispaces.com", + "wiki-misssherk.wikispaces.com", + "wiki-mrsburns.wikispaces.com", + "wiki-mixelsstories.fandom.com-20230912", + "wiki-maou-gakuen-no-hangyakusha-03282023", + "wiki-boyinstripedpyjamas.wikispaces.com_meta", + "wiki-httpsverifiedhandles.comvhidmain_page", + "tropicalwikis-feb-2013", + "Wiki-PCGamingWiki_201310", + "wiki-stockerlibrary.wikispaces.com", + "wiki-rapidpopulationgrowth.wikispaces.com", + "wiki-rgfigreenschool.wikispaces.com", + "wiki-robots2011.wikispaces.com", + "wiki-rtdufodefensefandomcom", + "wiki-scc2011.wikispaces.com", + "wiki-spanish1h.wikispaces.com", + "wiki-starter-old.fandom.com_ar-20230912", + "wiki-starter-old.fandom.com_he-20230912", + "wiki-processors.wiki.ti.com_20140107", + "wiki-webquests215.wikispaces.com", + "wiki-wiki.aynu.org", + "wikimediacommons-torrents", + "wikipediapresentations", + "wikipediappt", + "wiki-www.alphalinux.org", + # This is a dump of multiple wikis, These will be handled specially. + "shoutwiki.com", + # These are dumps of MediaWiki things so we should get them from official + # sources. + "WikitravelAllLanguages.7z", + "wikia_dump_20200214", + "wikivoyage", + # These are dumps of wikipedia pages that were quickly deleted. + # They should be handled specially + "wikipedia-delete-v3-2012-07", + # These are older versions of the speedy deletion dumps + "wikipedia-delete-v2-2012-06", + "wikipedia-delete-2012-06", + "wikipedia-delete-2012-05", + # Non-standard format/filename and not in english, not worth fixing + "wiki-chiliwikide", + "wiki-de-media-perdida", + "wiki-biblioteca_wikimedia_it_20140110", + # This is an older version of religionswiki-20200920-wikidump (which is in a + # weird formet :/) + "religionswiki-20190926-wikidump.tar", + # These are older versions of chuunibyou-demo-koi-ga-shitai-fandom_20210201 + "wiki-chuunibyou-demo-koi-ga-shitai-fandom_202008", + "wiki-chuunibyou-demo-koi-ga-shitai-fandom_202009", + "wiki-chuunibyou-demo-koi-ga-shitai-fandom_202102", + # This dump is formatted really wrong + "wiki-chuunibyou-demo-koi-ga-shitai-fandom_20210201", + # This dump is a weird epub format + "wiki-nsindexnet", + # The dump files have strange names (it is co instead of to, etc.) and the + # content is in Russian + "wiki-lurkmoreto", + # Non English and the dump mentions that the the License doesn't apply to all + # the content + "wiki-es.wikieducator.org_201401", + "wiki-fr.wikieducator.org_201401", + # This dump seems to only include the images + "wiki-battleborn.fandom.com-20240323", + # These are older versions of the multi-wiki dump wikia_dump_20200214 + "wikia_dump_20121109", + "wikia_dump_20121204", + "wikia_dump_20140125", + "wikia_dump_20140529", + "wikia_dump_20141219", + "wikia_dump_20180602", + # These are older versions of the dump wikiironchariotsorg-20170712-wikidump.tar + "wikiironchariotsorg-20150805-wikidump.tar", + "wikiironchariotsorg-20160714", + # This is a just a log of what wiki's they have dumpped. + "wikiteam-2018-05", + "wikiteam_2020-02-09", + # This is an old version of wiki-windowswallpapermirahezeorg_w + "wiki-windowswallpapermirahezeorg-20220509", + # This was causing errors when downloading. + "wiki-galaxiesunboundfandomcom", + "wiki-doomwiki.org-20231010", + } +) diff --git a/wiki/dolma_utils.py b/wiki/dolma_utils.py new file mode 100644 index 0000000..5c0f0bd --- /dev/null +++ b/wiki/dolma_utils.py @@ -0,0 +1 @@ +"""Utilities for converting wiki's into the dolma format.""" diff --git a/wiki/dump/.gitignore b/wiki/dump/.gitignore new file mode 100644 index 0000000..60baa9c --- /dev/null +++ b/wiki/dump/.gitignore @@ -0,0 +1 @@ +data/* diff --git a/wiki/dump/README.md b/wiki/dump/README.md new file mode 100644 index 0000000..bb85595 --- /dev/null +++ b/wiki/dump/README.md @@ -0,0 +1,12 @@ +# MediaWiki + +These are tools that are used to download wiki data from offical dumps distributed by the MediaWiki foundataion. + +The downloading tools are bespoke for the MediaWiki naming scheme, but the conversion to dolma script can be used for other wikis that share the `*-history.xml` dump format. + +## Steps: + +1. Run `download.sh YYYYMMDD` to download xml dumps +2. Run `to_dolma.sh YYYYMMDD` (date must match) to convert to the dolma format + +This results on dolma formatted data on disk with wikitext. Use the shard wikitext preprocessing pipeline to get plaintext. diff --git a/wiki/dump/download.py b/wiki/dump/download.py new file mode 100644 index 0000000..39975c0 --- /dev/null +++ b/wiki/dump/download.py @@ -0,0 +1,48 @@ +"""Download and extract official wiki dumps.""" + +import argparse +import os +import re +import urllib.parse + +import pyunpack + +from licensed_pile import logs, scrape + +parser = argparse.ArgumentParser( + description="Download and Extract official Wiki dumps." +) +parser.add_argument("--url", help="The url to download a dump from.") +parser.add_argument("--wikimedia", help="") +parser.add_argument( + "--output_dir", default="data/dumps", help="Where to save the downloaded dumps." +) + + +def wikimedia_url(wikimedia): + wikimedia = re.sub(r"^en", "", wikimedia) + return f"https://dumps.wikimedia.org/en{wikimedia}/latest/en{wikimedia}-latest-pages-articles-multistream.xml.bz2" + + +def download_and_extract(url, ident, output_dir): + filename = os.path.basename(urllib.parse.urlparse(url).path) + + +def main(args): + if args.url and args.wikimedia: + raise ValueError( + f"--url={args.url} and --wikimedia={args.wikimedia} cannot be set at the same time." + ) + if not (args.url or args.wikimedia): + raise ValueError(f"--url or --wikimedia must be set.") + if not args.url: + args.url = wikimedia_url(args.wikimedia) + + ident = ... + download_and_extract(args.url, ident, args.output_dir) + + +if __name__ == "__main__": + args = parser.parse_args() + logs.configure_logging("wiki/dump") + main(args) diff --git a/wiki/dump/download.sh b/wiki/dump/download.sh new file mode 100755 index 0000000..d09bfb9 --- /dev/null +++ b/wiki/dump/download.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +DATE=${1} +data_dir=${2:-"data"} +data_dir=${data_dir%/} + +if [ -z ${DATE} ]; then + echo "usage: download.sh [date YYYYMMDD] data/" 2> /dev/null + exit 1 +fi + +declare -a wikis=( + wiki + wikibooks + wikinews + wikiquote + wikisource + wikiversity + wikivoyage + wiktionary +) + +mkdir -p "${data_dir}/dumps" + +for wiki in ${wikis[@]}; do + filename="en${wiki}-${DATE}-pages-meta-current.xml.bz2" + url="https://dumps.wikimedia.org/en${wiki}/${DATE}/${filename}" + # Use wget to avoid re-downloading and continue downloads. + wget -nc -c ${url} -O "${data_dir}/dumps/${filename}" + # bzip2 doesn't decompress if the output is already there, so we don't check + bunzip2 -k "${data_dir}/dumps/${filename}" +done diff --git a/wiki/dump/to_dolma.sh b/wiki/dump/to_dolma.sh new file mode 100755 index 0000000..789d1a3 --- /dev/null +++ b/wiki/dump/to_dolma.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +DATE=${1} +export_dir=${2:-"data/dumps"} +export_dir=${export_dir%/} +output_dir=${3:-"../data/wiki/dump/raw"} +output_dir=${output_dir%/} + +if [ -z ${DATE} ]; then + echo "usage: to_dolma.sh [date YYYYMMDD] dump/ data/wiki/raw/documents" 2> /dev/null + exit 1 +fi + +declare -a wikis=( + wiki + wikibooks + wikinews + wikiquote + wikisource + wikiversity + wikivoyage + wiktionary +) + +for wiki in ${wikis[@]}; do + filename="en${wiki}-${DATE}-pages-meta-current.xml" + # Check for output + if [[ ${wiki} == "wiki" ]]; then + url="https://wikipedia.com" + else + url="https://${wiki}.com" + fi + python ../to_dolma.py --license CC-BY-SA/4.0 --wiki "${url}" --export "${export_dir}/${filename}" --output_dir "${output_dir}" --last_author --source "wiki/dump" +done diff --git a/wiki/parser/.gitignore b/wiki/parser/.gitignore new file mode 100644 index 0000000..55145b7 --- /dev/null +++ b/wiki/parser/.gitignore @@ -0,0 +1,14 @@ +.DS_Store +.idea +*.log +tmp/ + +*.tern-port +node_modules/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* +*.tsbuildinfo +.npm +.eslintcache +logs/ diff --git a/wiki/parser/README.md b/wiki/parser/README.md new file mode 100644 index 0000000..a5359b1 --- /dev/null +++ b/wiki/parser/README.md @@ -0,0 +1,34 @@ +# WTF WIKIPEDIA parsing server + +We use the dolma format and a server running `wtf_wikipedia` for wikitext parsing instead of they dumpster dip as we want to be able to parse wikitext even when it is not in the standard xml format. + +## Starting the Parser Server + +1. Install HAProxy `sudo apt install haproxy` +2. Install nvm and node +3. Install dependencies `npm install` +4. edit `haproxy.cfg` to include one `server ${name} 127.0.0.1:${port} check` line for each server you plan to run. +5. move/link `haproxy.cfg` to `/etc/haproxy/haproxy.cfg` +6. Restart haproxy (`systemctl restart haproxy` on systemd based systems) +7. Run `./start ${numserver}`. Should match the number of `server` lines in `haproxy` +8. Go to `localhost:8404/stats` to check that each server is seen by haproxy + +## Why? + +Each server uses a worker pool with `1` worker. This is because `wtf_wikipedia` is syncronous code, so we need to run it in a thread to be able to use timeouts to cancel execution for long running documents. This also helps in cases where the parsing causes an OoM error, this happens in the thread instead of the real server. + +We then have multiple copies of the server behing the load balancer (which uses least connections scheduling), this allows for recovery in cases where the main server itself crashes. + +### v8 garbage collection + +v8, and therefore node, seem to have a pretty complex garbage collector and includes things like different heaps for persistant objects and "young" objects that are short-lived. Despite various efforts to set the sizes for these heaps (defaults to 64 and 32 GB in our code for each worker), I have found a lot of javascript OoM error, even though they seem to say that the heap is much smaller than the limits. This is set in the optinos for the constructor for the worker pool. + +There were also cases where using a large worker pool and a single server, the main server can have OoM errors. This crashes the whole server and grinds the dolma conversion to a halt. Even with commandline arguments to set the size of the heap, this was still happening, again despite it seeming to not have much on the heap. When this happens, our load balancer stops routing traffic to this server and out start script brings a new version online. Once it is live it is added back to the pool. + +These errors tend to happen on pages that have over 2 million characters. + +## Settings + +It seems to be fast to try to make sure that each server is currently working on 1 document and have already received a second document to be processed next. As the python code is syncronous, this means we need ~twice as many dolma processes as we have servers. Having extra python processes allows for the server to not have to wait for python string manipulataions. + +On a Ryzen 9 7950X using 30 dolma processes and 16 servers, the whole system processes ~5.5k documents/second and takes ~4 hours and 15 mins to process wikipeadia, its talk pages, and the other mediawiki pages. diff --git a/wiki/parser/haproxy.cfg b/wiki/parser/haproxy.cfg new file mode 100644 index 0000000..de0f256 --- /dev/null +++ b/wiki/parser/haproxy.cfg @@ -0,0 +1,48 @@ +defaults + mode http + timeout client 10m + timeout connect 10m + timeout server 10m + timeout http-request 10m + balance leastconn + +frontend stats + mode http + bind 127.0.0.1:8404 + stats enable + stats uri /stats + stats refresh 5s + stats admin if LOCALHOST + +frontend wtf + bind 127.0.0.1:5000 + default_backend wtf_workers + +backend wtf_workers + option httpchk + http-check send meth GET uri /health + http-check expect status 200 + server wtf1 127.0.0.1:5001 check + server wtf2 127.0.0.1:5002 check + server wtf3 127.0.0.1:5003 check + server wtf4 127.0.0.1:5004 check + server wtf5 127.0.0.1:5005 check + server wtf6 127.0.0.1:5006 check + server wtf7 127.0.0.1:5007 check + server wtf8 127.0.0.1:5008 check + server wtf9 127.0.0.1:5009 check + server wtf10 127.0.0.1:5010 check + server wtf11 127.0.0.1:5011 check + server wtf12 127.0.0.1:5012 check + server wtf13 127.0.0.1:5013 check + server wtf14 127.0.0.1:5014 check + server wtf15 127.0.0.1:5015 check + server wtf16 127.0.0.1:5016 check + server wtf17 127.0.0.1:5017 check + server wtf18 127.0.0.1:5018 check + server wtf19 127.0.0.1:5019 check + server wtf20 127.0.0.1:5020 check + server wtf21 127.0.0.1:5021 check + server wtf22 127.0.0.1:5022 check + server wtf23 127.0.0.1:5023 check + server wtf24 127.0.0.1:5024 check diff --git a/wiki/parser/package.json b/wiki/parser/package.json new file mode 100644 index 0000000..6a60e6b --- /dev/null +++ b/wiki/parser/package.json @@ -0,0 +1,10 @@ +{ + "dependencies": { + "commander": "^12.1.0", + "express": "^4.19.2", + "workerpool": "^9.1.3", + "wtf_wikipedia": "^10.3.1", + "wtf-plugin-api": "^2.0.0", + "wtf-plugin-latex": "^1.0.0" + } +} diff --git a/wiki/parser/parser.js b/wiki/parser/parser.js new file mode 100644 index 0000000..be200ea --- /dev/null +++ b/wiki/parser/parser.js @@ -0,0 +1,94 @@ +// Simple wikitext parsing server, node parser.js --port [port] +// +// Can create multiple versions that listen on multiple ports behind a load +// balancer for multiprocessing. + +// Simple webserver +const express = require("express"); +// cli parsing +const { program } = require("commander"); +const workerpool = require("workerpool"); + +// Convert the cli argument into an actual int. +function parseIntArg(value, prev) { + const parsedValue = parseInt(value, 10); + if (isNaN(parsedValue)) { + throw new commander.InvalidArgumentError("Not an Int.") + } + return parsedValue; +} + +// Parse CLI arguments +program + .option("--port ", "port", parseIntArg, 3000) + .option("--host", "host", "localhost") + .option("--timeout ", "timeout (seconds)", parseIntArg, 120) + .option("--maxworkers ", "max #workers in pool", parseIntArg, 1) + .parse(); +const args = program.opts(process.argv); + +// TODO: make pool settings configurable +console.log(`Starting worker pool with at most ${args.maxworkers} workers.`) +const pool = workerpool.pool("./worker.js", { + maxWorkers: args.maxworkers, + emitStdStreams: false, + workerThreadOpts: { + resourceLimits: { + maxOldGenerationSizeMb: 65536, + maxYoungGenerationSizeMb: 32768, + }}}); + +const app = express(); + +// TODO: How to set no size limit? +app.use(express.json({limit: "1000mb"})); +// This is an endpoint the load balancer and the runner script will hit to make +// sure the server is running. Sometime the main server and crash when multiple +// large document requests come in. +app.get("/health", async (req, res) => { + res.status(200).send(""); +}) +// Endpoint to parse wikitext. +app.post("/", async (req, res) => { + // Document comes as json {"wikitext": str, "id": str, "source": str} + const data = req.body; + console.log(`Parsing wikitext from document ${data['id']} of ${data['source']}`); + + // Pass this document to the worker pool. Using a worker pool allows us to + // put a timeout on syncronous code (wtf_wikipedia) as the main server will + // run async and kill the worker if it is taking too long. + pool + // Run the parsing function `wtf_parse` in the worker file `worker.js` + .exec('wtf_parse', [data["wikitext"]]) + // If the worker doesn't return a result in this time, an error is thrown + .timeout(args.timeout * 1000) + // When the worker returns, this is run + .then((response) => { + // Log finish and return parsed text. + console.log(`Finished parsing wikitext from document ${data['id']} of ${data['source']}`); + res.json(response); + }) + // If there was an error in the worker, + .catch((err) => { + console.log(err.message); + // If this is a timeout error, set the status code. + if (err.message.indexOf("timed out") != -1) { + console.error(`Parsing wikitext from document ${data['id']} of ${data['source']} timed out.`) + // This is technaially for the server to send the client when the client has + // timed out, but there isn't a server side timeout code. 504 is for when the + // server is a proxy, not just long running. + res.status(408).json({ timeout: err.message }); + // Log other errors, these are generally from the worker running out of + // memory + } else { + console.log(`~~~~~~~~~~ Error processing ${data['id']} of ${data['source']} ~~~~~~~~~~`); + console.error(err); + res.status(500).json({ error: err.message}); + } + }); + +}) +// Start the server. +app.listen(args.port, () => { + console.log(`Server started on port=${args.port} with timeout=${args.timeout} seconds.`) +}) diff --git a/wiki/parser/start.sh b/wiki/parser/start.sh new file mode 100755 index 0000000..6f19270 --- /dev/null +++ b/wiki/parser/start.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +NUMSERVERS=${1:-16} + +function port { + local id=${1} + if [[ ${id} -ge 10 ]]; then + echo "50${id}" + else + echo "500${id}" + fi +} + +function launch { + local id=${1} + node --max-old-space-size=65536 --max-semi-space-size=16384 parser.js --port $(port ${id}) --timeout 180 --maxworkers 1 >> ./logs/worker${id}.log 2>&1 & +} + +function ping { + local id=${1} + echo $(curl -I -X GET localhost:$(port ${id})/health 2> /dev/null | head -n 1 | cut -d$" " -f2) +} + +mkdir -p logs + +while true; do + for i in $(seq 1 $NUMSERVERS); do + if [[ $(ping ${i}) -ne "200" ]]; then + echo "Worker ${i} not running, starting." + launch ${i} + fi + done + sleep 5 +done diff --git a/wiki/parser/worker.js b/wiki/parser/worker.js new file mode 100644 index 0000000..c4de3c4 --- /dev/null +++ b/wiki/parser/worker.js @@ -0,0 +1,27 @@ +// Actually run wtf_wikipedia parsing. This is done in a worker thread to allow +// for timeouts as it is sync code. + +const workerpool = require("workerpool"); +const wtf = require("wtf_wikipedia"); + +function wtf_parse(text){ + // If the input is empty, at least return one empty section. This might have + // been better to have the client code deal with an empty list. + if (!text) { + return {document: [{title: "", text: ""}]} + } + + // Parse with wtf_wikipedia + var doc = wtf(text); + + // Convert to simple [{"title": str, "text": str}, ...] representation of + // sections for the response + const response = { + document: doc.sections().map(s => ({title: s.title(), text: s.text()})), + }; + return response; +} + +workerpool.worker({ + wtf_parse, +}); diff --git a/wiki/preprocess.py b/wiki/preprocess.py new file mode 100644 index 0000000..47e9aa6 --- /dev/null +++ b/wiki/preprocess.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 + +import argparse +import glob +import multiprocessing as mp +import os +import re +from tempfile import TemporaryDirectory + +import requests +import tqdm + +import wiki +from licensed_pile import logs, utils +from licensed_pile.write import ShardParallelProcessor + +parser = argparse.ArgumentParser(description="Preprocess raw wikitext in dolma format.") +parser.add_argument( + "--input", + default="dump/data/wiki/dump/raw", + help="The input version, this directory should be where the `documents` dir lives.", +) +parser.add_argument( + "--output", + default="dump/data/wiki/dump/v0", + help="The output version, this directory should be where the `documents` dir will live.", +) +parser.add_argument( + "--filename", + default="*.jsonl.gz", + help="The filename to match with globs, probably needs to be escaped.", +) +# TODO: Respect this flag +parser.add_argument( + "--overwrite", + action="store_true", + help="Should we overwrite previously processed examples?", +) +parser.add_argument( + "--debug", + action="store_true", + help="Should we log when documents are not changed by preprocessing.", +) +parser.add_argument( + "--processes", + type=int, + default=mp.cpu_count(), + help="Number of processors for multicore.", +) +parser.add_argument( + "--meta", + help="Location to store Dolma Metadata information.", +) +parser.add_argument( + "--no_shadow", + action="store_false", + help="Disable shadow paging, for things like cloud storage.", +) + +logs.configure_logging(level="INFO") + + +# These are pages that often crashed the servers. +DENYLIST = { + "Template:Attached KML/U.S. Route 62 in Kentucky", + "Template:Attached KML/U.S. Route 277", + "User:BeywheelzLetItRip/fonts.css", + "User:BeywheelzLetItRip/fonts2.cs", + "Template:Graph:Map/Inner/USA-json", +} + + +class WTFWikipediaParallel(ShardParallelProcessor): + @classmethod + def parse_wikitext(cls, wikitext, ex_id, ex_src): + logger = cls.get_logger() + try: + return wiki.parse_wikitext(wikitext, ex_id, ex_src) + except requests.Timeout: + logger.error("Wikitext parsing: timed out") + # Returning None for the whole example will filter it from the output. + return None + except (ValueError, requests.JSONDecodeError): + logger.error( + "Failed wikitext parsing for example", + exc_info=True, + ) + # Returning None for the whole example will filter it from the output. + return None + except Exception as e: + e.add_note(f"Failed to parse wikitext for example: {ex_src}/{ex_id}") + logger.error("Failed to parse wikitext for example") + raise + + @classmethod + def process_example(cls, example, **kwargs): + logger = cls.get_logger() + with logger(source=example["source"], id=example["id"]): + if (title := example["metadata"]["title"]) in DENYLIST: + logger.warning( + "Skipping example from deny list as the text is %d characters long.", + len(example["text"]), + extras={"title": title}, + ) + # Returning None for the whole example will filter it from the output. + return None + wikitext = example["text"] + # TODO: The dolma generation script should not include empty text. + if not wikitext: + logger.warning("Example is empty, skipping") + # Returning None for the whole example will filter it from the output. + return None + # Eventually add length filtering? + # if len(wikitext) > 1_000_000: + # logger.warning("Skipping example as the text is %d characters long.", len(wikitext)) + # return None + # Convert -> $$...$$ + wikitext = wiki.replace_math_tags(wikitext) + # Adjust indentation to avoid reorderings. + wikitext = wiki.adjust_indentation(wikitext) + # Extract Templates + wikitext, math_templates = wiki.extract_templates( + wikitext, ("math",), wiki.MATH_MARKER + ) + if math_templates: + logger.debug("Found %d {{math|...}} templates.", len(math_templates)) + wikitext, raw_templates = wiki.extract_templates( + wikitext, wiki.MATH_TEMPLATES, wiki.SECOND_MARKER + ) + if raw_templates: + logger.debug( + "Found %d more templates that appear to contain math.", + len(raw_templates), + ) + + # We replace these symbols after extracting any thare are part of other + # templates. Trying to extract these as their own templates (optional \) + # creates weird issues like {{Infobox ...}} getting extracted as {{In..}} + wikitext = wiki.replace_symbols(wikitext, include_money=True) + + # Parse Wiki Text + document = cls.parse_wikitext(wikitext, example["id"], example["source"]) + # TODO: Remove the double checking for document being empty + if document is None: + logger.warning( + "Wikitext parsing reduced example to nothing.", + ) + # Returning None for the whole example will filter it from the output. + return None + + # Format plaintext into document + document = wiki.format_document( + document, example.get("metadata", {}).get("title", "") + ) + if not document: + logger.warning( + "Wikitext parsing reduced example to nothing.", + ) + # Returning None for the whole example will filter it from the output. + return None + + # Process Templates + math_templates = map(wiki.fix_math, math_templates) + parsed_templates = [ + cls.parse_wikitext(t, example["id"], example["source"]) + for t in math_templates + ] + parsed_templates = [ + p[0]["text"] if p is not None else "" for p in parsed_templates + ] + for mt, pt in zip(math_templates, parsed_templates): + if not pt: + logger.warning( + "Math template `%s` was parsed to nothing.", + mt, + ) + + parsed_templates = [ + t.replace(wiki.ABS_MARKER, "|") for t in parsed_templates + ] + parsed_templates = [f"${t}$" for t in parsed_templates] + + raw_templates = map(wiki.fix_math, raw_templates) + parsed_raw = [ + cls.parse_wikitext(t, example["id"], example["source"]) + for t in raw_templates + ] + parsed_raw = [p[0]["text"] if p is not None else "" for p in parsed_raw] + for rt, pr in zip(raw_templates, parsed_templates): + if not pr: + logger.warning( + "Template `%s` was parsed to nothing.", + rt, + ) + parsed_raw = [t.replace(wiki.ABS_MARKER, "|") for t in parsed_raw] + parsed_raw = [f"${t}$" for t in parsed_raw] + # Reinsert Templates + document = wiki.insert_templates(document, parsed_raw, wiki.SECOND_MARKER) + document = wiki.insert_templates( + document, parsed_templates, wiki.MATH_MARKER + ) + example["text"] = document + return example + + +def main(args): + with utils.maybe_temp_dir(path=args.meta) as meta_dir: + processor = WTFWikipediaParallel( + source_prefix=utils.dolma_input(args.input, args.filename), + destination_prefix=utils.dolma_output(args.output), + metadata_prefix=meta_dir, + num_processes=args.processes, + ) + processor(debug=args.debug, overwrite=args.overwrite, shadow=not args.no_shadow) + + +if __name__ == "__main__": + # Dolma examples use spawn over fork, unsure why but lets follow them. + mp.set_start_method("spawn") + args = parser.parse_args() + main(args) diff --git a/wiki/scrape/.gitignore b/wiki/scrape/.gitignore new file mode 100644 index 0000000..60baa9c --- /dev/null +++ b/wiki/scrape/.gitignore @@ -0,0 +1 @@ +data/* diff --git a/wiki/scrape/README.md b/wiki/scrape/README.md new file mode 100644 index 0000000..f108775 --- /dev/null +++ b/wiki/scrape/README.md @@ -0,0 +1,26 @@ +# Wiki Scrapes +## + +Going forward, large scale scraps should use the [wikiteam3](https://github.com/saveweb/wikiteam3) tools. There are multiple wiki formats (deku wiki, wikidot, etc.) that are detected by not implemented in that tool. If we want to collect those we will need custom code. + +These tools are designed for getting data from mediawiki wikis that don't publish dumps. + +Conversion to the dolma format can be used for any wiki that uses the `*-history.xml` format. + +## Data Download + +These steps are to be completed for each wiki that we are scraping. + +1. Find all the namespaces that pages are listed under with `python get_namespaces.py --wiki ${wiki_url}`. This saves a mapping of namespace names to id's in `data/${wiki_name}/namespaces.json`. +2. Get all the pages under each namespace by following pagination links using `python list_pages.py --wiki ${wiki_url} -ns 0 -ns 1...`. The namespaces we want to scrape are generally: + * `(Main)`: 0 + * `Talk`: 1 + * `UserTalk`: 3 +Either the integer or the name can be used as input. This generates lists of page titles at `data/${wiki_name}/pages/${ns}.txt`. +3. Get the XML export of these pages with `python export_pages.py --wiki ${wiki_url}`. This get xml exports of the all the pages exported pages. It currently fetches all revisions so that we can build a complete author list. This will create a sharded xml export at `data/${wiki_name}/export/${shard_idx}-pages.xml`. The `` tag contains the wikimedia markup. +4. Convert the XML export into the dolma format from the wiki directory with `python to-dolma.py --wiki ${wiki_url} --license ${license_str} --export ${path}` + +The export format is the same as the wiki dump + +Wiki archive scraps have ~3 versions, to use the same format as the dump and 1 has a unique format. Most of the wiki's +that aren't online anymore use this old format. diff --git a/wiki/scrape/export_pages.py b/wiki/scrape/export_pages.py new file mode 100644 index 0000000..54eb03c --- /dev/null +++ b/wiki/scrape/export_pages.py @@ -0,0 +1,103 @@ +"""Export the pages we enumerated as xml. + +This page https://www.mediawiki.org/wiki/Manual:Parameters_to_Special:Export +lists multiple limits to the amount of data that can be returned. In the two +wiki's I have been testing on I haven't found these to be true. The main +points of concern are: +* pages: The limit is 35 +* limit: The maximum number of revisions to return, limited at 1000. +* history: It mentions there are cases where this doesn't return all the revisions. +* listauthors: This didn't seem active on any of the wikis I tested on. +""" + + +import argparse +import os +import urllib.parse +from typing import List + +from utils import enumerate_pages, get_page, get_wiki_name + +from licensed_pile import logs + +parser = argparse.ArgumentParser(description="Export mediawikis as XML") +parser.add_argument("--wiki", required=True, help="The wiki url we are exporting.") +parser.add_argument( + "--pages", + action="append", + help="A list of files of pages to export, or a dir to export all. defaults to data/${wiki_name}/pages/.", +) +# Using firefox I didn't have issues sending a lot of pages at once, but I was +# getting URI too long errors when using requests. +parser.add_argument( + "--page_limit", default=35, help="The max number of pages to export at once." +) +parser.add_argument( + "--test_pages", default=None, type=int, help="The number of test pages to retrieve." +) +parser.add_argument( + "--output_dir", + help="Where to save the xml export. defaults to data/${wiki_name}/export/.", +) +# TODO: Implement this if we find a wiki that has this enabled. +parser.add_argument( + "--listauthors", + help="Use the listauthors url param instead of getting multiple revisions. UNIMPLEMENTED.", +) + + +def export_pages(wiki: str, pages: List[str]): + # Note: We don't quote the newline ourselves as requests will do it too and + # you'll get `%250A` instead of `%0A` in the url. + pages = "\n".join(pages).strip("\n") + # Even though they recomment using the index.php?title=PAGETITLE url for a lot + # of things (with the /wiki/ being for readers), we use it here to start looking + # for pages because it is more consistent (some wiki's want /w/index.php and + # some just want /index.php). + return get_page( + urllib.parse.urljoin(wiki, "/wiki/Special:Export"), + params={"pages": pages, "history": 1}, + ) + + +def main(args): + if args.listauthors is not None: + raise NotImplementedError("--listauthors is current not implemented.") + logger = logs.get_logger() + args.pages = ( + args.pages + if args.pages is not None + else [os.path.join("data", get_wiki_name(args.wiki), "pages")] + ) + logger.info("Enumerating pages from %s", args.pages) + pages = enumerate_pages(args.pages) + logger.info("There are %d pages to export.", len(pages)) + + args.output_dir = ( + args.output_dir + if args.output_dir is not None + else os.path.join("data", get_wiki_name(args.wiki), "export") + ) + os.makedirs(args.output_dir, exist_ok=True) + logger.info("Saving export to %s", args.output_dir) + + # Save shards of exported pages to + # data/${wiki_name}/export/${shard_idx}-pages.xml + # These shards can be processed as if they are one large xml file with + # licensed_pile.xml.iterate_xmls(glob.iglob(...), tag) + # Note: These exports seem to an xml namespace so all tags are actually + # "{http://mediawiki.org/xml/export-0.11/}TAGNAME" + # with literal "{"'s. + for i, j in enumerate(range(0, len(pages), args.page_limit)): + xml = export_pages(args.wiki, pages[j : j + args.page_limit]) + with open(os.path.join(args.output_dir, f"{i:>05}-pages.xml"), "w") as wf: + wf.write(xml) + if args.test_pages and j > args.test_pages: + logger.info(f"Scraped {j + args.page_limit} pages, stopping for testing.") + break + + +if __name__ == "__main__": + args = parser.parse_args() + logs.configure_logging() + main(args) diff --git a/wiki/scrape/get_namespaces.py b/wiki/scrape/get_namespaces.py new file mode 100644 index 0000000..f39c5f3 --- /dev/null +++ b/wiki/scrape/get_namespaces.py @@ -0,0 +1,70 @@ +"""Enumerate all the namespaces in a mediawiki wiki.""" + +import argparse +import json +import os +import urllib.parse +from typing import Dict + +from utils import get_page, get_soup, get_wiki_name + +from licensed_pile import logs +from licensed_pile.utils import removesuffix + +parser = argparse.ArgumentParser(description="Find all namespaces in a mediawiki wiki.") +parser.add_argument("--wiki", required=True, help="The Url for the wiki in question.") +parser.add_argument( + "--output", + help="Where to save the id -> namespace mapping. Normally (data/${wiki_name}/namespaces.json)", +) +# Using const="" allows us to have the empty string be the value used when +# only --prefix is passed. +parser.add_argument( + "--wiki_prefix", + default="wiki/", + nargs="?", + const="", + help="Prefix for url paths, changes between wiki's, often wiki/, w/, or nothing (just pass --wiki_prefix)", +) + + +def find_namespaces(wiki_url: str, url_prefix: str = "/wiki/") -> Dict[int, str]: + options = {} + # Even though they recommend using the index.php?title=PAGETITLE url for a lot + # of things (with the /wiki/ being for readers), we use it here to start looking + # for pages because it is more consistent (some wiki's want /w/index.php and + # some just want /index.php). + # TODO: Code would probably be able to automatically try and select the prefix + # by trying each of the common ones. + # Normalize the prefix by removing any trailing slash. + url_prefix = removesuffix(url_prefix, "/") + soup = get_soup( + get_page(urllib.parse.urljoin(wiki_url, f"{url_prefix}/Special:AllPages")) + ) + # Extract the list of namespaces from the URL + namespaces = soup.find(id="namespace") + for option in namespaces.find_all("option"): + options[option.text] = int(option.attrs["value"]) + return options + + +def main(args): + logger = logs.get_logger() + logger.info(f"Finding all namespaces from {args.wiki}") + namespaces = find_namespaces(args.wiki, args.wiki_prefix) + args.output = ( + args.output + if args.output is not None + else os.path.join("data", get_wiki_name(args.wiki), "namespaces.json") + ) + + os.makedirs(os.path.dirname(args.output), exist_ok=True) + logger.info(f"Writing namespaces to {args.output}") + with open(args.output, "w") as wf: + json.dump(namespaces, wf, indent=2) + + +if __name__ == "__main__": + args = parser.parse_args() + logs.configure_logging() + main(args) diff --git a/wiki/scrape/list_pages.py b/wiki/scrape/list_pages.py new file mode 100644 index 0000000..86da0b8 --- /dev/null +++ b/wiki/scrape/list_pages.py @@ -0,0 +1,146 @@ +"""Create a list of all pages under a namespace for a mediawiki.""" + +import argparse +import json +import os +import urllib.parse +from typing import List + +from requests.models import PreparedRequest +from utils import get_page, get_soup, get_wiki_name + +from licensed_pile import logs +from licensed_pile.utils import removeprefix, removesuffix + +parser = argparse.ArgumentParser( + description="Find all pages under a namespace for a mediawiki." +) +parser.add_argument("--wiki", required=True, help="The Url for the wiki in question.") +parser.add_argument( + "--namespace", + "-ns", + required=True, + action="append", + help="The namespace to enumerate.", +) +parser.add_argument( + "--namespace_map", help="The id -> namespace mapping file, stored as json." +) +parser.add_argument("--output_dir", help="Where to store the list of file outputs.") +# Using const="" allows us to have the empty string be the value used when +# only --prefix is passed. +parser.add_argument( + "--wiki_prefix", + default="wiki/", + nargs="?", + const="", + help="Prefix for url paths, changes between wiki's, often wiki/, w/, or nothing (just pass --wiki_prefix)", +) + + +def enumerate_namespace( + wiki_url: str, namespace: int, url_prefix: str = "wiki/" +) -> List[str]: + """Collect all pages of a wiki from within a namespace.""" + logger = logs.get_logger() + logger.info(f"Finding all pages under the {namespace} namespace from {wiki_url}") + # Even though they recomment using the index.php?title=PAGETITLE url for a lot + # of things (with the /wiki/ being for readers), we use it here to start looking + # for pages because it is more consistent (some wiki's want /w/index.php and + # some just want /index.php). + # TODO: Code would probably be able to automatically try and select the prefix + # by trying each of the common ones. + # Normalize the prefix by removing any trailing slash. + url_prefix = removesuffix(url_prefix, "/") + url = urllib.parse.urljoin(wiki_url, f"{url_prefix}/Special:AllPages") + # Use a prepared request to build out the url with parameters, we don't + # actually use this to make requests. + r = PreparedRequest() + r.prepare_url(url, {"namespace": namespace, "hideredirects": "1"}) + return _enumerate_namespace( + r.url, + wiki_url, + [], + ) + + +def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str]: + """Collect all pages of a wiki from within a namespace. + + Args: + url: The current pagination URL to get the next set of links from. + wiki_url: The base url as pagination links don't include the host information. + Note: If we move this recurrent function to an inner function of the above + this wouldn't need to be a parameter. + pages: The current list of pages we are building. + """ + logger = logs.get_logger("wiki.scrape") + logger.info(f"Finding page links in {url}") + soup = get_soup(get_page(url)) + # Find all the links in the page + page_count = len(pages) + # Sometimes this is in a div and sometimes it is in a table, so just look for class names. + if links := soup.find(True, {"class": ["mw-allpages-body", "mw-allpages-chunk"]}): + for link in links.find_all("a"): + href = link.attrs["href"] + href = removeprefix(href, "/") + href = removeprefix(href, "wiki/") + pages.append(urllib.parse.unquote(href)) + logger.info(f"Found {len(pages) - page_count} pages") + + # Find a pagination link + if nav := soup.find("div", {"class": "mw-allpages-nav"}): + for link in nav.find_all("a"): + # Pagination links look like "Next Page (${page title})" + # Check the start of the text to make sure we don't find a link that has + # a title that contains "next page". + if link.text.lower().startswith("next page"): + # Recurse using the pagination link as the new url. + try: + logger.info(f"Found pagination page at {link.attrs['href']}") + # The current page links have already been added to pages so we can + # just return whatever the recusion gives us. + return _enumerate_namespace( + urllib.parse.urljoin(wiki_url, link.attrs["href"]), + wiki_url, + pages, + ) + except Exception as e: + # If something goes wrong in pagination, just return the pages we + # have. + logger.info( + f"Something went wrong processing pagination at {link.attrs['href']}, returning partial results." + ) + return pages + # If no pagination link was found, just return what we have. + logger.info(f"No pagination link found, finished.") + return pages + + +def main(args): + args.namespace_map = ( + args.namespace_map + if args.namespace_map is not None + else os.path.join("data", get_wiki_name(args.wiki), "namespaces.json") + ) + with open(args.namespace_map) as f: + namespace_map = json.load(f) + args.output_dir = ( + args.output_dir + if args.output_dir is not None + else os.path.join("data", get_wiki_name(args.wiki), "pages") + ) + os.makedirs(args.output_dir, exist_ok=True) + + for namespace in args.namespace: + # Convert to int using map if it was a string, otherwise default keeps it as int. + namespace = namespace_map.get(namespace, namespace) + pages = enumerate_namespace(args.wiki, namespace, args.wiki_prefix) + with open(os.path.join(args.output_dir, f"{namespace}.txt"), "w") as wf: + wf.write("\n".join(pages) + "\n") + + +if __name__ == "__main__": + args = parser.parse_args() + logs.configure_logging() + main(args) diff --git a/wiki/scrape/list_wikis.py b/wiki/scrape/list_wikis.py new file mode 100644 index 0000000..e92abf2 --- /dev/null +++ b/wiki/scrape/list_wikis.py @@ -0,0 +1,113 @@ +"""This uses the list pages from https://wikiindex.org to get the urls of all wikis.""" + +import argparse +import functools +import glob +import multiprocessing.dummy as mp +import os +import time +import urllib.parse +from typing import List + +import tenacity +from utils import enumerate_pages, get_page, get_soup, get_wiki_name, make_wiki_url + +from licensed_pile import logs + +parser = argparse.ArgumentParser(description="Convert a list of wikinames to urls.") +parser.add_argument( + "--wiki", default="https://wikiindex.org", help="The wiki url we are exporting." +) +parser.add_argument( + "--pages", + action="append", + help="A list of files of pages to export, or a dir to export all. defaults to data/${wiki_name}/pages/.", +) +parser.add_argument( + "--test_pages", default=None, type=int, help="The number of test pages to retrieve." +) +parser.add_argument( + "--output", + help="Where to save the output. defaults to data/${wiki_name}/export/.", +) +# Using const="" allows us to have the empty string be the value used when +# only --prefix is passed. +parser.add_argument( + "--wiki_prefix", + default="", + nargs="?", + const="", + help="Prefix for url paths, changes between wiki's, often wiki/, w/, or nothing (just pass --wiki_prefix)", +) +parser.add_argument( + "--num_threads", + default=64, + type=int, + help="The number of threads to use when fetching wiki names, as this is I/O blocking, you can use more than your core count without much issue.", +) +parser.add_argument( + "--wait", default=2, type=int, help="How long to wait between requests." +) + + +def get_wiki_link( + page_title: str, wiki_url: str, url_prefix: str = "", wait: int = 0 +) -> str: + url = make_wiki_url(wiki_url, page_title, url_prefix) + logger = logs.get_logger() + logger.info(f"Finding external link to {url}") + try: + soup = get_soup(get_page(url)) + ext_url = get_external_link(soup) + time.sleep(wait) + logger.info(f"Found {ext_url} as the external url for {url}") + return ext_url + except tenacity.RetryError: + logger.error(f"Failed to fetch {url}") + + +def get_external_link(soup) -> str: + if ext_link := soup.find("a", {"class": "external text"}): + return ext_link.attrs["href"] + + +def main(args): + logger = logs.get_logger("wiki.scrape") + args.pages = ( + args.pages + if args.pages is not None + else [os.path.join("data", get_wiki_name(args.wiki), "pages")] + ) + args.output = ( + args.output + if args.output is not None + else os.path.join("data", get_wiki_name(args.wiki), "wiki_list.txt") + ) + logger.info(f"Enumerating pages from {args.pages}") + pages = enumerate_pages(args.pages) + logger.info(f"There are {len(pages)} wikis") + + # Only fetch some for testing. + pages = pages[: args.test_pages] + + logger.info(f"Fetching wiki links with {args.num_threads} threads.") + with mp.Pool(args.num_threads) as pool: + links = pool.map( + functools.partial( + get_wiki_link, + wiki_url=args.wiki, + url_prefix=args.wiki_prefix, + wait=args.wait, + ), + pages, + ) + + logger.info(f"Writing wiki links to {args.output}") + with open(args.output, "w") as wf: + wf.write("\n".join(filter(lambda l: l is not None, links)) + "\n") + + +if __name__ == "__main__": + args = parser.parse_args() + logs.configure_logging() + main(args) diff --git a/wiki/scrape/utils.py b/wiki/scrape/utils.py new file mode 100644 index 0000000..399003a --- /dev/null +++ b/wiki/scrape/utils.py @@ -0,0 +1,59 @@ +"""Utilities for scraping wikis.""" + +import glob +import os +import urllib.parse +from typing import Dict, List, Optional + +import requests +from bs4 import BeautifulSoup + +from licensed_pile import scrape + + +def get_page(*args, **kwargs): + r = scrape.get_page(*args, **kwargs) + return r.text + + +def get_soup(text, parser="html.parser"): + """Abstract into a function in case we want to swap how we parse html.""" + return BeautifulSoup(text, parser) + + +def get_wiki_name(url: str) -> str: + """Use a wiki's url as it's name. + + This functions is to abstract into a semantic unit, even though it doesn't do much. + """ + return urllib.parse.urlparse(url).netloc + + +def make_wiki_url(base_url: str, title: str, url_prefix: str = "wiki/") -> str: + """Create a wiki url from the wiki url and the page name.""" + url_prefix = removesuffix(url_prefix, "/") + url = urllib.parse.urljoin(base_url, f"{url_prefix}/{title.replace(' ', '_')}") + return urllib.parse.quote(url, safe=":/") + + +def read_page_titles(filename: str) -> List[str]: + with open(filename) as f: + return f.read().strip("\n").split("\n") + + +def enumerate_pages(pages: List[str], pattern: str = "*.txt") -> List[str]: + """Enumerate all pages found in a wiki scrape. + + Args: + pages: A list of paths to text files containing one page title per line or + a dir containing multiple page files. + pattern: A glob pattern to find page files within pages[i] when it is a dir. + """ + results = [] + for page in pages: + if os.path.exists(page) and os.path.isdir(page): + for f in glob.iglob(os.path.join(page, pattern)): + results.extend(read_page_titles(f)) + else: + results.extend(read_page_titles(page)) + return results diff --git a/wiki/scripts/find.py b/wiki/scripts/find.py new file mode 100644 index 0000000..ad2b744 --- /dev/null +++ b/wiki/scripts/find.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +import collections +import glob +import json + +import tqdm +from smart_open import smart_open + +import wiki + +template_counts = collections.Counter() +for i, f in enumerate( + glob.glob( + "/media/brian/External-SSD/licensed_pile/wiki/data/wiki/dump/raw/documents/*_wikipedia.com.jsonl.gz" + ) +): + print(f"Extracting math templates from {f}") + with smart_open(f) as f: + for l in tqdm.tqdm(f): + if not l: + continue + data = json.loads(l) + if not data["text"]: + continue + _, templates = wiki.extract_math_templates(data["text"]) + for t in templates: + template_counts[t] += 1 +print(f"{len(template_counts)} unique math templates found.") +print(f"{sum(template_counts.values())} total math templates found.") +with open("math_templates.json", "w") as wf: + wf.write(json.dumps(dict(template_counts))) diff --git a/wiki/scripts/grammar.py b/wiki/scripts/grammar.py new file mode 100644 index 0000000..edc313c --- /dev/null +++ b/wiki/scripts/grammar.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 + +import lark + +grammar = r""" +?start: abs + | PIPE + | CHAR + +CHAR_SEQ: CHAR+ +CHAR: /[^"\\\{\}]/ + +PIPE: "{{!}}" + +abs: "{{abs|" abs "}}" + | "{{abs|" CHAR_SEQ "}}" +""" + + +grammar = r""" +math: "{{math|" template* "}}" + | template* + +template: phi + | delta + | ell + | pi + | equal + | pipe + | bra + | ket + | bra_ket + | braket + | norm + | closed_open + | closed_closed + | open_open + | open_closed + | brace + | overset + | overline + | sup + | sub + | sfrac + | italic + | bold +// | su + | WORD + + +phi: "{{phi}}" +pi: "{{pi}}" +delta: "{{delta}}" +ell: "{{ell}}" +equal: "{{=}}" +pipe: "{{!}}" + +bra: "{{bra|" template "}}" + | "{{Dbra|" template "}}" +ket: "{{ket|" template "}}" + | "{{Dket|" template "}}" +bra_ket: "{{bra-ket|" template "|" template "}}" + | "{{Dbraket|" template "|" template "}}" +// By aliasing these cases to match the versions above, we can automatically +// reuse the transformer methods. +braket : "{{braket|bra|" template "}}" -> bra + | "{{braket|ket|" template "}}" -> ket + | "{{braket|bra-ket|" template "|" template "}}" -> bra_ket + +closed_open: "{{closed-open|" template "}}" +closed_closed: "{{closed-closed|" template "}}" +open_open: "{{open-open|" template "}}" +open_closed: "{{open-closed|" template "}}" + +brace: "{{" "brace" "|" template "}}" +norm: "{{norm|" template "}}" + +sup: "" template "" +sub: "" template "" +sfrac: "{{sfrac|" template "|" template "}}" + +// TODO: Revisit with priority compared to the terminal that picks up ' +italic: "''" template "''" +bold: "'''" template "'''" + +// TODO: Revisit +overset: "{{overset|" template "|" template "}}" +overline: "{{overline|" template "}}" + + +// su: "{{" "su" "|" "p=" template "|" "b=" template "|" "a=" template "}}" +// | "{{" "su" "|" "p=" template "|" "a=" template "|" "b=" template "}}" +// | "{{" "su" "|" "b=" template "|" "p=" template "|" "a=" template "}}" +// | "{{" "su" "|" "b=" template "|" "a=" template "|" "p=" template "}}" +// | "{{" "su" "|" "a=" template "|" "p=" template "|" "b=" template "}}" +// | "{{" "su" "|" "a=" template "|" "b=" template "|" "p=" template "}}" + +//%import common.WORD -> WORD +LETTER: /\w/ +SYMBOL: /[^\w\{|\}<>]/ +CHARACTER: LETTER | SYMBOL +WORD: CHARACTER+ +""" + +l = lark.Lark(grammar, start="math") + + +class TemplateToLaTex(lark.Transformer): + def math(self, templates): + return "$" + "".join(templates) + "$" + + def template(self, children): + return children[0] + + def brace(self, param): + param = param[0] + return "\{" + param + "\}" + + def bra(self, param): + param = param[0] + return rf"\langle {param} |" + + def ket(self, param): + param = param[0] + return rf"| {param} \rangle" + + def bra_ket(self, params): + bra, ket = params + return rf"\langle {bra} | {ket} \rangle" + + def closed_closed(self, param): + param = param[0] + return f"[{param}]" + + def closed_open(self, param): + param = param[0] + return f"[{param})" + + def open_closed(self, param): + param = param[0] + return f"({param}]" + + def open_open(self, param): + param = param[0] + return f"({param})" + + def overset(self, param): + over, under = param + return rf"\overset{{{over}}}{{{under}}}" + + def overline(self, param): + return rf"\overline{{{param[0]}}}" + + def sup(self, inner): + return rf"^{{{inner[0]}}}" + + def sub(self, inner): + return rf"_{{{inner[0]}}}" + + def sfrac(self, args): + num, denom = args + return f"{num}/{denom}" + + def norm(self, param): + param = param[0] + return rf"\| {param} \|" + + def italic(self, param): + # Math mode is already italic. + return param[0] + + def bold(self, param): + return rf"\mathbf{{{param[0]}}}" + + def phi(self, _): + return r"\phi" + + def pi(self, _): + return r"\pi" + + def delta(self, _): + return r"\delta" + + def equal(self, _): + return "=" + + def pipe(self, _): + return "|" + + def ell(self, _): + return r"\ell" + + def WORD(self, w): + return w diff --git a/wiki/scripts/remove_html.py b/wiki/scripts/remove_html.py new file mode 100644 index 0000000..b98147c --- /dev/null +++ b/wiki/scripts/remove_html.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 + +import argparse +import multiprocessing as mp +import re +from tempfile import TemporaryDirectory + +import bs4 + +from licensed_pile import logs, utils +from licensed_pile.write import ShardParallelProcessor + +parser = argparse.ArgumentParser(description="Remove HTML from dolma documents.") +parser.add_argument( + "--input", + required=True, + help="The input version, this directory should be where the `documents` dir lives.", +) +parser.add_argument( + "--output", + required=True, + help="The output version, this directory should be where the `documents` dir will live.", +) +parser.add_argument( + "--filename", + default="*.jsonl.gz", + help="The filename to match with globs, probably needs to be escaped.", +) +# TODO: Respect this flag +parser.add_argument( + "--overwrite", + action="store_true", + help="Should we overwrite previously processed examples?", +) +parser.add_argument( + "--debug", + action="store_true", + help="Should we log when documents are not changed by preprocessing.", +) +parser.add_argument( + "--processes", + type=int, + default=mp.cpu_count(), + help="Number of processors for multicore.", +) +parser.add_argument("--meta", help="Location to save dolma processing metadata.") + +logs.configure_logging(level="DEBUG") + + +class CaptureMatches: + """A class that records what matches were found when doing re.serach""" + + def __init__(self): + self.matches = [] + + def __call__(self, m): + try: + self.matches.append(m.group(1)) + except IndexError: + self.matches.append(m) + return "" + + def __iter__(self): + yield from self.matches + + def __bool__(self): + return bool(self.matches) + + +class RegexRemoveHTMLParallel(ShardParallelProcessor): + @classmethod + def process_example(cls, example, **kwargs): + logger = cls.get_logger() + with logger(source=example["source"], example_id=example["id"]): + cm = CaptureMatches() + # Capture the smallest amount of text between
+ # This would not be ok if we cared about malicious input. + # cleaned_text = re.sub(r"(<(?:div|font).*?>)", cm, example["text"]) + # cleaned_text = re.sub(r"(<[^ >][^>]*?>)", cm, example["text"]) + # Looking at matches found the long ones tended to be false positives. + cleaned_text = re.sub( + r"(<(?:[a-zA-Z/]|\\\\/|\?xml|\?php|!--)[^>]{0,500}>)", + cm, + example["text"], + re.DOTALL, + ) + + # If we found a , make sure that we didn't miss the + # <${tag} ${attrs}> version in the text. This is useful for things + # like tags in other languages that I noticed. + backtracking = set() + if cm: + for m in cm: + logger.debug("Removed %s based on regex", m, extra={"match": m}) + # Some of the ones I found have trailing spaces, but I don't want + # to grab something that has a bunch of attributes + if m := re.search(r"^$", m): + # We grab the group so the sapce is removed. + backtracking.add(re.escape(m.group(1))) + + if backtracking: + cm = CaptureMatches() + backtracking = rf"(<(?:{'|'.join(backtracking)} ?.*?)>)" + cleaned_text = re.sub(backtracking, cm, cleaned_text, re.DOTALL) + if cm: + for m in cm: + logger.debug( + "Removed %s based on backtracking regex", + m, + extra={"match": m}, + ) + + example["text"] = cleaned_text + return example + + +def main(args): + with utils.maybe_temp_dir(args.meta) as meta_dir: + processor = RegexRemoveHTMLParallel( + source_prefix=utils.dolma_input(args.input, args.filename), + destination_prefix=utils.dolma_output(args.output), + metadata_prefix=meta_dir, + num_processes=args.processes, + ) + processor(debug=args.debug, overwrite=args.overwrite) + + +if __name__ == "__main__": + # Dolma examples use spawn over fork, unsure why but lets follow them. + mp.set_start_method("spawn") + args = parser.parse_args() + main(args) diff --git a/wiki/to_dolma.py b/wiki/to_dolma.py new file mode 100644 index 0000000..662bbd0 --- /dev/null +++ b/wiki/to_dolma.py @@ -0,0 +1,148 @@ +"""Convert a wikiscrape of media-wiki dump into the dolma format.""" + +import argparse +import datetime +import functools +import glob +import os +import urllib.parse + +from utils import get_wiki_name, make_wiki_url + +from licensed_pile.licenses import PermissiveLicenses +from licensed_pile.logs import configure_logging, get_logger +from licensed_pile.utils import dolma_output +from licensed_pile.write import to_dolma +from licensed_pile.xml import iterate_xmls + +parser = argparse.ArgumentParser(description="Convert the xml export to dolma.") +parser.add_argument("--wiki", required=True, help="The wiki url we are processing.") +parser.add_argument("--license", required=True, help="The licenses this is under.") +parser.add_argument("--export", help="The location of the exported pages.") +parser.add_argument( + "--output_dir", + help="Where the dolma formatted data goes.", +) +parser.add_argument( + "--source", + choices=["wiki/scrape", "wiki/archive", "wiki/dump"], + help="Where does the data come from?", +) +parser.add_argument( + "--filename", default=None, help="The base filename for our wiki data." +) +parser.add_argument( + "--shard_size", type=int, default=1, help="Size, in GB, for each shard." +) +parser.add_argument( + "--last_author", + action="store_true", + help="Should we only include the most recent author? (Faster)", +) +parser.add_argument( + "--include_redirects", + action="store_true", + help="Should we skip pages that are redirects to others?", +) + + +def main(args): + # Calculate defaults + license = PermissiveLicenses.from_string(args.license) + logger = get_logger() + logger.info("Saving all exported pages as licensed with %s", license) + args.filename = ( + args.filename if args.filename else f"{get_wiki_name(args.wiki)}.jsonl.gz" + ) + logger.info("Saving to dolma format at %s", args.filename) + args.export = ( + args.export + if args.export + else os.path.join("data", get_wiki_name(args.wiki), "export", "*.xml") + ) + logger.info("Loading export from %s", args.export) + args.output_dir = dolma_output( + os.path.join("data", args.source, "raw") + if not args.output_dir + else args.output_dir + ) + + logger.info("Saving Dolma formatted data to %s", args.output_dir) + # Our parser can ignore xml-namespaces so just use `page`. + pages = iterate_xmls(glob.iglob(args.export), tag="page") + pages = map( + functools.partial( + format_dolma, + source_name=args.source, + wiki=args.wiki, + license=license, + all_authors=not args.last_author, + skip_redirect=not args.include_redirects, + ), + pages, + ) + # When we filter out pages based on things like redirects, they may be None + pages = filter(lambda p: p is not None, pages) + to_dolma(pages, args.output_dir, args.filename, args.shard_size) + + +def format_dolma( + xml, + source_name: str, + wiki: str, + license: PermissiveLicenses, + all_authors: bool = True, + skip_redirect: bool = True, +): + if skip_redirect and [x for x in xml if x.tag.endswith("redirect")]: + return None + revisions = [r for r in xml if r.tag.endswith("revision")] + # TODO Handle if this fails and add logging. + text = [t for t in revisions[-1] if t.tag.endswith("text")][0].text + page_namespace = [ns for ns in xml if ns.tag.endswith("ns")][0].text + page_id = [pid for pid in xml if pid.tag.endswith("id")][0].text + created = datetime.datetime.fromisoformat( + [ts for ts in revisions[-1] if ts.tag.endswith("timestamp")][0].text + ).replace(tzinfo=None) + page_title = [t for t in xml if t.tag.endswith("title")][0].text + + contributors = set() + if all_authors: + for revision in revisions: + contribs = [c for c in revision if c.tag.endswith("contributor")] + # When there are multiple contributors, there are multiple contributor + # xml items where each one has a single username and id items. + names = [u.text for c in contribs for u in c if u.tag.endswith("username")] + # Save their id too in case they change their username + uid = [u.text for c in contribs for u in c if u.tag.endswith("id")] + contributors.update(zip(names, uid)) + else: + contrib = [c for c in revisions[-1] if c.tag.endswith("contributor")] + # When there are multiple contributors, there are multiple contributor + # xml items where each one has a single username and id items. + name = [u.text for c in contrib for u in c if u.tag.endswith("username")] + # Save their id too in case they change their username + uid = [u.text for c in contrib for u in c if u.tag.endswith("id")] + contributors.update(zip(name, uid)) + + return { + "id": f"{page_namespace}-{page_id}", + "text": text, + "source": f"{source_name}/{get_wiki_name(wiki)}", + "added": datetime.datetime.utcnow().isoformat(), + "created": created.isoformat(), + "metadata": { + "license": str(license), + "authors": sorted(contributors), + "url": make_wiki_url(wiki, page_title), + "wiki": get_wiki_name(wiki), + "namespace": page_namespace, + "title": page_title, + }, + } + + +if __name__ == "__main__": + args = parser.parse_args() + configure_logging() + main(args) diff --git a/wiki/wiki.py b/wiki/wiki.py new file mode 100644 index 0000000..ad6e2d6 --- /dev/null +++ b/wiki/wiki.py @@ -0,0 +1,804 @@ +"""Tools and utilities for parsing wikitext.""" + +import itertools +import os +import re +from typing import Dict, List, Set, Tuple + +import requests + +# ᙭᙭᙭᙭᙭ "Canadian Syllabics Chi Sign", a rare unicode that isn't touched by wtf_wikipedia +MATH_MARKER = "\u166D\u166D\u166D\u166D\u166D" +# ⇭⇭⇭⇭⇭ "Upwards White Arrow On Pedestal with Vertical Bar", a rare unicode untouched by wtf_wikipedia +SECOND_MARKER = "\u21ED\u21ED\u21ED\u21ED\u21ED" +# ¦¦¦¦¦ "Broken Bar", `|` has a lot of meaning in wikitext so we to replace actual instances of it. +ABS_MARKER = "\u00A6\u00A6\u00A6\u00A6\u00A6" + +# WTF Wikipedia strips out most templates, which is where almost all the math is +# :( What we do is find the math templates (regex to find the start then iterate +# forward to find the closing of the scope, allows for nesting) and replace them +# with a symbol that doesn't appear anywhere else. We then clean each template +# ourselves and insert them back, after wtf_wikipedia has been run on the main +# article. +# +# Sometimes wtf_wikipdia converts a template to `1/undefined` and the `/` can +# be an ascii slash or sometime various unicode verions. These are currently +# left in. + + +# Characters that appear in wikimath templates and how to translate them into +# how they would appear in latex. +CHAR_SYMBOLS = { + "[Pp]hi": r"\phi", + r"\)": ")", + r"\(": "(", + "[Dd]elta": r"\delta", + "[Pp]i": r"\pi", + "[Gg]amma": r"\gamma", + "[Ee]psilon": r"\epsilon", + "[Ss]igma": r"\sigma", + "[Tt]heta": r"\theta", + "[Vv]arepsilon": r"\epsilon", + "[Vv]arphi": r"\phi", + "[Vv]arsigma": r"\sigma", + "[Vv]artheta": r"\theta", + "[Ee]ll": r"\ell", +} + + +def insert_templates(text: str, templates: List[str], marker) -> str: + """Replace each instance of marker in text with a template. + + re.sub was being annoying about \'s in the replacements.' + """ + offset = 0 + new_text = [] + for t in templates: + if mark := re.search(marker, text[offset:], re.IGNORECASE): + new_text.append(text[offset : offset + mark.span()[0]]) + new_text.append(t) + offset = offset + mark.span()[1] + else: + # This should be an error, but the logger isn't plumbed into this + # function atm, just let it go for v0 + pass + if trailing := text[offset:]: + new_text.append(trailing) + return "".join(new_text) + + +# Replacing the templates, the math tags, and the indentation adjustment all have +# basically the algorithm, but to share code the unique function part would have +# to be super complex and have access to a bunch of things (the start, end, the +# matches, etc.) So it isn't worth it to deduplicate this code at the moment. +def extract_templates( + text: str, templates: List[str], replacement: str +) -> Tuple[str, List[str]]: + # {{ -> { when using an f-string, this creates a regex like {{(?:(...|...)) ?\| + # Escaping the last | is important, otherwise you match everything as the "or" + # is an empty string. + opening = rf"{{{{(?:{'|'.join(templates)}) *?\|" + new_text = [] + templates = [] + offset = 0 + # See `replace_math_tags` + while template := re.search(opening, text[offset:], re.IGNORECASE): + # Add everything before the template + new_text.append(text[offset : offset + template.span()[0]]) + # Find the closing }}, we expect there to be far more {{ openings inside + # the template compared the the tags, so we need to find the last + # one. This will dispatch to the special curly parser. + end_start, end_end = finish_template( + text[offset + template.span()[0] :], "{{", "}}" + ) + # If a template is opened and never finished, we just include everything + if end_start == -1: + offset = offset + template.span()[1] + continue + # Add our template replacement + new_text.append(replacement) + # Add the template text to our list of templates. + templates.append( + text[offset + template.span()[0] : offset + template.span()[0] + end_end] + ) + # Move the search offset in the text to after the template. + offset = offset + template.span()[0] + end_end + # If there is any text left over after the last time we found a template, + # add that to out new text + if text[offset:]: + new_text.append(text[offset:]) + # Combine the parts of the new texts. + new_text = "".join(new_text) + assert len(re.findall(replacement, new_text)) == len(templates) + return new_text, templates + + +def remove_template_brackets(text: str, templates: List[str]) -> str: + """This can be used to remove math templates that aren't important for latex but breaks wtf_wikipedia. + + Examples include: nobreak, nowrap, and var + """ + for template in templates: + opening = rf"{{{{{template} *?\|?" + new_text = [] + offset = 0 + while t := re.search(opening, text[offset:], re.IGNORECASE): + new_text.append(text[offset : offset + t.span()[0]]) + end_start, end_end = finish_template( + text[offset + t.span()[0] :], "{{", "}}" + ) + if end_start == -1: + offset = offset + t.span()[1] + continue + template_text = text[ + offset + t.span()[1] : offset + t.span()[0] + end_start + ] + new_text.append(template_text) + offset = offset + t.span()[0] + end_end + if text[offset:]: + new_text.append(text[offset:]) + text = "".join(new_text) + return text + + +def fix_equals(text: str) -> str: + """wtf_wikipedia can handle the {{math|1=...}} templates but not {{math| ... {{=}} ...}}""" + if re.search(r"{{ ?= ?}}|=", text, re.IGNORECASE): + text = re.sub(r"{{math ?\|", "{{math|1=", text) + return re.sub(r"{{ ?= ?}}|=", "=", text) + return text + + +## +# These function rewrite a template like {{overline|...}} to latex \overline{...} +# +def replace_template( + text: str, + opening, + closing, + start, + end, + nest_open=None, + nest_close=None, + recursive: bool = False, +) -> str: + """Replace templates found in text with a marker. See replace_math_templates + for an explaination of the main parsing code. + + Note: This function *always* allows for the nesting of *different* templates + i.e., {{math|{{overline|...}}}}, but recursive=True must be set to + allow for the nesting of the *same* template, i.e. Xij + """ + nest_open = nest_open if nest_open else opening + nest_close = nest_close if nest_close else closing + offset = 0 + new_text = [] + while m := re.search(opening, text[offset:], re.IGNORECASE): + new_text.append(text[offset : offset + m.span()[0]]) + end_start, end_end = finish_template( + text[offset + m.span()[0] :], nest_open, nest_close + ) + if end_start == -1: + offset = offset + m.span()[1] + continue + new_text.append(start) + between = text[offset + m.span()[1] : offset + m.span()[0] + end_start] + if recursive: + new_text.append( + replace_template( + between, + opening, + closing, + start, + end, + nest_open, + nest_close, + recursive, + ) + ) + else: + new_text.append(between) + new_text.append(end) + offset = offset + m.span()[0] + end_end + if trailing := text[offset:]: + new_text.append(trailing) + return "".join(new_text) + + +## +# These are for ease of use, giving names to the common templates we replace in +# the conversion from wikitext to latex. +# +def replace_sub(text: str) -> str: + return replace_template(text, r"", r"", "_{", "}", recursive=True) + + +def replace_sup(text: str) -> str: + return replace_template(text, r"", r"", "^{", "}", recursive=True) + + +def replace_radical(text: str) -> str: + opening = r"{{[Rr]adic(?:al)? ?\|" + closing = r"}}" + return replace_template(text, opening, closing, "\sqrt{", "}", nest_open="{{") + + +def replace_prime(text: str) -> str: + opening = r"{{(?:[Pp]rime|′) ?\|" + closing = r"}}" + return replace_template(text, opening, closing, "", "'", nest_open="{{") + + +def replace_fraction(text: str) -> str: + """{{Fraction|}} isn't handled by wtf_wikipedia but {{sfrac|...}} is.""" + text = re.sub(r"{{[Ff]ract(?:ion)?(?:/sandbox)? ?\|", "{{sfrac|", text) + return re.sub(r"{{sfrac/sandbox ?\|", "{{sfrac|", text) + + +def replace_overline(text: str) -> str: + opening = r"{{[Oo]verline ?\|?" + closing = r"}}" + return replace_template(text, opening, closing, r"\overline{", "}", nest_open="{{") + + +def replace_overbar(text: str) -> str: + opening = r"{{[Oo]verbar ?\|" + return replace_template(text, opening, "}}", r"\overbar{", "}", nest_open="{{") + + +def replace_overarc(text: str) -> str: + opening = r"{{[Oo]verarc ?\|" + return replace_template(text, opening, "}}", r"\overarc{", "}", nest_open="{{") + + +def replace_mathcal(text: str) -> str: + opening = r"{{[Mm]athcal ?\|" + return replace_template(text, opening, "}}", r"\mathcal{", "}", nest_open="{{") + + +def replace_mathbb(text: str) -> str: + opening = r"{{[Mm]athbb ?\|" + return replace_template(text, opening, "}}", r"\mathbb{", "}", nest_open="{{") + + +# TODO: Replace ''' with \mathbf{}? +def replace_strong(text: str) -> str: + opening = r"{{[Ss]trong ?\|" + return replace_template(text, opening, "}}", r"\mathbf{", "}", nest_open="{{") + + +def replace_ceil(text: str) -> str: + opening = r"{{[Cc]eil ?\|" + return replace_template(text, opening, "}}", r"\ceil{", "}", nest_open="{{") + + +def replace_floor(text: str) -> str: + opening = r"{{[Ff]loor ?\|" + return replace_template(text, opening, "}}", r"\floor{", "}", nest_open="{{") + + +def replace_norm(text: str) -> str: + opening = r"{{[Nn]orm ?\|" + return replace_template( + text, opening, "}}", rf"\{ABS_MARKER}", rf"\{ABS_MARKER}", nest_open="{{" + ) + + +def replace_open_closed(text: str) -> str: + opening = r"{{[Oo]pen-[Cc]losed ?\|" + return replace_template(text, opening, "}}", "(", "]", nest_open="{{") + + +def replace_open_open(text: str) -> str: + opening = r"{{[Oo]pen-[Oo]pen ?\|" + return replace_template(text, opening, "}}", "(", ")", nest_open="{{") + + +def replace_closed_closed(text: str) -> str: + opening = r"{{[Cc]losed-[Cc]losed ?\|" + return replace_template(text, opening, "}}", "[", "]", nest_open="{{") + + +def replace_closed_open(text: str) -> str: + opening = r"{{[Cc]losed-[Oo]pen ?\|" + return replace_template(text, opening, "}}", "[", ")", nest_open="{{") + + +def replace_bra(text: str) -> str: + opening = r"{{[Bb]ra ?\|" + return replace_template(text, opening, "}}", r"\langle", ABS_MARKER, nest_open="{{") + + +def replace_ket(text: str) -> str: + opening = r"{{[Kk]et ?\|" + return replace_template(text, opening, "}}", ABS_MARKER, r"\rangle", nest_open="{{") + + +def replace_brace(text: str) -> str: + opening = r"{{[Bb]race ?\|" + return replace_template(text, opening, "}}", r"\{", r"\}", nest_open="{{") + + +def replace_angle_bracket(text: str) -> str: + opening = r"{{[Aa]ngle ?[Bb]racket ?\|" + return replace_template(text, opening, "}}", r"\langle", r"\rangle", nest_open="{{") + + +def replace_symbols( + text: str, symbols: Dict[str, str] = CHAR_SYMBOLS, include_money: bool = False +) -> str: + """Replace templates that evaulate to a symbol {{pi}} -> 𝛑 with the latex version.""" + for template, latex in symbols.items(): + # re.sub was being difficult about including something like \p in the + # replacement string. So do it manually. + # text = re.sub(rf"{{{{{template}}}}}", latex, text) + if m := re.search(rf"{{{{{template}}}}}", text, re.IGNORECASE): + if include_money: + latex = f"${latex}$" + text = "".join((text[: m.span()[0]], latex, text[m.span()[1] :])) + return text + + +def replace_abs(text: str) -> str: + """Convert absolute value from wikitext to latex. + + The | symbol is used in the wikitext template syntax, so they uses various + different ways to escape them. This tries to standadize them all to the latex + format. + """ + text = text.replace("{{!}}", ABS_MARKER) + text = text.replace("|", ABS_MARKER) + text = text.replace("||", f"{ABS_MARKER}{ABS_MARKER}") + opening = r"{{[Mm]?[Aa]bs ?\|?" + closing = r"}}" + return replace_template( + text, opening, closing, ABS_MARKER, ABS_MARKER, nest_open="{{", recursive=True + ) + + +def replace_mset(text: str) -> str: + """Convert set notation from wikitext to latex. + + Where are some cases where wtf_wikipedia deletes msets that have | bars in + them despite that being legal in wikitext, those are not handled well atm. + """ + opening = r"{{[Mm]set\|?" + closing = r"}}" + return replace_template( + text, opening, closing, r"\{", r"\}", nest_open="{{", recursive=True + ) + + +## +# This joins together all the text processing we do. +def fix_math(text): + """Convert wikitext math to latex. + + Note: The order of these fixes can be important, some latex output can get + caught by regex's for other tempaltes. + """ + text = remove_template_brackets( + text, + ("var", "nobreak", "nowrap", "mvar", "linktext", "em", "italics correction"), + ) + text = fix_equals(text) + text = replace_fraction(text) + text = replace_prime(text) + text = replace_overline(text) + text = replace_overbar(text) + text = replace_overarc(text) + text = replace_radical(text) + text = replace_mathcal(text) + text = replace_mathbb(text) + text = replace_strong(text) + text = replace_ceil(text) + text = replace_floor(text) + text = replace_norm(text) + text = replace_open_closed(text) + text = replace_open_open(text) + text = replace_closed_closed(text) + text = replace_closed_open(text) + text = replace_bra(text) + text = replace_ket(text) + text = replace_brace(text) + text = replace_angle_bracket(text) + text = replace_symbols(text) + text = replace_sup(text) + text = replace_sub(text) + text = replace_mset(text) + text = replace_abs(text) + return text + + +def extract_math_templates(text: str) -> Tuple[str, List[str]]: + """Pull all math out of the page to handle later.""" + return extract_templates(text, ("math",), MATH_MARKER) + + +def replace_math_tags(text: str) -> str: + """Replace with $$ for latex. + + We try to pick $...$ or $$...$$ based on the wikitext. + """ + math_opening = r'' + math_closing = r"" + offset = 0 + new_text = [] + # Find the first math tag in the text, we will increment where we start our + # search to be after these tags to find the next on. + # All regex positions are based of the text[offset:] slice we we need to add + # offset to them when using them to index the whole string + while math := re.search(math_opening, text[offset:], re.IGNORECASE): + # Add everything before the first match. + new_text.append(text[offset : offset + math.span()[0]]) + + # Find the closing associated with this tag. This index is relative + # to the slice of text starting with the location of the opening tag match + # so we need to add the start offset and the offset to index into the + # original string. + end_start, end_end = finish_template( + text[offset + math.span()[0] :], math_opening, math_closing + ) + # This happens when there is a start tag but no end tag. For example, + # in talk page 1-9564, they have `` as a symbol (it is inside ) + if end_start == -1: + # TODO: Add logging + # Skip processing the scope for this one and then continue looking for more matches + offset = offset + math.span()[1] + continue + # and should use $ + # and should use $$ + # always uses $$ rendering (limits above and below sum for example) + # but in wikipedia, if is on it's own line then a new paragraph + # is created. In latex $$ always creates a new paragraph/line so we + # don't need any marking for this special case, just use $$ + new_text.append("$" if math.group("type") == "inline" else "$$") + + math_text = text[offset + math.span()[1] : offset + math.span()[0] + end_start] + # We shouldn't have nested tags, but it is wikitext so *shrug*. + # We could recurse to replace nested tags but that would cause + # latex errors so instead we log an error. + # if m := re.search(math_opening, math_text): + # logger.error("...") + # Add the text /after/ the opening tag, but /before/ the closing tag. + new_text.append(math_text) + # Same choices as above. + new_text.append("$ " if math.group("type") == "inline" else "$$") + # Move the search offset to /after/ the closing tag. + offset = offset + math.span()[0] + end_end + if text[offset:]: + new_text.append(text[offset:]) + return "".join(new_text) + + +# Templates for math symbols will probably be inside other math so skip looking +# for them, list here https://en.wikipedia.org/wiki/Template:%3D +MATH_TEMPLATES = ( + "±", + "×", + "10^", + "x10^", + "abs", + "alpha/Fe", + "angle bracket", + "angbr", + "bigmath", + "Binom", + "bra", + "bra-ket", + "braket", + "ceil", + "closed-closed", + "closed-open", + "DBra", + "Dbraket", + "degree", + "subst:degree", + "Devanagari", + "dirprod", + "Dket", + "e-sp", + "ell", + "epsilon", + "EqNote", + "EquationNote", + "Equation", + "Equation box 1", + "EquationRef", + "#expr:", + "Fe/H", + "floor", + "Function", + "Fraction", + "Frac", + "gamma", + "hub", + "intmath", + "intorient", + "kappa", + "ket", + "lambda", + "langle", + "ldelim", + "Lg-start", + "M/H", + "Mapsto", + "Math theorem", + "Math proof", + "math-link", + "mathbb", + "mathcal", + "mexp", + "minteg", + "mset", + "mu", + "mvar", + "mvar-link", + "N-ary", + "nary", # Figure out which it actually is + "norm", + "Numbered block", + "oiiint", + "oiint", + "open-closed", + "open-open", + "otimes", + "overarc", + "overline", + "overset", + "overunderset", + "Pars", + "phi", + "pi", + "pnsign", + "radic", + "rangle", + "rdelim", + "rndhands", + "scinote", + "sigma", + "smallmath", + "starred", + "su", + "su2", + "sub", + "subsub", + "subsup", + "sup", + "sup sub", + "sfrac", + "tau", + "theta", + "tmath", + "tombstone", + "underoverset", + "underset", + "upsilon", + "Urdu numeral", + "val", + "varepsilon", + "varphi", + "varsigma", + "vartheta", + "vec", + "x10^", + "xi", + "xor", + "φ", + "All", + "And", + "Eqv", + "Exist", + "False", + "Ident", + "Imp", + "In", + "Models", + "Nand", + "Nor-", + "Not", + "Or-", + "Tee", + "True", +) + +# These are templates that are hard to process and make sense to strip out. +# "change": This creates a table, skip +# "change2": This creates a table, skip +# "changes": This creates a table, skip +# "delimiter-es": +# "dice" +# "lessthan": Is used in weird substitution situations +# "underline": Seems more widespread than just math +# "var": Seems more widespread than just math +# "var serif": Seems more widespread than just math + + +# These are sections that are often near the end of wikipedia and have non-natural +# text after them. All lowercase for easier checks +SKIP_SECTIONS = frozenset( + ( + "notes", + "bibliography", + "sources", + "citations", + "references", + "see also", + "external links", + "further reading", + "tertiary sources", + "secondary sources", + "primary sources", + "general and cited sources", + "footnotes", + "works cited", + ) +) + + +## +# These function look ahead in the text to find the end of a scope. +def finish_template(text, start="{{", end="}}"): + """Find the end of a template by looking for `end`. + + text should start with the `start` template that we are looking to finish. + + This handles nested scoping as long as the "start" regex matches all openings + to scopes, otherwise it is possible to have the end of an unfound opening be + considered the final end. + """ + if start == "{{" and end == "}}": + return finish_mustache_template(text) + i = 0 + templates = 0 + while i < len(text): + if m := re.search(f"^{start}", text[i:], re.IGNORECASE): + templates += 1 + # Note: .span is based on the slice so it is basically the length of the match + i += m.span()[1] - 1 + elif m := re.search(f"^{end}", text[i:], re.IGNORECASE): + templates -= 1 + # Note: .span is based on the slice so it is basically the length of the match + begin = i + m.span()[0] + i += m.span()[1] - 1 + if templates == 0: + return begin, i + 1 + i += 1 + return -1, -1 + + +def finish_mustache_template(text): + """This is a special case of template finding where `{` and `}` are considered + scopes that we must close before finding }}. + + If there are } without preceding {, they are ignored. + + In ambiguous cases like {{{, it parses to {{, { for opening the scopes. + """ + i = 2 + scopes = ["{{"] + while i < len(text) - 1: + if text[i] == "{": + scopes.append("{") + elif text[i] == "}": + if text[i + 1] == "}": + if scopes and scopes[-1] == "{{": + scopes.pop() + i += 1 + elif scopes and scopes[-1] == "{": + scopes.pop() + if not scopes: + return i - 1, i + 1 + else: + if scopes and scopes[-1] == "{": + scopes.pop() + i += 1 + return -1, -1 + + +def wiki_to_dir(wiki_id, chars: int = 2, levels: int = 2): + """Convert wiki id to a nested dir for faster filesystem access. + + ex: wiki-car_collectionfandomcom -> wiki-ca/r_/wiki-car_collectionfandomcom + """ + prefix = "wiki-" if wiki_id.startswith("wiki-") else "" + wiki_id = re.sub(f"^{prefix}", "", wiki_id) + parts = ( + (f"{prefix}{wiki_id[:chars]}",) + + tuple(wiki_id[l * chars : (l + 1) * chars] for l in range(1, levels)) + + (f"{prefix}{wiki_id}",) + ) + return os.path.join(*parts) + + +def parse_wikitext( + text, doc_id, source, host: str = "http://localhost", port: int = 5000 +): + """Parse wikitext by hitting a server endpoint.""" + r = requests.post( + f"{host}:{port}", + json={"wikitext": text, "id": doc_id, "source": source}, + ) + # This is technaially for the server to send the client when the client has + # timed out, but there isn't a server side timeout code. 504 is for when the + # server is a proxy, not just long running. + if r.status_code == 408: + raise requests.Timeout() + # This happens when HAProxy times out + if r.status_code == 504: + message = r.text + raise ValueError(f"{r}, {r.text}, probably from an HAProxy timeout.") + if r.status_code == 200: + try: + return r.json()["document"] + except requests.JSONDecodeError as e: + e.add_note(f"JSON Decoding failed for request {r}:{r.text}") + raise + try: + # Our server returns errors with json information, but if there is a non + # 200 code because of the load balancer, it might not be as JSON. + message = r.json()["error"] + except requests.JSONDecodeError: + message = r.text + raise ValueError(message) + + +def format_section(sec) -> str: + """Convert a section dict into a string like: + + title + text... + more text... + ... + """ + match sec: + case {"title": "", "text": ""}: + return "" + case {"title": title, "text": ""}: + return "" + case {"title": "", "text": text}: + return text + case {"title": title, "text": text}: + return f"{title}\n{text}" + + +def filter_section(sec, blocklist: Set[str] = SKIP_SECTIONS) -> bool: + return not sec.get("title", "").lower() in blocklist + + +def format_document(doc, title: str = "") -> str: + """Convert the list of sections into a string, filtering out boilerplate sections.""" + sections = filter(filter_section, doc) + sections = (sec for s in sections if (sec := format_section(s))) + return "\n\n".join(itertools.chain((title,), sections)).strip() + + +def adjust_indentation(text: str) -> str: + """When a :indent comment is followed by a normal line, that like gets moved + above the indentation, see https://github.com/spencermountain/wtf_wikipedia/issues/577 + + This work around adds an extra newline between the list :indent line and a + line with text to avoid this issue. + + It can cause some extra whitespace in the output, but that can easily be fixed + later. + + I had to re-write this to an iterative solution over a recursive one as the + stack seems to be much smaller when using multiprocessing (I only the max + recursion depth exceeded error when running within dolma). + """ + result = [] + while indent := re.search("^:+.+$", text, re.MULTILINE | re.IGNORECASE): + # The :ident is on the last line, "\n" isn't matched so subtract 1 + if indent.span()[1] >= (len(text) - 1): + result.append(text) + break + + result.append(text[: indent.span()[1] + 1]) + if text[indent.span()[1] + 1] not in (":", "\n"): + result.append("\n") + + text = text[indent.span()[1] + 1 :] + else: + result.append(text) + return "".join(result)