Skip to content

Commit

Permalink
Tooling to download and process Wikis (#51)
Browse files Browse the repository at this point in the history
Add tools to scrape mediawiki wikis that don't publish dumps

Add tool that exports the xml based on the list of pages.

Add the ability to convert wikis to dolma

Download and extract script supports multiworker

Create WTF Wikipedia parsing server which uses a worker pool to allow for timeouts

Creation of script that removes html tags we found in many wiki dumps.

Added Shadow Paging to the creation of wikitext dolma files

Added Shadow Paging to dolma preprocessing.

Added script that remove `None` lines from dolma files.

Added script that can combine dolma shards while tracking what was used
where to allow for aligned combinations of later versions.
  • Loading branch information
blester125 authored Sep 24, 2024
1 parent 9a4d292 commit f567cd1
Show file tree
Hide file tree
Showing 51 changed files with 4,450 additions and 133 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,6 @@ cython_debug/
#.idea/
.python-version
**/licensed_pile_log.txt

node_modules
package-lock.json
39 changes: 34 additions & 5 deletions licensed_pile/licenses.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ def __str__(self):
return self.value


# TODO: With all the different versions that are out in the wild, this flat enum
# is getting hard to use. We should re-thing how to do this.
class PermissiveLicenses(StringEnum):
"""By 'Permissive' we mean licenses that are in the Gold, Silver, or Bronze
lists of the Blue Oak Countil (https://blueoakcouncil.org/list), even if
Expand All @@ -17,15 +19,24 @@ class PermissiveLicenses(StringEnum):

PD = "Public Domain"
CC0 = "Creative Commons Zero - Public Domain - https://creativecommons.org/publicdomain/zero/1.0/"
CC_PDM = "Creative Commons Public Domain Mark - https://creativecommons.org/publicdomain/mark/1.0/"
CC_BY = (
"Creative Commons - Attribution - https://creativecommons.org/licenses/by/4.0/"
)
CC_BY_3 = (
"Creative Commons - Attribution - https://creativecommons.org/licenses/by/3.0/"
)
CC_BY_2_5 = (
"Creative Commons - Attribution - https://creativecommons.org/licenses/by/2.5/"
)
CC_BY_2 = (
"Creative Commons - Attribution - https://creativecommons.org/licenses/by/2.0/"
)
CC_BY_SA = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/4.0/"
CC_BY_SA_3 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/3.0/"
CC_BY_SA_2_5 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/2.5/"
CC_BY_SA_2_1 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/2.1/"
CC_BY_SA_1 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/1.0/"
GFDL = "GNU Free Documentation License"
APACHE_2 = "Apache 2 License - https://www.apache.org/licenses/LICENSE-2.0"
MIT = "MIT License"
Expand All @@ -49,17 +60,35 @@ def from_string(cls, s: str) -> "PermissiveLicenses":
s = s.lower().strip()
if re.match(r".*/publicdomain/zero/1.0/?$", s):
return cls.CC0
if m := re.match(r".*/licenses/by(?P<share>-sa)?/(?P<version>\d).0/?$", s):
if m.group("version") == "4":
if m.group("share") is None:
if re.match(r".*/publicdomain/mark/1.0/?$", s):
return cls.CC_PDM
if re.match(r".*/publicdomain/.*", s):
return cls.PD
if m := re.search(r"(?:/licenses/)?by(?P<share>-sa)?/(?P<version>\d.\d)/?", s):
if m.group("version") == "4.0":
if m.group("share") is not None:
return cls.CC_BY_SA
return cls.CC_BY
elif m.group(1) == "3":
if m.group("share") is None:
elif m.group("version") == "3.0":
if m.group("share") is not None:
return cls.CC_BY_SA_3
return cls.CC_BY_3
elif m.group("version") == "2.5":
if m.group("share") is not None:
return cls.CC_BY_SA_2_5
return cls.CC_BY_2_5
elif m.group("version") == "2.1":
if m.group("share") is not None:
return cls.CC_BY_SA_2_1
elif m.group("version") == "2.0":
return cls.CC_BY_2
elif m.group("version") == "1.0":
if m.group("share") is not None:
return cls.CC_BY_SA_1
else:
raise ValueError(f"Unable to understand license {s}")
if s == "gfdl" or "gnu_free_documentation_license" in s:
return cls.GFDL
raise ValueError(f"Unable to understand license {s}")


Expand Down
1 change: 1 addition & 0 deletions licensed_pile/logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import sys
from typing import Protocol, Sequence

import contextual_logger
from logging_json import JSONFormatter


Expand Down
1 change: 1 addition & 0 deletions licensed_pile/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def get_page(
resp = requests.get(url, params=params, headers=headers)
logging.debug(f"Sending GET to {resp.url}")
if resp.status_code != 200:
# TODO: Update logger
logging.warning(
f"Failed request to {resp.url}: {resp.status_code}, {resp.reason}"
)
Expand Down
1 change: 1 addition & 0 deletions licensed_pile/scripts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
shard_to_*.json
Loading

0 comments on commit f567cd1

Please sign in to comment.