Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hansard #76

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
64 changes: 64 additions & 0 deletions hansard/ca_to_dolma.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import argparse
baberabb marked this conversation as resolved.
Show resolved Hide resolved
from datetime import datetime

import datasets
import regex

from licensed_pile.licenses import PermissiveLicenses
from licensed_pile.write import to_dolma

WHITESPACE = regex.compile(r"\w+|[^\w\s]+")
COUNT = 0


def format_dolma(row: dict) -> dict:
global COUNT
COUNT += len(WHITESPACE.split(row["text"]))
baberabb marked this conversation as resolved.
Show resolved Hide resolved
return {
"id": row.get("speechdate").strftime("%Y-%m-%d").replace("-", ""),
"text": row["text"],
"created": row["speechdate"].strftime("%Y-%m-%d"),
baberabb marked this conversation as resolved.
Show resolved Hide resolved
"source": "ca-hansard",
baberabb marked this conversation as resolved.
Show resolved Hide resolved
"added": str(datetime.now().date()),
"metadata": {
"license": str(PermissiveLicenses.PD),
"language": "en",
"year": str(row["speechdate"].year),
},
}


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Collect Canadian Hansard into Dolma format."
)
parser.add_argument(
"--output_folder",
default="hansard/ca_hansard",
baberabb marked this conversation as resolved.
Show resolved Hide resolved
help="Output format for parquet files",
)
parser.add_argument(
"--file-name",
default="cahansard.jsonl.gz",
baberabb marked this conversation as resolved.
Show resolved Hide resolved
help="The base filename for our books.",
)
parser.add_argument(
"--shard-size", type=int, default=1, help="Size, in GB, for each shard."
)
parser.add_argument(
"--count",
action="store_true",
help="Count the number of words in the dataset.",
)
args = parser.parse_args()
# Load the dataset
baberabb marked this conversation as resolved.
Show resolved Hide resolved
dataset = datasets.load_dataset(
"baber/canadian_hansard",
split="train",
)
ds = dataset.map(format_dolma, remove_columns=dataset.column_names)
to_dolma(
ds, path=args.output_folder, filename=args.file_name, shard_size=args.shard_size
)
if args.count:
print(COUNT)
106 changes: 106 additions & 0 deletions hansard/source_code/ca_hansard.py
baberabb marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import argparse
from pathlib import Path

import polars as pl

# Download the postgres sql file from https://www.lipad.ca/data/ and create a postgres table
baberabb marked this conversation as resolved.
Show resolved Hide resolved

# The downstream workflow assumes the ds is sorted!
query = """SELECT * from dilipadsite_basehansard ORDER BY basepk"""
baberabb marked this conversation as resolved.
Show resolved Hide resolved
row_shifted = lambda column: (~pl.col(column).eq(pl.col(column).shift()))
baberabb marked this conversation as resolved.
Show resolved Hide resolved


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Consolidate Canadian Hansard into Dolma format."
)
parser.add_argument(
"--output_folder",
default="data/hansard/ca",
help="Output format for parquet files",
)
parser.add_argument(
"--uri",
default="postgresql://localhost:5432/cshansard",
baberabb marked this conversation as resolved.
Show resolved Hide resolved
help="URI to the database.",
)
args = parser.parse_args()
URI = args.uri
baberabb marked this conversation as resolved.
Show resolved Hide resolved
OUTPUT_FOLDER_PATH = Path(args.output_folder)
OUTPUT_FOLDER_PATH.mkdir(parents=True, exist_ok=True)
(
pl.read_database_uri(query=query, uri=URI)
.with_columns(pl.col(pl.Utf8).replace("", None))
.with_columns(
main_changed=row_shifted("maintopic"),
minor_changed=row_shifted("subtopic"),
speakerposition=pl.when(
pl.col("speakerposition")
.str.contains("(?i)interjection|intervention|stagedirection")
.not_()
)
.then(None)
.when(pl.col("speakerposition").str.contains("stagedirection"))
.then(pl.lit("[Stage Direction]", dtype=pl.String))
.otherwise(
pl.concat_str(
pl.lit("[", dtype=pl.String),
pl.col("speakerposition").str.to_titlecase(),
pl.lit("]", dtype=pl.String),
ignore_nulls=False,
)
),
)
.lazy()
.group_by("speechdate")
.agg(
baberabb marked this conversation as resolved.
Show resolved Hide resolved
pl.concat_str(
[
pl.when("main_changed")
.then(
pl.concat_str(
[
pl.lit("\n", dtype=pl.String),
pl.col("maintopic"),
pl.lit("\n", dtype=pl.String),
],
ignore_nulls=False,
)
)
.otherwise(None),
pl.when("minor_changed").then(pl.col("subtopic")).otherwise(None),
pl.concat_str(
[pl.col("speakerposition"), pl.lit("\n", dtype=pl.String)],
ignore_nulls=False,
),
pl.concat_str(
[
pl.when(pl.col("speakeroldname").str.contains("Speaker"))
.then(pl.col("speakeroldname"))
.when(pl.col("speakeroldname").is_null())
.then(pl.col("speakername"))
.when(pl.col("speakername").is_first_distinct())
.then(pl.col("speakeroldname"))
.otherwise(
pl.col("speakername"),
),
pl.lit(": ", dtype=pl.String),
],
ignore_nulls=False,
),
pl.col("speechtext"),
],
ignore_nulls=True,
).alias("text")
)
.with_columns(
pl.col("text").list.join("\n").str.strip_chars(),
year=pl.col("speechdate").dt.year(),
)
.collect()
.write_parquet(
f"{OUTPUT_FOLDER_PATH}.parquet",
use_pyarrow=True,
pyarrow_options={"partition_cols": ["year"]},
)
)
164 changes: 164 additions & 0 deletions hansard/uk_to_dolma.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import argparse
import re
from datetime import datetime
from pathlib import Path
from typing import Iterator

import lxml
import lxml.etree as ET
import regex

from licensed_pile.licenses import PermissiveLicenses
from licensed_pile.write import to_dolma

parser = argparse.ArgumentParser(description="Collect UK-Hansard into Dolma format")
parser.add_argument(
"--base_folder",
default="hansard/uk_parlparse/scrapedxml",
help="Path to the directory of UK-Hansard XML files.",
)
parser.add_argument(
"--output_dir",
default="hansard/dolma_outputs",
baberabb marked this conversation as resolved.
Show resolved Hide resolved
help="Where the dolma formatted data goes.",
)
parser.add_argument(
"--shard_size", type=int, default=1, help="Size, in GB, for each shard."
)

parser.add_argument("--count", action="store_true", help="Count the number of words.")

FILE_NAMES = {
"debates": {"source": "commons-debates"},
"london-mayors-questions": {
"source": "london-mayors-questions",
},
"lordspages": {"source": "lords-debates"},
"lordswms": {
"source": "lords-written-ministerial-statements",
},
"lordswrans": {
"source": "lords-written-answers",
},
"ni": {"source": "northern-ireland-assembly"},
"sp": {"source": "scottish-parliament"},
"sp-written": {
"source": "scottish-parliament-written-answers",
},
"standing": {"source": "standing-committees"},
"westminister": {"source": "westminister-hall"},
"wms": {"source": "written-ministerial-statements"},
"wrans": {"source": "written-answers"},
"senedd": {
"cy": {"source": "senedd-cy"},
"en": {"source": "senedd-en"},
"source": "senedd",
},
}
WHITESPACE = regex.compile(r"\w+|[^\w\s]+")
COUNT = 0

PARSER = lxml.etree.XMLParser(
encoding="utf-8",
recover=True,
)


def get_subfolders(folder_path: str | Path) -> list[Path]:
baberabb marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(folder_path, str):
folder_path = Path(folder_path)
subfolders = [f for f in folder_path.iterdir() if f.is_dir()]
return subfolders


def parse_hansard_xml_file(root: ET._Element) -> str:
parsed_text = []
for element in root.iter():
if element.tag == "major-heading":
text = ET.tostring(element, method="text", encoding="unicode").strip()
parsed_text.append(f"{text}")
baberabb marked this conversation as resolved.
Show resolved Hide resolved
elif element.tag in ["oral-heading", "minor-heading"]:
baberabb marked this conversation as resolved.
Show resolved Hide resolved
text = ET.tostring(element, method="text", encoding="unicode").strip()
parsed_text.append(f"{text}")
elif element.tag in [
baberabb marked this conversation as resolved.
Show resolved Hide resolved
"speech",
"ques",
"reply",
"question",
]:
speaker = (
baberabb marked this conversation as resolved.
Show resolved Hide resolved
element.attrib.get("speakername", "") + ": "
if element.attrib.get("speakername", "")
else ""
)
speech_text = (
ET.tostring(element, method="text", encoding="unicode")
.strip()
.replace(" ", "")
baberabb marked this conversation as resolved.
Show resolved Hide resolved
)
parsed_text.append(f"{speaker + speech_text}")
baberabb marked this conversation as resolved.
Show resolved Hide resolved

return "\n\n".join(parsed_text).replace(" ", "")
baberabb marked this conversation as resolved.
Show resolved Hide resolved


def process_files_in_folder(
folder_path: Path, source: str, count: bool = False
) -> dict:
global COUNT
language = "en" if not source == "senedd-cy" else "cy"
date_match = re.compile(r"\d{4}-\d{2}-\d{2}")
baberabb marked this conversation as resolved.
Show resolved Hide resolved
for file in folder_path.iterdir():
if file.suffix == ".xml" and file.stem != "tmp":
root = ET.parse(file, parser=PARSER).getroot()
parsed_text = parse_hansard_xml_file(root)
if parsed_text:
if count:
COUNT += len(WHITESPACE.split(parsed_text))
date_ = date_match.search(file.stem)
baberabb marked this conversation as resolved.
Show resolved Hide resolved
if date_:
date = date_.group()
else:
date = "9999-01-01"
yield {
"id": file.stem,
"text": parsed_text,
"created": date,
"source": f"uk-hansard-{source}",
"added": str(datetime.now().date()),
"metadata": {
"license": str(PermissiveLicenses.OPL),
"language": language,
"year": date.split("-")[0],
},
}


def process_folder(folder_path: Path, count: bool = False) -> Iterator[dict]:
for subfolder in get_subfolders(folder_path):
if subfolder.name in FILE_NAMES:
baberabb marked this conversation as resolved.
Show resolved Hide resolved
source = FILE_NAMES[subfolder.name]["source"]
if source == "senedd":
for nested_subfolder in get_subfolders(subfolder):
source = FILE_NAMES["senedd"][nested_subfolder.name]["source"]
yield from process_files_in_folder(
nested_subfolder, source, count=count
)
else:
yield from process_files_in_folder(subfolder, source)


def main(args):
base_folder = Path(args.base_folder)
to_dolma(
examples=process_folder(base_folder, count=args.count),
path=args.output_dir,
shard_size=args.shard_size,
filename="ukhansard.jsonl.gz",
baberabb marked this conversation as resolved.
Show resolved Hide resolved
)


if __name__ == "__main__":
args = parser.parse_args()
main(args)
if args.count:
print(COUNT)
1 change: 1 addition & 0 deletions licensed_pile/licenses.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class PermissiveLicenses(StringEnum):
GFDL = "GNU Free Documentation License"
APACHE_2 = "Apache 2 License - https://www.apache.org/licenses/LICENSE-2.0"
MIT = "MIT License"
OPL = "Open Parliament Licence - https://www.parliament.uk/site-information/copyright-parliament/open-parliament-licence/"
BSD_2 = "BSD 2-Clause"
BSD_3 = "BSD 3-Clause"

Expand Down
Loading