From 2dcc7f0d793a621d6634199c15893c9f96ef2df3 Mon Sep 17 00:00:00 2001 From: Enrico Shippole Date: Mon, 3 Jun 2024 11:45:10 -0400 Subject: [PATCH 1/3] Update get_data.sh Only the most recent dump should be used. You are making numerous duplicates of the same data. --- courtlistener/get_data.sh | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/courtlistener/get_data.sh b/courtlistener/get_data.sh index dde4f76..e35c9aa 100644 --- a/courtlistener/get_data.sh +++ b/courtlistener/get_data.sh @@ -11,21 +11,7 @@ download_dir="./data/courtlistener/raw" mkdir -p "$download_dir" dates=( - "2022-08-02" - "2022-08-31" - "2022-09-30" - "2022-10-31" - "2022-11-30" - "2022-12-31" - "2023-01-31" - "2023-02-28" - "2023-03-31" - "2023-04-30" - "2023-05-31" - "2023-07-31" - "2023-08-31" - "2023-12-04" - "2024-03-11" + "2024-05-06" ) max_jobs=8 From ecbc70dd51125f791a325d0683a94050224c2c38 Mon Sep 17 00:00:00 2001 From: Enrico Shippole Date: Mon, 3 Jun 2024 13:53:51 -0400 Subject: [PATCH 2/3] Lint for download_and_convert_md.py and add comments --- courtlistener/get_data.sh | 3 +++ pubmedcentral/download_and_convert_to_md.py | 13 ++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/courtlistener/get_data.sh b/courtlistener/get_data.sh index e35c9aa..6fc26fb 100644 --- a/courtlistener/get_data.sh +++ b/courtlistener/get_data.sh @@ -10,6 +10,9 @@ download_dir="./data/courtlistener/raw" # Create the directory if it does not exist mkdir -p "$download_dir" +# Only download the data from most recent CL dump +# The newest dump contains the previous dumps data +# Differences from the previous data should not be included dates=( "2024-05-06" ) diff --git a/pubmedcentral/download_and_convert_to_md.py b/pubmedcentral/download_and_convert_to_md.py index 1e52a52..8bef0c2 100644 --- a/pubmedcentral/download_and_convert_to_md.py +++ b/pubmedcentral/download_and_convert_to_md.py @@ -36,13 +36,13 @@ help="Number of processes to use for conversion.", ) + def get_date_from_tree(tree): date_created = None # get date from tree # date can be found under a number of tags pub_types = ["pub", "epub", "pmc-release", "ppub"] for pub_type in pub_types: - # try most common location first date = tree.find(f".//pub-date[@pub-type='{pub_type}']") if date is not None: @@ -64,7 +64,6 @@ def get_date_from_tree(tree): date_created = f"{year}-01-01" continue - # if we found the month, try the day try: day = date.find("day").text @@ -73,12 +72,11 @@ def get_date_from_tree(tree): date_created = f"{year}-{month}-01" continue - # If we successfully found all date components, # convert to YYYY-MM-DD format date_created = f"{year}-{month}-{day}" break - + # try the next location date = tree.find(f".//pub-date[@date-type='{pub_type}']") if date is not None: @@ -100,7 +98,6 @@ def get_date_from_tree(tree): date_created = f"{year}-01-01" continue - # if we found the month, try the day try: day = date.find("day").text @@ -109,7 +106,6 @@ def get_date_from_tree(tree): date_created = f"{year}-{month}-01" continue - # If we successfully found all date components, # convert to YYYY-MM-DD format date_created = f"{year}-{month}-{day}" @@ -117,6 +113,7 @@ def get_date_from_tree(tree): return date_created + def get_authors_and_date(nxml_file: str, pmcid: str): # get authors from nxml file authors = [] @@ -138,7 +135,9 @@ def get_authors_and_date(nxml_file: str, pmcid: str): # not a fatal error, just log it if date_created is None: logger = logs.get_logger("pubmedcentral") - logger.info(f"Date not found for {pmcid}. Setting to default value of '1900-01-01'") + logger.info( + f"Date not found for {pmcid}. Setting to default value of '1900-01-01'" + ) date_created = "1900-01-01" return authors, date_created From f5c79b60a9616efdf5eb73cb3299ffd190fa6eba Mon Sep 17 00:00:00 2001 From: Enrico Shippole Date: Mon, 3 Jun 2024 23:29:23 -0400 Subject: [PATCH 3/3] Add column merging and correct text ordering --- courtlistener/process_cl.py | 132 ++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 courtlistener/process_cl.py diff --git a/courtlistener/process_cl.py b/courtlistener/process_cl.py new file mode 100644 index 0000000..3581d9f --- /dev/null +++ b/courtlistener/process_cl.py @@ -0,0 +1,132 @@ +import argparse +import csv +import logging +import os +import re +import sys + +import pandas as pd + +from licensed_pile.licenses import PermissiveLicenses +from licensed_pile.logs import configure_logging +from licensed_pile.write import to_dolma + +SOURCE_NAME = "CourtListenerOpinion" + +csv.field_size_limit(sys.maxsize) + +logger = configure_logging("court-listener-opinion") + + +def process_court_listener(file_path): + df = pd.read_csv(file_path) + + # add metadata column + df["metadata"] = str(PermissiveLicenses.PD) + + # add source column + df["source"] = SOURCE_NAME + + """ + Remove columns: + date_modified + author_str + per_curiam + joined_by_str + type + sha1 + page_count + local_path + extracted_by_ocr + author_id + cluster_id + """ + df = df.drop( + columns=[ + "date_modified", + "author_str", + "per_curiam", + "joined_by_str", + "type", + "sha1", + "page_count", + "local_path", + "extracted_by_ocr", + "author_id", + "cluster_id", + ] + ) + + """ + Merge columns based on Court Listener documentation: + html_with_citations + html_columbia + html_lawbox + xml_harvard + html_anon_2020 + html + plain_text + """ + df["text"] = ( + df["html_with_citations"] + .combine_first(df["html_columbia"]) + .combine_first(df["html_lawbox"]) + .combine_first(df["xml_harvard"]) + .combine_first(df["html_anon_2020"]) + .combine_first(df["html"]) + ) + + # keep only the text columns and drop null values + df = df.drop( + columns=[ + "html", + "html_anon_2020", + "html_lawbox", + "html_columbia", + "xml_harvard", + "html_with_citations", + ] + ).dropna(subset=["text"]) + + # extract text from html and xml following Harvard CAP + # They used r"<.+?>", "" + df["text"] = df["text"].apply(lambda x: re.sub(r"<.+?>", "", x)) + + # combine merge plain text and extracted text + df["text"] = df["text"].combine_first(df["plain_text"]) + + # drop plain text column and text null values + df = df.drop(columns=["plain_text"]).dropna(subset=["text"]) + + # return a dictionary for each row - dolma format + return df.to_dict(orient="records") + + +def main(args): + example = process_court_listener(args.input_file) + output_file_base_name = os.path.basename(args.input_file).replace( + ".csv", ".jsonl.gz" + ) + to_dolma(example, args.output_dir, output_file_base_name, args.shard_size) + logger.info(f"Saved {args.input_file} as dolma shared files at {args.output_dir}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Convert csv data to dolma.") + parser.add_argument( + "--output_dir", + default=f"data/courtlistener/v0", + help="Where the dolma formatted data goes.", + ) + parser.add_argument( + "--shard_size", + default=1000, + help="The number of documents to store in each shard.", + ) + parser.add_argument( + "--input_file", + default="./data/courtlistener/raw/opinions-2022-08-02.csv", + help="The path to the csv file to convert.", + ) + args = parser.parse_args() + main(args)