From 2dcc7f0d793a621d6634199c15893c9f96ef2df3 Mon Sep 17 00:00:00 2001
From: Enrico Shippole <henryshippole@gmail.com>
Date: Mon, 3 Jun 2024 11:45:10 -0400
Subject: [PATCH 1/3] Update get_data.sh

Only the most recent dump should be used. You are making numerous duplicates of the same data.
---
 courtlistener/get_data.sh | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/courtlistener/get_data.sh b/courtlistener/get_data.sh
index dde4f76..e35c9aa 100644
--- a/courtlistener/get_data.sh
+++ b/courtlistener/get_data.sh
@@ -11,21 +11,7 @@ download_dir="./data/courtlistener/raw"
 mkdir -p "$download_dir"
 
 dates=(
-    "2022-08-02"
-    "2022-08-31"
-    "2022-09-30"
-    "2022-10-31"
-    "2022-11-30"
-    "2022-12-31"
-    "2023-01-31"
-    "2023-02-28"
-    "2023-03-31"
-    "2023-04-30"
-    "2023-05-31"
-    "2023-07-31"
-    "2023-08-31"
-    "2023-12-04"
-    "2024-03-11"
+    "2024-05-06"
 )
 
 max_jobs=8

From ecbc70dd51125f791a325d0683a94050224c2c38 Mon Sep 17 00:00:00 2001
From: Enrico Shippole <enricoship@gmail.com>
Date: Mon, 3 Jun 2024 13:53:51 -0400
Subject: [PATCH 2/3] Lint for download_and_convert_md.py and add comments

---
 courtlistener/get_data.sh                   |  3 +++
 pubmedcentral/download_and_convert_to_md.py | 13 ++++++-------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/courtlistener/get_data.sh b/courtlistener/get_data.sh
index e35c9aa..6fc26fb 100644
--- a/courtlistener/get_data.sh
+++ b/courtlistener/get_data.sh
@@ -10,6 +10,9 @@ download_dir="./data/courtlistener/raw"
 # Create the directory if it does not exist
 mkdir -p "$download_dir"
 
+# Only download the data from most recent CL dump
+# The newest dump contains the previous dumps data 
+# Differences from the previous data should not be included
 dates=(
     "2024-05-06"
 )
diff --git a/pubmedcentral/download_and_convert_to_md.py b/pubmedcentral/download_and_convert_to_md.py
index 1e52a52..8bef0c2 100644
--- a/pubmedcentral/download_and_convert_to_md.py
+++ b/pubmedcentral/download_and_convert_to_md.py
@@ -36,13 +36,13 @@
     help="Number of processes to use for conversion.",
 )
 
+
 def get_date_from_tree(tree):
     date_created = None
     # get date from tree
     # date can be found under a number of tags
     pub_types = ["pub", "epub", "pmc-release", "ppub"]
     for pub_type in pub_types:
-
         # try most common location first
         date = tree.find(f".//pub-date[@pub-type='{pub_type}']")
         if date is not None:
@@ -64,7 +64,6 @@ def get_date_from_tree(tree):
                 date_created = f"{year}-01-01"
                 continue
 
-
             # if we found the month, try the day
             try:
                 day = date.find("day").text
@@ -73,12 +72,11 @@ def get_date_from_tree(tree):
                 date_created = f"{year}-{month}-01"
                 continue
 
-
             # If we successfully found all date components,
             #   convert to YYYY-MM-DD format
             date_created = f"{year}-{month}-{day}"
             break
-    
+
         # try the next location
         date = tree.find(f".//pub-date[@date-type='{pub_type}']")
         if date is not None:
@@ -100,7 +98,6 @@ def get_date_from_tree(tree):
                 date_created = f"{year}-01-01"
                 continue
 
-
             # if we found the month, try the day
             try:
                 day = date.find("day").text
@@ -109,7 +106,6 @@ def get_date_from_tree(tree):
                 date_created = f"{year}-{month}-01"
                 continue
 
-
             # If we successfully found all date components,
             #   convert to YYYY-MM-DD format
             date_created = f"{year}-{month}-{day}"
@@ -117,6 +113,7 @@ def get_date_from_tree(tree):
 
     return date_created
 
+
 def get_authors_and_date(nxml_file: str, pmcid: str):
     # get authors from nxml file
     authors = []
@@ -138,7 +135,9 @@ def get_authors_and_date(nxml_file: str, pmcid: str):
     # not a fatal error, just log it
     if date_created is None:
         logger = logs.get_logger("pubmedcentral")
-        logger.info(f"Date not found for {pmcid}. Setting to default value of '1900-01-01'")
+        logger.info(
+            f"Date not found for {pmcid}. Setting to default value of '1900-01-01'"
+        )
         date_created = "1900-01-01"
 
     return authors, date_created

From f5c79b60a9616efdf5eb73cb3299ffd190fa6eba Mon Sep 17 00:00:00 2001
From: Enrico Shippole <enricoship@gmail.com>
Date: Mon, 3 Jun 2024 23:29:23 -0400
Subject: [PATCH 3/3] Add column merging and correct text ordering

---
 courtlistener/process_cl.py | 132 ++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 courtlistener/process_cl.py

diff --git a/courtlistener/process_cl.py b/courtlistener/process_cl.py
new file mode 100644
index 0000000..3581d9f
--- /dev/null
+++ b/courtlistener/process_cl.py
@@ -0,0 +1,132 @@
+import argparse
+import csv
+import logging
+import os
+import re
+import sys
+
+import pandas as pd
+
+from licensed_pile.licenses import PermissiveLicenses
+from licensed_pile.logs import configure_logging
+from licensed_pile.write import to_dolma
+
+SOURCE_NAME = "CourtListenerOpinion"
+
+csv.field_size_limit(sys.maxsize)
+
+logger = configure_logging("court-listener-opinion")
+
+
+def process_court_listener(file_path):
+    df = pd.read_csv(file_path)
+
+    # add metadata column
+    df["metadata"] = str(PermissiveLicenses.PD)
+
+    # add source column
+    df["source"] = SOURCE_NAME
+
+    """
+    Remove columns:
+    date_modified
+    author_str
+    per_curiam
+    joined_by_str
+    type
+    sha1
+    page_count
+    local_path
+    extracted_by_ocr
+    author_id
+    cluster_id
+    """
+    df = df.drop(
+        columns=[
+            "date_modified",
+            "author_str",
+            "per_curiam",
+            "joined_by_str",
+            "type",
+            "sha1",
+            "page_count",
+            "local_path",
+            "extracted_by_ocr",
+            "author_id",
+            "cluster_id",
+        ]
+    )
+
+    """
+    Merge columns based on Court Listener documentation:
+    html_with_citations
+    html_columbia
+    html_lawbox
+    xml_harvard
+    html_anon_2020
+    html
+    plain_text
+    """
+    df["text"] = (
+        df["html_with_citations"]
+        .combine_first(df["html_columbia"])
+        .combine_first(df["html_lawbox"])
+        .combine_first(df["xml_harvard"])
+        .combine_first(df["html_anon_2020"])
+        .combine_first(df["html"])
+    )
+
+    # keep only the text columns and drop null values
+    df = df.drop(
+        columns=[
+            "html",
+            "html_anon_2020",
+            "html_lawbox",
+            "html_columbia",
+            "xml_harvard",
+            "html_with_citations",
+        ]
+    ).dropna(subset=["text"])
+
+    # extract text from html and xml following Harvard CAP
+    # They used r"<.+?>", ""
+    df["text"] = df["text"].apply(lambda x: re.sub(r"<.+?>", "", x))
+
+    # combine merge plain text and extracted text
+    df["text"] = df["text"].combine_first(df["plain_text"])
+
+    # drop plain text column and text null values
+    df = df.drop(columns=["plain_text"]).dropna(subset=["text"])
+
+    # return a dictionary for each row - dolma format
+    return df.to_dict(orient="records")
+
+
+def main(args):
+    example = process_court_listener(args.input_file)
+    output_file_base_name = os.path.basename(args.input_file).replace(
+        ".csv", ".jsonl.gz"
+    )
+    to_dolma(example, args.output_dir, output_file_base_name, args.shard_size)
+    logger.info(f"Saved {args.input_file} as dolma shared files at {args.output_dir}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert csv data to dolma.")
+    parser.add_argument(
+        "--output_dir",
+        default=f"data/courtlistener/v0",
+        help="Where the dolma formatted data goes.",
+    )
+    parser.add_argument(
+        "--shard_size",
+        default=1000,
+        help="The number of documents to store in each shard.",
+    )
+    parser.add_argument(
+        "--input_file",
+        default="./data/courtlistener/raw/opinions-2022-08-02.csv",
+        help="The path to the csv file to convert.",
+    )
+    args = parser.parse_args()
+    main(args)