updated requirements

r-three · May 12, 2024 · bea3f0c · bea3f0c
2 parents d07ad3d + 22b8e90
commit bea3f0c
Show file tree

Hide file tree

Showing 18 changed files with 818 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -21,6 +21,22 @@ Look at the text for item with the id of 12 (note that position in file is not c
 
 Note: You can also use `gunzip -c ${file}.jsonl.gz | jq -s ${command}` which is slightly faster (reduces the amount of  data flowwing through pipes) but if you forget the `-c` flag you end up uncompressing the file and deleting the compressed version, i.e. you need to run `gzip ${file}.jsonl` to fix it.
 
+### Capped-parallelism in bash script
+Sometimes we want to download/process multiple files in parallel up to a limited number of jobs in bash script.
+Below is a example code snippet (used in [courtlistener/get_data.sh](courtlistener/get_data.sh)).
+Note that `jobs -r` counts all jobs running in the current shell.
+
+````
+max_jobs = 8
+for file in "${files[@]}"; do
+    download_and_process "file" &
+
+    # Limit the number of parallel jobs
+    if (( $(jobs -r | wc -l) >= max_jobs )); then
+        wait -n
+    fi
+done
+````
 ## Development
 
 We use git pre-commit hooks to format code and keep style consistent.

diff --git a/courtlistener/README.md b/courtlistener/README.md
@@ -0,0 +1,10 @@
+# Court Listener Data
+Opinion data from CourtListener [bulk data list](https://com-courtlistener-storage.s3-us-west-2.amazonaws.com/list.html?prefix=bulk-data/)
+
+## Data download and processing
+Run full processing including downloading the raw zipped data, unzipp to csv file and parsing to dolma format with
+``bash get_data.sh``.
+
+To test with only one zip file with ``bash get_data.sh --test_run 1``.
+
+To change the maximum number of parallel jobs (8 by default) to run with ``--max_jobs``.
diff --git a/courtlistener/csv_to_dolma.py b/courtlistener/csv_to_dolma.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+"""
+Created by zhenlinx on 01/19/2024
+"""
+import argparse
+import csv
+import logging
+import os
+import sys
+from datetime import datetime
+
+from licensed_pile.licenses import PermissiveLicenses
+from licensed_pile.logs import configure_logging
+from licensed_pile.write import to_dolma
+
+SOURCE_NAME = "CourtListenerOpinion"
+
+csv.field_size_limit(sys.maxsize)
+
+logger = configure_logging("court-listener-opinion")
+
+
+def make_record_generator(file_path):
+    with open(file_path, "r") as csvfile:
+        # Create a CSV reader object
+        reader = csv.DictReader(csvfile)
+
+        # Yield a dictionary for each row
+        for row in reader:
+            # 'row' is a dictionary with column headers as keys
+
+            if not row["plain_text"]:
+                pass  # TODO load from row["download_url"] if not null
+            else:
+                yield {
+                    "id": row["id"],
+                    "text": row["plain_text"],
+                    "source": SOURCE_NAME,
+                    "added": datetime.utcnow().isoformat(),
+                    "created": row["data_created"],
+                    "metadata": {
+                        "license": str(PermissiveLicenses.PD),
+                        "url": row["download_url"],
+                    },
+                }
+
+
+def main(args):
+    example_generator = make_record_generator(args.input_file)
+    output_file_base_name = os.path.basename(args.input_file).replace(
+        ".csv", ".jsonl.gz"
+    )
+    to_dolma(example_generator, args.output_dir, output_file_base_name, args.shard_size)
+    logger.info(f"Saved {args.input_file} as dolma shared files at {args.output_dir}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert csv data to dolma.")
+    parser.add_argument(
+        "--output_dir",
+        default=f"data/courtlistener/v0",
+        help="Where the dolma formatted data goes.",
+    )
+    parser.add_argument(
+        "--input_file",
+        default="./data/courtlistener/raw/opinions-2022-08-02.csv",
+        help="The base filename stores data",
+    )
+    parser.add_argument(
+        "--shard_size", type=int, default=1, help="Size, in GB, for each shard."
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/courtlistener/get_data.sh b/courtlistener/get_data.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+set -e
+
+# URL of the directory
+base_url="https://storage.courtlistener.com/bulk-data/"
+
+# Define the download directory
+download_dir="./data/courtlistener/raw"
+
+# Create the directory if it does not exist
+mkdir -p "$download_dir"
+
+dates=(
+    "2022-08-02"
+    "2022-08-31"
+    "2022-09-30"
+    "2022-10-31"
+    "2022-11-30"
+    "2022-12-31"
+    "2023-01-31"
+    "2023-02-28"
+    "2023-03-31"
+    "2023-04-30"
+    "2023-05-31"
+    "2023-07-31"
+    "2023-08-31"
+    "2023-12-04"
+    "2024-03-11"
+)
+
+max_jobs=8
+
+# Parse command-line options
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        --test_run)
+            # Use the first N dates for testing
+            shift
+            test_run_count=$1
+            dates=("${dates[@]:0:$test_run_count}")
+            shift
+            ;;
+        --max_jobs)
+            # Set the maximum number of parallel jobs
+            shift
+            max_jobs=$1
+            shift
+            ;;
+        *)
+            echo "Unknown option: $key"
+            exit 1
+            ;;
+    esac
+done
+
+# Display the dates of the files to be fetched
+echo "Dates of files to be fetched:"
+for date in "${dates[@]}"; do
+    echo "$date"
+done
+
+# Function to download and decompress a file
+download_and_decompress() {
+    local file_name="opinions-${1}.csv"
+#    local file_name="financial-disclosure-investments-${1}.csv"
+    local file_url="${base_url}${file_name}.bz2"
+    local decompressed_file="${download_dir}/${file_name}"
+    local compressed_file="${download_dir}/${file_name}.bz2"
+
+    # Check if the decompressed file already exists
+    if [[ -f "$decompressed_file" ]]; then
+        echo "Decompressed file ${decompressed_file} already exists, skipping..."
+    else
+      # Check if the compressed file already exists
+      if [[ -f "$compressed_file" ]]; then
+          echo "Compressed file ${compressed_file} already exists, skipping download..."
+      else
+          # Download the file
+          wget -P "$download_dir" "$file_url"
+      fi
+      # Decompress the file
+      bunzip2 "$compressed_file"
+      echo "Decompressed file ${compressed_file} ..."
+    fi
+
+    # transform csv files into shared dolma data
+    echo "Save records in ${decompressed_file} to dolma data"
+    python ./courtlistener/csv_to_dolma.py --input_file ${decompressed_file}
+}
+
+
+# Download each file
+for date in "${dates[@]}"; do
+    download_and_decompress "$date" &
+
+    # Limit the number of parallel jobs
+    if (( $(jobs -r | wc -l) >= max_jobs )); then
+        wait -n
+    fi
+done
+
+# Wait for all background jobs to finish
+wait
+
+echo "Download and decompression completed."
diff --git a/courtlistener/process_csv_file.sh b/courtlistener/process_csv_file.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env sh
+set -e
+python courtlistener/process_csv.py
diff --git a/food/.gitignore b/food/.gitignore
@@ -0,0 +1 @@
+data/*
diff --git a/food/README.md b/food/README.md
@@ -0,0 +1,18 @@
+# Foodista
+
+This Code scrapes the Foodista shared recipe site which is licensed under CC-BY-3.0
+
+## Downloading the Data
+
+1. Use `python build_index.py` to get a list of pages on the site by parsing the sitemap.
+2. Use `python download_pages.py` to download the pages. `--wait` can be used to give the remote server a break between requests. `--num_threads` controls how many threads are used to download the data. This script can do incremental downloads, or it can re-download everything with the `--overwrite` flag.
+3. Use `python to_dolma.py` to convert the pages from files on disk to the dolma format. Each page is raw html at this point.
+4. Use `python preprocess.py` to parse the html into plain text. This uses dolma for multiprocessing of the various data shards.
+
+You can also use `get-data.sh` to do all the steps above automatically.
+
+Note: It is normal to see messages in the log like this when doing test runs, but it would be concerning when running the real data collection.
+
+``` json
+{"level_name": "ERROR", "timestamp": "2024-05-07 14:02:52,846", "module_name": "to_dolma", "function_name": "format_page", "logger": "food", "message": "Article data/pages/foodista.com_tool_Z2MHM8QR_julienne-peeler.html exists in the index but is not downloaded."}
+```
diff --git a/food/build_index.py b/food/build_index.py
@@ -0,0 +1,72 @@
+"""Build a list of pages to scape based on the sitemap."""
+
+
+import argparse
+import json
+import os
+import re
+from typing import List
+
+import usp.tree
+
+from licensed_pile import logs, scrape, utils
+
+parser = argparse.ArgumentParser(
+    description="Find all pages to download based on the sitemap."
+)
+parser.add_argument(
+    "--url", default="https://www.foodista.com/", help="The site we are scraping."
+)
+parser.add_argument(
+    "--index_path",
+    default="data/pages/page_index.jsonl",
+    help="Where to save the list of pages.",
+)
+parser.add_argument(
+    "--overwrite",
+    action="store_true",
+    help="Should we overwrite a previous list of pages we made?",
+)
+
+
+def build_url_index(url: str) -> List[str]:
+    logs.configure_logging("usp.helpers")
+    logs.configure_logging("usp.fetch_parse")
+    logs.configure_logging("usp.tree")
+
+    tree = usp.tree.sitemap_tree_for_homepage(url)
+    page_list = sorted(set(page.url for page in tree.all_pages()))
+    # Remove homepage and the _ping healthcheck page.
+    page_list = page_list[2:]
+    return page_list
+
+
+def url_to_filename(url: str) -> str:
+    url = re.sub(r"https?://(?:www\.)?", "", url)
+    url = re.sub(r"[?,=/]", "_", url)
+    url = re.sub(r"\s+", "_", url)
+    return url
+
+
+def main(args):
+    logger = logs.get_logger("food")
+    if os.path.exists(args.index_path) and not args.overwrite:
+        logger.error(f"Page Index already exists at {args.index_path}, aborting.")
+        return
+    logger.info(f"Building page index from {args.url}")
+    page_list = build_url_index(args.url)
+    logger.info(f"Found {len(page_list)} pages.")
+    page_index = [
+        {"idx": idx, "url": url, "filename": f"{url_to_filename(url)}.html"}
+        for idx, url in enumerate(page_list)
+    ]
+    logger.info(f"Saving page index to {args.index_path}")
+    os.makedirs(os.path.dirname(args.index_path), exist_ok=True)
+    with open(args.index_path, "w") as wf:
+        wf.write("\n".join(json.dumps(p) for p in page_index) + "\n")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    logs.configure_logging("food")
+    main(args)