-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
18 changed files
with
818 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Court Listener Data | ||
Opinion data from CourtListener [bulk data list](https://com-courtlistener-storage.s3-us-west-2.amazonaws.com/list.html?prefix=bulk-data/) | ||
|
||
## Data download and processing | ||
Run full processing including downloading the raw zipped data, unzipp to csv file and parsing to dolma format with | ||
``bash get_data.sh``. | ||
|
||
To test with only one zip file with ``bash get_data.sh --test_run 1``. | ||
|
||
To change the maximum number of parallel jobs (8 by default) to run with ``--max_jobs``. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Created by zhenlinx on 01/19/2024 | ||
""" | ||
import argparse | ||
import csv | ||
import logging | ||
import os | ||
import sys | ||
from datetime import datetime | ||
|
||
from licensed_pile.licenses import PermissiveLicenses | ||
from licensed_pile.logs import configure_logging | ||
from licensed_pile.write import to_dolma | ||
|
||
SOURCE_NAME = "CourtListenerOpinion" | ||
|
||
csv.field_size_limit(sys.maxsize) | ||
|
||
logger = configure_logging("court-listener-opinion") | ||
|
||
|
||
def make_record_generator(file_path): | ||
with open(file_path, "r") as csvfile: | ||
# Create a CSV reader object | ||
reader = csv.DictReader(csvfile) | ||
|
||
# Yield a dictionary for each row | ||
for row in reader: | ||
# 'row' is a dictionary with column headers as keys | ||
|
||
if not row["plain_text"]: | ||
pass # TODO load from row["download_url"] if not null | ||
else: | ||
yield { | ||
"id": row["id"], | ||
"text": row["plain_text"], | ||
"source": SOURCE_NAME, | ||
"added": datetime.utcnow().isoformat(), | ||
"created": row["data_created"], | ||
"metadata": { | ||
"license": str(PermissiveLicenses.PD), | ||
"url": row["download_url"], | ||
}, | ||
} | ||
|
||
|
||
def main(args): | ||
example_generator = make_record_generator(args.input_file) | ||
output_file_base_name = os.path.basename(args.input_file).replace( | ||
".csv", ".jsonl.gz" | ||
) | ||
to_dolma(example_generator, args.output_dir, output_file_base_name, args.shard_size) | ||
logger.info(f"Saved {args.input_file} as dolma shared files at {args.output_dir}") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Convert csv data to dolma.") | ||
parser.add_argument( | ||
"--output_dir", | ||
default=f"data/courtlistener/v0", | ||
help="Where the dolma formatted data goes.", | ||
) | ||
parser.add_argument( | ||
"--input_file", | ||
default="./data/courtlistener/raw/opinions-2022-08-02.csv", | ||
help="The base filename stores data", | ||
) | ||
parser.add_argument( | ||
"--shard_size", type=int, default=1, help="Size, in GB, for each shard." | ||
) | ||
args = parser.parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#!/bin/bash | ||
set -e | ||
|
||
# URL of the directory | ||
base_url="https://storage.courtlistener.com/bulk-data/" | ||
|
||
# Define the download directory | ||
download_dir="./data/courtlistener/raw" | ||
|
||
# Create the directory if it does not exist | ||
mkdir -p "$download_dir" | ||
|
||
dates=( | ||
"2022-08-02" | ||
"2022-08-31" | ||
"2022-09-30" | ||
"2022-10-31" | ||
"2022-11-30" | ||
"2022-12-31" | ||
"2023-01-31" | ||
"2023-02-28" | ||
"2023-03-31" | ||
"2023-04-30" | ||
"2023-05-31" | ||
"2023-07-31" | ||
"2023-08-31" | ||
"2023-12-04" | ||
"2024-03-11" | ||
) | ||
|
||
max_jobs=8 | ||
|
||
# Parse command-line options | ||
while [[ $# -gt 0 ]]; do | ||
key="$1" | ||
case $key in | ||
--test_run) | ||
# Use the first N dates for testing | ||
shift | ||
test_run_count=$1 | ||
dates=("${dates[@]:0:$test_run_count}") | ||
shift | ||
;; | ||
--max_jobs) | ||
# Set the maximum number of parallel jobs | ||
shift | ||
max_jobs=$1 | ||
shift | ||
;; | ||
*) | ||
echo "Unknown option: $key" | ||
exit 1 | ||
;; | ||
esac | ||
done | ||
|
||
# Display the dates of the files to be fetched | ||
echo "Dates of files to be fetched:" | ||
for date in "${dates[@]}"; do | ||
echo "$date" | ||
done | ||
|
||
# Function to download and decompress a file | ||
download_and_decompress() { | ||
local file_name="opinions-${1}.csv" | ||
# local file_name="financial-disclosure-investments-${1}.csv" | ||
local file_url="${base_url}${file_name}.bz2" | ||
local decompressed_file="${download_dir}/${file_name}" | ||
local compressed_file="${download_dir}/${file_name}.bz2" | ||
|
||
# Check if the decompressed file already exists | ||
if [[ -f "$decompressed_file" ]]; then | ||
echo "Decompressed file ${decompressed_file} already exists, skipping..." | ||
else | ||
# Check if the compressed file already exists | ||
if [[ -f "$compressed_file" ]]; then | ||
echo "Compressed file ${compressed_file} already exists, skipping download..." | ||
else | ||
# Download the file | ||
wget -P "$download_dir" "$file_url" | ||
fi | ||
# Decompress the file | ||
bunzip2 "$compressed_file" | ||
echo "Decompressed file ${compressed_file} ..." | ||
fi | ||
|
||
# transform csv files into shared dolma data | ||
echo "Save records in ${decompressed_file} to dolma data" | ||
python ./courtlistener/csv_to_dolma.py --input_file ${decompressed_file} | ||
} | ||
|
||
|
||
# Download each file | ||
for date in "${dates[@]}"; do | ||
download_and_decompress "$date" & | ||
|
||
# Limit the number of parallel jobs | ||
if (( $(jobs -r | wc -l) >= max_jobs )); then | ||
wait -n | ||
fi | ||
done | ||
|
||
# Wait for all background jobs to finish | ||
wait | ||
|
||
echo "Download and decompression completed." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/usr/bin/env sh | ||
set -e | ||
python courtlistener/process_csv.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
data/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Foodista | ||
|
||
This Code scrapes the Foodista shared recipe site which is licensed under CC-BY-3.0 | ||
|
||
## Downloading the Data | ||
|
||
1. Use `python build_index.py` to get a list of pages on the site by parsing the sitemap. | ||
2. Use `python download_pages.py` to download the pages. `--wait` can be used to give the remote server a break between requests. `--num_threads` controls how many threads are used to download the data. This script can do incremental downloads, or it can re-download everything with the `--overwrite` flag. | ||
3. Use `python to_dolma.py` to convert the pages from files on disk to the dolma format. Each page is raw html at this point. | ||
4. Use `python preprocess.py` to parse the html into plain text. This uses dolma for multiprocessing of the various data shards. | ||
|
||
You can also use `get-data.sh` to do all the steps above automatically. | ||
|
||
Note: It is normal to see messages in the log like this when doing test runs, but it would be concerning when running the real data collection. | ||
|
||
``` json | ||
{"level_name": "ERROR", "timestamp": "2024-05-07 14:02:52,846", "module_name": "to_dolma", "function_name": "format_page", "logger": "food", "message": "Article data/pages/foodista.com_tool_Z2MHM8QR_julienne-peeler.html exists in the index but is not downloaded."} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
"""Build a list of pages to scape based on the sitemap.""" | ||
|
||
|
||
import argparse | ||
import json | ||
import os | ||
import re | ||
from typing import List | ||
|
||
import usp.tree | ||
|
||
from licensed_pile import logs, scrape, utils | ||
|
||
parser = argparse.ArgumentParser( | ||
description="Find all pages to download based on the sitemap." | ||
) | ||
parser.add_argument( | ||
"--url", default="https://www.foodista.com/", help="The site we are scraping." | ||
) | ||
parser.add_argument( | ||
"--index_path", | ||
default="data/pages/page_index.jsonl", | ||
help="Where to save the list of pages.", | ||
) | ||
parser.add_argument( | ||
"--overwrite", | ||
action="store_true", | ||
help="Should we overwrite a previous list of pages we made?", | ||
) | ||
|
||
|
||
def build_url_index(url: str) -> List[str]: | ||
logs.configure_logging("usp.helpers") | ||
logs.configure_logging("usp.fetch_parse") | ||
logs.configure_logging("usp.tree") | ||
|
||
tree = usp.tree.sitemap_tree_for_homepage(url) | ||
page_list = sorted(set(page.url for page in tree.all_pages())) | ||
# Remove homepage and the _ping healthcheck page. | ||
page_list = page_list[2:] | ||
return page_list | ||
|
||
|
||
def url_to_filename(url: str) -> str: | ||
url = re.sub(r"https?://(?:www\.)?", "", url) | ||
url = re.sub(r"[?,=/]", "_", url) | ||
url = re.sub(r"\s+", "_", url) | ||
return url | ||
|
||
|
||
def main(args): | ||
logger = logs.get_logger("food") | ||
if os.path.exists(args.index_path) and not args.overwrite: | ||
logger.error(f"Page Index already exists at {args.index_path}, aborting.") | ||
return | ||
logger.info(f"Building page index from {args.url}") | ||
page_list = build_url_index(args.url) | ||
logger.info(f"Found {len(page_list)} pages.") | ||
page_index = [ | ||
{"idx": idx, "url": url, "filename": f"{url_to_filename(url)}.html"} | ||
for idx, url in enumerate(page_list) | ||
] | ||
logger.info(f"Saving page index to {args.index_path}") | ||
os.makedirs(os.path.dirname(args.index_path), exist_ok=True) | ||
with open(args.index_path, "w") as wf: | ||
wf.write("\n".join(json.dumps(p) for p in page_index) + "\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parser.parse_args() | ||
logs.configure_logging("food") | ||
main(args) |
Oops, something went wrong.