Skip to content

Commit

Permalink
updated requirements
Browse files Browse the repository at this point in the history
  • Loading branch information
Shayne Longpre authored and Shayne Longpre committed May 12, 2024
2 parents d07ad3d + 22b8e90 commit bea3f0c
Show file tree
Hide file tree
Showing 18 changed files with 818 additions and 7 deletions.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,22 @@ Look at the text for item with the id of 12 (note that position in file is not c

Note: You can also use `gunzip -c ${file}.jsonl.gz | jq -s ${command}` which is slightly faster (reduces the amount of data flowwing through pipes) but if you forget the `-c` flag you end up uncompressing the file and deleting the compressed version, i.e. you need to run `gzip ${file}.jsonl` to fix it.

### Capped-parallelism in bash script
Sometimes we want to download/process multiple files in parallel up to a limited number of jobs in bash script.
Below is a example code snippet (used in [courtlistener/get_data.sh](courtlistener/get_data.sh)).
Note that `jobs -r` counts all jobs running in the current shell.

````
max_jobs = 8
for file in "${files[@]}"; do
download_and_process "file" &
# Limit the number of parallel jobs
if (( $(jobs -r | wc -l) >= max_jobs )); then
wait -n
fi
done
````
## Development

We use git pre-commit hooks to format code and keep style consistent.
Expand Down
10 changes: 10 additions & 0 deletions courtlistener/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Court Listener Data
Opinion data from CourtListener [bulk data list](https://com-courtlistener-storage.s3-us-west-2.amazonaws.com/list.html?prefix=bulk-data/)

## Data download and processing
Run full processing including downloading the raw zipped data, unzipp to csv file and parsing to dolma format with
``bash get_data.sh``.

To test with only one zip file with ``bash get_data.sh --test_run 1``.

To change the maximum number of parallel jobs (8 by default) to run with ``--max_jobs``.
73 changes: 73 additions & 0 deletions courtlistener/csv_to_dolma.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env python
"""
Created by zhenlinx on 01/19/2024
"""
import argparse
import csv
import logging
import os
import sys
from datetime import datetime

from licensed_pile.licenses import PermissiveLicenses
from licensed_pile.logs import configure_logging
from licensed_pile.write import to_dolma

SOURCE_NAME = "CourtListenerOpinion"

csv.field_size_limit(sys.maxsize)

logger = configure_logging("court-listener-opinion")


def make_record_generator(file_path):
with open(file_path, "r") as csvfile:
# Create a CSV reader object
reader = csv.DictReader(csvfile)

# Yield a dictionary for each row
for row in reader:
# 'row' is a dictionary with column headers as keys

if not row["plain_text"]:
pass # TODO load from row["download_url"] if not null
else:
yield {
"id": row["id"],
"text": row["plain_text"],
"source": SOURCE_NAME,
"added": datetime.utcnow().isoformat(),
"created": row["data_created"],
"metadata": {
"license": str(PermissiveLicenses.PD),
"url": row["download_url"],
},
}


def main(args):
example_generator = make_record_generator(args.input_file)
output_file_base_name = os.path.basename(args.input_file).replace(
".csv", ".jsonl.gz"
)
to_dolma(example_generator, args.output_dir, output_file_base_name, args.shard_size)
logger.info(f"Saved {args.input_file} as dolma shared files at {args.output_dir}")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert csv data to dolma.")
parser.add_argument(
"--output_dir",
default=f"data/courtlistener/v0",
help="Where the dolma formatted data goes.",
)
parser.add_argument(
"--input_file",
default="./data/courtlistener/raw/opinions-2022-08-02.csv",
help="The base filename stores data",
)
parser.add_argument(
"--shard_size", type=int, default=1, help="Size, in GB, for each shard."
)
args = parser.parse_args()
main(args)
106 changes: 106 additions & 0 deletions courtlistener/get_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/bin/bash
set -e

# URL of the directory
base_url="https://storage.courtlistener.com/bulk-data/"

# Define the download directory
download_dir="./data/courtlistener/raw"

# Create the directory if it does not exist
mkdir -p "$download_dir"

dates=(
"2022-08-02"
"2022-08-31"
"2022-09-30"
"2022-10-31"
"2022-11-30"
"2022-12-31"
"2023-01-31"
"2023-02-28"
"2023-03-31"
"2023-04-30"
"2023-05-31"
"2023-07-31"
"2023-08-31"
"2023-12-04"
"2024-03-11"
)

max_jobs=8

# Parse command-line options
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--test_run)
# Use the first N dates for testing
shift
test_run_count=$1
dates=("${dates[@]:0:$test_run_count}")
shift
;;
--max_jobs)
# Set the maximum number of parallel jobs
shift
max_jobs=$1
shift
;;
*)
echo "Unknown option: $key"
exit 1
;;
esac
done

# Display the dates of the files to be fetched
echo "Dates of files to be fetched:"
for date in "${dates[@]}"; do
echo "$date"
done

# Function to download and decompress a file
download_and_decompress() {
local file_name="opinions-${1}.csv"
# local file_name="financial-disclosure-investments-${1}.csv"
local file_url="${base_url}${file_name}.bz2"
local decompressed_file="${download_dir}/${file_name}"
local compressed_file="${download_dir}/${file_name}.bz2"

# Check if the decompressed file already exists
if [[ -f "$decompressed_file" ]]; then
echo "Decompressed file ${decompressed_file} already exists, skipping..."
else
# Check if the compressed file already exists
if [[ -f "$compressed_file" ]]; then
echo "Compressed file ${compressed_file} already exists, skipping download..."
else
# Download the file
wget -P "$download_dir" "$file_url"
fi
# Decompress the file
bunzip2 "$compressed_file"
echo "Decompressed file ${compressed_file} ..."
fi

# transform csv files into shared dolma data
echo "Save records in ${decompressed_file} to dolma data"
python ./courtlistener/csv_to_dolma.py --input_file ${decompressed_file}
}


# Download each file
for date in "${dates[@]}"; do
download_and_decompress "$date" &

# Limit the number of parallel jobs
if (( $(jobs -r | wc -l) >= max_jobs )); then
wait -n
fi
done

# Wait for all background jobs to finish
wait

echo "Download and decompression completed."
3 changes: 3 additions & 0 deletions courtlistener/process_csv_file.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env sh
set -e
python courtlistener/process_csv.py
1 change: 1 addition & 0 deletions food/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data/*
18 changes: 18 additions & 0 deletions food/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Foodista

This Code scrapes the Foodista shared recipe site which is licensed under CC-BY-3.0

## Downloading the Data

1. Use `python build_index.py` to get a list of pages on the site by parsing the sitemap.
2. Use `python download_pages.py` to download the pages. `--wait` can be used to give the remote server a break between requests. `--num_threads` controls how many threads are used to download the data. This script can do incremental downloads, or it can re-download everything with the `--overwrite` flag.
3. Use `python to_dolma.py` to convert the pages from files on disk to the dolma format. Each page is raw html at this point.
4. Use `python preprocess.py` to parse the html into plain text. This uses dolma for multiprocessing of the various data shards.

You can also use `get-data.sh` to do all the steps above automatically.

Note: It is normal to see messages in the log like this when doing test runs, but it would be concerning when running the real data collection.

``` json
{"level_name": "ERROR", "timestamp": "2024-05-07 14:02:52,846", "module_name": "to_dolma", "function_name": "format_page", "logger": "food", "message": "Article data/pages/foodista.com_tool_Z2MHM8QR_julienne-peeler.html exists in the index but is not downloaded."}
```
72 changes: 72 additions & 0 deletions food/build_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Build a list of pages to scape based on the sitemap."""


import argparse
import json
import os
import re
from typing import List

import usp.tree

from licensed_pile import logs, scrape, utils

parser = argparse.ArgumentParser(
description="Find all pages to download based on the sitemap."
)
parser.add_argument(
"--url", default="https://www.foodista.com/", help="The site we are scraping."
)
parser.add_argument(
"--index_path",
default="data/pages/page_index.jsonl",
help="Where to save the list of pages.",
)
parser.add_argument(
"--overwrite",
action="store_true",
help="Should we overwrite a previous list of pages we made?",
)


def build_url_index(url: str) -> List[str]:
logs.configure_logging("usp.helpers")
logs.configure_logging("usp.fetch_parse")
logs.configure_logging("usp.tree")

tree = usp.tree.sitemap_tree_for_homepage(url)
page_list = sorted(set(page.url for page in tree.all_pages()))
# Remove homepage and the _ping healthcheck page.
page_list = page_list[2:]
return page_list


def url_to_filename(url: str) -> str:
url = re.sub(r"https?://(?:www\.)?", "", url)
url = re.sub(r"[?,=/]", "_", url)
url = re.sub(r"\s+", "_", url)
return url


def main(args):
logger = logs.get_logger("food")
if os.path.exists(args.index_path) and not args.overwrite:
logger.error(f"Page Index already exists at {args.index_path}, aborting.")
return
logger.info(f"Building page index from {args.url}")
page_list = build_url_index(args.url)
logger.info(f"Found {len(page_list)} pages.")
page_index = [
{"idx": idx, "url": url, "filename": f"{url_to_filename(url)}.html"}
for idx, url in enumerate(page_list)
]
logger.info(f"Saving page index to {args.index_path}")
os.makedirs(os.path.dirname(args.index_path), exist_ok=True)
with open(args.index_path, "w") as wf:
wf.write("\n".join(json.dumps(p) for p in page_index) + "\n")


if __name__ == "__main__":
args = parser.parse_args()
logs.configure_logging("food")
main(args)
Loading

0 comments on commit bea3f0c

Please sign in to comment.