-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding Data Provenance data and new licenses
- Loading branch information
1 parent
22b8e90
commit 28e210b
Showing
13 changed files
with
666 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
#!/usr/bin/env sh | ||
set -e | ||
python courtlistener/process_csv.py | ||
python courtlistener/process_csv.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
data/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Processing scripts for Data Provenance data | ||
|
||
The [Data Provenance Initiative](https://www.dataprovenance.org) is a digital library for supervised datasets that have been manually annotated with their source and license information. It wraps HuggingFace datasets with extra metadata, and provides code to download, standardize and filter for various criteria. | ||
|
||
In this case, we have filtered for the following criteria: | ||
* English language or code data | ||
* No model generated text | ||
* Datasets have a commercially viable license, found through the Data Provenance Initiative or the hosting GitHub repository | ||
* We only include datasets where all associated licenses (from the Data Provenance Initiative and GitHub) are open source compliant or appear in the Gold, Silver or Bronze lists of the Blue Oak Council (https://blueoakcouncil.org/list). | ||
* The original source(s) of the text are only from the list of sources in `source_allow_list.txt` | ||
* We only include datasets where the relevant license sources are thoroughly documented and linked. | ||
|
||
The specific filter settings are here: https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection/blob/main/src/configs/pile_v2_test.yaml | ||
|
||
|
||
Here is the process to download the data, from inside the `data_provenance` dir: | ||
|
||
1. Run `python download.py --include include.csv` | ||
|
||
2. Run `python to-dolma.py --include include.csv` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
HF_MAPPING = { | ||
"CommitPackFT": "commitpack_ft", | ||
"Dolly 15k": "dolly_15k", | ||
"Open Assistant v2": "open_assistant_v2", | ||
"Open Assistant OctoPack": "octopack_oa", | ||
"Open Assistant": "open_assistant", | ||
"OIG": "oig", | ||
"Anthropic HH-RLHF": "rlhf_anthropic_hh", | ||
"Flan Collection (Super-NaturalInstructions)": "flan_sni", | ||
"Flan Collection (P3)": "flan_p3", | ||
"Flan Collection (Flan 2021)": "flan_2021", | ||
"Tasksource Symbol-Tuning": "tasksource_symboltuning", | ||
"Tasksource Instruct": "tasksource_instruct", | ||
"Flan Collection (Chain-of-Thought)": "flan_cot", | ||
"HelpSteer": "helpsteer", | ||
"Aya Dataset": "aya_dataset", | ||
"AgentInstruct": "agentinstruct", | ||
"xP3x": "xp3x", | ||
"Flan Collection (Dialog)": "flan_dialog", | ||
"Joke Explanation": "joke_explanation", | ||
"StarCoder Self-Instruct": "starcoder_selfinstruct", | ||
"DialogStudio": "dialogstudio", | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
"""Download Data Provenance Initative data""" | ||
|
||
import argparse | ||
import gzip | ||
import json | ||
import logging | ||
import multiprocessing | ||
import os | ||
import tarfile | ||
import typing | ||
from collections import defaultdict | ||
|
||
import jsonlines | ||
import pandas as pd | ||
from constants import HF_MAPPING | ||
from datasets import load_dataset | ||
from tqdm.auto import tqdm | ||
|
||
from licensed_pile.logs import configure_logging, get_logger | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser(description="Data Provenance Data Downloader") | ||
parser.add_argument( | ||
"--hf", | ||
default="DataProvenanceInitiative/Ultra_Permissive_Test", | ||
help="The label for the HuggingFace dataset that can be used in HuggingFace's load_dataset()", | ||
) | ||
parser.add_argument( | ||
"--include", | ||
default="include.csv", | ||
help="Path to csv file with `Collection Name, Dataset ID` we will include", | ||
) | ||
parser.add_argument( | ||
"--outdir", default="data/raw-data-provenance", help="Path to output directory" | ||
) | ||
return parser.parse_args() | ||
|
||
|
||
def write_jsonl_gz( | ||
data, | ||
outpath, | ||
): | ||
dirname = os.path.dirname(outpath) | ||
if dirname: | ||
os.makedirs(dirname, exist_ok=True) | ||
with gzip.open(outpath, "wb") as fp: # Open file in binary write mode | ||
data_bytes = ( | ||
b"\n".join(json.dumps(d).encode() for d in data) + b"\n" | ||
) # Encode strings to bytes | ||
fp.write(data_bytes) | ||
|
||
|
||
def main(args): | ||
logger = get_logger() | ||
logger.info(f"Filtering to just the datasets in {args.include}") | ||
|
||
include_df = pd.read_csv(args.include) | ||
include_collections = list(set(include_df["Collection"])) | ||
include_dset_ids = set(include_df["Dataset ID"]) | ||
|
||
for collection in include_collections: | ||
folder_name = HF_MAPPING[collection] | ||
subset = load_dataset( | ||
args.hf, | ||
split="train", | ||
num_proc=os.cpu_count(), | ||
revision="main", | ||
data_files=f"data/{folder_name}/*.jsonl", | ||
).to_list() | ||
exs = [ex for ex in subset if ex["dataset"] in include_dset_ids] | ||
savepath = os.path.join(args.outdir, f"{folder_name}.jsonl.gz") | ||
write_jsonl_gz(exs, savepath) | ||
logger.info(f"Saving {len(exs)} examples to {savepath}") | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parse_args() | ||
configure_logging() | ||
main(args) |
Oops, something went wrong.