From 28e210b7c7776844745b0e77112f65b2e60862d3 Mon Sep 17 00:00:00 2001 From: shayne-longpre <69018523+shayne-longpre@users.noreply.github.com> Date: Thu, 23 May 2024 15:48:06 -0400 Subject: [PATCH] Data Provenance data (#61) Adding Data Provenance data and new licenses --- CONTRIBUTING.md | 10 +- courtlistener/process_csv_file.sh | 2 +- data_provenance/.gitignore | 1 + data_provenance/README.md | 20 ++ data_provenance/constants.py | 23 ++ data_provenance/download.py | 80 +++++++ data_provenance/include.csv | 329 ++++++++++++++++++++++++++ data_provenance/include_test.csv | 9 + data_provenance/source_allow_list.txt | 23 ++ data_provenance/to-dolma.py | 158 +++++++++++++ licensed_pile/licenses.py | 14 +- pyproject.toml | 1 - requirements.txt | 5 +- 13 files changed, 666 insertions(+), 9 deletions(-) create mode 100644 data_provenance/.gitignore create mode 100644 data_provenance/README.md create mode 100644 data_provenance/constants.py create mode 100644 data_provenance/download.py create mode 100644 data_provenance/include.csv create mode 100644 data_provenance/include_test.csv create mode 100644 data_provenance/source_allow_list.txt create mode 100644 data_provenance/to-dolma.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 92a3bfa..f143097 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -23,7 +23,7 @@ The current list of the permissive licenses allowed by this project is below and - [MIT License](https://opensource.org/license/mit/) - [BSD License](https://opensource.org/license/bsd-2-clause/) -This list contains some of the common permissive licenses that cover many large data sources, but we intend to expand this list as we continue to collect data. If you come across a source with a license that you believe should be on this list, feel free to comment in our [Allowable License Meta-Issue](https://github.com/r-three/licensed-pile/issues/34). +This list contains some of the common permissive licenses that cover many large data sources, but we intend to expand this list as we continue to collect data. If you come across a source with a license that you believe should be on this list, feel free to comment in our [Allowable License Meta-Issue](https://github.com/r-three/licensed-pile/issues/34). ### Finding License Information @@ -47,14 +47,14 @@ License information can sometimes be difficult to find for certain text sources 5. An "about" page can include licensing information for the website as a whole. -## Contributing Data Collection Code +## Contributing Data Collection Code Once you have selected a source from the list of [Issues](https://github.com/r-three/licensed-pile/issues), add a comment that you plan to work on it and an adim will assign the issue to you. Then, you can follow these guidelines for how to get started with contributing to the repo: 1. Clone the repo 2. Run `pip install -r requirements.txt` - + 3. Create a subdirectory for your data source (e.g., the `licensed-pile/gutenberg` directory for the Project Gutenberg data source). 4. Identify the best way to collect the raw data @@ -67,11 +67,11 @@ Once you have selected a source from the list of [Issues](https://github.com/r-t 5. If necessary, write code to filter the downloaded items down to only those with appropriate licenses. -6. Write code that outputs the resulting data to `licensed-pile/data/{SOURCE}/v0` +6. Write code that outputs the resulting data to `licensed-pile/data/{SOURCE}/v0` > The data format used in this project is [Dolma](https://github.com/allenai/dolma). To write out the resulting data as a Dolma dataset, convert each record in the dataset to a python dictionary and use the utilities in `licensed-pile/licensed_pile/write.py` to convert the list of dictionaries to a Dolma dataset. In cases where the dataset is very large, it is better to define a record generator rather than a list and pass the generator to the Dolma utility functions. -> Each record should minimally have the following keys: +> Each record should minimally have the following keys: ```json { "id": , diff --git a/courtlistener/process_csv_file.sh b/courtlistener/process_csv_file.sh index 19ddfec..aab909d 100644 --- a/courtlistener/process_csv_file.sh +++ b/courtlistener/process_csv_file.sh @@ -1,3 +1,3 @@ #!/usr/bin/env sh set -e -python courtlistener/process_csv.py \ No newline at end of file +python courtlistener/process_csv.py diff --git a/data_provenance/.gitignore b/data_provenance/.gitignore new file mode 100644 index 0000000..60baa9c --- /dev/null +++ b/data_provenance/.gitignore @@ -0,0 +1 @@ +data/* diff --git a/data_provenance/README.md b/data_provenance/README.md new file mode 100644 index 0000000..989df39 --- /dev/null +++ b/data_provenance/README.md @@ -0,0 +1,20 @@ +# Processing scripts for Data Provenance data + +The [Data Provenance Initiative](https://www.dataprovenance.org) is a digital library for supervised datasets that have been manually annotated with their source and license information. It wraps HuggingFace datasets with extra metadata, and provides code to download, standardize and filter for various criteria. + +In this case, we have filtered for the following criteria: +* English language or code data +* No model generated text +* Datasets have a commercially viable license, found through the Data Provenance Initiative or the hosting GitHub repository +* We only include datasets where all associated licenses (from the Data Provenance Initiative and GitHub) are open source compliant or appear in the Gold, Silver or Bronze lists of the Blue Oak Council (https://blueoakcouncil.org/list). +* The original source(s) of the text are only from the list of sources in `source_allow_list.txt` +* We only include datasets where the relevant license sources are thoroughly documented and linked. + +The specific filter settings are here: https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection/blob/main/src/configs/pile_v2_test.yaml + + +Here is the process to download the data, from inside the `data_provenance` dir: + +1. Run `python download.py --include include.csv` + +2. Run `python to-dolma.py --include include.csv` diff --git a/data_provenance/constants.py b/data_provenance/constants.py new file mode 100644 index 0000000..2dc3e93 --- /dev/null +++ b/data_provenance/constants.py @@ -0,0 +1,23 @@ +HF_MAPPING = { + "CommitPackFT": "commitpack_ft", + "Dolly 15k": "dolly_15k", + "Open Assistant v2": "open_assistant_v2", + "Open Assistant OctoPack": "octopack_oa", + "Open Assistant": "open_assistant", + "OIG": "oig", + "Anthropic HH-RLHF": "rlhf_anthropic_hh", + "Flan Collection (Super-NaturalInstructions)": "flan_sni", + "Flan Collection (P3)": "flan_p3", + "Flan Collection (Flan 2021)": "flan_2021", + "Tasksource Symbol-Tuning": "tasksource_symboltuning", + "Tasksource Instruct": "tasksource_instruct", + "Flan Collection (Chain-of-Thought)": "flan_cot", + "HelpSteer": "helpsteer", + "Aya Dataset": "aya_dataset", + "AgentInstruct": "agentinstruct", + "xP3x": "xp3x", + "Flan Collection (Dialog)": "flan_dialog", + "Joke Explanation": "joke_explanation", + "StarCoder Self-Instruct": "starcoder_selfinstruct", + "DialogStudio": "dialogstudio", +} diff --git a/data_provenance/download.py b/data_provenance/download.py new file mode 100644 index 0000000..c2366e3 --- /dev/null +++ b/data_provenance/download.py @@ -0,0 +1,80 @@ +"""Download Data Provenance Initative data""" + +import argparse +import gzip +import json +import logging +import multiprocessing +import os +import tarfile +import typing +from collections import defaultdict + +import jsonlines +import pandas as pd +from constants import HF_MAPPING +from datasets import load_dataset +from tqdm.auto import tqdm + +from licensed_pile.logs import configure_logging, get_logger + + +def parse_args(): + parser = argparse.ArgumentParser(description="Data Provenance Data Downloader") + parser.add_argument( + "--hf", + default="DataProvenanceInitiative/Ultra_Permissive_Test", + help="The label for the HuggingFace dataset that can be used in HuggingFace's load_dataset()", + ) + parser.add_argument( + "--include", + default="include.csv", + help="Path to csv file with `Collection Name, Dataset ID` we will include", + ) + parser.add_argument( + "--outdir", default="data/raw-data-provenance", help="Path to output directory" + ) + return parser.parse_args() + + +def write_jsonl_gz( + data, + outpath, +): + dirname = os.path.dirname(outpath) + if dirname: + os.makedirs(dirname, exist_ok=True) + with gzip.open(outpath, "wb") as fp: # Open file in binary write mode + data_bytes = ( + b"\n".join(json.dumps(d).encode() for d in data) + b"\n" + ) # Encode strings to bytes + fp.write(data_bytes) + + +def main(args): + logger = get_logger() + logger.info(f"Filtering to just the datasets in {args.include}") + + include_df = pd.read_csv(args.include) + include_collections = list(set(include_df["Collection"])) + include_dset_ids = set(include_df["Dataset ID"]) + + for collection in include_collections: + folder_name = HF_MAPPING[collection] + subset = load_dataset( + args.hf, + split="train", + num_proc=os.cpu_count(), + revision="main", + data_files=f"data/{folder_name}/*.jsonl", + ).to_list() + exs = [ex for ex in subset if ex["dataset"] in include_dset_ids] + savepath = os.path.join(args.outdir, f"{folder_name}.jsonl.gz") + write_jsonl_gz(exs, savepath) + logger.info(f"Saving {len(exs)} examples to {savepath}") + + +if __name__ == "__main__": + args = parse_args() + configure_logging() + main(args) diff --git a/data_provenance/include.csv b/data_provenance/include.csv new file mode 100644 index 0000000..5815ca4 --- /dev/null +++ b/data_provenance/include.csv @@ -0,0 +1,329 @@ +Dataset ID,Collection,Dataset Name,Languages,Text Sources,Model Generated,Derived from Datasets,License Use (DataProvenance),License Use (GitHub),Licenses,GitHub License,Dataset URL,GitHub URL,ArXiv URL +ds-ABCD,DialogStudio,ABCD,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/asappresearch/abcd/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/asappresearch/abcd,https://github.com/asappresearch/abcd,https://aclanthology.org/2021.naacl-main.239 +ds-AirDialogue,DialogStudio,AirDialogue,['English'],['human'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://github.com/google/airdialogue/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/google/airdialogue,https://github.com/google/airdialogue,https://aclanthology.org/D18-1419 +ds-BiTOD,DialogStudio,BiTOD,"['English', 'Chinese']",['human'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://github.com/HLTCHKUST/BiToD/blob/main/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/HLTCHKUST/BiToD,https://github.com/HLTCHKUST/BiToD, +ds-CaSiNo,DialogStudio,CaSiNo,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/kushalchawla/CaSiNo/blob/main/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/kushalchawla/CaSiNo,https://github.com/kushalchawla/CaSiNo,https://aclanthology.org/2021.naacl-main.254.pdf +ds-CraigslistBargains,DialogStudio,CraigslistBargains,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/stanfordnlp/cocoa/blob/master/LICENSE.md'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/stanfordnlp/cocoa/tree/master/craigslistbargain,https://github.com/stanfordnlp/cocoa/tree/master/craigslistbargain,https://arxiv.org/abs/1808.09637 +ds-Disambiguation,DialogStudio,Disambiguation,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/qbetterk/ParlAI/blob/disambiguation/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/qbetterk/ParlAI/tree/disambiguation,https://github.com/qbetterk/ParlAI/tree/disambiguation,https://aclanthology.org/2022.naacl-main.85.pdf +ds-GECOR,DialogStudio,GECOR,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0/'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://multinlp.github.io/GECOR/,,https://aclanthology.org/D19-1462/ +ds-HDSA-Dialog,DialogStudio,HDSA-Dialog,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/wenhuchen/HDSA-Dialog/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/wenhuchen/HDSA-Dialog,https://github.com/wenhuchen/HDSA-Dialog,https://arxiv.org/abs/1905.12866 +ds-KETOD,DialogStudio,KETOD,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/facebookresearch/ketod/blob/main/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/facebookresearch/ketod,https://github.com/facebookresearch/ketod,https://arxiv.org/abs/2205.05589 +ds-MULTIWOZ2_2,DialogStudio,MULTIWOZ2_2,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/budzianowski/multiwoz/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/budzianowski/multiwoz,https://github.com/budzianowski/multiwoz,https://arxiv.org/abs/2007.12720 +ds-MulDoGO,DialogStudio,MulDoGO,['English'],['human'],[],[],commercial,,"[{'License': 'CDLA Permissive 1.0', 'License URL': 'https://github.com/awslabs/multi-domain-goal-oriented-dialogues-dataset/blob/master/LICENSE.txt'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/awslabs/multi-domain-goal-oriented-dialogues-dataset,https://github.com/awslabs/multi-domain-goal-oriented-dialogues-dataset,https://aclanthology.org/D19-1460.pdf +ds-MultiWOZ_2.1,DialogStudio,MultiWOZ_2.1,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/budzianowski/multiwoz/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/budzianowski/multiwoz,https://github.com/budzianowski/multiwoz,https://arxiv.org/abs/1907.01669 +ds-SGD,DialogStudio,SGD,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/google-research-datasets/dstc8-schema-guided-dialogue/blob/master/LICENSE.txt'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/google-research-datasets/dstc8-schema-guided-dialogue,https://github.com/google-research-datasets/dstc8-schema-guided-dialogue,https://arxiv.org/pdf/1909.05855.pdf +ds-STAR,DialogStudio,STAR,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/RasaHQ/STAR/blob/master/LICENSE.txt'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/RasaHQ/STAR,https://github.com/RasaHQ/STAR,https://arxiv.org/abs/2010.11853 +ds-Taskmaster1,DialogStudio,Taskmaster1,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0/'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/google-research-datasets/Taskmaster,https://github.com/google-research-datasets/Taskmaster,https://arxiv.org/abs/1909.05358 +ds-Taskmaster2,DialogStudio,Taskmaster2,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0/'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/google-research-datasets/Taskmaster/tree/master/TM-2-2020,https://github.com/google-research-datasets/Taskmaster/tree/master/TM-2-2020, +ds-Taskmaster3,DialogStudio,Taskmaster3,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0/'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/google-research-datasets/Taskmaster/tree/master/TM-3-2020,https://github.com/google-research-datasets/Taskmaster/tree/master/TM-3-2020, +ds-WOZ2_0,DialogStudio,WOZ2_0,['English'],['human'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://github.com/nmrksic/neural-belief-tracker/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/nmrksic/neural-belief-tracker,https://github.com/nmrksic/neural-belief-tracker,https://arxiv.org/abs/1805.11350 +ds-AMI,DialogStudio,AMI,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://groups.inf.ed.ac.uk/ami/'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://groups.inf.ed.ac.uk/ami/,https://github.com/guokan-shang/ami-and-icsi-corpora,https://aclanthology.org/P18-1062/ +ds-CRD3,DialogStudio,CRD3,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/RevanthRameshkumar/CRD3/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/RevanthRameshkumar/CRD3,https://github.com/RevanthRameshkumar/CRD3,https://www.aclweb.org/anthology/2020.acl-main.459/ +ds-ConvoSumm,DialogStudio,ConvoSumm,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/Yale-LILY/ConvoSumm/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/Yale-LILY/ConvoSumm,https://github.com/Yale-LILY/ConvoSumm,https://aclanthology.org/2021.acl-long.535/ +ds-DialogSum,DialogStudio,DialogSum,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/cylnlp/dialogsum/blob/main/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/cylnlp/dialogsum?tab=readme-ov-file,https://github.com/cylnlp/dialogsum?tab=readme-ov-file,https://aclanthology.org/2021.findings-acl.449/ +ds-ICSI,DialogStudio,ICSI,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://groups.inf.ed.ac.uk/ami/'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/guokan-shang/ami-and-icsi-corpora,https://github.com/guokan-shang/ami-and-icsi-corpora,https://ieeexplore.ieee.org/abstract/document/1198793 +ds-QMSum,DialogStudio,QMSum,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/Yale-LILY/QMSum/blob/main/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/Yale-LILY/QMSum,https://github.com/Yale-LILY/QMSum,https://arxiv.org/abs/2104.05938 +ds-TweetSumm,DialogStudio,TweetSumm,['English'],['human'],[],[],commercial,,"[{'License': 'CC0 1.0', 'License URL': 'https://github.com/guyfe/Tweetsumm/blob/main/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/guyfe/Tweetsumm,https://github.com/guyfe/Tweetsumm,https://arxiv.org/abs/2111.11894 +ds-AntiScam,DialogStudio,AntiScam,['English'],['human'],[],[],commercial,,"[{'License': 'CC0 1.0', 'License URL': 'https://gitlab.com/ucdavisnlp/antiscam'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://gitlab.com/ucdavisnlp/antiscam,https://gitlab.com/ucdavisnlp/antiscam,https://arxiv.org/abs/1911.10742 +ds-ConvAI2,DialogStudio,ConvAI2,['English'],['human'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://github.com/DeepPavlovAdmin/convai/tree/master/2017'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/DeepPavlovAdmin/convai/tree/master/2017,https://github.com/DeepPavlovAdmin/convai/tree/master/2017,https://arxiv.org/abs/1902.00098 +ds-HH-RLHF,DialogStudio,HH-RLHF,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://huggingface.co/datasets/Anthropic/hh-rlhf'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://huggingface.co/datasets/Anthropic/hh-rlhf,,https://arxiv.org/abs/2204.05862 +ds-Prosocial,DialogStudio,Prosocial,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/skywalker023/prosocial-dialog/blob/main/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/skywalker023/prosocial-dialog,https://github.com/skywalker023/prosocial-dialog,https://arxiv.org/abs/2205.12688 +ds-chitchat-dataset,DialogStudio,chitchat-dataset,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/BYU-PCCL/chitchat-dataset'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/BYU-PCCL/chitchat-dataset,https://github.com/BYU-PCCL/chitchat-dataset, +ds-CoQA,DialogStudio,CoQA,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/stanfordnlp/coqa-baselines/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/stanfordnlp/coqa-baselines,https://github.com/stanfordnlp/coqa-baselines,https://arxiv.org/abs/1808.07042 +ds-CoSQL,DialogStudio,CoSQL,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/4.0/legalcode'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://yale-lily.github.io/cosql,,https://arxiv.org/abs/1909.05378 +ds-DART,DialogStudio,DART,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/Yale-LILY/dart/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/Yale-LILY/dart,https://github.com/Yale-LILY/dart,https://aclanthology.org/2021.naacl-main.37/ +ds-FeTaQA,DialogStudio,FeTaQA,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/Yale-LILY/FeTaQA/blob/main/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/Yale-LILY/FeTaQA,https://github.com/Yale-LILY/FeTaQA,https://aclanthology.org/2022.tacl-1.3/ +ds-GrailQA,DialogStudio,GrailQA,['English'],['human'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://github.com/dki-lab/GrailQA/blob/main/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://dki-lab.github.io/GrailQA/,https://github.com/dki-lab/GrailQA,https://arxiv.org/abs/2011.07743 +ds-HybridQA,DialogStudio,HybridQA,['English'],['human'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/wenhuchen/HybridQA/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://hybridqa.github.io/,https://github.com/wenhuchen/HybridQA,https://aclanthology.org/2020.findings-emnlp.91/ +ds-MTOP,DialogStudio,MTOP,"['English', 'Italian', 'Japanese']",['human'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/awslabs/multilingual-top/blob/main/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/awslabs/multilingual-top,https://github.com/awslabs/multilingual-top,https://aclanthology.org/2021.eacl-main.257/ +ds-SParC,DialogStudio,SParC,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/4.0/legalcode'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://yale-lily.github.io/sparc,,https://aclanthology.org/P19-1443/ +ds-SQA,DialogStudio,SQA,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/4.0/legalcode'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://www.microsoft.com/en-us/download/details.aspx?id=54253,,https://aclanthology.org/P17-1167/ +ds-Spider,DialogStudio,Spider,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/4.0/legalcode'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/taoyds/spider,https://github.com/taoyds/spider,https://arxiv.org/abs/1809.08887 +ds-ToTTo,DialogStudio,ToTTo,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/3.0/'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/google-research-datasets/ToTTo,https://github.com/google-research-datasets/ToTTo,https://aclanthology.org/2020.emnlp-main.89/ +ds-WebQSP,DialogStudio,WebQSP,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0/'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://www.microsoft.com/en-us/research/publication/the-value-of-semantic-parse-labeling-for-knowledge-base-question-answering-2/,,https://aclanthology.org/P16-2033/ +ds-WikiSQL,DialogStudio,WikiSQL,['English'],['human'],[],[],commercial,,"[{'License': 'BSD 3-Clause License', 'License URL': 'https://github.com/salesforce/WikiSQL/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/salesforce/WikiSQL,https://github.com/salesforce/WikiSQL,https://arxiv.org/abs/1709.00103 +ds-WikiTQ,DialogStudio,WikiTQ,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/ppasupat/WikiTableQuestions/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/ppasupat/WikiTableQuestions,https://github.com/ppasupat/WikiTableQuestions,https://aclanthology.org/P15-1142/ +ds-wizard_of_internet,DialogStudio,wizard_of_internet,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0/'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://parl.ai/projects/sea/,,https://aclanthology.org/2022.acl-long.579/ +ds-wizard_of_wikipedia,DialogStudio,wizard_of_wikipedia,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0/'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://parl.ai/projects/wizard_of_wikipedia/,,https://arxiv.org/abs/1811.01241 +ds-ATIS,DialogStudio,ATIS,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/PolyAI-LDN/task-specific-datasets/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/PolyAI-LDN/task-specific-datasets,https://github.com/PolyAI-LDN/task-specific-datasets,https://aclanthology.org/H90-1021/ +ds-ATIS-NER,DialogStudio,ATIS-NER,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/PolyAI-LDN/task-specific-datasets/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/PolyAI-LDN/task-specific-datasets,https://github.com/PolyAI-LDN/task-specific-datasets,https://aclanthology.org/H90-1021/ +ds-BANKING77,DialogStudio,BANKING77,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/PolyAI-LDN/task-specific-datasets/commit/3bf93de788b9362c34d06cba1d271bc23dd1bcb6'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/PolyAI-LDN/task-specific-datasets,https://github.com/PolyAI-LDN/task-specific-datasets,https://aclanthology.org/2020.nlp4convai-1.5/?ref=https://githubhelp.com +ds-BANKING77-OOS,DialogStudio,BANKING77-OOS,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/PolyAI-LDN/task-specific-datasets/commit/3bf93de788b9362c34d06cba1d271bc23dd1bcb6'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/PolyAI-LDN/task-specific-datasets,https://github.com/PolyAI-LDN/task-specific-datasets,https://aclanthology.org/2020.nlp4convai-1.5/?ref=https://githubhelp.com +ds-CLINC-Single-Domain-OOS-banking,DialogStudio,CLINC-Single-Domain-OOS-banking,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 3.0', 'License URL': 'https://github.com/jianguoz/Few-Shot-Intent-Detection'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/jianguoz/Few-Shot-Intent-Detection,https://github.com/jianguoz/Few-Shot-Intent-Detection,https://arxiv.org/abs/2106.04564 +ds-CLINC-Single-Domain-OOS-credit_cards,DialogStudio,CLINC-Single-Domain-OOS-credit_cards,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 3.0', 'License URL': 'https://github.com/jianguoz/Few-Shot-Intent-Detection'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/jianguoz/Few-Shot-Intent-Detection,https://github.com/jianguoz/Few-Shot-Intent-Detection,https://arxiv.org/abs/2106.04564 +ds-CLINC150,DialogStudio,CLINC150,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://github.com/clinc/oos-eval/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/clinc/oos-eval,https://github.com/clinc/oos-eval,https://aclanthology.org/D19-1131/ +ds-DSTC8-SGD,DialogStudio,DSTC8-SGD,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/google-research-datasets/dstc8-schema-guided-dialogue/blob/master/LICENSE.txt'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/google-research-datasets/dstc8-schema-guided-dialogue,https://github.com/google-research-datasets/dstc8-schema-guided-dialogue,https://arxiv.org/abs/1909.05855 +ds-HWU64,DialogStudio,HWU64,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://github.com/alexa/dialoglue?tab=readme-ov-file'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/alexa/dialoglue,https://github.com/alexa/dialoglue,https://arxiv.org/abs/1903.05566 +ds-RESTAURANTS8K,DialogStudio,RESTAURANTS8K,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/PolyAI-LDN/task-specific-datasets/commit/3bf93de788b9362c34d06cba1d271bc23dd1bcb6'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/PolyAI-LDN/task-specific-datasets,https://github.com/PolyAI-LDN/task-specific-datasets,https://arxiv.org/abs/2005.08866 +ds-SNIPS,DialogStudio,SNIPS,['English'],['human'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://github.com/snipsco/snips-nlu/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/snipsco/snips-nlu,https://github.com/snipsco/snips-nlu,https://arxiv.org/abs/1805.10190 +ds-SNIPS-NER,DialogStudio,SNIPS-NER,['English'],['human'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://github.com/snipsco/snips-nlu/blob/master/LICENSE'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/snipsco/snips-nlu,https://github.com/snipsco/snips-nlu,https://arxiv.org/abs/1805.10190 +ds-TOP,DialogStudio,TOP,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA', 'License URL': 'https://github.com/alexa/dialoglue'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/alexa/dialoglue,https://github.com/alexa/dialoglue,https://arxiv.org/abs/1810.07942 +ds-TOP-NER,DialogStudio,TOP-NER,['English'],['human'],[],[],commercial,,"[{'License': 'CC BY-SA', 'License URL': 'https://github.com/alexa/dialoglue'}, {'License': 'Apache License 2.0', 'License URL': 'https://github.com/salesforce/DialogStudio/blob/main/LICENSE.txt'}]",,https://github.com/alexa/dialoglue,https://github.com/alexa/dialoglue,https://arxiv.org/abs/1810.07942 +oasst2-en,Open Assistant v2,oasst2,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://huggingface.co/datasets/OpenAssistant/oasst2'}]",,https://github.com/LAION-AI/Open-Assistant,https://github.com/LAION-AI/Open-Assistant,https://arxiv.org/abs/2304.07327 +xp3x-xquad-eng_latn,xP3x,xquad,['English'],['wikipedia.org'],[],['Squad'],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/4.0/legalcode'}]",,https://github.com/deepmind/xquad,https://github.com/deepmind/xquad,https://arxiv.org/abs/1910.11856 +xp3x-mlqa-eng_latn,xP3x,mlqa,['English'],['wikipedia.org'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/4.0/legalcode'}]",,https://github.com/facebookresearch/MLQA,https://github.com/facebookresearch/MLQA,https://arxiv.org/abs/1910.07475 +xp3x-tydiqa_primary-eng_latn,xP3x,tydiqa-primary,['English'],['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://en.wikipedia.org/wiki/Wikipedia:Reusing_Wikipedia_content'}]",Apache License 2.0,https://github.com/google-research-datasets/tydiqa,https://github.com/google-research-datasets/tydiqa,https://arxiv.org/abs/2003.05002 +xp3x-tydiqa_goldp-eng_latn,xP3x,tydiqa-goldp,['English'],['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://en.wikipedia.org/wiki/Wikipedia:Reusing_Wikipedia_content'}]",Apache License 2.0,https://github.com/google-research-datasets/tydiqa,https://github.com/google-research-datasets/tydiqa,https://arxiv.org/abs/2003.05002 +xp3x-mbpp-python,xP3x,mbpp,['Python'],[],[],[],commercial,commercial,"[{'License': 'CC BY 4.0', 'License URL': 'https://www.tensorflow.org/datasets/community_catalog/huggingface/mbpp'}]",Apache License 2.0,https://github.com/google-research/google-research/tree/master/mbpp,https://github.com/google-research/google-research/tree/master/mbpp, +xp3x-codecomplex-java,xP3x,codecomplex,['Java'],[],[],[],commercial,commercial,"[{'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0/'}]",Apache License 2.0,https://github.com/yonsei-toc/CodeComple,https://github.com/yonsei-toc/CodeComple, +xp3x-flores-eng_latn,xP3x,flores,['English'],['wikipedia.org'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/4.0/legalcode'}]",,https://github.com/facebookresearch/flores,https://github.com/facebookresearch/flores/,https://arxiv.org/abs/2106.03193 +dolly-openqa,Dolly 15k,dolly-open_qa,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +dolly-closedqa,Dolly 15k,dolly-closed_qa,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +dolly-classification,Dolly 15k,dolly-classification,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +dolly-brainstorming,Dolly 15k,dolly-brainstorming,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +dolly-infoextract,Dolly 15k,dolly-infoextract,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +dolly-summarization,Dolly 15k,dolly-summarization,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +dolly-creative_writing,Dolly 15k,dolly-creative_writing,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +aya-english,Aya Dataset,aya-english,['English'],"['crowdsourced', 'human', 'volunteer annotations']",[],['xP3'],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://huggingface.co/datasets/CohereForAI/aya_dataset#licensing-information'}, {'License': 'Apache License 2.0', 'License URL': 'https://huggingface.co/datasets/bigscience/xP3#licensing-information'}]",,https://huggingface.co/datasets/CohereForAI/aya_dataset/viewer/default/train?f[language][value]=%27English%27,,https://arxiv.org/abs/2402.06619 +AgentInstruct-alfworld,AgentInstruct,ALFworld,['English'],['crowdsourced'],[],['ALFRED'],commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/alfworld/alfworld'}, {'License': 'MIT License', 'License URL': 'https://github.com/askforalfred/alfred/blob/master/LICENSE'}, {'License': 'MIT License', 'License URL': 'https://github.com/microsoft/TextWorld/blob/main/LICENSE.txt'}]",,https://github.com/alfworld/alfworld/tree/master/alfworld/data,https://github.com/THUDM/AgentTuning,https://arxiv.org/abs/2010.03768 +oasst-en-octopack,Open Assistant OctoPack,oasst,"['English', 'Spanish', 'Russian', 'Mandarin Chinese', 'German', 'French', 'Thai', 'Portugese (Brazilian)', 'Catalan', 'Ukrainian', 'Italian', 'Japanese', 'Polish', 'Basque', 'Vietnamese', 'Hungarian', 'Arabic', 'Danish', 'Turkish', 'Code']",['crowdsourced'],[],[],commercial,commercial,"[{'License': 'CC BY 4.0', 'License URL': 'https://open-assistant.io/'}]",Apache License 2.0,https://github.com/LAION-AI/Open-Assistant,https://github.com/LAION-AI/Open-Assistant,https://arxiv.org/abs/2304.07327 +oasst-en,Open Assistant,oasst,"['English', 'Code']",['crowdsourced'],[],[],commercial,commercial,"[{'License': 'CC BY 4.0', 'License URL': 'https://open-assistant.io/'}]",Apache License 2.0,https://github.com/LAION-AI/Open-Assistant,https://github.com/LAION-AI/Open-Assistant,https://arxiv.org/abs/2304.07327 +commitpackft-llvm,CommitPackFT,CommitPackFT,"['English', 'LLVM']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-crystal,CommitPackFT,CommitPackFT,"['English', 'Crystal']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-purescript,CommitPackFT,CommitPackFT,"['English', 'PureScript']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-unity3d-asset,CommitPackFT,CommitPackFT,"['English', 'Unity3D Asset']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'CC0 1.0', 'License URL': 'https://creativecommons.org/publicdomain/zero/1.0/'}, {'License': 'ISC License', 'License URL': 'https://opensource.org/licenses/ISC'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}, {'License': 'Unspecified', 'License URL': ''}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-idris,CommitPackFT,CommitPackFT,"['Idris', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-rouge,CommitPackFT,CommitPackFT,"['Rouge', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-liquid,CommitPackFT,CommitPackFT,"['Liquid', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'CC0 1.0', 'License URL': 'https://creativecommons.org/publicdomain/zero/1.0/'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-solidity,CommitPackFT,CommitPackFT,"['Solidity', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}, {'License': 'Unspecified', 'License URL': ''}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-json5,CommitPackFT,CommitPackFT,"['JSON5', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-systemverilog,CommitPackFT,CommitPackFT,"['SystemVerilog', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-literate-coffeescript,CommitPackFT,CommitPackFT,"['English', 'Literate CoffeeScript']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-openscad,CommitPackFT,CommitPackFT,"['OpenSCAD', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}, {'License': 'CC0 1.0', 'License URL': 'https://creativecommons.org/publicdomain/zero/1.0/'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-pan,CommitPackFT,CommitPackFT,"['Pan', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-pony,CommitPackFT,CommitPackFT,"['Pony', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-chapel,CommitPackFT,CommitPackFT,"['Chapel', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-ioke,CommitPackFT,CommitPackFT,"['English', 'Ioke']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-ooc,CommitPackFT,CommitPackFT,"['ooc', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-g-code,CommitPackFT,CommitPackFT,"['English', 'G-code']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-mirah,CommitPackFT,CommitPackFT,"['Mirah', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-logos,CommitPackFT,CommitPackFT,"['Logos', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'ISC License', 'License URL': 'https://opensource.org/licenses/ISC'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-vcl,CommitPackFT,CommitPackFT,"['VCL', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-gdscript,CommitPackFT,CommitPackFT,"['English', 'GDScript']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'CC0 1.0', 'License URL': 'https://creativecommons.org/publicdomain/zero/1.0/'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-graphql,CommitPackFT,CommitPackFT,"['GraphQL', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'CC0 1.0', 'License URL': 'https://creativecommons.org/publicdomain/zero/1.0/'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-hlsl,CommitPackFT,CommitPackFT,"['HLSL', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-http,CommitPackFT,CommitPackFT,"['HTTP', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-ninja,CommitPackFT,CommitPackFT,"['English', 'Ninja']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-oz,CommitPackFT,CommitPackFT,"['Oz', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-aspectj,CommitPackFT,CommitPackFT,"['AspectJ', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-fancy,CommitPackFT,CommitPackFT,"['English', 'Fancy']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-moonscript,CommitPackFT,CommitPackFT,"['English', 'MoonScript']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-piglatin,CommitPackFT,CommitPackFT,"['English', 'PigLatin']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-urweb,CommitPackFT,CommitPackFT,"['UrWeb', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-agda,CommitPackFT,CommitPackFT,"['Agda', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-coldfusion,CommitPackFT,CommitPackFT,"['ColdFusion', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-emberscript,CommitPackFT,CommitPackFT,"['English', 'EmberScript']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-latte,CommitPackFT,CommitPackFT,"['English', 'Latte']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-literate-haskell,CommitPackFT,CommitPackFT,"['Literate Haskell', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-scilab,CommitPackFT,CommitPackFT,"['Scilab', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-apl,CommitPackFT,CommitPackFT,"['English', 'APL']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'ISC License', 'License URL': 'https://opensource.org/licenses/ISC'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-genshi,CommitPackFT,CommitPackFT,"['English', 'Genshi']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-jsonld,CommitPackFT,CommitPackFT,"['JSONLD', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'CC0 1.0', 'License URL': 'https://creativecommons.org/publicdomain/zero/1.0/'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}, {'License': 'Unspecified', 'License URL': ''}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-krl,CommitPackFT,CommitPackFT,"['KRL', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-lean,CommitPackFT,CommitPackFT,"['English', 'Lean']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-lfe,CommitPackFT,CommitPackFT,"['LFE', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-metal,CommitPackFT,CommitPackFT,"['English', 'Metal']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-monkey,CommitPackFT,CommitPackFT,"['Monkey', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-mupad,CommitPackFT,CommitPackFT,"['mupad', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-nesc,CommitPackFT,CommitPackFT,"['nesC', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-nit,CommitPackFT,CommitPackFT,"['English', 'Nit']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-purebasic,CommitPackFT,CommitPackFT,"['PureBasic', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-xproc,CommitPackFT,CommitPackFT,"['English', 'XProc']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-zephir,CommitPackFT,CommitPackFT,"['Zephir', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-boo,CommitPackFT,CommitPackFT,"['English', 'Boo']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-brainfuck,CommitPackFT,CommitPackFT,"['Brainfuck', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-bro,CommitPackFT,CommitPackFT,"['Bro', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-cartocss,CommitPackFT,CommitPackFT,"['CartoCSS', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-creole,CommitPackFT,CommitPackFT,"['English', 'Creole']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-dylan,CommitPackFT,CommitPackFT,"['Dylan', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-eiffel,CommitPackFT,CommitPackFT,"['Eiffel', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-flux,CommitPackFT,CommitPackFT,"['English', 'FLUX']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-lilypond,CommitPackFT,CommitPackFT,"['English', 'LilyPond']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-lsl,CommitPackFT,CommitPackFT,"['LSL', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-nu,CommitPackFT,CommitPackFT,"['English', 'Nu']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-ragel-in-ruby-host,CommitPackFT,CommitPackFT,"['English', 'Ragel in Ruby Host']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-slash,CommitPackFT,CommitPackFT,"['English', 'Slash']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-sourcepawn,CommitPackFT,CommitPackFT,"['English', 'SourcePawn']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-squirrel,CommitPackFT,CommitPackFT,"['Squirrel', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-ston,CommitPackFT,CommitPackFT,"['English', 'STON']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-uno,CommitPackFT,CommitPackFT,"['Uno', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-xbase,CommitPackFT,CommitPackFT,"['English', 'xBase']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-yacc,CommitPackFT,CommitPackFT,"['English', 'Yacc']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}, {'License': 'ISC License', 'License URL': 'https://opensource.org/licenses/ISC'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-zig,CommitPackFT,CommitPackFT,"['English', 'Zig']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-abap,CommitPackFT,CommitPackFT,"['ABAP', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-arc,CommitPackFT,CommitPackFT,"['English', 'Arc']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-ats,CommitPackFT,CommitPackFT,"['English', 'ATS']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-blitzmax,CommitPackFT,CommitPackFT,"['BlitzMax', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-bluespec,CommitPackFT,CommitPackFT,"['Bluespec', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-clean,CommitPackFT,CommitPackFT,"['English', 'Clean']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-dns-zone,CommitPackFT,CommitPackFT,"['DNS Zone', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-forth,CommitPackFT,CommitPackFT,"['Forth', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-harbour,CommitPackFT,CommitPackFT,"['Harbour', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-igor-pro,CommitPackFT,CommitPackFT,"['English', 'IGOR Pro']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-inform-7,CommitPackFT,CommitPackFT,"['English', 'Inform 7']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-isabelle,CommitPackFT,CommitPackFT,"['English', 'Isabelle']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-jflex,CommitPackFT,CommitPackFT,"['JFlex', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-literate-agda,CommitPackFT,CommitPackFT,"['English', 'Literate Agda']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-maple,CommitPackFT,CommitPackFT,"['Maple', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-mathematica,CommitPackFT,CommitPackFT,"['English', 'Mathematica']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'CC0 1.0', 'License URL': 'https://creativecommons.org/publicdomain/zero/1.0/'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-mtml,CommitPackFT,CommitPackFT,"['English', 'MTML']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-netlinx,CommitPackFT,CommitPackFT,"['NetLinx', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-propeller-spin,CommitPackFT,CommitPackFT,"['English', 'Propeller Spin']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-pure-data,CommitPackFT,CommitPackFT,"['English', 'Pure Data']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-rebol,CommitPackFT,CommitPackFT,"['English', 'Rebol']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-red,CommitPackFT,CommitPackFT,"['English', 'Red']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-sage,CommitPackFT,CommitPackFT,"['English', 'Sage']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-sas,CommitPackFT,CommitPackFT,"['SAS', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-scaml,CommitPackFT,CommitPackFT,"['English', 'Scaml']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 2-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-2-Clause'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-smt,CommitPackFT,CommitPackFT,"['English', 'SMT']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'BSD 3-Clause License', 'License URL': 'https://opensource.org/licenses/BSD-3-Clause'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-unrealscript,CommitPackFT,CommitPackFT,"['UnrealScript', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'MIT License', 'License URL': 'https://opensource.org/licenses/MIT'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +commitpackft-xpages,CommitPackFT,CommitPackFT,"['XPages', 'English']",['github'],[],[],commercial,commercial,"[{'License': 'MIT License', 'License URL': 'https://github.com/bigcode-project/octopack/tree/main#licenses'}, {'License': 'Apache License 2.0', 'License URL': 'https://www.apache.org/licenses/LICENSE-2.0'}]",MIT License,https://github.com/bigcode-project/octopack,https://github.com/bigcode-project/octopack,https://arxiv.org/abs/2308.07124 +fc-cot-cot_gsm8k,Flan Collection (Chain-of-Thought),cot_gsm8k,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'MIT License', 'License URL': 'https://huggingface.co/datasets/gsm8k#licensing-information'}]",,https://github.com/openai/grade-school-math,https://github.com/openai/grade-school-math,https://arxiv.org/abs/2110.14168 +fc-cot-cot_strategyqa,Flan Collection (Chain-of-Thought),cot_strategyqa,['English'],['wikipedia.org'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'}]",,https://allenai.org/data/strategyqa,,https://arxiv.org/abs/2101.02235 +fc-cot-stream_creak,Flan Collection (Chain-of-Thought),stream_creak,['English'],['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://paperswithcode.com/dataset/creak'}]",MIT License,https://github.com/yasumasaonoe/creak,https://github.com/yasumasaonoe/creak,https://arxiv.org/abs/2109.01653 +fc-cot-stream_esnli,Flan Collection (Chain-of-Thought),stream_esnli,['English'],['crowdsourced (amt)'],[],['snli'],commercial,commercial,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/OanaMariaCamburu/e-SNLI'}]",MIT License,https://github.com/OanaMariaCamburu/e-SNLI,https://github.com/OanaMariaCamburu/e-SNLI,https://arxiv.org/abs/1812.01193 +tsi-recast-recast_verbnet,Tasksource Instruct,tsi-recast-recast_verbnet,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,http://decomp.io/,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsi-recast-recast_verbcorner,Tasksource Instruct,tsi-recast-recast_verbcorner,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,https://huggingface.co/datasets/metaeval/recast,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsi-recast-recast_ner,Tasksource Instruct,tsi-recast-recast_ner,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,https://huggingface.co/datasets/metaeval/recast,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsi-recast-recast_sentiment,Tasksource Instruct,tsi-recast-recast_sentiment,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,https://huggingface.co/datasets/metaeval/recast,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsi-recast-recast_puns,Tasksource Instruct,tsi-recast-recast_puns,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,https://huggingface.co/datasets/metaeval/recast,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsi-recast-recast_factuality,Tasksource Instruct,tsi-recast-recast_factuality,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,https://huggingface.co/datasets/metaeval/recast,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsi-recast-recast_megaveridicality,Tasksource Instruct,tsi-recast-recast_megaveridicality,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,https://huggingface.co/datasets/metaeval/recast,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsi-breaking_nli,Tasksource Instruct,tsi-breaking_nli,['English'],['crowdsourced'],[],['SNLI'],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/BIU-NLP/Breaking_NLI#data-source'}]",,https://github.com/BIU-NLP/Breaking_NLI,https://github.com/BIU-NLP/Breaking_NLI,https://arxiv.org/abs/1805.02266 +tsi-conj_nli,Tasksource Instruct,tsi-conj_nli,['English'],['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/swarnaHub/ConjNLI,https://github.com/swarnaHub/ConjNLI,https://arxiv.org/abs/2010.10418 +tsi-fracas,Tasksource Instruct,tsi-fracas,['English'],[],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/felipessalvatore/NLI_datasets,https://github.com/felipessalvatore/NLI_datasets, +tsi-truthful_qa-multiple_choice,Tasksource Instruct,tsi-truthful_qa-multiple_choice,['English'],['human'],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",Apache License 2.0,https://github.com/sylinrl/TruthfulQA,https://github.com/sylinrl/TruthfulQA,https://arxiv.org/abs/2109.07958 +tsi-fig_qa,Tasksource Instruct,tsi-fig_qa,['English'],['crowdsourced'],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/nightingal3/Fig-QA,https://github.com/nightingal3/Fig-QA,https://arxiv.org/abs/2204.12632 +tsi-social_i_qa,Tasksource Instruct,tsi-social_i_qa,['English'],['crowdsourced (amt)'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://allenai.org/data/socialiqa'}]",,https://allenai.org/data/socialiqa,,https://arxiv.org/abs/1904.09728 +tsi-balanced_copa,Tasksource Instruct,tsi-balanced_copa,['English'],['human'],[],['COPA dataset'],commercial,,"[{'License': 'BSD 2-Clause License', 'License URL': 'https://people.ict.usc.edu/~gordon/copa.html'}]",,https://balanced-copa.github.io/,https://balanced-copa.github.io/,https://arxiv.org/abs/1911.00225 +tsi-vitaminc-tals__vitaminc,Tasksource Instruct,tsi-vitaminc-tals__vitaminc,['English'],['wikipedia.org'],[],['FEVER'],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/TalSchuster/VitaminC,https://github.com/TalSchuster/VitaminC,https://arxiv.org/abs/2103.08541 +tsi-lex_glue-case_hold,Tasksource Instruct,tsi-lex_glue-case_hold,['English'],[],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': 'https://github.com/reglab/casehold'}]",Apache License 2.0,https://github.com/reglab/casehold,https://github.com/reglab/casehold,https://arxiv.org/abs/2110.00976 +tsi-hyperpartisan_news,Tasksource Instruct,tsi-hyperpartisan_news,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': None}]",,https://huggingface.co/datasets/hyperpartisan_news_detection,, +tsi-fever_evidence_related-mwong__fever_related,Tasksource Instruct,tsi-fever_evidence_related-mwong__fever_related,['English'],['wikipedia.org'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://fever.ai/download/feverous/license.html'}]",,https://huggingface.co/datasets/mwong/fever-evidence-related,,https://arxiv.org/abs/1803.05355 +tsi-dynasent-dynabench.dynasent.r1.all-r1,Tasksource Instruct,tsi-dynasent-dynabench.dynasent.r1.all-r1,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://huggingface.co/datasets/dynabench/dynasent#license'}]",,https://dynabench.org/tasks/3,,https://arxiv.org/abs/2012.15349 +tsi-dynasent-dynabench.dynasent.r2.all-r2,Tasksource Instruct,tsi-dynasent-dynabench.dynasent.r2.all-r2,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://huggingface.co/datasets/dynabench/dynasent#license'}]",,https://dynabench.org/tasks/3,,https://arxiv.org/abs/2012.15349 +tsi-moral_stories-full,Tasksource Instruct,tsi-moral_stories-full,['English'],['crowdsourced'],[],['SOCIAL-CHEM-101 dataset'],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/demelin/moral_stories,https://github.com/demelin/moral_stories,https://arxiv.org/abs/2012.15738 +tsi-prost,Tasksource Instruct,tsi-prost,['English'],['grammar-based'],[],[],commercial,commercial,"[{'License': 'Apache License 2.0', 'License URL': 'https://github.com/nala-cub/prost#license'}]",Apache License 2.0,https://github.com/nala-cub/prost,https://github.com/nala-cub/prost,https://arxiv.org/abs/2106.03634 +tsi-condaqa,Tasksource Instruct,tsi-condaqa,['English'],['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'Apache License 2.0', 'License URL': 'https://github.com/AbhilashaRavichander/CondaQA/blob/main/LICENSE'}]",Apache License 2.0,https://github.com/AbhilashaRavichander/CondaQA,https://github.com/AbhilashaRavichander/CondaQA,https://arxiv.org/abs/2211.00295 +tsi-wouldyourather,Tasksource Instruct,tsi-wouldyourather,['English'],[],[],[],commercial,,"[{'License': 'CC0 1.0', 'License URL': 'https://www.kaggle.com/datasets/charlieray668/would-you-rather'}]",,https://huggingface.co/datasets/metaeval/wouldyourather,, +tsi-defeasible_nli-snli,Tasksource Instruct,tsi-defeasible_nli-snli,['English'],['crowdsourced'],[],"['SNLI', 'social chemestry', 'ATOMIC']",commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/rudinger/defeasible-nli/blob/main/LICENSE'}]",,https://huggingface.co/datasets/metaeval/defeasible-nli,,https://aclanthology.org/2020.findings-emnlp.418/ +tsi-defeasible_nli-atomic,Tasksource Instruct,tsi-defeasible_nli-atomic,['English'],['crowdsourced'],[],"['SNLI', 'social chemestry', 'ATOMIC']",commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/rudinger/defeasible-nli/blob/main/LICENSE'}]",,https://huggingface.co/datasets/metaeval/defeasible-nli,,https://aclanthology.org/2020.findings-emnlp.418/ +tsi-lonli,Tasksource Instruct,tsi-lonli,['English'],['grammar-based'],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/microsoft/LoNLI,https://github.com/microsoft/LoNLI,https://arxiv.org/abs/2107.07229 +tsi-neqa,Tasksource Instruct,tsi-neqa,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/inverse-scaling/prize/blob/main/LICENSE'}]",,https://huggingface.co/datasets/inverse-scaling/NeQA,, +tsi-quote_repetition,Tasksource Instruct,tsi-quote_repetition,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/inverse-scaling/prize/blob/main/LICENSE'}]",,https://huggingface.co/datasets/inverse-scaling/quote-repetition,, +tsi-redefine_math,Tasksource Instruct,tsi-redefine_math,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/inverse-scaling/prize/blob/main/LICENSE'}]",,https://huggingface.co/datasets/inverse-scaling/redefine-math,, +tsi-tracie,Tasksource Instruct,tsi-tracie,['English'],['crowdsourced'],[],['ROCStories dataset'],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",Apache License 2.0,https://github.com/allenai/aristo-leaderboard/tree/master/tracie/data,https://github.com/allenai/aristo-leaderboard/tree/master/tracie/data,https://arxiv.org/abs/2010.12753 +tsi-winowhy,Tasksource Instruct,tsi-winowhy,['English'],"['winograd schema challenge dataset', 'conceptnet', 'crowdsourced']",[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/HKUST-KnowComp/WinoWhy,https://github.com/HKUST-KnowComp/WinoWhy,https://arxiv.org/abs/2005.05763 +tsi-cladder,Tasksource Instruct,tsi-cladder,['English'],[],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/causalNLP/cladder,https://github.com/causalNLP/cladder, +tsi-few_nerd-supervised,Tasksource Instruct,tsi-few_nerd-supervised,['English'],['wikipedia.org'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://ningding97.github.io/fewnerd/'}]",,https://ningding97.github.io/fewnerd/,https://ningding97.github.io/fewnerd/,https://arxiv.org/abs/2105.07464 +tsy-recast-recast_verbnet,Tasksource Symbol-Tuning,tsy-recast-recast_verbnet,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,http://decomp.io/,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsy-recast-recast_verbcorner,Tasksource Symbol-Tuning,tsy-recast-recast_verbcorner,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,https://huggingface.co/datasets/metaeval/recast,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsy-recast-recast_ner,Tasksource Symbol-Tuning,tsy-recast-recast_ner,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,https://huggingface.co/datasets/metaeval/recast,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsy-recast-recast_sentiment,Tasksource Symbol-Tuning,tsy-recast-recast_sentiment,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,https://huggingface.co/datasets/metaeval/recast,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsy-recast-recast_puns,Tasksource Symbol-Tuning,tsy-recast-recast_puns,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,https://huggingface.co/datasets/metaeval/recast,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsy-recast-recast_factuality,Tasksource Symbol-Tuning,tsy-recast-recast_factuality,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,https://huggingface.co/datasets/metaeval/recast,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsy-recast-recast_megaveridicality,Tasksource Symbol-Tuning,tsy-recast-recast_megaveridicality,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/decompositional-semantics-initiative/decomp#license'}]",,https://huggingface.co/datasets/metaeval/recast,https://github.com/decompositional-semantics-initiative/DNC/raw/master/inference_is_everything.zip, +tsy-breaking_nli,Tasksource Symbol-Tuning,tsy-breaking_nli,['English'],['crowdsourced'],[],['SNLI'],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/BIU-NLP/Breaking_NLI#data-source'}]",,https://github.com/BIU-NLP/Breaking_NLI,https://github.com/BIU-NLP/Breaking_NLI,https://arxiv.org/abs/1805.02266 +tsy-conj_nli,Tasksource Symbol-Tuning,tsy-conj_nli,['English'],['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/swarnaHub/ConjNLI,https://github.com/swarnaHub/ConjNLI,https://arxiv.org/abs/2010.10418 +tsy-fracas,Tasksource Symbol-Tuning,tsy-fracas,['English'],[],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/felipessalvatore/NLI_datasets,https://github.com/felipessalvatore/NLI_datasets, +tsy-vitaminc-tals__vitaminc,Tasksource Symbol-Tuning,tsy-vitaminc-tals__vitaminc,['English'],['wikipedia.org'],[],['FEVER'],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/TalSchuster/VitaminC,https://github.com/TalSchuster/VitaminC,https://arxiv.org/abs/2103.08541 +tsy-hyperpartisan_news,Tasksource Symbol-Tuning,tsy-hyperpartisan_news,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': None}]",,https://huggingface.co/datasets/hyperpartisan_news_detection,, +tsy-fever_evidence_related-mwong__fever_related,Tasksource Symbol-Tuning,tsy-fever_evidence_related-mwong__fever_related,['English'],['wikipedia.org'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://fever.ai/download/feverous/license.html'}]",,https://huggingface.co/datasets/mwong/fever-evidence-related,,https://arxiv.org/abs/1803.05355 +tsy-dynasent-dynabench.dynasent.r1.all-r1,Tasksource Symbol-Tuning,tsy-dynasent-dynabench.dynasent.r1.all-r1,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://huggingface.co/datasets/dynabench/dynasent#license'}]",,https://dynabench.org/tasks/3,,https://arxiv.org/abs/2012.15349 +tsy-dynasent-dynabench.dynasent.r2.all-r2,Tasksource Symbol-Tuning,tsy-dynasent-dynabench.dynasent.r2.all-r2,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://huggingface.co/datasets/dynabench/dynasent#license'}]",,https://dynabench.org/tasks/3,,https://arxiv.org/abs/2012.15349 +tsy-condaqa,Tasksource Symbol-Tuning,tsy-condaqa,['English'],['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'Apache License 2.0', 'License URL': 'https://github.com/AbhilashaRavichander/CondaQA/blob/main/LICENSE'}]",Apache License 2.0,https://github.com/AbhilashaRavichander/CondaQA,https://github.com/AbhilashaRavichander/CondaQA,https://arxiv.org/abs/2211.00295 +tsy-defeasible_nli-snli,Tasksource Symbol-Tuning,tsy-defeasible_nli-snli,['English'],['crowdsourced'],[],"['SNLI', 'social chemestry', 'ATOMIC']",commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/rudinger/defeasible-nli/blob/main/LICENSE'}]",,https://huggingface.co/datasets/metaeval/defeasible-nli,,https://aclanthology.org/2020.findings-emnlp.418/ +tsy-defeasible_nli-atomic,Tasksource Symbol-Tuning,tsy-defeasible_nli-atomic,['English'],['crowdsourced'],[],"['SNLI', 'social chemestry', 'ATOMIC']",commercial,,"[{'License': 'MIT License', 'License URL': 'https://github.com/rudinger/defeasible-nli/blob/main/LICENSE'}]",,https://huggingface.co/datasets/metaeval/defeasible-nli,,https://aclanthology.org/2020.findings-emnlp.418/ +tsy-lonli,Tasksource Symbol-Tuning,tsy-lonli,['English'],['grammar-based'],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/microsoft/LoNLI,https://github.com/microsoft/LoNLI,https://arxiv.org/abs/2107.07229 +tsy-tracie,Tasksource Symbol-Tuning,tsy-tracie,['English'],['crowdsourced'],[],['ROCStories dataset'],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",Apache License 2.0,https://github.com/allenai/aristo-leaderboard/tree/master/tracie/data,https://github.com/allenai/aristo-leaderboard/tree/master/tracie/data,https://arxiv.org/abs/2010.12753 +tsy-winowhy,Tasksource Symbol-Tuning,tsy-winowhy,['English'],"['winograd schema challenge dataset', 'conceptnet', 'crowdsourced']",[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/HKUST-KnowComp/WinoWhy,https://github.com/HKUST-KnowComp/WinoWhy,https://arxiv.org/abs/2005.05763 +tsy-cladder,Tasksource Symbol-Tuning,tsy-cladder,['English'],[],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': None}]",MIT License,https://github.com/causalNLP/cladder,https://github.com/causalNLP/cladder, +fc-flan-drop,Flan Collection (Flan 2021),drop,['English'],"['wikipedia.org', 'crowdsourced']",[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://allenai.org/data/drop'}]",,https://allenai.org/data/drop,,https://arxiv.org/abs/1903.00161 +fc-flan-e2e_nlg,Flan Collection (Flan 2021),e2e_nlg,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://gem-benchmark.com/data_cards/e2e_nlg'}]",,https://gem-benchmark.com/data_cards/e2e_nlg,,https://arxiv.org/abs/1706.09254 +fc-flan-natural_questions,Flan Collection (Flan 2021),natural_questions,['English'],"['wikipedia.org', 'crowdsourced']",[],[],commercial,commercial,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://github.com/google-research-datasets/natural-questions/tree/master/nq_open'}]",Apache License 2.0,https://github.com/google-research-datasets/natural-questions/tree/master/nq_open,https://github.com/google-research-datasets/natural-questions/tree/master/nq_open,https://aclanthology.org/Q19-1026/?utm_campaign=NLP%20News&utm_medium=email&utm_source=Revue%20newsletter +fc-flan-quac,Flan Collection (Flan 2021),quac,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://quac.ai/'}]",,https://quac.ai/,,https://arxiv.org/abs/1808.07036 +fc-flan-squad_v1,Flan Collection (Flan 2021),squad_v1,['English'],['crowdsourced (daemo)'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/4.0/legalcode'}]",,https://rajpurkar.github.io/SQuAD-explorer/,https://rajpurkar.github.io/SQuAD-explorer/,https://arxiv.org/abs/1806.03822 +fc-flan-squad_v2,Flan Collection (Flan 2021),squad_v2,['English'],['crowdsourced (daemo)'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/4.0/legalcode'}]",,https://rajpurkar.github.io/SQuAD-explorer/,https://rajpurkar.github.io/SQuAD-explorer/,https://arxiv.org/abs/1806.03822 +fc-flan-trec,Flan Collection (Flan 2021),trec,['English'],[],[],[],commercial,,"[{'License': 'CC0 1.0', 'License URL': 'https://www.kaggle.com/datasets/thedevastator/the-trec-question-classification-dataset-a-longi'}]",,https://cogcomp.seas.upenn.edu/Data/QA/QC/,, +fc-flan-true_case,Flan Collection (Flan 2021),true_case,['English'],[],[],[],commercial,,"[{'License': 'CC0 1.0', 'License URL': 'https://creativecommons.org/share-your-work/public-domain/cc0/'}]",,https://www.paracrawl.eu/,, +fc-flan-wiki_lingua_english_en,Flan Collection (Flan 2021),wiki_lingua_english_en,['English'],['wikihow.com'],[],[],commercial,,"[{'License': 'CC BY 3.0', 'License URL': 'https://creativecommons.org/licenses/by/3.0/'}]",,https://gem-benchmark.com/data_cards/wiki_lingua,,https://arxiv.org/abs/2010.03093 +fc-flan-winogrande,Flan Collection (Flan 2021),winogrande,['English'],['crowdsourced'],[],[],commercial,commercial,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/allenai/winogrande'}]",Apache License 2.0,https://github.com/allenai/winogrande,https://github.com/allenai/winogrande,https://arxiv.org/abs/1907.10641 +fc-flan-wnli,Flan Collection (Flan 2021),wnli,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0/'}]",,https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html,, +fc-flan-word_segment,Flan Collection (Flan 2021),word_segment,['English'],[],[],[],commercial,,"[{'License': 'CC0 1.0', 'License URL': 'https://creativecommons.org/share-your-work/public-domain/cc0/'}]",,https://www.paracrawl.eu/,, +fc-flan-wsc,Flan Collection (Flan 2021),wsc,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html'}]",,https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html,, +oig-unified_grade_school_math_instructions,OIG,oig-grade_school_math_instructions,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://laion.ai/blog/oig-dataset/'}, {'License': 'MIT License', 'License URL': 'https://huggingface.co/datasets/gsm8k#licensing-information'}]",,https://github.com/openai/grade-school-math,,https://arxiv.org/abs/2110.14168 +oig-unified_sqlv2,OIG,oig-sqlv2,['English'],[],[],"['SPIDER', 'SPARC', 'COSQL']",commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://laion.ai/blog/oig-dataset/'}, {'License': 'CC BY-SA 4.0', 'License URL': 'https://yale-lily.github.io/spider'}, {'License': 'CC BY-SA 4.0', 'License URL': 'https://yale-lily.github.io/sparc'}, {'License': 'CC BY-SA 4.0', 'License URL': 'https://yale-lily.github.io/cosql'}]",,Unspecified,, +oig-unified_nq,OIG,oig-nq,['English'],[],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://laion.ai/blog/oig-dataset/'}, {'License': 'CC BY-SA 3.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/3.0/'}]",,https://ai.google.com/research/NaturalQuestions/download,, +oig-unified_sqlv1,OIG,oig-sqlv1,['English'],[],[],"['SPIDER', 'SPARC', 'COSQL']",commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://laion.ai/blog/oig-dataset/'}, {'License': 'CC BY-SA 4.0', 'License URL': 'https://yale-lily.github.io/spider'}, {'License': 'CC BY-SA 4.0', 'License URL': 'https://yale-lily.github.io/sparc'}, {'License': 'CC BY-SA 4.0', 'License URL': 'https://yale-lily.github.io/cosql'}]",,https://huggingface.co/datasets/laion/OIG,, +oig-unified_canadian_parliament,OIG,oig-canadian_parliament,['English'],[],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://laion.ai/blog/oig-dataset/'}]",,https://openparliament.ca/data-download/,, +oig-unified_cuad,OIG,oig-cuad,['English'],['sec.gov/edgar/about'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://laion.ai/blog/oig-dataset/'}, {'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0/'}]",,https://www.atticusprojectai.org/cuad,,https://arxiv.org/abs/2103.06268 +oig-unified_squad_v2_more_neg,OIG,oig-squad_v2_more_neg,['English'],['crowdsourced (daemo)'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://laion.ai/blog/oig-dataset/'}, {'License': 'CC BY-SA 4.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/4.0/legalcode'}]",,https://rajpurkar.github.io/SQuAD-explorer/,,https://arxiv.org/abs/1806.03822 +oig-unified_squad_v2,OIG,oig-squad_v2,['English'],['crowdsourced (daemo)'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://laion.ai/blog/oig-dataset/'}, {'License': 'CC BY-SA 4.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/4.0/legalcode'}]",,https://rajpurkar.github.io/SQuAD-explorer/,,https://arxiv.org/abs/1806.03822 +fc-p3-adversarial_qa,Flan Collection (P3),adversarial_qa,['English'],"['wikipedia.org', 'crowdsourced']",[],['SQuADv1'],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/3.0/'}]",,https://paperswithcode.com/dataset/adversarialqa,,https://arxiv.org/abs/2002.00293 +fc-p3-cos_e,Flan Collection (P3),cos_e,['English'],['conceptnet'],[],['CommonSenseQA'],commercial,commercial,"[{'License': 'BSD 3-Clause License', 'License URL': 'https://github.com/salesforce/cos-e/blob/master/LICENSE'}]",BSD 3-Clause License,https://github.com/salesforce/cos-e,https://github.com/salesforce/cos-e,https://arxiv.org/abs/1906.02361 +fc-p3-dbpedia_14,Flan Collection (P3),dbpedia_14,['English'],['dbpedia'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'}]",,https://huggingface.co/datasets/dbpedia_14,, +fc-p3-hotpotqa,Flan Collection (P3),hotpotqa,['English'],['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'CC BY-SA 4.0', 'License URL': 'http://creativecommons.org/licenses/by-sa/4.0/legalcode'}]",Apache License 2.0,https://github.com/hotpotqa/hotpot,https://github.com/hotpotqa/hotpot,https://arxiv.org/abs/1809.09600 +fc-p3-quarel,Flan Collection (P3),quarel,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0'}]",,https://allenai.org/data/quarel,, +fc-p3-quartz,Flan Collection (P3),quartz,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0'}]",,https://allenai.org/data/quartz,,https://arxiv.org/abs/1909.03553 +fc-p3-quoref,Flan Collection (P3),quoref,['English'],"['wikipedia.org', 'crowdsourced']",[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://creativecommons.org/licenses/by/4.0/'}]",,https://paperswithcode.com/dataset/quoref,,https://arxiv.org/abs/1908.05803 +fc-p3-web_questions,Flan Collection (P3),web_questions,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'http://creativecommons.org/licenses/by/4.0/'}]",,https://nlp.stanford.edu/software/sempre/,,https://aclanthology.org/D13-1160/ +fc-p3-wiki_bio,Flan Collection (P3),wiki_bio,['English'],['wikipedia.org'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://github.com/rlebret/wikipedia-biography-dataset/blob/master/LICENSE.txt'}]",,https://paperswithcode.com/dataset/wikibio,,https://arxiv.org/abs/1603.07771 +fc-p3-wiki_hop,Flan Collection (P3),wiki_hop,['English'],['wikipedia.org'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'http://qangaroo.cs.ucl.ac.uk/'}]",,https://paperswithcode.com/dataset/wikihop,,https://arxiv.org/abs/1603.07771 +fc-sni-adversarial_qa,Flan Collection (Super-NaturalInstructions),fc-sni-adversarial_qa,['English'],"['wikipedia.org', 'crowdsourced']",[],['SQuADv1'],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://creativecommons.org/licenses/by-sa/3.0/'}]",,https://paperswithcode.com/dataset/adversarialqa,,https://arxiv.org/abs/2002.00293 +fc-sni-adverserial_qa,Flan Collection (Super-NaturalInstructions),fc-sni-adverserial_qa,['English'],"['wikipedia.org', 'crowdsourced']",[],['SQuADv1'],commercial,,"[{'License': 'MIT License', 'License URL': 'Unspecified'}]",,https://huggingface.co/datasets/adversarial_qa,,https://arxiv.org/abs/2002.00293 +fc-sni-air_dialogue,Flan Collection (Super-NaturalInstructions),fc-sni-air_dialogue,['English'],['crowdsourced'],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': 'https://aclanthology.org/D18-1419.pdf'}]",Apache License 2.0,https://github.com/google/airdialogue,https://github.com/google/airdialogue,https://aclanthology.org/D18-1419/ +fc-sni-ancora_ca_ner,Flan Collection (Super-NaturalInstructions),fc-sni-ancora_ca_ner,"['Catalan', 'English']",[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://zenodo.org/record/4761746'}]",,https://huggingface.co/datasets/bsc/ancora-ca-ner,, +fc-sni-anem,Flan Collection (Super-NaturalInstructions),fc-sni-anem,['English'],[],[],[],commercial,commercial,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://github.com/juand-r/entity-recognition-datasets/blob/master/data/AnEM/LICENSE'}]",MIT License,https://github.com/juand-r/entity-recognition-datasets/tree/master/data/AnEM,https://github.com/juand-r/entity-recognition-datasets/tree/master/data/AnEM, +fc-sni-argkp,Flan Collection (Super-NaturalInstructions),fc-sni-argkp,['English'],[],[],[],commercial,commercial,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://research.ibm.com/haifa/dept/vst/debating_data.shtml'}]",Apache License 2.0,https://github.com/IBM/KPA_2021_shared_task,https://github.com/IBM/KPA_2021_shared_task,https://arxiv.org/abs/2005.01619 +fc-sni-asian_language_treebank,Flan Collection (Super-NaturalInstructions),fc-sni-asian_language_treebank,"['Indonesian', 'Japanese', 'English']",[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/'}]",,https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/,, +fc-sni-atomic,Flan Collection (Super-NaturalInstructions),fc-sni-atomic,['English'],"['crowdsourced', 'conceptnet']",[],['Atomic (Sap et al 2019)'],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://allenai.org/data/atomic-2020'}]",,https://allenai.org/data/atomic-2020,,https://arxiv.org/abs/2010.05953 +fc-sni-bard,Flan Collection (Super-NaturalInstructions),fc-sni-bard,['English'],[],[],[],commercial,commercial,"[{'License': 'Apache License 2.0', 'License URL': 'https://github.com/NancyFulda/BYU-Analogical-Reasoning-Dataset/blob/master/LICENSE'}]",Apache License 2.0,https://github.com/NancyFulda/BYU-Analogical-Reasoning-Dataset,https://github.com/NancyFulda/BYU-Analogical-Reasoning-Dataset, +fc-sni-cedr,Flan Collection (Super-NaturalInstructions),fc-sni-cedr,"['Russian', 'English']",[],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': 'https://www.sciencedirect.com/science/article/pii/S1877050921013247'}]",Apache License 2.0,https://github.com/sag111/CEDR,https://github.com/sag111/CEDR, +fc-sni-circa,Flan Collection (Super-NaturalInstructions),fc-sni-circa,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/google-research-datasets/circa#license'}]",,https://huggingface.co/datasets/circa,,https://arxiv.org/abs/2010.03450 +fc-sni-clue_cmrc2018,Flan Collection (Super-NaturalInstructions),fc-sni-clue_cmrc2018,"['Chinese', 'English']",['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': 'https://aclanthology.org/D19-1600.pdf'}]",CC BY-SA 4.0,https://github.com/ymcui/cmrc2018,https://github.com/ymcui/cmrc2018,https://arxiv.org/abs/1810.07366 +fc-sni-coached_conv_pref,Flan Collection (Super-NaturalInstructions),fc-sni-coached_conv_pref,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/google-research-datasets/ccpe#copyright-notice'}]",,https://research.google/tools/datasets/coached-conversational-preference-elicitation/,,https://aclanthology.org/W19-5941/ +fc-sni-copa_hr,Flan Collection (Super-NaturalInstructions),fc-sni-copa_hr,"['Croatian', 'English']",['human'],[],['COPA dataset'],commercial,,"[{'License': 'BSD 2-Clause License', 'License URL': 'https://people.ict.usc.edu/~gordon/copa.html'}]",,https://huggingface.co/datasets/classla/copa_hr,,https://arxiv.org/abs/2005.00333 +fc-sni-crows_pairs,Flan Collection (Super-NaturalInstructions),fc-sni-crows_pairs,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/nyu-mll/crows-pairs#license'}]",,https://github.com/nyu-mll/crows-pairs,https://github.com/nyu-mll/crows-pairs,https://arxiv.org/abs/2010.00133 +fc-sni-cuad,Flan Collection (Super-NaturalInstructions),fc-sni-cuad,['English'],['sec.gov/edgar/about'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://www.atticusprojectai.org/cuad'}]",,https://huggingface.co/datasets/cuad,,https://arxiv.org/abs/2103.06268 +fc-sni-defeasible_nli_atomic,Flan Collection (Super-NaturalInstructions),fc-sni-defeasible_nli_atomic,['English'],['crowdsourced'],[],"['SNLI', 'social chemestry', 'ATOMIC']",commercial,commercial,"[{'License': 'Unspecified', 'License URL': 'https://ojs.aaai.org//index.php/AAAI/article/view/4160, https://www.aclweb.org/anthology/D15-1075/, https://www.aclweb.org/anthology/2020.emnlp-main.48/'}]",MIT License,https://github.com/rudinger/defeasible-nli,https://github.com/rudinger/defeasible-nli,https://aclanthology.org/2020.findings-emnlp.418/ +fc-sni-disfl_qa,Flan Collection (Super-NaturalInstructions),fc-sni-disfl_qa,['English'],"['wikipedia.org', 'crowdsourced']",[],['SQuADv2'],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/google-research-datasets/Disfl-QA#license'}]",,https://github.com/google-research-datasets/Disfl-QA,https://github.com/google-research-datasets/Disfl-QA,https://arxiv.org/abs/2106.04016 +fc-sni-e_snli,Flan Collection (Super-NaturalInstructions),fc-sni-e_snli,['English'],['crowdsourced (amt)'],[],['snli'],commercial,commercial,"[{'License': 'Unspecified', 'License URL': 'https://proceedings.neurips.cc/paper_files/paper/2018/file/4c7a167bb329bd92580a99ce422d6fa6-Paper.pdf'}]",MIT License,https://github.com/OanaMariaCamburu/e-SNLI/,https://github.com/OanaMariaCamburu/e-SNLI/,https://arxiv.org/abs/1812.01193 +fc-sni-gap,Flan Collection (Super-NaturalInstructions),fc-sni-gap,['English'],['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': 'https://github.com/google-research-datasets/gap-coreference#gap-coreference-dataset'}]",Apache License 2.0,https://github.com/google-research-datasets/gap-coreference,https://github.com/google-research-datasets/gap-coreference,https://arxiv.org/abs/1810.05201 +fc-sni-hotpotqa,Flan Collection (Super-NaturalInstructions),fc-sni-hotpotqa,['English'],['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/hotpotqa/hotpot#license'}]",Apache License 2.0,https://github.com/hotpotqa/hotpot,https://github.com/hotpotqa/hotpot,https://arxiv.org/abs/1809.09600 +fc-sni-human_ratings_of_natural_language_generation_outputs,Flan Collection (Super-NaturalInstructions),fc-sni-human_ratings_of_natural_language_generation_outputs,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://researchportal.hw.ac.uk/en/datasets/human-ratings-of-natural-language-generation-outputs'}]",,https://researchportal.hw.ac.uk/en/datasets/human-ratings-of-natural-language-generation-outputs,, +fc-sni-hybridqa,Flan Collection (Super-NaturalInstructions),fc-sni-hybridqa,['English'],['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'CC BY 4.0', 'License URL': 'https://hybridqa.github.io/'}]",MIT License,https://github.com/wenhuchen/HybridQA,https://github.com/wenhuchen/HybridQA,https://arxiv.org/abs/2004.07347 +fc-sni-iirc,Flan Collection (Super-NaturalInstructions),fc-sni-iirc,['English'],['wikipedia.org'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://allenai.org/data/iirc'}]",,https://allenai.org/data/iirc,,https://arxiv.org/abs/2011.07127 +fc-sni-jigsaw,Flan Collection (Super-NaturalInstructions),fc-sni-jigsaw,['English'],[],[],[],commercial,,"[{'License': 'CC0 1.0', 'License URL': 'https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview/faq'}]",,https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,, +fc-sni-librispeech_asr,Flan Collection (Super-NaturalInstructions),"fc-sni-librispeech_asr,",['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'http://www.openslr.org/12'}]",,https://huggingface.co/datasets/librispeech_asr,, +fc-sni-logic2text,Flan Collection (Super-NaturalInstructions),fc-sni-logic2text,['English'],"['wikipedia.org', 'crowdsourced']",[],['WikiTables'],commercial,commercial,"[{'License': 'Unspecified', 'License URL': 'https://aclanthology.org/2020.findings-emnlp.190.pdf'}]",MIT License,https://github.com/czyssrs/Logic2Text,https://github.com/czyssrs/Logic2Text,https://arxiv.org/abs/2004.14579 +fc-sni-numeric_fused_head,Flan Collection (Super-NaturalInstructions),fc-sni-numeric_fused_head,['English'],[],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': 'https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00280/43502/Where-s-My-Head-Definition-Data-Set-and-Models-for'}]",MIT License,https://github.com/yanaiela/num_fh,https://github.com/yanaiela/num_fh, +fc-sni-offenseval_dravidian,Flan Collection (Super-NaturalInstructions),fc-sni-offenseval_dravidian,"['Tamil', 'English']",[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://aclanthology.org/2021.dravidianlangtech-1.46.pdf'}]",,https://huggingface.co/datasets/offenseval_dravidian,, +fc-sni-open_pi,Flan Collection (Super-NaturalInstructions),fc-sni-open_pi,['English'],['wikihow.com'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://allenai.org/data/openpi'}]",,https://allenai.org/data/openpi,,https://arxiv.org/abs/2011.08092 +fc-sni-paper_reviews_data_set,Flan Collection (Super-NaturalInstructions),fc-sni-paper_reviews_data_set,"['Spanish', 'English']",[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://archive.ics.uci.edu/dataset/410/paper+reviews'}]",,https://archive.ics.uci.edu/ml/datasets/Paper+Reviews,, +fc-sni-poem_sentiment,Flan Collection (Super-NaturalInstructions),fc-sni-poem_sentiment,['English'],['project gutenberg'],[],['Gutenberg Poem Dataset'],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/google-research-datasets/poem-sentiment/blob/master/LICENSE'}]",,https://huggingface.co/datasets/poem_sentiment,,https://arxiv.org/abs/2011.02686 +fc-sni-propara,Flan Collection (Super-NaturalInstructions),fc-sni-propara,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://allenai.org/data/propara'}]",,https://arxiv.org/abs/1805.06975,,https://arxiv.org/abs/1805.06975 +fc-sni-quarel,Flan Collection (Super-NaturalInstructions),fc-sni-quarel,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://allenai.org/data/quarel'}]",,https://allenai.org/data/quarel,, +fc-sni-quartz,Flan Collection (Super-NaturalInstructions),fc-sni-quartz,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://allenai.org/data/quartz'}]",,https://allenai.org/data/quartz,,https://arxiv.org/abs/1909.03553 +fc-sni-quoref,Flan Collection (Super-NaturalInstructions),fc-sni-quoref,['English'],"['wikipedia.org', 'crowdsourced']",[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://allenai.org/data/quoref'}]",,https://paperswithcode.com/dataset/quoref,,https://arxiv.org/abs/1908.05803 +fc-sni-ro_sts_parallel,Flan Collection (Super-NaturalInstructions),fc-sni-ro_sts_parallel,"['Romanian', 'English']",[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/dumitrescustefan/RO-STS'}]",,https://huggingface.co/datasets/ro_sts_parallel,, +fc-sni-schema_guided_dstc8,Flan Collection (Super-NaturalInstructions),fc-sni-schema_guided_dstc8,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/google-research-datasets/dstc8-schema-guided-dialogue/blob/master/LICENSE.txt'}]",,https://huggingface.co/datasets/schema_guided_dstc8,,https://arxiv.org/abs/2002.01359 +fc-sni-scitail,Flan Collection (Super-NaturalInstructions),fc-sni-scitail,['English'],[],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://allenai.org/data/scitail'}]",,http://data.allenai.org.s3.amazonaws.com/downloads/SciTailV1.1.zip,,https://arxiv.org/abs/1809.05726 +fc-sni-scitailv1.1,Flan Collection (Super-NaturalInstructions),fc-sni-scitailv1.1,['English'],[],[],[],commercial,,"[{'License': 'Apache License 2.0', 'License URL': 'https://allenai.org/data/scitail'}]",,http://data.allenai.org/scitail,,https://arxiv.org/abs/1809.05726 +fc-sni-semeval_2020_task4,Flan Collection (Super-NaturalInstructions),fc-sni-semeval_2020_task4,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://competitions.codalab.org/competitions/21080#learn_the_details-terms_and_conditions'}]",,https://arxiv.org/abs/2007.00236,,https://aclanthology.org/2020.semeval-1.39/ +fc-sni-sms_spam_collection_v.1,Flan Collection (Super-NaturalInstructions),fc-sni-sms_spam_collection_v.1,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'http://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection'}]",,https://www.dt.fee.unicamp.br/~tiago/smsspamcollection/,, +fc-sni-splash,Flan Collection (Super-NaturalInstructions),fc-sni-splash,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/MSR-LIT/Splash'}]",,https://arxiv.org/pdf/2005.02539.pdf,https://github.com/MSR-LIT/Splash,https://arxiv.org/abs/2005.02539 +fc-sni-squad_1.1,Flan Collection (Super-NaturalInstructions),fc-sni-squad_1.1,['English'],['wikipedia.org'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://rajpurkar.github.io/SQuAD-explorer/'}]",,https://rajpurkar.github.io/SQuAD-explorer/,https://rajpurkar.github.io/SQuAD-explorer/,https://arxiv.org/abs/1606.05250 +fc-sni-squad2.0,Flan Collection (Super-NaturalInstructions),fc-sni-squad2.0,['English'],['crowdsourced (daemo)'],[],[],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://rajpurkar.github.io/SQuAD-explorer/'}]",,https://arxiv.org/pdf/1806.03822.pdf,,https://arxiv.org/abs/1806.03822 +fc-sni-strategyqa,Flan Collection (Super-NaturalInstructions),fc-sni-strategyqa,['English'],['wikipedia.org'],[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': 'https://arxiv.org/abs/2101.02235'}]",MIT License,https://github.com/eladsegal/strategyqa,https://github.com/eladsegal/strategyqa,https://arxiv.org/abs/2101.02235 +fc-sni-universal_dependencies___english_dependency_treebank,Flan Collection (Super-NaturalInstructions),fc-sni-universal_dependencies___english_dependency_treebank,['English'],[],[],[],commercial,commercial,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/UniversalDependencies/UD_English-EWT#licensecopyright'}]",CC BY-SA 4.0,https://github.com/UniversalDependencies/UD_English-EWT,https://github.com/UniversalDependencies/UD_English-EWT, +fc-sni-web_questions,Flan Collection (Super-NaturalInstructions),fc-sni-web_questions,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://nlp.stanford.edu/software/sempre/'}]",,https://nlp.stanford.edu/software/sempre/,,https://aclanthology.org/D13-1160/ +fc-sni-wiki_hop,Flan Collection (Super-NaturalInstructions),fc-sni-wiki_hop,['English'],['wikipedia.org'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'http://qangaroo.cs.ucl.ac.uk/'}]",,https://paperswithcode.com/dataset/wikihop,,https://arxiv.org/abs/1603.07771 +fc-sni-wikitext,Flan Collection (Super-NaturalInstructions),fc-sni-wikitext,['English'],['wikipedia.org'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/'}]",,https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/,,https://arxiv.org/abs/1609.07843 +fc-sni-winograd_wsc,Flan Collection (Super-NaturalInstructions),fc-sni-winograd_wsc,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html'}]",,https://huggingface.co/datasets/winograd_wsc,, +fc-sni-winomt,Flan Collection (Super-NaturalInstructions),fc-sni-winomt,['English'],['grammar-based'],[],"['Winogender', 'WinoBias']",commercial,commercial,"[{'License': 'Unspecified', 'License URL': 'https://github.com/gabrielStanovsky/mt_gender#evaluating-gender-bias-in-machine-translation'}]",MIT License,https://github.com/gabrielStanovsky/mt_gender,https://github.com/gabrielStanovsky/mt_gender,https://arxiv.org/abs/1906.00591 +fc-sni-winowhy,Flan Collection (Super-NaturalInstructions),fc-sni-winowhy,['English'],"['winograd schema challenge dataset', 'conceptnet', 'crowdsourced']",[],[],commercial,commercial,"[{'License': 'Unspecified', 'License URL': 'https://arxiv.org/abs/2005.05763'}]",MIT License,https://github.com/HKUST-KnowComp/WinoWhy,https://github.com/HKUST-KnowComp/WinoWhy,https://arxiv.org/abs/2005.05763 +fc-sni-wsc_fiexed,Flan Collection (Super-NaturalInstructions),fc-sni-wsc_fiexed,['English'],[],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://github.com/google-research-datasets/boolean-questions#license'}]",,https://huggingface.co/datasets/super_glue,https://github.com/google-research-datasets/boolean-questions, +fc-sni-wsc; enhanced_wsc,Flan Collection (Super-NaturalInstructions),fc-sni-wsc; enhanced_wsc,['English'],[],[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html'}]",,https://huggingface.co/datasets/winograd_wsc; https://github.com/mhany90/perturbed-wsc,https://huggingface.co/datasets/winograd_wsc; https://github.com/mhany90/perturbed-wsc, +fc-sni-xcopa,Flan Collection (Super-NaturalInstructions),fc-sni-xcopa,"['Haitian', 'English']",['human'],[],['COPA dataset'],commercial,commercial,"[{'License': 'CC BY 4.0', 'License URL': 'https://github.com/cambridgeltl/xcopa/blob/master/LICENSE.md'}]",CC BY 4.0,https://github.com/cambridgeltl/xcopa,https://github.com/cambridgeltl/xcopa,https://arxiv.org/abs/2005.00333 +fc-sni-xquad,Flan Collection (Super-NaturalInstructions),fc-sni-xquad,['English'],['wikipedia.org'],[],['Squad'],commercial,,"[{'License': 'CC BY-SA 4.0', 'License URL': 'https://github.com/deepmind/xquad#license'}]",,https://huggingface.co/datasets/viewer/?dataset=xquad,,https://arxiv.org/abs/1910.11856 +HelpSteer,HelpSteer,Helpfulness SteerLM,['English'],"['scale.com', 'human']",[],[],commercial,,"[{'License': 'CC BY 4.0', 'License URL': 'https://huggingface.co/datasets/nvidia/HelpSteer'}]",,https://huggingface.co/datasets/nvidia/HelpSteer,,https://arxiv.org/abs/2311.09528 diff --git a/data_provenance/include_test.csv b/data_provenance/include_test.csv new file mode 100644 index 0000000..b0769b8 --- /dev/null +++ b/data_provenance/include_test.csv @@ -0,0 +1,9 @@ +,Dataset ID,Collection,Dataset Name,Languages,Text Sources,Model Generated,Derived from Datasets,License Use (DataProvenance),License Use (GitHub),Licenses,GitHub License,Dataset URL,GitHub URL,ArXiv URL +68,dolly-openqa,Dolly 15k,dolly-open_qa,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +69,dolly-closedqa,Dolly 15k,dolly-closed_qa,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +70,dolly-classification,Dolly 15k,dolly-classification,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +71,dolly-brainstorming,Dolly 15k,dolly-brainstorming,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +72,dolly-infoextract,Dolly 15k,dolly-infoextract,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +73,dolly-summarization,Dolly 15k,dolly-summarization,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +74,dolly-creative_writing,Dolly 15k,dolly-creative_writing,['English'],['crowdsourced'],[],[],commercial,,"[{'License': 'CC BY-SA 3.0', 'License URL': 'https://huggingface.co/datasets/databricks/databricks-dolly-15k#summary'}]",,https://huggingface.co/datasets/databricks/databricks-dolly-15k,, +77,oasst-en-octopack,Open Assistant OctoPack,oasst,"['English', 'Spanish', 'Russian', 'Mandarin Chinese', 'German', 'French', 'Thai', 'Portugese (Brazilian)', 'Catalan', 'Ukrainian', 'Italian', 'Japanese', 'Polish', 'Basque', 'Vietnamese', 'Hungarian', 'Arabic', 'Danish', 'Turkish', 'Code']",['crowdsourced'],[],[],commercial,commercial,"[{'License': 'CC BY 4.0', 'License URL': 'https://open-assistant.io/'}]",Apache License 2.0,https://github.com/LAION-AI/Open-Assistant,https://github.com/LAION-AI/Open-Assistant,https://arxiv.org/abs/2304.07327 diff --git a/data_provenance/source_allow_list.txt b/data_provenance/source_allow_list.txt new file mode 100644 index 0000000..b9bc442 --- /dev/null +++ b/data_provenance/source_allow_list.txt @@ -0,0 +1,23 @@ +crowdflower.com +crowdsourced +crowdsourced (amt) +crowdsourced (daemo) +human +User interactions +volunteer annotations +grammar-based +wikidata +wikipedia.org +wikihow.com +wikisource +wiktionary.org +wordnet +stackexchange.com +github +dbpedia +verbnet +project gutenberg +conceptnet +scale.com +winograd schema challenge dataset +sec.gov/edgar/about diff --git a/data_provenance/to-dolma.py b/data_provenance/to-dolma.py new file mode 100644 index 0000000..b336290 --- /dev/null +++ b/data_provenance/to-dolma.py @@ -0,0 +1,158 @@ +"""Convert the downloaded data into the dolma sharded jsonl.gz format. + +We use `download.py` to save our own intermediate copy of the data, before +preparing for dolma, in this file. +""" + +import argparse +import functools +import gzip +import itertools +import json +import os +from datetime import datetime + +import jsonlines +import pandas as pd +from constants import HF_MAPPING + +from licensed_pile.licenses import PermissiveLicenses +from licensed_pile.logs import configure_logging, get_logger +from licensed_pile.write import to_dolma + +LICENSE_MAPPER = { + "MPL 2.0": PermissiveLicenses.MPL, + "CDLA Permissive 1.0": PermissiveLicenses.CDLA_P, + "MIT License": PermissiveLicenses.MIT, + "CC BY 4.0": PermissiveLicenses.CC_BY, + "CC0 1.0": PermissiveLicenses.CC0, + "BSD 2-Clause License": PermissiveLicenses.BSD_2, + "BSD 3-Clause License": PermissiveLicenses.BSD_3, + "Apache License 2.0": PermissiveLicenses.APACHE_2, + "ISC License": PermissiveLicenses.ISC, + "EPL 1.0": PermissiveLicenses.EPL, + "CC BY-SA": PermissiveLicenses.CC_BY_SA, + "CC BY 3.0": PermissiveLicenses.CC_BY_3, + "CC BY-SA 3.0": PermissiveLicenses.CC_BY_SA_3, + "Artistic License 2.0": PermissiveLicenses.ARTISTIC_2, + "CC BY-SA 4.0": PermissiveLicenses.CC_BY_SA, +} + +parser = argparse.ArgumentParser( + description="Collect Data Provenance datasets into Dolma format." +) +parser.add_argument( + "--indir", + default="data/raw-data-provenance", + help="Path to our directory of raw datasets.", +) +parser.add_argument( + "--outdir", + default="data/data-provenance/v0/documents/", + help="Where the dolma formatted data goes.", +) +parser.add_argument( + "--include", + default="include.csv", + help="The csv with metadata on data provenance datasets.", +) +parser.add_argument( + "--filename", default="dpi.jsonl.gz", help="The base filename for our datasets." +) +parser.add_argument( + "--shard_size", type=int, default=1, help="Size, in GB, for each shard." +) + +SOURCE_NAME = "Data Provenance Initiative" + + +def listdir_nohidden(path): + """Returns all non-hidden files within a directory, raises ValueError if the path is invalid.""" + if not os.path.exists(path) or not os.path.isdir(path): + raise ValueError( + f"Provided path '{path}' is either not a directory or does not exist." + ) + return [os.path.join(path, f) for f in os.listdir(path) if not f.startswith(".")] + + +def read_jsonl_gz(inpath: str): + with gzip.open(inpath, "rb") as fp: + return [json.loads(l) for l in fp] + + +def extract_licenses(license_list, gh_license): + license_set = set() + for license_dict in eval(license_list): + if license_dict["License"] != "Unspecified": + license_set.add(str(LICENSE_MAPPER[license_dict["License"]])) + if gh_license: + license_set = list(license_set) + [str(LICENSE_MAPPER[gh_license])] + return license_set + + +def file_to_dolma(path: str, include_df: str, source_name: str = SOURCE_NAME): + logger = get_logger() + logger.info(f"Converting {path} to the dolma format.") + dset_to_licenses = { + row["Dataset ID"]: extract_licenses(row["Licenses"], row["GitHub License"]) + for _, row in include_df.iterrows() + } + dset_to_license_urls = { + row["Dataset ID"]: entry["License URL"] + for _, row in include_df.iterrows() + for entry in eval(row["Licenses"]) + } + dset_to_langs = { + row["Dataset ID"]: eval(row["Languages"]) for _, row in include_df.iterrows() + } + dset_to_urls = { + row["Dataset ID"]: row["Dataset URL"] for _, row in include_df.iterrows() + } + + dset_collection = read_jsonl_gz(path) + + results = [] + for i, ex in enumerate(dset_collection): + license_names = dset_to_licenses[ex["dataset"]] + langs = dset_to_langs[ex["dataset"]] + url = dset_to_urls[ex["dataset"]] + license_urls = dset_to_license_urls[ex["dataset"]] + input_text = ex["inputs"] + target_text = ex.get("labels", ex.get("targets", "")) + # If target_text isn't found, the strip will remove the extra newline + text = f"{input_text}\n{target_text}".strip() + results.append( + { + "id": f"{ex['dataset']}-{i}", + "text": text, + "source": source_name, + "added": datetime.utcnow().isoformat(), + "metadata": { + "license": sorted(license_names), + "license_url": license_urls, + "language": langs, + "url": url, + "dataset_id": ex["dataset"], + "response": target_text, + }, + } + ) + return results + + +def main(args): + os.makedirs(args.outdir, exist_ok=True) + + include_df = pd.read_csv(args.include).fillna("") + + paths = listdir_nohidden(args.indir) + examples = itertools.chain( + *(file_to_dolma(path, include_df=include_df) for path in paths) + ) + to_dolma(examples, args.outdir, args.filename, args.shard_size) + + +if __name__ == "__main__": + args = parser.parse_args() + configure_logging() + main(args) diff --git a/licensed_pile/licenses.py b/licensed_pile/licenses.py index f1e9baf..6367480 100644 --- a/licensed_pile/licenses.py +++ b/licensed_pile/licenses.py @@ -10,6 +10,11 @@ def __str__(self): class PermissiveLicenses(StringEnum): + """By 'Permissive' we mean licenses that are in the Gold, Silver, or Bronze + lists of the Blue Oak Countil (https://blueoakcouncil.org/list), even if + they have copyleft requirements. + """ + PD = "Public Domain" CC0 = "Creative Commons Zero - Public Domain - https://creativecommons.org/publicdomain/zero/1.0/" CC_BY = ( @@ -24,7 +29,14 @@ class PermissiveLicenses(StringEnum): GFDL = "GNU Free Documentation License" APACHE_2 = "Apache 2 License - https://www.apache.org/licenses/LICENSE-2.0" MIT = "MIT License" - BSD = "BSD License" + BSD_2 = "BSD 2-Clause" + BSD_3 = "BSD 3-Clause" + + ISC = "ISC License" + ARTISTIC_2 = "Artistic License 2.0" + + # Not in the Blue Oak Council list, but open source compliant. + CDLA_P = "Community Data License Agreement - Permissive 1.0 - https://cdla.dev/" # TODO: Fill out this function to match in more cases. # Note: This kind of function will always be messy and probably require diff --git a/pyproject.toml b/pyproject.toml index a431c42..e9d5561 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,4 +7,3 @@ profile = "black" [build-system] # Minimum requirements for the build system to execute. requires = ["setuptools", "wheel"] # PEP 508 specifications. - diff --git a/requirements.txt b/requirements.txt index 7081d52..0d60bfa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,7 @@ rdflib requests>=2.13 smart_open tenacity -tqdm \ No newline at end of file +pandas +jsonlines +datasets +tqdm