diff --git a/seacrowd/sea_datasets/belebele/belebele.py b/seacrowd/sea_datasets/belebele/belebele.py index c409018b6..de9d48a2c 100644 --- a/seacrowd/sea_datasets/belebele/belebele.py +++ b/seacrowd/sea_datasets/belebele/belebele.py @@ -130,12 +130,14 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase path = dl_manager.download_and_extract(_URLS[_DATASETNAME]) file = "{path}/Belebele/{lang}.jsonl".format(path=path, lang=lang) - return datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "file": file, - }, - ) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "file": file, + }, + ), + ] def _generate_examples(self, file: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" @@ -152,8 +154,8 @@ def _generate_examples(self, file: str) -> Tuple[int, Dict]: answer = choices[int(line['correct_answer_num'])-1] yield key, { "id": key, - "question_id": line['question_number'], - "document_id": hashlib.md5(line['question_number'].encode('utf-8')).hexdigest(), + "question_id": str(line['question_number']), + "document_id": hashlib.md5(line['flores_passage'].encode('utf-8')).hexdigest(), "question": line['question'], "type": 'multiple_choice', "choices": choices,