Merge pull request SEACrowd#85 from sabilmakbar/sabil/qa_backward_com…

…patibility Relates SEACrowd#36 | Add Commonsense Reasoning Task & Extend QA Schema to cater Metadata (as optional field)
sabilmakbar · Nov 20, 2023 · e5db66c · e5db66c
2 parents bfd4f23 + 4202870
commit e5db66c
Show file tree

Hide file tree

Showing 9 changed files with 21 additions and 4 deletions.
diff --git a/seacrowd/sea_datasets/belebele/belebele.py b/seacrowd/sea_datasets/belebele/belebele.py
@@ -182,6 +182,7 @@ def _generate_examples(self, file: str) -> Tuple[int, Dict]:
                         "choices": choices,
                         "context": line['flores_passage'],
                         "answer": [answer],
+                        "meta": {}
                     }
         else:
-            raise ValueError(f"Invalid config {self.config.name}")
+            raise ValueError(f"Invalid config {self.config.name}")
diff --git a/seacrowd/sea_datasets/facqa/facqa.py b/seacrowd/sea_datasets/facqa/facqa.py
@@ -151,5 +151,6 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
                     "choices": [],
                     "context": listToString(ast.literal_eval(row.passage)),
                     "answer": [getAnswerString(ast.literal_eval(row.passage), ast.literal_eval(row.seq_label))],
+                    "meta": {}
                 }
                 yield row.index, entry
diff --git a/seacrowd/sea_datasets/idk_mrc/idk_mrc.py b/seacrowd/sea_datasets/idk_mrc/idk_mrc.py
@@ -228,4 +228,5 @@ def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
                         "choices": [],
                         "context": example["context"],
                         "answer": [ans["text"] for ans in qa["answers"]],
+                        "meta": {}
                     }
diff --git a/seacrowd/sea_datasets/mlqa/mlqa.py b/seacrowd/sea_datasets/mlqa/mlqa.py
@@ -240,5 +240,5 @@ def _generate_examples(self, filepath: Path, split: str, files=None) -> Tuple[in
                             count += 1
 
                         elif self.config.schema == "seacrowd_qa":
-                            yield count, {"question_id": id_, "context": context, "question": question, "answer": {"answer_start": answers_start[0], "text": answers_text[0]}, "id": id_, "choices": [], "type": "extractive", "document_id": count}
+                            yield count, {"question_id": id_, "context": context, "question": question, "answer": {"answer_start": answers_start[0], "text": answers_text[0]}, "id": id_, "choices": [], "type": "extractive", "document_id": count, "meta":{}}
                             count += 1
diff --git a/seacrowd/sea_datasets/squad_id/squad_id.py b/seacrowd/sea_datasets/squad_id/squad_id.py
@@ -120,6 +120,7 @@ def _generate_examples(self, filepath: Path):
                                     "choices": [],
                                     "context": each_data["context"],
                                     "answer": answers,
+                                    "meta": {}
                                 }
 
                             else:

diff --git a/seacrowd/sea_datasets/tydiqa_id/tydiqa_id.py b/seacrowd/sea_datasets/tydiqa_id/tydiqa_id.py
@@ -180,7 +180,8 @@ def _generate_examples(self, filepath: Path):
                     "type": 'abstractive',
                     "choices": [],
                     "context": example['context'],
-                    "answer": [example['label']]
+                    "answer": [example['label']],
+                    "meta": {}
                 }
         else:
             raise ValueError(f"Invalid config: {self.config.name}")
diff --git a/seacrowd/sea_datasets/xcopa/xcopa.py b/seacrowd/sea_datasets/xcopa/xcopa.py
@@ -54,7 +54,7 @@
     ]
 }
 
-_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]
+_SUPPORTED_TASKS = [Tasks.COMMONSENSE_REASONING]
 
 _SOURCE_VERSION = "1.0.0"
 
@@ -157,6 +157,7 @@ def _generate_examples(self, filepath):
                         "choices": [data["choice1"], data["choice2"]],
                         "context": data["premise"],
                         "answer": [data["choice1"] if data["label"] == 0 else data["choice2"]],
+                        "meta": {}
                     }
                     yield idx, sample
 

diff --git a/seacrowd/utils/constants.py b/seacrowd/utils/constants.py
@@ -58,6 +58,7 @@ class Tasks(Enum):
     TOKEN_LEVEL_LANGUAGE_IDENTIFICATION = "LANGID"
 
     # Pair Text Classification
+    COMMONSENSE_REASONING = "CR"
     QUESTION_ANSWERING = "QA"
     TEXTUAL_ENTAILMENT = "TE"
     SEMANTIC_SIMILARITY = "STS"
@@ -191,6 +192,7 @@ class Licenses(Enum):
     Tasks.KEYWORD_TAGGING: "SEQ_LABEL",
     Tasks.SENTENCE_ORDERING: "SEQ_LABEL",
     Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL",
+    Tasks.COMMONSENSE_REASONING: "QA",
     Tasks.QUESTION_ANSWERING: "QA",
     Tasks.TEXTUAL_ENTAILMENT: "PAIRS",
     Tasks.SEMANTIC_SIMILARITY: "PAIRS_SCORE",

diff --git a/seacrowd/utils/schemas/qa.py b/seacrowd/utils/schemas/qa.py
@@ -13,5 +13,14 @@
         "choices": datasets.Sequence(datasets.Value("string")),
         "context": datasets.Value("string"),
         "answer": datasets.Sequence(datasets.Value("string")),
+
+        # the schema of 'meta' aren't specified either to allow some flexibility
+        "meta": {}
+
+        # notes on how to use this field of 'meta'
+        # you can choose two of options:
+        # 1. defining as empty dict if you don't think it's usable in `_generate_examples`, or
+        # 2. defining meta as dict of key with intended colname meta and its val with dataset.Features class
+        #    in `_info` Dataloader method then populate it with the values in `_general_examples` Dataloader method
     }
 )