diff --git a/seacrowd/utils/constants.py b/seacrowd/utils/constants.py index cb8c537bd..965701caf 100644 --- a/seacrowd/utils/constants.py +++ b/seacrowd/utils/constants.py @@ -5,6 +5,7 @@ from seacrowd.utils.schemas import ( image_text_features, kb_features, + tree_features, pairs_features, pairs_features_score, pairs_multi_features, @@ -43,9 +44,17 @@ class Tasks(Enum): WORD_SENSE_DISAMBIGUATION = "WSD" COREFERENCE_RESOLUTION = "COREF" + + # Tree + CONSTITUENCY_PARSING = "CONST_PAR" + + # Single Text Classification + ASPECT_BASED_SENTIMENT_ANALYSIS = "ABSA" + # Single Text Classification (single-label) ABUSIVE_LANGUAGE_PREDICTION = "ABL" DOMAIN_KNOWLEDGE_CLASSIFICATION = "DKC" # classification for non NLP-oriented label + EMOTION_CLASSIFICATION = "EC" LANGUAGE_IDENTIFICATION = "LI" HOAX_NEWS_CLASSIFICATION = "HNC" @@ -207,6 +216,7 @@ class Licenses(Enum): TASK_TO_SCHEMA = { Tasks.DEPENDENCY_PARSING: "KB", + Tasks.CONSTITUENCY_PARSING: "TREE", Tasks.WORD_SENSE_DISAMBIGUATION: "T2T", Tasks.WORD_ANALOGY: "T2T", Tasks.KEYWORD_EXTRACTION: "SEQ_LABEL", @@ -276,6 +286,7 @@ class Licenses(Enum): SCHEMA_TO_FEATURES = { "KB": kb_features, + "TREE": tree_features, "QA": qa_features, "T2T": text2text_features, "TEXT": text_features(), diff --git a/seacrowd/utils/schemas/__init__.py b/seacrowd/utils/schemas/__init__.py index b517372e2..d95c8e4c4 100644 --- a/seacrowd/utils/schemas/__init__.py +++ b/seacrowd/utils/schemas/__init__.py @@ -1,5 +1,6 @@ from .image_text import features as image_text_features from .kb import features as kb_features +from .tree import features as tree_features from .pairs import features as pairs_features from .pairs import features_with_continuous_label as pairs_features_score from .pairs_multilabel import features as pairs_multi_features @@ -19,6 +20,7 @@ __all__ = [ "image_text_features", "kb_features", + "tree_features", "pairs_features", "pairs_features_score", "pairs_multi_features", diff --git a/seacrowd/utils/schemas/tree.py b/seacrowd/utils/schemas/tree.py new file mode 100644 index 000000000..b84c062e9 --- /dev/null +++ b/seacrowd/utils/schemas/tree.py @@ -0,0 +1,115 @@ +"""\ +Tree Schema + +This schema assumes a document with subnodes elements +and a tree hierarchy. + +For example: + + SUBNODE1 - word1 + // + NODE1 - SUBNODE2 - word2 + // +ROOT - NODE2 - SUBNODE3 - word3 + \\ + NODE3 - SUBNODE4 - word4 + \\ + SUBNODE5 - word5 + +Schema structure: + + "id": sentence_id, + "passage": { + "id": sentence_id, + "type": None, + "text": "word1 word2 word3 word4 word5" + "offsets": [0, 29] + }, + "nodes": [ + { + "id": 0, + "type": ROOT, + "text": "word1 word2 word3 word4 word5", + "offsets": [0, 29], + "subnodes": [1, 2, 3] + }, + { + "id": 1, + "type": NODE1, + "text": "word1 word2", + "offsets": [0, 11], + "subnodes": [4, 5] + }, + { + "id": 2, + "type": NODE2, + "text": "word3", + "offsets": [12, 17], + "subnodes": [6] + }, + { + "id": 3, + "type": NODE3, + "text": "word4 word5", + "offsets": [18, 29], + "subnodes": [7, 8] + }, + { + "id": 4, + "type": SUBNODE1, + "text": "word1", + "offsets": [0, 5], + "subnodes": [] + }, + { + "id": 5, + "type": SUBNODE2, + "text": "word2", + "offsets": [6, 11], + "subnodes": [] + }, + { + "id": 6, + "type": SUBNODE3, + "text": "word3", + "offsets": [12, 17], + "subnodes": [] + }, + { + "id": 7, + "type": SUBNODE4, + "text": "word4", + "offsets": [18, 23], + "subnodes": [] + }, + { + "id": 8, + "type": SUBNODE5, + "text": "word5", + "offsets": [24, 29], + "subnodes": [] + } + ] +""" +import datasets + +features = datasets.Features( + { + "id": datasets.Value("string"), + "passage": { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence(datasets.Value("int32")), + }, + "nodes": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Value("string"), + "offsets": datasets.Sequence(datasets.Value("int32")), + "subnodes": datasets.Sequence(datasets.Value("string")), # ids of subnodes + } + ], + } +)