SEACrowd · holylovenia · May 1, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 18, 2024
@@ -0,0 +1,277 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+import gdown
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks
+
+_CITATION = """\
+@inproceedings{10.1145/3587819.3592545,
+    author = {Prakash, Nirmalendu and Hee, Ming Shan and Lee, Roy Ka-Wei},
+    title = {TotalDefMeme: A Multi-Attribute Meme dataset on Total Defence in Singapore},
+    year = {2023},
+    isbn = {9798400701481},
+    publisher = {Association for Computing Machinery},
+    address = {New York, NY, USA},
+    url = {https://doi.org/10.1145/3587819.3592545},
+    doi = {10.1145/3587819.3592545},
+    booktitle = {Proceedings of the 14th Conference on ACM Multimedia Systems},
+    pages = {369–375},
+    numpages = {7},
+    keywords = {multimodal, meme, dataset, topic clustering, stance classification},
+    location = {Vancouver, BC, Canada},
+    series = {MMSys '23}
+}
+"""
+
+_DATASETNAME = "total_defense_meme"
+
+_DESCRIPTION = """\
+This is a large-scale multimodal and multi-attribute dataset containing memes
+about Singapore's Total Defence policy from different social media platforms.
+The type (Singaporean or generic), pillars (military, civil, economic, social,
+psychological, digital, others), topics and stances (against, neutral,
+supportive) of each meme are manually identified by annotators.
+"""
+
+_HOMEPAGE = "https://gitlab.com/bottle_shop/meme/TotalDefMemes"
+
+_LANGUAGES = ["eng"]
+
+_LICENSE = Licenses.UNKNOWN.value
+
+_LOCAL = False
+
+_URLS = {
+    "image": "https://drive.google.com/file/d/1oJIh4QQS3Idff2g6bZORstS5uBROjUUz/view?usp=share_link",
+    "annotations": "https://gitlab.com/bottle_shop/meme/TotalDefMemes/-/raw/main/report/annotation.json?ref_type=heads",
+}
+
+_SUPPORTED_TASKS = [Tasks.OPTICAL_CHARACTER_RECOGNITION, Tasks.IMAGE_CLASSIFICATION_MULTILABEL]
+_SEACROWD_SCHEMA = {
+    task.value: f"seacrowd_{TASK_TO_SCHEMA[task].lower()}" for task in _SUPPORTED_TASKS
+}  # ocr: imtext, imc_multi: image_multi
+
+_SOURCE_VERSION = "1.0.0"
+
+_SEACROWD_VERSION = "1.0.0"
+
+
+class TotalDefenseMemeDataset(datasets.GeneratorBasedBuilder):
+    """Multimodal dataset containing memes about Singapore's Total Defence policy"""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=_DATASETNAME,
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_{_SEACROWD_SCHEMA['OCR']}",
+            version=SEACROWD_VERSION,
+            description=f"{_DATASETNAME} SEACrowd schema",
+            schema=_SEACROWD_SCHEMA["OCR"],
+            subset_id=_DATASETNAME,
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_{_SEACROWD_SCHEMA['IMC_MULTI']}",
+            version=SEACROWD_VERSION,
+            description=f"{_DATASETNAME} SEACrowd schema",
+            schema=_SEACROWD_SCHEMA["IMC_MULTI"],
+            subset_id=_DATASETNAME,
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        # define labelling
+        meme_type = ["Non_Memes", "Non_SG_Memes", "SG_Memes"]
+        pillar_type = [
+            "Social",
+            "Economic",
+            "Psychological",
+            "Military",
+            "Civil",
+            "Digital",
+            "Others",
+        ]
+        stance_type = ["Against", "Neutral", "Supportive"]
+
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "image_path": datasets.Value("string"),
+                    "categories": datasets.Sequence(datasets.ClassLabel(names=meme_type)),
+                    "text": datasets.Value("string"),
+                    "tags": datasets.Sequence(datasets.Value("string")),
+                    "pillar_stances": datasets.Sequence(
+                        {
+                            "category": datasets.ClassLabel(names=pillar_type),
+                            "stance": datasets.Sequence(datasets.ClassLabel(names=stance_type)),
+                        }
+                    ),
+                }
+            )
+
+        elif self.config.schema == _SEACROWD_SCHEMA["OCR"]:  # all images
+            features = schemas.image_text_features(label_names=meme_type)
+            features["metadata"] = {
+                "tags": datasets.Sequence(datasets.Value("string")),
+                "pillar_stances": datasets.Sequence(
+                    {
+                        "category": datasets.ClassLabel(names=pillar_type),
+                        "stance": datasets.Sequence(datasets.ClassLabel(names=stance_type)),
+                    }
+                ),
+            }
+        elif self.config.schema == _SEACROWD_SCHEMA["IMC_MULTI"]:  # sg meme images only
+            features = schemas.image_multi_features(label_names=pillar_type)
+            features["metadata"] = {
+                "tags": datasets.Sequence(datasets.Value("string")),
+                "stances": datasets.Sequence(datasets.Sequence(datasets.ClassLabel(names=stance_type))),
+            }
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        # download image from gdrive
+        output_dir = Path.cwd() / "data" / _DATASETNAME
+        output_dir.mkdir(parents=True, exist_ok=True)
+        output_file = output_dir / f"{_DATASETNAME}.zip"
+        if not output_file.exists():
+            gdown.download(_URLS["image"], str(output_file), fuzzy=True)
+        else:
+            print(f"File already downloaded: {str(output_file)}")
+        # extract image data
+        image_dir = Path(dl_manager.extract(output_file)) / "TD_Memes"
+
+        # download annotations
+        annotation_path = Path(dl_manager.download(_URLS["annotations"]))
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "image_dir": image_dir,
+                    "annotation_file": annotation_path,
+                },
+            ),
+        ]
+
+    def _generate_examples(self, image_dir: Path, annotation_file: Path) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        # load annotation
+        with open(annotation_file, "r", encoding="utf-8") as file:
+            annotation = json.load(file)
+
+        # get unique image names
+        image_names = sorted(
+            list(
+                set(annotation["Non_Memes"])
+                | set(annotation["Non_SG_Memes"])
+                | set(annotation["SG_Memes"])
+            )
+        )
+
+        # annotation data is a list of dict, instead of dict of image names
+        def get_value(image_name, list_of_dicts):
+            for dictionary in list_of_dicts:
+                if image_name in dictionary:
+                    return dictionary[image_name]
+            return None
+
+        key = 0
+        for image_name in image_names:
+            # assert image exist in directory
+            assert (image_dir / image_name).exists(), f"Image {image_name} not found"
+            image_path = str(image_dir / image_name)
+
+            # get categories, can be multiple
+            categories = []
+            if image_name in annotation["Non_Memes"]:
+                categories.append("Non_Memes")
+            if image_name in annotation["Non_SG_Memes"]:
+                categories.append("Non_SG_Memes")
+            if image_name in annotation["SG_Memes"]:
+                categories.append("SG_Memes")
+
+            # get attributes
+            text = get_value(image_name, annotation["Text"])
+            tags = get_value(image_name, annotation["Tags"])
+            raw_pillar_stances = get_value(image_name, annotation["Pillar_Stances"])
+
+            # process pillar stances
+            pillar_stances = []
+            if raw_pillar_stances:
+                for pillar, stances in raw_pillar_stances:
+                    category = pillar.split(" ")[0]
+                    pillar_stances.append({"category": category, "stance": stances})
+
+            # source schema
+            if self.config.schema == "source":
+                yield key, {
+                    "image_path": image_path,
+                    "categories": categories,
+                    "text": text,
+                    "tags": tags,
+                    "pillar_stances": pillar_stances,
+                }
+                key += 1
+
+            # ocr seacrowd schema
+            elif self.config.schema == _SEACROWD_SCHEMA["OCR"]:
+                yield key, {
+                    "id": str(key),
+                    "image_paths": [image_path],
+                    "texts": text,
+                    "metadata": {
+                        "tags": tags,
+                        "pillar_stances": pillar_stances,
+                    },
+                }
+                key += 1
+
+            # pillar classification seacrowd schema
+            elif self.config.schema == _SEACROWD_SCHEMA["IMC_MULTI"]:
+                if pillar_stances:  # only those with pillar stances
+                    yield key, {
+                        "id": str(key),
+                        "labels": [pillar["category"] for pillar in pillar_stances],
+                        "image_path": image_path,
+                        "metadata": {
+                            "tags": tags,
+                            "stances": [pillar["stance"] for pillar in pillar_stances],
+                        },
+                    }
+                    key += 1
@@ -10,6 +10,8 @@
     pairs_features_score,
     pairs_multi_features,
     qa_features,
+    image_features,
+    image_multi_features,
     imqa_features,
     seq_label_features,
     speech2speech_features,
@@ -121,6 +123,10 @@ class Tasks(Enum):
     # SpeechSpeech
     SPEECH_TO_SPEECH_TRANSLATION = "S2ST"
 
+    # Image
+    IMAGE_CLASSIFICATION = "IMC"
+    IMAGE_CLASSIFICATION_MULTILABEL = "IMC_MULTI"
+
     # ImageText
     IMAGE_CAPTIONING = "IC"
     VISUAL_QUESTION_ANSWERING = "VQA"
@@ -281,6 +287,8 @@ class Licenses(Enum):
     Tasks.SPEECH_EMOTION_RECOGNITION: "SPEECH",
     Tasks.SPEECH_EMOTION_RECOGNITION_MULTILABEL: "SPEECH_MULTI",
     Tasks.VISUAL_QUESTION_ANSWERING: "IMQA",
+    Tasks.IMAGE_CLASSIFICATION: "IMAGE",
+    Tasks.IMAGE_CLASSIFICATION_MULTILABEL: "IMAGE_MULTI",
     Tasks.IMAGE_CAPTIONING: "IMTEXT",
     Tasks.SIGN_LANGUAGE_RECOGNITION: "IMTEXT",
     Tasks.OPTICAL_CHARACTER_RECOGNITION: "IMTEXT",
@@ -317,6 +325,8 @@ class Licenses(Enum):
     "S2S": speech2speech_features,
     "SPEECH": speech_features(),
     "SPEECH_MULTI": speech_multi_features(),
+    "IMAGE": image_features(),
+    "IMAGE_MULTI": image_multi_features(),
     "IMTEXT": image_text_features(),
     "IMQA": imqa_features,
     "VIDTEXT": video_features,

@@ -5,6 +5,8 @@
 from .pairs import features_with_continuous_label as pairs_features_score
 from .pairs_multilabel import features as pairs_multi_features
 from .qa import features as qa_features
+from .image import features as image_features
+from .image import multi_features as image_multi_features
 from .imqa import features as imqa_features
 from .self_supervised_pretraining import features as ssp_features
 from .seq_label import features as seq_label_features
@@ -26,6 +28,8 @@
     "pairs_features_score",
     "pairs_multi_features",
     "qa_features",
+    "image_features",
+    "image_multi_features",
     "imqa_features",
     "ssp_features",
     "seq_label_features",

@@ -0,0 +1,35 @@
+"""
+General Image Classification Schema
+
+The field "metadata" is not specified to allow some flexibility.
+On how to use "metadata", choose one:
+1. defining as empty dict if you don't think it's usable in
+    `_generate_examples`, or
+2. defining meta as dict of key with intended colname meta and its val with
+    dataset.Features class in `_info` Dataloader method then populate it with the
+    values in `_general_examples` Dataloader method
+"""
+
+import datasets
+
+
+def features(label_names=["Yes", "No"]):
+    return datasets.Features(
+        {
+            "id": datasets.Value("string"),
+            "labels": datasets.ClassLabel(names=label_names),
+            "image_path": datasets.Value("string"),
+            "metadata": {},
+        }
+    )
+
+
+def multi_features(label_names=["Yes", "No"]):
+    return datasets.Features(
+        {
+            "id": datasets.Value("string"),
+            "labels": datasets.Sequence(datasets.ClassLabel(names=label_names)),
+            "image_path": datasets.Value("string"),
+            "metadata": {},
+        }
+    )