Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #355 | Add Dataloader TotalDefMeme #602

Merged
merged 3 commits into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
277 changes: 277 additions & 0 deletions seacrowd/sea_datasets/total_defense_meme/total_defense_meme.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from pathlib import Path
from typing import Dict, List, Tuple

import datasets
import gdown

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks

_CITATION = """\
@inproceedings{10.1145/3587819.3592545,
author = {Prakash, Nirmalendu and Hee, Ming Shan and Lee, Roy Ka-Wei},
title = {TotalDefMeme: A Multi-Attribute Meme dataset on Total Defence in Singapore},
year = {2023},
isbn = {9798400701481},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3587819.3592545},
doi = {10.1145/3587819.3592545},
booktitle = {Proceedings of the 14th Conference on ACM Multimedia Systems},
pages = {369–375},
numpages = {7},
keywords = {multimodal, meme, dataset, topic clustering, stance classification},
location = {Vancouver, BC, Canada},
series = {MMSys '23}
}
"""

_DATASETNAME = "total_defense_meme"

_DESCRIPTION = """\
This is a large-scale multimodal and multi-attribute dataset containing memes
about Singapore's Total Defence policy from different social media platforms.
The type (Singaporean or generic), pillars (military, civil, economic, social,
psychological, digital, others), topics and stances (against, neutral,
supportive) of each meme are manually identified by annotators.
"""

_HOMEPAGE = "https://gitlab.com/bottle_shop/meme/TotalDefMemes"

_LANGUAGES = ["eng"]

_LICENSE = Licenses.UNKNOWN.value

_LOCAL = False

_URLS = {
"image": "https://drive.google.com/file/d/1oJIh4QQS3Idff2g6bZORstS5uBROjUUz/view?usp=share_link",
"annotations": "https://gitlab.com/bottle_shop/meme/TotalDefMemes/-/raw/main/report/annotation.json?ref_type=heads",
}

_SUPPORTED_TASKS = [Tasks.OPTICAL_CHARACTER_RECOGNITION, Tasks.IMAGE_CLASSIFICATION_MULTILABEL]
_SEACROWD_SCHEMA = {
task.value: f"seacrowd_{TASK_TO_SCHEMA[task].lower()}" for task in _SUPPORTED_TASKS
} # ocr: imtext, imc_multi: image_multi

_SOURCE_VERSION = "1.0.0"

_SEACROWD_VERSION = "1.0.0"


class TotalDefenseMemeDataset(datasets.GeneratorBasedBuilder):
"""Multimodal dataset containing memes about Singapore's Total Defence policy"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

BUILDER_CONFIGS = [
SEACrowdConfig(
name=f"{_DATASETNAME}_source",
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema",
schema="source",
subset_id=_DATASETNAME,
),
SEACrowdConfig(
name=f"{_DATASETNAME}_{_SEACROWD_SCHEMA['OCR']}",
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=_SEACROWD_SCHEMA["OCR"],
subset_id=_DATASETNAME,
),
SEACrowdConfig(
name=f"{_DATASETNAME}_{_SEACROWD_SCHEMA['IMC_MULTI']}",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

f"{_DATASETNAME}_{_SEACROWD_SCHEMA['IMC_MULTI']}" --> f"{_DATASETNAME}_topic_{_SEACROWD_SCHEMA['IMC_MULTI']}"

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing this will make the test not work since the config should maintain a certain template based on constants.py

version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=_SEACROWD_SCHEMA["IMC_MULTI"],
subset_id=_DATASETNAME,
),
]

DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"

def _info(self) -> datasets.DatasetInfo:
# define labelling
meme_type = ["Non_Memes", "Non_SG_Memes", "SG_Memes"]
pillar_type = [
"Social",
"Economic",
"Psychological",
"Military",
"Civil",
"Digital",
"Others",
]
stance_type = ["Against", "Neutral", "Supportive"]

if self.config.schema == "source":
features = datasets.Features(
{
"image_path": datasets.Value("string"),
"meme_type": datasets.Sequence(datasets.ClassLabel(names=meme_type)),
"text": datasets.Value("string"),
"tags": datasets.Sequence(datasets.Value("string")),
"pillar_stances": datasets.Sequence(
{
"category": datasets.ClassLabel(names=pillar_type),
"stance": datasets.Sequence(datasets.ClassLabel(names=stance_type)),
}
),
}
)

elif self.config.schema == _SEACROWD_SCHEMA["OCR"]: # all images
features = schemas.image_text_features(label_names=meme_type)
features["metadata"] = {
"tags": datasets.Sequence(datasets.Value("string")),
"pillar_stances": datasets.Sequence(
{
"category": datasets.ClassLabel(names=pillar_type),
"stance": datasets.Sequence(datasets.ClassLabel(names=stance_type)),
}
),
}
elif self.config.schema == _SEACROWD_SCHEMA["IMC_MULTI"]: # sg meme images only
features = schemas.image_multi_features(label_names=pillar_type)
features["metadata"] = {
"tags": datasets.Sequence(datasets.Value("string")),
"stances": datasets.Sequence(datasets.Sequence(datasets.ClassLabel(names=stance_type))),
}

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
# download image from gdrive
output_dir = Path.cwd() / "data" / _DATASETNAME
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / f"{_DATASETNAME}.zip"
if not output_file.exists():
gdown.download(_URLS["image"], str(output_file), fuzzy=True)
else:
print(f"File already downloaded: {str(output_file)}")
# extract image data
image_dir = Path(dl_manager.extract(output_file)) / "TD_Memes"

# download annotations
annotation_path = Path(dl_manager.download(_URLS["annotations"]))
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"image_dir": image_dir,
"annotation_file": annotation_path,
},
),
]

def _generate_examples(self, image_dir: Path, annotation_file: Path) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
# load annotation
with open(annotation_file, "r", encoding="utf-8") as file:
annotation = json.load(file)

# get unique image names
image_names = sorted(
list(
set(annotation["Non_Memes"])
| set(annotation["Non_SG_Memes"])
| set(annotation["SG_Memes"])
)
)

# annotation data is a list of dict, instead of dict of image names
def get_value(image_name, list_of_dicts):
for dictionary in list_of_dicts:
if image_name in dictionary:
return dictionary[image_name]
return None

key = 0
for image_name in image_names:
# assert image exist in directory
assert (image_dir / image_name).exists(), f"Image {image_name} not found"
image_path = str(image_dir / image_name)

# get categories, can be multiple
categories = []
if image_name in annotation["Non_Memes"]:
categories.append("Non_Memes")
if image_name in annotation["Non_SG_Memes"]:
categories.append("Non_SG_Memes")
if image_name in annotation["SG_Memes"]:
categories.append("SG_Memes")

# get attributes
text = get_value(image_name, annotation["Text"])
tags = get_value(image_name, annotation["Tags"])
raw_pillar_stances = get_value(image_name, annotation["Pillar_Stances"])

# process pillar stances
pillar_stances = []
if raw_pillar_stances:
for pillar, stances in raw_pillar_stances:
category = pillar.split(" ")[0]
pillar_stances.append({"category": category, "stance": stances})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please make this "stance" --> "agreed_stances" and add another variable "all_stances"?

The all_stances labels have all the stance annotations while the agreed_stances labels are the "correct labels" based on this processing detail in Section 3.3 Quality Control Measures from the paper.

Lastly, the annotators annotate the meme’s stances towards the assigned pillars: support, against, or neutral. To ensure the reliability of the dataset, each meme is annotated by two annotators. If the disagreements contain similar opinions, the overlap annotations will be considered correct labels. However, if there are disagreements with entirely different perspectives, a third annotator will be brought in to provide an additional annotation for the meme. The overlapping annotations between at least two annotators will then be considered the correct labels. In the extreme case where all three annotators have different opinions, the meme will be flagged and removed from the dataset.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See my comment above.


# source schema
if self.config.schema == "source":
yield key, {
"image_path": image_path,
"meme_type": categories,
"text": text,
"tags": tags,
"pillar_stances": pillar_stances,
}
key += 1

# ocr seacrowd schema
elif self.config.schema == _SEACROWD_SCHEMA["OCR"]:
yield key, {
"id": str(key),
"image_paths": [image_path],
"texts": text,
"metadata": {
"tags": tags,
"pillar_stances": pillar_stances,
},
}
key += 1

# pillar/topic classification seacrowd schema
elif self.config.schema == _SEACROWD_SCHEMA["IMC_MULTI"]:
if pillar_stances: # only those with pillar stances
yield key, {
"id": str(key),
"labels": [pillar["category"] for pillar in pillar_stances],
"image_path": image_path,
"metadata": {
"tags": tags,
"stances": [pillar["stance"] for pillar in pillar_stances],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please make this "stances" --> "agreed_stances" and add another variable "all_stances"?

The all_stances labels have all the stance annotations while the agreed_stances labels are the "correct labels" based on this processing detail in Section 3.3 Quality Control Measures from the paper.

Lastly, the annotators annotate the meme’s stances towards the assigned pillars: support, against, or neutral. To ensure the reliability of the dataset, each meme is annotated by two annotators. If the disagreements contain similar opinions, the overlap annotations will be considered correct labels. However, if there are disagreements with entirely different perspectives, a third annotator will be brought in to provide an additional annotation for the meme. The overlapping annotations between at least two annotators will then be considered the correct labels. In the extreme case where all three annotators have different opinions, the meme will be flagged and removed from the dataset.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the extreme case where all three annotators have different opinions, the meme will be flagged and removed from the dataset.

Actually it's not. Just for an example, this is the raw data from annotation.json (see img_4124).
image

To ensure the reliability of the dataset, each meme is annotated by two annotators. If the disagreements contain similar opinions, the overlap annotations will be considered correct labels. However, if there are disagreements with entirely different perspectives, a third annotator will be brought in to provide an additional annotation for the meme. The overlapping annotations between at least two annotators will then be considered the correct labels.

There is also no "agreed stances" properties in the dataset. Except if we want to add a script to process the dataset. But IMO, since we are just "loading" the dataset, the processing should be given later to the user.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also to add about the inconsistencies from the dataset, see below.

  • img_4129: perfect example for three annotators, we can get obtain the agreed stances.
  • img_4130: where is the third annotator?
  • img_4131: first and second annotators already agreed on neutral, why third annotator?
    image

},
}
key += 1
10 changes: 10 additions & 0 deletions seacrowd/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
pairs_features_score,
pairs_multi_features,
qa_features,
image_features,
image_multi_features,
imqa_features,
seq_label_features,
speech2speech_features,
Expand Down Expand Up @@ -121,6 +123,10 @@ class Tasks(Enum):
# SpeechSpeech
SPEECH_TO_SPEECH_TRANSLATION = "S2ST"

# Image
IMAGE_CLASSIFICATION = "IMC"
IMAGE_CLASSIFICATION_MULTILABEL = "IMC_MULTI"

# ImageText
IMAGE_CAPTIONING = "IC"
VISUAL_QUESTION_ANSWERING = "VQA"
Expand Down Expand Up @@ -281,6 +287,8 @@ class Licenses(Enum):
Tasks.SPEECH_EMOTION_RECOGNITION: "SPEECH",
Tasks.SPEECH_EMOTION_RECOGNITION_MULTILABEL: "SPEECH_MULTI",
Tasks.VISUAL_QUESTION_ANSWERING: "IMQA",
Tasks.IMAGE_CLASSIFICATION: "IMAGE",
Tasks.IMAGE_CLASSIFICATION_MULTILABEL: "IMAGE_MULTI",
Tasks.IMAGE_CAPTIONING: "IMTEXT",
Tasks.SIGN_LANGUAGE_RECOGNITION: "IMTEXT",
Tasks.OPTICAL_CHARACTER_RECOGNITION: "IMTEXT",
Expand Down Expand Up @@ -317,6 +325,8 @@ class Licenses(Enum):
"S2S": speech2speech_features,
"SPEECH": speech_features(),
"SPEECH_MULTI": speech_multi_features(),
"IMAGE": image_features(),
"IMAGE_MULTI": image_multi_features(),
"IMTEXT": image_text_features(),
"IMQA": imqa_features,
"VIDTEXT": video_features,
Expand Down
4 changes: 4 additions & 0 deletions seacrowd/utils/schemas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from .pairs import features_with_continuous_label as pairs_features_score
from .pairs_multilabel import features as pairs_multi_features
from .qa import features as qa_features
from .image import features as image_features
from .image import multi_features as image_multi_features
from .imqa import features as imqa_features
from .self_supervised_pretraining import features as ssp_features
from .seq_label import features as seq_label_features
Expand All @@ -26,6 +28,8 @@
"pairs_features_score",
"pairs_multi_features",
"qa_features",
"image_features",
"image_multi_features",
"imqa_features",
"ssp_features",
"seq_label_features",
Expand Down
35 changes: 35 additions & 0 deletions seacrowd/utils/schemas/image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
General Image Classification Schema

The field "metadata" is not specified to allow some flexibility.
On how to use "metadata", choose one:
1. defining as empty dict if you don't think it's usable in
`_generate_examples`, or
2. defining meta as dict of key with intended colname meta and its val with
dataset.Features class in `_info` Dataloader method then populate it with the
values in `_general_examples` Dataloader method
"""

import datasets


def features(label_names=["Yes", "No"]):
return datasets.Features(
{
"id": datasets.Value("string"),
"labels": datasets.ClassLabel(names=label_names),
"image_path": datasets.Value("string"),
"metadata": {},
}
)


def multi_features(label_names=["Yes", "No"]):
return datasets.Features(
{
"id": datasets.Value("string"),
"labels": datasets.Sequence(datasets.ClassLabel(names=label_names)),
"image_path": datasets.Value("string"),
"metadata": {},
}
)