Skip to content

Commit

Permalink
Merge pull request #122 from IvanHalimP/cc3m_35l
Browse files Browse the repository at this point in the history
Closes #77 | Create dataset loader for CC3M-35L
  • Loading branch information
SamuelCahyawijaya authored Apr 8, 2024
2 parents aa5b0be + cc7d2b5 commit b080f35
Show file tree
Hide file tree
Showing 2 changed files with 242 additions and 0 deletions.
Empty file.
242 changes: 242 additions & 0 deletions seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
import os
from typing import Dict, List, Tuple

import datasets
import jsonlines as jl
import pandas as pd

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses, Tasks

_CITATION = """\
@inproceedings{thapliyal-etal-2022-crossmodal,
title = "Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset",
author = "Thapliyal, Ashish V. and
Pont Tuset, Jordi and
Chen, Xi and
Soricut, Radu",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.45",
doi = "10.18653/v1/2022.emnlp-main.45",
pages = "715--729",
}
"""

_DATASETNAME = "cc3m_35l"

_DESCRIPTION = """\
CC3M-35L is created by translating Conceptual Captions 3M (Sharma et al., 2018),
originally in English, to the other 34 languages using Google's machine translation API.
"""

_HOMEPAGE = "https://google.github.io/crossmodal-3600/"

_LICENSE = Licenses.CC_BY_4_0.value

# the image URLs are contained in tsv file together with the original captions which can be downloaded locally using google account.
# those tsv file originally can be found and downloaded from this page https://ai.google.com/research/ConceptualCaptions/download
# there are no direct image folder ready, so it needs to be downloaded one by one
# some warnings may occur when downloading due to reasons such as security certificate and others
_URLS = {
"trans_train": "https://storage.googleapis.com/crossmodal-3600/cc3m_mt_train.jsonl.gz",
"trans_dev": "https://storage.googleapis.com/crossmodal-3600/cc3m_mt_dev.jsonl.gz",
}

_SUPPORTED_TASKS = [Tasks.IMAGE_CAPTIONING]

_SOURCE_VERSION = "1.0.0"

_SEACROWD_VERSION = "1.0.0"

_LANGUAGES = ["fil", "ind", "tha", "vie"]

_LOCAL = True


class CC3M35L(datasets.GeneratorBasedBuilder):
"""
CC3M-35L is created by translating Conceptual Captions 3M (Sharma et al., 2018),
originally in English, to the other 34 languages using Google's machine translation API.
"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

BUILDER_CONFIGS = [SEACrowdConfig(name=f"cc3m_35l_{lang}_source", version=datasets.Version(_SOURCE_VERSION), description=f"cc3m_35l_{lang} source schema", schema="source", subset_id=f"cc3m_35l_{lang}",) for lang in _LANGUAGES] + [
SEACrowdConfig(
name=f"{_DATASETNAME}_{lang}_seacrowd_imtext",
version=datasets.Version(_SEACROWD_VERSION),
description=f"{_DATASETNAME}_{lang} SEACrowd schema",
schema="seacrowd_imtext",
subset_id=f"{_DATASETNAME}_{lang}",
)
for lang in _LANGUAGES
]

DEFAULT_CONFIG_NAME = "cc3m_35l_id_source"

def _info(self) -> datasets.DatasetInfo:
if self.config.schema == "source":
features = datasets.Features(
{
"id": datasets.Value("string"),
"image_paths": datasets.Value("string"),
"src_lang": datasets.Value("string"),
"caption_tokenized": datasets.Value("string"),
"trg_lang": datasets.Value("string"),
"translation_tokenized": datasets.Value("string"),
"backtranslation_tokenized": datasets.Value("string"),
}
)
elif self.config.schema == "seacrowd_imtext":
features = schemas.image_text_features()

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def fill_img_path(self, df: pd.DataFrame, line: dict):
exceptions = []
selected_row = df.query('caption==@line["caption_tokenized"]')
# it may return several rows, skip of empty
if not selected_row.empty:
# for each row, download the image, use its path and put the translation
for idx, row in selected_row.iterrows():
row["trans_caption"] = line["translation_tokenized"]
row["backtrans_caption"] = line["backtranslation_tokenized"]
# if the image cannot be downloaded for some reason, skip it
# may cause difference in the total data each run
try:
row["img_path"] = datasets.DownloadManager().download(row["img_url"])
except:
exceptions.append(idx)

return selected_row, exceptions

def is_target(self, line: dict, trg_lang: str):
if line["trg_lang"] == trg_lang:
return line

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
dev_path = dl_manager.download_and_extract(_URLS["trans_dev"])
train_path = dl_manager.download_and_extract(_URLS["trans_train"])

if self.config.data_dir is None:
raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
else:
data_dir = self.config.data_dir

# read tsv from local train and validation files
gcc_val = os.path.join(data_dir, "Validation_GCC-1.1.0-Validation.tsv")
gcc_train = os.path.join(data_dir, "Train_GCC-training.tsv")

# make it into pandas dataframe
colnames = ["caption", "img_url"]
gcc_val_df = pd.read_csv(gcc_val, sep="\t", header=None, names=colnames)
gcc_train_df = pd.read_csv(gcc_train, sep="\t", header=None, names=colnames)

# add new column to keep the downloaded image path
gcc_val_df["img_path"] = None
gcc_train_df["img_path"] = None

# add new column to keep the translated caption
gcc_val_df["trans_caption"] = None
gcc_train_df["trans_caption"] = None

gcc_val_df["backtrans_caption"] = None
gcc_train_df["backtrans_caption"] = None

# match the original captions in the translated set to the dataframe caption
# download the images from the URL and use it as the filepath
train_exceptions = []
val_exceptions = []

current_lang = self.config.subset_id.split("_")[2]
val_caption_targets = []
train_caption_targets = []

# filter validation data
with jl.open(os.path.join(dev_path), mode="r") as j:
val_caption_targets = [line for line in j if line["trg_lang"] == current_lang]

#for line in val_caption_targets[:100]: # this was for debugging only
for line in val_caption_targets:
res = self.fill_img_path(gcc_train_df, line)
val_exceptions.extend(res[1])
gcc_val_df.update(res[0])

# clean the memory
val_caption_targets = []

# filter train data
with jl.open(os.path.join(train_path), mode="r") as j:
train_caption_targets = [line for line in j if line["trg_lang"] == current_lang]


#for line in train_caption_targets[:100]: # this was for debugging only
for line in train_caption_targets:
res = self.fill_img_path(gcc_val_df, line)
train_exceptions.extend(res[1])
gcc_train_df.update(res[0])

# clean the memory
train_caption_targets = []

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": gcc_train_df,
"exceptions": train_exceptions,
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": gcc_val_df,
"exceptions": val_exceptions,
},
),
]

def _generate_examples(self, filepath: dict, exceptions: list) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
for idx, row in filepath.iterrows():
if idx not in exceptions:
if self.config.schema == "source":
yield idx, {
"id": str(idx),
"image_paths": row["img_path"],
"src_lang": "en",
"caption_tokenized": row["caption"],
"trg_lang": self.config.subset_id.split("_")[2],
"translation_tokenized": row["trans_caption"],
"backtranslation_tokenized": row["backtrans_caption"],
}

elif self.config.schema == "seacrowd_imtext":
yield idx, {
"id": str(idx),
"image_paths": [row["img_path"]],
"texts": row["trans_caption"],
"metadata": {
"context": None,
"labels": None,
},
}

else:
raise ValueError(f"Invalid config: {self.config.name}")

0 comments on commit b080f35

Please sign in to comment.