Skip to content

Commit

Permalink
datasets: fix train files for UD_CZECH and UD_RUSSIAN
Browse files Browse the repository at this point in the history
  • Loading branch information
stefan-it committed Apr 2, 2024
1 parent a5edbd6 commit 44a2f9d
Showing 1 changed file with 31 additions and 26 deletions.
57 changes: 31 additions & 26 deletions flair/datasets/treebanks.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,30 +711,17 @@ def __init__(
ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Czech-PDT/{revision}"
cached_path(f"{ud_path}/cs_pdt-ud-dev.conllu", Path("datasets") / dataset_name)
cached_path(f"{ud_path}/cs_pdt-ud-test.conllu", Path("datasets") / dataset_name)
cached_path(
f"{ud_path}/cs_pdt-ud-train-c.conllu",
Path("datasets") / dataset_name / "original",
)
cached_path(
f"{ud_path}/cs_pdt-ud-train-l.conllu",
Path("datasets") / dataset_name / "original",
)
cached_path(
f"{ud_path}/cs_pdt-ud-train-m.conllu",
Path("datasets") / dataset_name / "original",
)
cached_path(
f"{ud_path}/cs_pdt-ud-train-v.conllu",
Path("datasets") / dataset_name / "original",
)

train_suffixes = ["ca", "ct", "la", "lt", "ma", "mt", "va"]

for train_suffix in train_suffixes:
cached_path(
f"{ud_path}/cs_pdt-ud-train-{train_suffix}.conllu",
Path("datasets") / dataset_name / "original",
)
data_path = flair.cache_root / "datasets" / dataset_name

train_filenames = [
"cs_pdt-ud-train-c.conllu",
"cs_pdt-ud-train-l.conllu",
"cs_pdt-ud-train-m.conllu",
"cs_pdt-ud-train-v.conllu",
]
train_filenames = [f"cs_pdt-ud-train-{train_suffix}.conllu" for train_suffix in train_suffixes]

new_train_file: Path = data_path / "cs_pdt-ud-train-all.conllu"

Expand Down Expand Up @@ -1105,7 +1092,25 @@ def __init__(
ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/{revision}"
cached_path(f"{ud_path}/ru_syntagrus-ud-dev.conllu", Path("datasets") / dataset_name)
cached_path(f"{ud_path}/ru_syntagrus-ud-test.conllu", Path("datasets") / dataset_name)
cached_path(f"{ud_path}/ru_syntagrus-ud-train.conllu", Path("datasets") / dataset_name)

train_filenames = [
"ru_syntagrus-ud-train-a.conllu",
"ru_syntagrus-ud-train-b.conllu",
"ru_syntagrus-ud-train-c.conllu",
]

for train_file in train_filenames:
cached_path(f"{ud_path}/{train_file}", Path("datasets") / dataset_name / "original")

data_path = flair.cache_root / "datasets" / dataset_name

new_train_file: Path = data_path / "ru_syntagrus-ud-train-all.conllu"

if not new_train_file.is_file():
with open(new_train_file, "w") as f_out:
for train_filename in train_filenames:
with open(data_path / "original" / train_filename) as f_in:
f_out.write(f_in.read())

super().__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)

Expand Down Expand Up @@ -1504,9 +1509,9 @@ def __init__(

# download data if necessary
web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Old_French-SRCMF/{revision}"
cached_path(f"{web_path}/fro_srcmf-ud-dev.conllu", Path("datasets") / dataset_name)
cached_path(f"{web_path}/fro_srcmf-ud-test.conllu", Path("datasets") / dataset_name)
cached_path(f"{web_path}/fro_srcmf-ud-train.conllu", Path("datasets") / dataset_name)
cached_path(f"{web_path}/fro_profiterole-ud-dev.conllu", Path("datasets") / dataset_name)
cached_path(f"{web_path}/fro_profiterole-ud-test.conllu", Path("datasets") / dataset_name)
cached_path(f"{web_path}/fro_profiterole-ud-train.conllu", Path("datasets") / dataset_name)

super().__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)

Expand Down

0 comments on commit 44a2f9d

Please sign in to comment.