Skip to content

Commit

Permalink
Add WikiWars corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
hmosousa committed Aug 8, 2022
1 parent ecf822c commit 0dd12fc
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 5 deletions.
18 changes: 13 additions & 5 deletions tests/test_datasets/test_download_and_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,19 @@ def test_download_and_read_narrative_container(tmp_path):
assert len(train_docs & test_docs) == 0


def test_download_and_read_wikiwars(tmp_path):

os.chdir(tmp_path)
corpus_name = "wikiwars"

download(corpus_name)
data_path = tmp_path / f"data/{corpus_name}"
assert data_path.is_dir()

data = read(corpus_name)
assert len(data.documents) == 22


def test_download_and_read_wikiwars_de(tmp_path):

os.chdir(tmp_path)
Expand All @@ -182,8 +195,3 @@ def test_download_and_read_wikiwars_de(tmp_path):

data = read(corpus_name)
assert len(data.documents) == 22

test_docs = set(doc.name for doc in data.test)
train_docs = set(doc.name for doc in data.train)
assert len(test_docs & train_docs) == 0
assert len(train_docs & test_docs) == 0
8 changes: 8 additions & 0 deletions tieval/datasets/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,14 @@ def description(self):
doc_reader=TempEval3DocumentReader
),

"wikiwars": DatasetMetadata(
name="wikiwars",
language="english",
url="https://drive.inesctec.pt/s/8ZPnNPfofwyyLT9/download",
reader=XMLDatasetReader,
doc_reader=WikiWarsDocumentReader
),

"wikiwars_de": DatasetMetadata(
name="wikiwars_de",
language="german",
Expand Down

0 comments on commit 0dd12fc

Please sign in to comment.