From 435df84f49737938c1bde44e6ef59c0cf2c9fcad Mon Sep 17 00:00:00 2001 From: hugosousa Date: Mon, 22 Aug 2022 17:54:51 +0100 Subject: [PATCH] Added KRAUTS partitions. Closes #5 --- tests/test_datasets/test_download_and_read.py | 36 +++++++++++++++++++ tieval/datasets/metadata.py | 26 ++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/tests/test_datasets/test_download_and_read.py b/tests/test_datasets/test_download_and_read.py index e86a65c..061ffa3 100644 --- a/tests/test_datasets/test_download_and_read.py +++ b/tests/test_datasets/test_download_and_read.py @@ -115,6 +115,42 @@ def test_download_and_read_krauts(tmp_path): assert timex.text == doc.text[s: e] +def test_download_and_read_krauts_diezeit(tmp_path): + + os.chdir(tmp_path) + + download("krauts_diezeit") + data_path = tmp_path / "data/krauts_diezeit" + assert data_path.is_dir() + + krauts = read("krauts_diezeit") + assert len(krauts.documents) == 50 + + +def test_download_and_read_krauts_dolomiten_42(tmp_path): + + os.chdir(tmp_path) + + download("krauts_dolomiten_42") + data_path = tmp_path / "data/krauts_dolomiten_42" + assert data_path.is_dir() + + krauts = read("krauts_dolomiten_42") + assert len(krauts.documents) == 42 + + +def test_download_and_read_krauts_dolomiten_100(tmp_path): + + os.chdir(tmp_path) + + download("krauts_dolomiten_100") + data_path = tmp_path / "data/krauts_dolomiten_100" + assert data_path.is_dir() + + krauts = read("krauts_dolomiten_100") + assert len(krauts.documents) == 100 + + def test_download_and_read_matres(tmp_path): os.chdir(tmp_path) diff --git a/tieval/datasets/metadata.py b/tieval/datasets/metadata.py index 726af66..f8faa33 100644 --- a/tieval/datasets/metadata.py +++ b/tieval/datasets/metadata.py @@ -94,6 +94,32 @@ def description(self): doc_reader=KRAUTSDocumentReader ), + "krauts_diezeit": DatasetMetadata( + name="krauts_diezeit", + language="german", + url="https://drive.inesctec.pt/s/f98ZDGLqEjqCXEp/download", + reader=XMLDatasetReader, + doc_reader=KRAUTSDocumentReader + ), + + + "krauts_dolomiten_42": DatasetMetadata( + name="krauts_dolomiten_42", + language="german", + url="https://drive.inesctec.pt/s/AEwAod8Wn2L9E9G/download", + reader=XMLDatasetReader, + doc_reader=KRAUTSDocumentReader + ), + + + "krauts_dolomiten_100": DatasetMetadata( + name="krauts_dolomiten_100", + language="german", + url="https://drive.inesctec.pt/s/x3N7Kf446RFeDse/download", + reader=XMLDatasetReader, + doc_reader=KRAUTSDocumentReader + ), + "matres": DatasetMetadata( name="matres", language="english",