Skip to content

Commit

Permalink
Added KRAUTS partitions.
Browse files Browse the repository at this point in the history
Closes #5
  • Loading branch information
hmosousa committed Aug 22, 2022
1 parent 0dd12fc commit 435df84
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 0 deletions.
36 changes: 36 additions & 0 deletions tests/test_datasets/test_download_and_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,42 @@ def test_download_and_read_krauts(tmp_path):
assert timex.text == doc.text[s: e]


def test_download_and_read_krauts_diezeit(tmp_path):

os.chdir(tmp_path)

download("krauts_diezeit")
data_path = tmp_path / "data/krauts_diezeit"
assert data_path.is_dir()

krauts = read("krauts_diezeit")
assert len(krauts.documents) == 50


def test_download_and_read_krauts_dolomiten_42(tmp_path):

os.chdir(tmp_path)

download("krauts_dolomiten_42")
data_path = tmp_path / "data/krauts_dolomiten_42"
assert data_path.is_dir()

krauts = read("krauts_dolomiten_42")
assert len(krauts.documents) == 42


def test_download_and_read_krauts_dolomiten_100(tmp_path):

os.chdir(tmp_path)

download("krauts_dolomiten_100")
data_path = tmp_path / "data/krauts_dolomiten_100"
assert data_path.is_dir()

krauts = read("krauts_dolomiten_100")
assert len(krauts.documents) == 100


def test_download_and_read_matres(tmp_path):

os.chdir(tmp_path)
Expand Down
26 changes: 26 additions & 0 deletions tieval/datasets/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,32 @@ def description(self):
doc_reader=KRAUTSDocumentReader
),

"krauts_diezeit": DatasetMetadata(
name="krauts_diezeit",
language="german",
url="https://drive.inesctec.pt/s/f98ZDGLqEjqCXEp/download",
reader=XMLDatasetReader,
doc_reader=KRAUTSDocumentReader
),


"krauts_dolomiten_42": DatasetMetadata(
name="krauts_dolomiten_42",
language="german",
url="https://drive.inesctec.pt/s/AEwAod8Wn2L9E9G/download",
reader=XMLDatasetReader,
doc_reader=KRAUTSDocumentReader
),


"krauts_dolomiten_100": DatasetMetadata(
name="krauts_dolomiten_100",
language="german",
url="https://drive.inesctec.pt/s/x3N7Kf446RFeDse/download",
reader=XMLDatasetReader,
doc_reader=KRAUTSDocumentReader
),

"matres": DatasetMetadata(
name="matres",
language="english",
Expand Down

0 comments on commit 435df84

Please sign in to comment.