diff --git a/data_preparation/70.releasing/Makefile b/data_preparation/70.releasing/Makefile
index d86e460..d568b21 100644
--- a/data_preparation/70.releasing/Makefile
+++ b/data_preparation/70.releasing/Makefile
@@ -17,6 +17,11 @@ RELEASE_NAME=Evaldio_Dataset-1.0
FORMAT_SIMPLE_SCRIPT=../60.format_simple/format_simple.sh
+CANONICAL=
+ifeq ($(CANONICAL),1)
+CANONICAL_FLAG=--canonical
+endif
+
DATE=$(shell date +%Y%m%d)
############ RULES ################
@@ -42,7 +47,7 @@ $(TGT_DIR)/%.xml : $(TGT_DIR)/%.xml.with_header
xmllint --format - < $< > $@
$(TGT_DIR)/%.xml.with_header : $(TGT_DIR)/%.xml.parsed
- python add_proper_tei_header.py $< $@
+ python add_proper_tei_header.py $(CANONICAL_FLAG) $< $@
$(TGT_DIR)/%.xml.parsed : $(TGT_DIR)/%.xml.tokenized
cp $< $@
@@ -60,12 +65,22 @@ $(TGT_DIR)/%.xml.specstr_replaced : $(SRC_DIR)/%.xml
copy_finalize_annot : $(BATCH_PATH).exer_numbers.csv runparser.pl
mkdir -p $(TGT_DIR)
+ RANDOM=2024; \
while read fid exer; do \
+ annot_count=`ls $(SRC_DIR)/$$fid*-$$exer.xml | wc -l`; \
+ rand_i=$$((RANDOM % $$annot_count)); \
+ i=0; \
for f in $(SRC_DIR)/$$fid*-$$exer.xml; do \
+ if [ $$i -eq $$rand_i ]; then \
+ canonical=1; \
+ else \
+ canonical=0; \
+ fi; \
bf=`basename $$f .xml`; \
echo "===== $$f ====="; \
- make $(TGT_DIR)/$$bf.xml; \
+ make $(TGT_DIR)/$$bf.xml CANONICAL=$$canonical; \
make $(TGT_DIR)/$$bf.txt; \
+ i=$$((i+1)); \
done; \
done < $<
diff --git a/data_preparation/70.releasing/add_proper_tei_header.py b/data_preparation/70.releasing/add_proper_tei_header.py
index eb14808..4613fae 100644
--- a/data_preparation/70.releasing/add_proper_tei_header.py
+++ b/data_preparation/70.releasing/add_proper_tei_header.py
@@ -39,7 +39,6 @@
{pub_version}
-
@@ -108,11 +107,13 @@
+ Databáze mluvených projevů v češtině jako cizím jazyce (trvalý pobyt v ČR)
{examid}
{level}
{exerno}
{type}
- Databáze mluvených projevů v češtině jako cizím jazyce (trvalý pobyt v ČR)
+ {annotator_short}
+ {canonical:d}
@@ -140,6 +141,7 @@ def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('input', help='Path to the input file')
parser.add_argument('output', help='Path to the output file')
+ parser.add_argument('--canonical', action='store_true', help='The annotation is canonical')
parser.add_argument('--pub-date', type=str, default='2024-10-31', help='Publication date')
parser.add_argument('--pub-version', type=str, default='1.0', help='Publication version')
parser.add_argument('--handle-uri', type=str, default='http://hdl.handle.net/11234/1-5731', help='Handle URI')
@@ -185,7 +187,7 @@ def add_info_from_file_content(info, doctree):
info['reviewer'] = ANNOTATOR_NAMES[review_duration_elems[0].attrib.get('user') if review_duration_elems else 'MR']
def add_info_from_args(info, args):
- for name in ['handle_uri', 'pub_version', 'rec_inst', 'rec_inst_en', 'rec_inst_short']:
+ for name in ['handle_uri', 'pub_version', 'rec_inst', 'rec_inst_en', 'rec_inst_short', 'canonical']:
info[name] = getattr(args, name)
info['pub_date'] = datetime.strptime(args.pub_date, '%Y-%m-%d')