From 3995c034b4bdb68bec9e1fd8567a4d9bcbe5b9e2 Mon Sep 17 00:00:00 2001
From: Michal Novak <mnovak@ufal.mff.cuni.cz>
Date: Wed, 6 Nov 2024 17:39:55 +0100
Subject: [PATCH] implementation of canonical transcript selection

---
 data_preparation/70.releasing/Makefile        | 19 +++++++++++++++++--
 .../70.releasing/add_proper_tei_header.py     |  8 +++++---
 2 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/data_preparation/70.releasing/Makefile b/data_preparation/70.releasing/Makefile
index d86e460..d568b21 100644
--- a/data_preparation/70.releasing/Makefile
+++ b/data_preparation/70.releasing/Makefile
@@ -17,6 +17,11 @@ RELEASE_NAME=Evaldio_Dataset-1.0
 
 FORMAT_SIMPLE_SCRIPT=../60.format_simple/format_simple.sh
 
+CANONICAL=
+ifeq ($(CANONICAL),1)
+CANONICAL_FLAG=--canonical
+endif
+
 DATE=$(shell date +%Y%m%d)
 
 ############ RULES ################
@@ -42,7 +47,7 @@ $(TGT_DIR)/%.xml : $(TGT_DIR)/%.xml.with_header
 	xmllint --format - < $< > $@
 
 $(TGT_DIR)/%.xml.with_header : $(TGT_DIR)/%.xml.parsed
-	python add_proper_tei_header.py $< $@
+	python add_proper_tei_header.py $(CANONICAL_FLAG) $< $@
 
 $(TGT_DIR)/%.xml.parsed : $(TGT_DIR)/%.xml.tokenized
 	cp $< $@
@@ -60,12 +65,22 @@ $(TGT_DIR)/%.xml.specstr_replaced : $(SRC_DIR)/%.xml
 
 copy_finalize_annot : $(BATCH_PATH).exer_numbers.csv runparser.pl
 	mkdir -p $(TGT_DIR)
+	RANDOM=2024; \
 	while read fid exer; do \
+		annot_count=`ls $(SRC_DIR)/$$fid*-$$exer.xml | wc -l`; \
+		rand_i=$$((RANDOM % $$annot_count)); \
+		i=0; \
 		for f in $(SRC_DIR)/$$fid*-$$exer.xml; do \
+			if [ $$i -eq $$rand_i ]; then \
+				canonical=1; \
+			else \
+				canonical=0; \
+			fi; \
 			bf=`basename $$f .xml`; \
 			echo "===== $$f ====="; \
-			make $(TGT_DIR)/$$bf.xml; \
+			make $(TGT_DIR)/$$bf.xml CANONICAL=$$canonical; \
 			make $(TGT_DIR)/$$bf.txt; \
+			i=$$((i+1)); \
 		done; \
 	done < $<
 
diff --git a/data_preparation/70.releasing/add_proper_tei_header.py b/data_preparation/70.releasing/add_proper_tei_header.py
index eb14808..4613fae 100644
--- a/data_preparation/70.releasing/add_proper_tei_header.py
+++ b/data_preparation/70.releasing/add_proper_tei_header.py
@@ -39,7 +39,6 @@
     <editionStmt>
       <edition>{pub_version}</edition>
     </editionStmt>
-
     
     <publicationStmt>
       <publisher>
@@ -108,11 +107,13 @@
     </langUsage>
     <textClass>
       <keywords scheme="custom">
+        <term type="database">Databáze mluvených projevů v češtině jako cizím jazyce (trvalý pobyt v ČR)</term>
         <term type="exam-id">{examid}</term>
         <term type="cefr-level">{level}</term>
         <term type="task-number">{exerno}</term>
         <term type="preannot-source">{type}</term>
-        <term type="database">Databáze mluvených projevů v češtině jako cizím jazyce (trvalý pobyt v ČR)</term>
+        <term type="annotator">{annotator_short}</term>
+        <term type="canonical">{canonical:d}</term>
       </keywords>
     </textClass>
   </profileDesc>
@@ -140,6 +141,7 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument('input', help='Path to the input file')
     parser.add_argument('output', help='Path to the output file')
+    parser.add_argument('--canonical', action='store_true', help='The annotation is canonical')
     parser.add_argument('--pub-date', type=str, default='2024-10-31', help='Publication date')
     parser.add_argument('--pub-version', type=str, default='1.0', help='Publication version')
     parser.add_argument('--handle-uri', type=str, default='http://hdl.handle.net/11234/1-5731', help='Handle URI')
@@ -185,7 +187,7 @@ def add_info_from_file_content(info, doctree):
     info['reviewer'] = ANNOTATOR_NAMES[review_duration_elems[0].attrib.get('user') if review_duration_elems else 'MR']
 
 def add_info_from_args(info, args):
-    for name in ['handle_uri', 'pub_version', 'rec_inst', 'rec_inst_en', 'rec_inst_short']:
+    for name in ['handle_uri', 'pub_version', 'rec_inst', 'rec_inst_en', 'rec_inst_short', 'canonical']:
         info[name] = getattr(args, name)
     info['pub_date'] = datetime.strptime(args.pub_date, '%Y-%m-%d')