From 3995c034b4bdb68bec9e1fd8567a4d9bcbe5b9e2 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Wed, 6 Nov 2024 17:39:55 +0100 Subject: [PATCH] implementation of canonical transcript selection --- data_preparation/70.releasing/Makefile | 19 +++++++++++++++++-- .../70.releasing/add_proper_tei_header.py | 8 +++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/data_preparation/70.releasing/Makefile b/data_preparation/70.releasing/Makefile index d86e460..d568b21 100644 --- a/data_preparation/70.releasing/Makefile +++ b/data_preparation/70.releasing/Makefile @@ -17,6 +17,11 @@ RELEASE_NAME=Evaldio_Dataset-1.0 FORMAT_SIMPLE_SCRIPT=../60.format_simple/format_simple.sh +CANONICAL= +ifeq ($(CANONICAL),1) +CANONICAL_FLAG=--canonical +endif + DATE=$(shell date +%Y%m%d) ############ RULES ################ @@ -42,7 +47,7 @@ $(TGT_DIR)/%.xml : $(TGT_DIR)/%.xml.with_header xmllint --format - < $< > $@ $(TGT_DIR)/%.xml.with_header : $(TGT_DIR)/%.xml.parsed - python add_proper_tei_header.py $< $@ + python add_proper_tei_header.py $(CANONICAL_FLAG) $< $@ $(TGT_DIR)/%.xml.parsed : $(TGT_DIR)/%.xml.tokenized cp $< $@ @@ -60,12 +65,22 @@ $(TGT_DIR)/%.xml.specstr_replaced : $(SRC_DIR)/%.xml copy_finalize_annot : $(BATCH_PATH).exer_numbers.csv runparser.pl mkdir -p $(TGT_DIR) + RANDOM=2024; \ while read fid exer; do \ + annot_count=`ls $(SRC_DIR)/$$fid*-$$exer.xml | wc -l`; \ + rand_i=$$((RANDOM % $$annot_count)); \ + i=0; \ for f in $(SRC_DIR)/$$fid*-$$exer.xml; do \ + if [ $$i -eq $$rand_i ]; then \ + canonical=1; \ + else \ + canonical=0; \ + fi; \ bf=`basename $$f .xml`; \ echo "===== $$f ====="; \ - make $(TGT_DIR)/$$bf.xml; \ + make $(TGT_DIR)/$$bf.xml CANONICAL=$$canonical; \ make $(TGT_DIR)/$$bf.txt; \ + i=$$((i+1)); \ done; \ done < $< diff --git a/data_preparation/70.releasing/add_proper_tei_header.py b/data_preparation/70.releasing/add_proper_tei_header.py index eb14808..4613fae 100644 --- a/data_preparation/70.releasing/add_proper_tei_header.py +++ b/data_preparation/70.releasing/add_proper_tei_header.py @@ -39,7 +39,6 @@ {pub_version} - @@ -108,11 +107,13 @@ + Databáze mluvených projevů v češtině jako cizím jazyce (trvalý pobyt v ČR) {examid} {level} {exerno} {type} - Databáze mluvených projevů v češtině jako cizím jazyce (trvalý pobyt v ČR) + {annotator_short} + {canonical:d} @@ -140,6 +141,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('input', help='Path to the input file') parser.add_argument('output', help='Path to the output file') + parser.add_argument('--canonical', action='store_true', help='The annotation is canonical') parser.add_argument('--pub-date', type=str, default='2024-10-31', help='Publication date') parser.add_argument('--pub-version', type=str, default='1.0', help='Publication version') parser.add_argument('--handle-uri', type=str, default='http://hdl.handle.net/11234/1-5731', help='Handle URI') @@ -185,7 +187,7 @@ def add_info_from_file_content(info, doctree): info['reviewer'] = ANNOTATOR_NAMES[review_duration_elems[0].attrib.get('user') if review_duration_elems else 'MR'] def add_info_from_args(info, args): - for name in ['handle_uri', 'pub_version', 'rec_inst', 'rec_inst_en', 'rec_inst_short']: + for name in ['handle_uri', 'pub_version', 'rec_inst', 'rec_inst_en', 'rec_inst_short', 'canonical']: info[name] = getattr(args, name) info['pub_date'] = datetime.strptime(args.pub_date, '%Y-%m-%d')