Skip to content

Latest commit

 

History

History
203 lines (172 loc) · 7.19 KB

File metadata and controls

203 lines (172 loc) · 7.19 KB

Eval trained models


declare -a model_names_list=(
baseline_bidaf.tar.gz
baseline_qanet.tar.gz
dassa_coref_feats.tar.gz
dassa_multi_srl_sdp_exp.tar.gz
dassa_multi_srl_sdp_exp_coref_feats.tar.gz
dassa_multi_srl_sdp_exp_nonexp.tar.gz
dassa_multi_srl_sdp_nonexp.tar.gz
dassa_sdp_exp.tar.gz
dassa_sdp_exp_nonexp.tar.gz
dassa_sdp_exp_nosense.tar.gz
dassa_sdp_ne.tar.gz
dassa_sdp_ne_nosense.tar.gz
dassa_sentspan3.tar.gz
dassa_srl_3verbs.tar.gz
dassa_srl_4verbs.tar.gz
)

# Indicates if we want to evaluate the eval input with the trained model.
EVAL_PREDICTIONS=TRUE

# Indicates if we want to calculate different metrics such as question types/lengths etc.
GENERATE_METRICS=TRUE

# If we have started evaluation but it failed and we want to append the failed example predictions
SKIP_OR_APPEND_FILE=TRUE

for model_name_curr in "${model_names_list[@]}"
do
    JOB_NAME=${model_name_curr}

    BATCH_SIZE=1
    python_exec=/home/mitarb/mihaylov/anaconda3/envs/dassa/bin/python
    ARCHIVE_FILE=trained_models/emnlp_paper/${model_name_curr}


    OUTPUT_DIR=_output/emnlp_paper/${JOB_NAME}
    mkdir -p ${OUTPUT_DIR}
    mkdir -p _jobs

    cp logs/job_${JOB_NAME}.log logs/job_${JOB_NAME}_train.log

    RESEARCH_DIR=/home/mitarb/mihaylov/research

    CUDA_DEVICE=0

    JOB_NAME=eval_${JOB_NAME}
    JOB_BASH_FILE=_jobs/${JOB_NAME}.sh

    TRAIN_FILE=data/narrativeqa_annotated/summaries_annotated.jsonl.train
    DEV_FILE=data/narrativeqa_annotated/summaries_annotated.jsonl.valid
    TEST_FILE=data/narrativeqa_annotated/summaries_annotated.jsonl.test
    TEST2_FILE=

    TRAIN_EVAL_OUT=predictions_train.json
    DEV_EVAL_OUT=predictions_dev.json
    TEST_EVAL_OUT=predictions_test.json
    TEST2_EVAL_OUT=

    TRAIN_EVAL_ANNO=
    DEV_EVAL_ANNO=
    TEST_EVAL_ANNO=tools/annotations/narrativeqa-annotation.json
    TEST2_EVAL_ANNO=

    DEV_EVAL_NUM=3461
    TEST_EVAL_NUM=10557
    TEST2_EVAL_NUM=

    ### Create BASH file
    ### We create bash file with the required evaluation and execute it!
    echo -e "#Script for ${JOB_NAME}\n" > ${JOB_BASH_FILE}

     # Predict DEV
     EVAL_FILE=${DEV_FILE}
     ANNOTATION_FILE=${DEV_EVAL_ANNO}
     if [ -n "${ANNOTATION_FILE}" ]; then
        ANNOTATION_FILE=" -a ${ANNOTATION_FILE}"
     fi
     PREDICTIONS_OUT_FILE=${OUTPUT_DIR}/${DEV_EVAL_OUT}
     EVAL_NUM=${DEV_EVAL_NUM}
     if [ "${EVAL_PREDICTIONS}" == "TRUE" ]; then
         START_ID=-1
         END_ID=-1
         FILE_OPEN_MODE=w
         SKIP_CURR=FALSE
         EVAL_FILE_LEN=$(wc -l < "${EVAL_FILE}")
         if [ -f ${PREDICTIONS_OUT_FILE} ]; then
            if [ "${SKIP_OR_APPEND_FILE}" == "TRUE" ]; then
                START_ID=$(wc -l < "${PREDICTIONS_OUT_FILE}")
                if [ "${EVAL_NUM}" == "${START_ID}" ]; then
                    SKIP_CURR=TRUE
                fi
                FILE_OPEN_MODE=a
            fi
         fi

         if [ "${SKIP_CURR}" == "FALSE" ]; then
             JOB_SCRIPT="RESEARCH_DIR=${RESEARCH_DIR};PYTHONPATH=. ${python_exec} docqa/run.py evaluate_custom --archive_file ${ARCHIVE_FILE} --evaluation_data_file ${EVAL_FILE} --output_file ${PREDICTIONS_OUT_FILE} --cuda_device ${CUDA_DEVICE} --batch_size=${BATCH_SIZE} --start_id ${START_ID} --end_id ${END_ID} --file_open_mode=${FILE_OPEN_MODE}"
             echo ${JOB_SCRIPT} >> ${JOB_BASH_FILE}
             echo -e "\n" >> ${JOB_BASH_FILE}
         fi
     fi

     # DEV generate metrics
     if [ "${GENERATE_METRICS}" == "TRUE" ]; then
         JOB_SCRIPT="RESEARCH_DIR=${RESEARCH_DIR};PYTHONPATH=. ${python_exec} tools/narrativeqa_eval_generation.py -i ${PREDICTIONS_OUT_FILE} ${ANNOTATION_FILE}"
         echo ${JOB_SCRIPT} >> ${JOB_BASH_FILE}
         echo -e "\n" >> ${JOB_BASH_FILE}
     fi

     # Predict TEST
     EVAL_FILE=${TEST_FILE}
     ANNOTATION_FILE=${TEST_EVAL_ANNO}
     if [ -n "${ANNOTATION_FILE}" ]; then
        ANNOTATION_FILE=" -a ${ANNOTATION_FILE}"
     fi
     PREDICTIONS_OUT_FILE=${OUTPUT_DIR}/${TEST_EVAL_OUT}
     EVAL_NUM=${TEST_EVAL_NUM}
     if [ "${EVAL_PREDICTIONS}" == "TRUE" ]; then
         START_ID=-1
         END_ID=-1
         FILE_OPEN_MODE=w
         SKIP_CURR=FALSE
         EVAL_FILE_LEN=$(wc -l < "${EVAL_FILE}")
         if [ -f ${PREDICTIONS_OUT_FILE} ]; then
            if [ "${SKIP_OR_APPEND_FILE}" == "TRUE" ]; then
                START_ID=$(wc -l < "${PREDICTIONS_OUT_FILE}")
                if [ "${EVAL_NUM}" == "${START_ID}" ]; then
                    SKIP_CURR=TRUE
                fi
                FILE_OPEN_MODE=a
            fi
         fi

         if [ "${SKIP_CURR}" == "FALSE" ]; then
             JOB_SCRIPT="RESEARCH_DIR=${RESEARCH_DIR};PYTHONPATH=. ${python_exec} docqa/run.py evaluate_custom --archive_file ${ARCHIVE_FILE} --evaluation_data_file ${EVAL_FILE} --output_file ${PREDICTIONS_OUT_FILE} --cuda_device ${CUDA_DEVICE} --batch_size=${BATCH_SIZE} --start_id ${START_ID} --end_id ${END_ID} --file_open_mode=${FILE_OPEN_MODE}"
             echo ${JOB_SCRIPT} >> ${JOB_BASH_FILE}
             echo -e "\n" >> ${JOB_BASH_FILE}
         fi
     fi

     # Test generate metrics
     if [ "${GENERATE_METRICS}" == "TRUE" ]; then
         JOB_SCRIPT="RESEARCH_DIR=${RESEARCH_DIR};PYTHONPATH=. ${python_exec} tools/narrativeqa_eval_generation.py -i ${PREDICTIONS_OUT_FILE} ${ANNOTATION_FILE}"
         echo ${JOB_SCRIPT} >> ${JOB_BASH_FILE}
         echo -e "\n" >> ${JOB_BASH_FILE}
     fi

    # Predict TEST2
    if [ "$TEST2_FILE" -ne "" ]; then
        EVAL_FILE=${$TEST2_FILE}
        ANNOTATION_FILE=${TEST2_EVAL_ANNO}
        if [ -n "${ANNOTATION_FILE}" ]; then
            ANNOTATION_FILE=" -a ${ANNOTATION_FILE}"
        fi
        PREDICTIONS_OUT_FILE=${OUTPUT_DIR}/${TEST2_EVAL_OUT}
        EVAL_NUM=${TEST2_EVAL_NUM}
        if [ "${EVAL_PREDICTIONS}" == "TRUE" ]; then
            START_ID=-1
            END_ID=-1
            FILE_OPEN_MODE=w
            SKIP_CURR=FALSE
            EVAL_FILE_LEN=$(wc -l < "${EVAL_FILE}")
            if [ -f ${PREDICTIONS_OUT_FILE} ]; then
               if [ "${SKIP_OR_APPEND_FILE}" == "TRUE" ]; then
                   START_ID=$(wc -l < "${PREDICTIONS_OUT_FILE}")
                   if [ "${EVAL_NUM}" == "${START_ID}" ]; then
                       SKIP_CURR=TRUE
                   fi
                   FILE_OPEN_MODE=a
               fi
            fi

            if [ "${SKIP_CURR}" == "FALSE" ]; then
                JOB_SCRIPT="RESEARCH_DIR=${RESEARCH_DIR};PYTHONPATH=. ${python_exec} docqa/run.py evaluate_custom --archive_file ${ARCHIVE_FILE} --evaluation_data_file ${EVAL_FILE} --output_file ${PREDICTIONS_OUT_FILE} --cuda_device ${CUDA_DEVICE} --batch_size=${BATCH_SIZE} --start_id ${START_ID} --end_id ${END_ID} --file_open_mode=${FILE_OPEN_MODE}"
                echo ${JOB_SCRIPT} >> ${JOB_BASH_FILE}
                echo -e "\n" >> ${JOB_BASH_FILE}
            fi
        fi

        # Test generate metrics
        if [ "${GENERATE_METRICS}" == "TRUE" ]; then
            JOB_SCRIPT="RESEARCH_DIR=${RESEARCH_DIR};PYTHONPATH=. ${python_exec} tools/narrativeqa_eval_generation.py -i ${PREDICTIONS_OUT_FILE} ${ANNOTATION_FILE}"
            echo ${JOB_SCRIPT} >> ${JOB_BASH_FILE}
            echo -e "\n" >> ${JOB_BASH_FILE}
        fi
    fi


    echo "bash ${JOB_BASH_FILE}"
    bash ${JOB_BASH_FILE}

done

Export results

bash tools/display_out_metrics_narrativeqa.sh