forked from due-benchmark/baselines
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_memmaps.sh
executable file
·34 lines (31 loc) · 1.39 KB
/
create_memmaps.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/bin/bash
DATASETS_ROOT="/data-c/shared/athena/datasets/public/challenges_benchmark_paper"
TOKENIZER="/data-c/shared/athena/models/hf/t5-base"
MAX_LENGTHS=(1024 6144 6144 1024 4096 1024 6144)
TRAIN_STRATEGIES=(all_items concat all_items all_items concat all_items all_items)
DATASETS=(DocVQA PWC DeepForm TabFact WikiTableQuestions InfographicsVQA KleisterCharity)
OCRS=(microsoft_cv tesseract djvu)
for task_idx in "${!DATASETS[@]}"
do
mkdir -p memmaps/$task_directory
for ocr_engine in "${OCRS[@]}"
do
if ! grep -q "\"tool_name\": \"$ocr_engine\"" $DATASETS_ROOT/${DATASETS[task_idx]}/documents_content.jsonl; then
echo "Skipping ${DATASETS[task_idx]}/$ocr_engine"
continue
fi
echo "Producing memmaps for ${DATASETS[task_idx]}/$ocr_engine..."
TOKENIZERS_PARALLELISM=false ./benchmarker/cli/l5/create_memmaps.py \
--dataset_path_or_name $DATASETS_ROOT/${DATASETS[task_idx]}/ \
--model_path $TOKENIZER \
--memmap_path memmaps/${DATASETS[task_idx]}/$ocr_engine \
--max_encoder_length ${MAX_LENGTHS[task_idx]} \
--segment_levels "(tokens,pages)" \
--processes 20 \
--ocr_engine $ocr_engine \
--train_strategy ${TRAIN_STRATEGIES[task_idx]} \
--dev_strategy concat \
--test_strategy concat \
--use_fast_tokenizer
done
done