-
Notifications
You must be signed in to change notification settings - Fork 1
Training a model : notes
Thibault Clérice edited this page Dec 23, 2018
·
1 revision
Faire attention aux choses suivantes dans Pie pour entraîner :
Geste n'a pas de ponctuation, or il y a la particularité "breakline_data": "$.",
à prendre en compte.
Wauchier n'a pas de données proprement ponctuée (?)
Quoi d'autre ?
Écrire un settings pour PPA Splitter ?
# For this file, the tokens will be splitted in window of 20 words and
# dispatched accordingly
datasets/chrestien.tsv:
column_marker: TAB
splitter: token_window
window: 20
datasets/dotmarkers.tsv:
column_marker: TAB
sentence_markers: ';.:'
splitter: punctuation
datasets/empty_line.tsv:
column_marker: TAB
splitter: empty_line
datasets/flow.tsv:
column_marker: TAB
splitter: file_split
{
// * General
// model name to be used for saving
"modelname": "latest-fro",
// model path to be used for saving
"modelpath": "./",
// run test (no serialization)
"run_test": false,
// max length of sentences (longer sentence will be split)
"max_sent_len": 35,
// maximum number of sentences to process
"max_sents": 1000000,
// * Data
// path or unix-like expression to file(s)/dir with training data:
// e.g. "datasets/capitula_classic/train0.tsv""
"input_path": "./datasets/jbc/train/train.tab",
// path to test set (same format as input_path)
"test_path": "./datasets/jbc/test/test.tab",
// path to dev set (same format as input_path)
"dev_path": "./datasets/jbc/dev/dev.tab",
// data to use as reference for breaking lines (e.g. "pos")
"breakline_ref": "pos",
// needed to decide for a sentence boundary (e.g. "$.")
"breakline_data": "$.",
// maximum vocabulary size
"char_max_size": 500,
// maximum vocabulary size for word input
"word_max_size": 20000,
// min freq per item to be incorporated in the vocabulary (only used if *_max_size is 0)
"char_min_freq": 1,
"word_min_freq": 1,
// char-level encoding
"char_eos": true,
"char_bos": true,
// tab-format only:
"header": false,
// separator for csv-like files
"sep": "\t",
// expected order of tasks for tabreader
"tasks_order": ["lemma", "pos", "morph"], // only for TabReader
// task-related config
"tasks": [
// each task's name refers to the corresponding data field
// this behaviour can be changed in case the name differs from the data field
// by using a "target" key that refers to the target data field
// e.g. {"name": "lemma-char", "settings": {"target": "lemma"}}
// e.g. {"name": "lemma-word", "settings": {"target": "lemma"}}
{
"name": "lemma",
"target": true,
"context": "sentence",
"level": "char",
"decoder": "attentional",
"settings": {
"bos": true,
"eos": true,
"lower": true,
"target": "lemma"
},
"layer": -1
},
{"name": "pos"},
{"name": "morph"}
],
"task_defaults": {
"level": "token",
"layer": -1,
"decoder": "linear",
"context": "sentence"
},
// general task schedule params (can be overwritten in the "settings" entry of each)
"patience": 1000000, // default to very large value
"factor": 1, // decrease the loss weight by this amount (0, 1)
"threshold": 0, // minimum decrease in loss to be considered an improvement
"min_weight": 0, // minimum value a task weight can be decreased to
// whether to include autoregressive loss
"include_lm": true,
// whether to share the output layer for both fwd and bwd lm
"lm_shared_softmax": false,
// lm settings in case it's included as an extra task
"lm_schedule": {
"patience": 100, "factor": 0.5, "weight": 0.2, "mode": "min"
},
"include_lm": true,
"lm_shared_softmax": true,
"lm_schedule": {
"patience": 2,
"factor": 0.5,
"weight": 0.2,
"mode": "min"
},
"batch_size": 25,
"patience": 5,
"factor": 0.5,
"dropout": 0.25,
"lr": 0.001,
"lr_factor": 0.75,
"lr_patience": 2,
"epochs": 100,
"cell": "GRU",
"num_layers": 1,
"hidden_size": 150,
"wemb_dim": 0,
"cemb_dim": 300,
"cemb_type": "rnn",
"cemb_layers": 2,
"checks_per_epoch": 1,
"report_freq": 200,
"verbose": true,
"device": "cuda",
"run_test": false,
"max_sents": 1000000,
"char_max_size": 500,
"char_min_freq": 1,
"word_min_freq": 1,
"char_eos": true,
"char_bos": true,
"threshold": 0,
"min_weight": 0,
"buffer_size": 10000,
"minimize_pad": false,
"word_dropout": 0,
"shuffle": true,
"optimizer": "Adam",
"clip_norm": 5,
"pretrain_embeddings": false,
"load_pretrained_embeddings": "",
"load_pretrained_encoder": "",
"freeze_embeddings": false,
"custom_cemb_cell": false,
"merge_type": "concat",
"scorer": "general",
"linear_layers": 1
}