Skip to content

Training a model : notes

Thibault Clérice edited this page Dec 23, 2018 · 1 revision

Préparation des données

Faire attention aux choses suivantes dans Pie pour entraîner :

Geste

Geste n'a pas de ponctuation, or il y a la particularité "breakline_data": "$.", à prendre en compte.

Wauchier

Wauchier n'a pas de données proprement ponctuée (?)

Autres corpora ?

Quoi d'autre ?

Configurations ?

Écrire un settings pour PPA Splitter ?

# For this file, the tokens will be splitted in window of 20 words and
#   dispatched accordingly
datasets/chrestien.tsv:
  column_marker: TAB
  splitter: token_window
  window: 20
datasets/dotmarkers.tsv:
  column_marker: TAB
  sentence_markers: ';.:'
  splitter: punctuation
datasets/empty_line.tsv:
  column_marker: TAB
  splitter: empty_line
datasets/flow.tsv:
  column_marker: TAB
  splitter: file_split

Configuration d'entraînement

{
  // * General
  // model name to be used for saving
  "modelname": "latest-fro",
  // model path to be used for saving
  "modelpath": "./",
  // run test (no serialization)
  "run_test": false,
  // max length of sentences (longer sentence will be split)
  "max_sent_len": 35,
  // maximum number of sentences to process
  "max_sents": 1000000,
  // * Data
  // path or unix-like expression to file(s)/dir with training data:
  // e.g. "datasets/capitula_classic/train0.tsv""
  "input_path": "./datasets/jbc/train/train.tab",
  // path to test set (same format as input_path)
  "test_path": "./datasets/jbc/test/test.tab",
  // path to dev set (same format as input_path)
  "dev_path": "./datasets/jbc/dev/dev.tab",
  // data to use as reference for breaking lines (e.g. "pos")
  "breakline_ref": "pos",
  // needed to decide for a sentence boundary (e.g. "$.")
  "breakline_data": "$.",
  // maximum vocabulary size
  "char_max_size": 500,
  // maximum vocabulary size for word input
  "word_max_size": 20000,
  // min freq per item to be incorporated in the vocabulary (only used if *_max_size is 0)
  "char_min_freq": 1,
  "word_min_freq": 1,
  // char-level encoding
  "char_eos": true,
  "char_bos": true,
  // tab-format only:
  "header": false,
  // separator for csv-like files
  "sep": "\t",
  // expected order of tasks for tabreader
  "tasks_order": ["lemma", "pos", "morph"], // only for TabReader
  // task-related config
  "tasks": [
    // each task's name refers to the corresponding data field
    // this behaviour can be changed in case the name differs from the data field
    // by using a "target" key that refers to the target data field
    // e.g. {"name": "lemma-char", "settings": {"target": "lemma"}}
    // e.g. {"name": "lemma-word", "settings": {"target": "lemma"}}
    {
      "name": "lemma",
      "target": true,
      "context": "sentence",
      "level": "char",
      "decoder": "attentional",
      "settings": {
        "bos": true,
        "eos": true,
        "lower": true,
        "target": "lemma"
      },
      "layer": -1
    },
    {"name": "pos"},
    {"name": "morph"}
  ],
  "task_defaults": {
    "level": "token",
    "layer": -1,
    "decoder": "linear",
    "context": "sentence"
  },
  // general task schedule params (can be overwritten in the "settings" entry of each)
  "patience": 1000000, // default to very large value
  "factor": 1, // decrease the loss weight by this amount (0, 1)
  "threshold": 0, // minimum decrease in loss to be considered an improvement
  "min_weight": 0, // minimum value a task weight can be decreased to

  // whether to include autoregressive loss
  "include_lm": true,
  // whether to share the output layer for both fwd and bwd lm
  "lm_shared_softmax": false,
  // lm settings in case it's included as an extra task
  "lm_schedule": {
    "patience": 100, "factor": 0.5, "weight": 0.2, "mode": "min"
  },
  "include_lm": true,
  "lm_shared_softmax": true,
  "lm_schedule": {
    "patience": 2,
    "factor": 0.5,
    "weight": 0.2,
    "mode": "min"
  },
  "batch_size": 25,
  "patience": 5,
  "factor": 0.5,
  "dropout": 0.25,
  "lr": 0.001,
  "lr_factor": 0.75,
  "lr_patience": 2,
  "epochs": 100,
  "cell": "GRU",
  "num_layers": 1,
  "hidden_size": 150,
  "wemb_dim": 0,
  "cemb_dim": 300,
  "cemb_type": "rnn",
  "cemb_layers": 2,
  "checks_per_epoch": 1,
  "report_freq": 200,
  "verbose": true,
  "device": "cuda",
  "run_test": false,
  "max_sents": 1000000,
  "char_max_size": 500,
  "char_min_freq": 1,
  "word_min_freq": 1,
  "char_eos": true,
  "char_bos": true,
  "threshold": 0,
  "min_weight": 0,
  "buffer_size": 10000,
  "minimize_pad": false,
  "word_dropout": 0,
  "shuffle": true,
  "optimizer": "Adam",
  "clip_norm": 5,
  "pretrain_embeddings": false,
  "load_pretrained_embeddings": "",
  "load_pretrained_encoder": "",
  "freeze_embeddings": false,
  "custom_cemb_cell": false,
  "merge_type": "concat",
  "scorer": "general",
  "linear_layers": 1
}