-
Notifications
You must be signed in to change notification settings - Fork 20
/
example.yaml
42 lines (37 loc) · 1.92 KB
/
example.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#
# YAML vocabulary can be used to create a new vocabulary or modify an existing vocabulary.
# Note that indentation must be done with 2 spaces and not with tabs.
#
# The following variables only affect a new vocabulary and will not modify an existing one:
charset: "utf-8" # has no effect on an existing vocabulary
capcode: 2 # has no effect on an existing vocabulary
normalization: "nfd" # multiple normalizations can be applied: nfd lowercase accents quotemarks collapse trim leadingspace unixlines
# The following variables will modify an existing vocabulary if present:
include-256-bytes: false
include-128-bytes: false
include-utf8-bytes: false
include-ascii-bytes: false
include-extended-bytes: false
exclude-other-bytes: false
unk: true # whether to use an UNK token for missing characters
unk-id: 20000 # optionally assign an ID to the UNK token (if exists)
reset-token-ids: false # if true, ids will be ignored and assigned from zero alphabetically
# Regular tokens and single byte tokens
tokens:
- token: "token1"
id: 27 # id field is optional
score: 0.127 # score field is optional
encoded: true # if false, normalization and capcode will be applied to the token before adding it
- token: "token2"
- token: "TokenMonsterHexEncode{ff}" # you can provide the token hex encoded like this
# Regular tokens cannot contain special tokens, any that do will be ignored/deleted
special:
- token: "<SPECIAL1>" # special tokens should not begin with a letter or number
id: 1000 # id & score fields are optional
score: 0.002
- token: "<SPECIAL2>"
# Tokens specified under "delete" will be deleted
delete:
- token: " token" # deletes by token
encoded: false # if false normalization and capcode will be applied before searching for it to delete
- id: 77 # deletes by ID