-
Notifications
You must be signed in to change notification settings - Fork 3
/
morphadorner.properties
290 lines (173 loc) · 7.44 KB
/
morphadorner.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
##### Adorner processing classes.
## Use list adorned word outputter.
adornedwordoutputter.class=ListAdornedWordOutputter
## Use word tokenizer that treats apostrophes as distinct from single quotes
wordtokenizer.class=DefaultWordTokenizer
## Use default post tokenizer.
posttokenizer.class=DefaultPostTokenizer
## Use trigram part of speech tagger.
partofspeechtagger.class=TrigramTagger
## Transition matrix for part of speech tags.
partofspeechtagger.transition_matrix=data/ncftransmat.mat
## Use default retagger.
partofspeechretagger.class=DefaultPartOfSpeechRetagger
## Use default part of speech guesser.
partofspeechguesser.class=DefaultPartOfSpeechGuesser
## Check possessives.
partofspeechguesser.check_possessives=false
## Try standard spellings to assist in guessing parts of speech for unknown word.
partofspeechguesser.try_standard_spellings=true
## Use default sentence splitter.
sentencesplitter.class=DefaultSentenceSplitter
## Use noop spelling standardizer as initial standardizer.
initialspellingstandardizer.class=NoopSpellingStandardizer
## Use extended simple search spelling standardizer.
spellingstandardizer.class=ExtendedSimpleSpellingStandardizer
## Use U.S. To British spelling mapper.
spellingmapper.class=USToBritishSpellingMapper
## Use noop name standardizer.
namestandardizer.class=NoopNameStandardizer
## Use simple XML file text inputter.
textinputter.class=SimpleXMLTextInputter
## Use default XML writer.
morphadornerxmlwriter.class=DefaultMorphAdornerXMLWriter
##### Configuration settings for Adorner output.
## No sentence number.
adorner.output.sentence_number=false
adorner.output.sentence_number_attribute=sn
## No word number.
adorner.output.word_number=false
adorner.output.word_number_attribute=wn
## Word numbers restart with each sentence.
adorner.output.running_word_numbers=false
## Word ordinal.
adorner.output.word_ordinal=true
adorner.output.word_ordinal_attribute=ord
## Output original token text (before post tokenization applied)
adorner.output.original_token=true
adorner.output.original_token_attribute=tok
## Output original spelling.
adorner.output.spelling=true
adorner.output.spelling_attribute=spe
## Output part of speech.
adorner.output.part_of_speech=true
adorner.output.part_of_speech_attribute=pos
## Output lemma.
adorner.output.lemma=true
adorner.output.lemma_attribute=lem
## Ignore lemmata in lexicon when lemmatizing?
adorner.lemmatization.ignorelexiconentries=false
## Output standard spelling.
adorner.output.standard_spelling=true
adorner.output.standard_spelling_attribute=reg
## Do not output KWIC context for token.
adorner.output.kwic=false
adorner.output.kwic_left_attribute=kl
adorner.output.kwic_right_attribute=kr
## Width in characters of KWIC context.
adorner.output.kwic.width=80
## Output end of sentence flag for each token.
adorner.output.end_of_sentence_flag=true
adorner.output.end_of_sentence_flag_attribute=eos
## True to perform extended XML processing.
adorner.handle_xml=true
##### Lexicons.
lexicon.word_lexicon=data/ncflexicon.lex
lexicon.suffix_lexicon=data/ncfsuffixlexicon.lex
##### Spelling lists.
spelling.standard_spellings=data/standardspellings.txt
spelling.spelling_pairs=data/ncfmergedspellingpairs.tab
spelling.spelling_pairs_by_word_class=data/spellingsbywordclass.txt
##### Additional word lists.
wordlists.use_latin_word_list=false
##### Configuration settings for XML handling follow.
## Name of the word id.
# This is generated only for XML output.
xml.id.attribute = xml:id
## Type of ID.
#
# Word IDs start with the work identifier, taken from the file name
# of the work.
#
# "reading_context_order" appends integer values
# whose order gives the reading context order defined
# by the classification of hard, soft, and jump tags.
#
# "word_within_page_block" appends two integer values in the
# the form pageblocknumber-wordinblock, where pageblocknumber
# is the ordinal of the current <pb> (page break) entry,
# and wordinblock is the number of the word within
# the page block (starting at 1).
#
# "spacing" gives the spacing between ID values. For
# example, an increment of 10 spaces subsequence
# reading_context_order or wordinblock values by 10.
# This allows new values to be interpolated for editing
# purposes.
#
# IDs for words split by soft tags have a ".n" appended,
# where "n" is an integer starting at 1 giving the part
# number of the word in order.
#
#xml.id.type=word_within_page_block
xml.id.type=reading_context_order
xml.id.spacing=10
## word_tag_name is a required parameter representing the name
## of the added text tags.
##
## The name 'id' is forbidden if "id" = true below.
xml.word_tag_name = w
## Field delimiter for adorner output.
## Delimiter cannot be a quotation mark '"' .
xml.field_delimiters = "\t"
## Word delimiter for adorned output.
## Delimiter cannot be a quotation mark '"'.
xml.word_delimiters = "\r\n"
## Marker for surrounding distinct segments of text
xml.surround_marker = " \ue500 "
## Ignore tag case when checking for soft/jump/hard tags.
xml.ignore_tag_case = true
## List of jump tag names. Separate tag names with a blank.
xml.jump_tags = bibl figdesc figDesc figure footnote note ref stage tailnote
## List of soft tag names. Separate tag names with a blank.
## Note: zzzzsw is used internally. Don't remove it or bad things will happen.
xml.soft_tags = abbr add address author c choice cl corr date emph \
foreign gap hi l lb location m mentioned milestone money name num \
organization orig pb person phr reg rs s sb seg sic soCalled sub sup term time \
title unclear w zzzzsw
## True to enable extended logging.
xml.log = false
## Name of output doctype name. Optional. You must specify
## xml.doctype.system in addition.
#xml.doctype.name = TEI
## Name of output system name (DTD). Optional. You must specify
## xml.doctype.name in addition.
#xml.doctype.system = http://ariadne.northwestern.edu/monk/dtds/teisimple.dtd
## Schema to use for XML input parsing, if not given in input XML file.
xml.xml_schema = http://ariadne.northwestern.edu/monk/schemata/TEIAnalytics.rng
## True to output whitespace elements (e.g., <c> </c>) between
## word elements in XML.
xml.output_whitespace_elements = true
## True to emit only non-redundant word tag attributes.
xml.output_nonredundant_attributes_only = false
## True to emit only non-redundant token attributes.
xml.output_nonredundant_token_attribute = false
## Fix gap tags in XML input.
xml.fix_gap_tags = false
## Fix orig tags in XML input.
xml.fix_orig_tags = false
## Fix selected split words in XML input.
xml.fix_split_words = false
## Output pseudo-page boundaries.
xml.output_pseudo_page_boundary_milestones = false
xml.pseudo_page_size = 300
xml.pseudo_page_container_div_types = volume chapter sermon
## Force end of sentence at end of hard tag.
xml.close_sentence_at_end_of_hard_tag = false
## Force end of sentence at end of jump tag.
xml.close_sentence_at_end_of_jump_tag = true
## XML elements in which to disallow generated <w> and <c> elements.
## Separate element names by blanks.
xml.disallow_word_elements_in=figDesc sic
## Adorn XML files with existing adorned version in output directory.
xml.adorn_existing_xml_files=false