-
Notifications
You must be signed in to change notification settings - Fork 3
/
eme.properties
46 lines (33 loc) · 1.9 KB
/
eme.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# Use post tokenizer which understands EEBO/TCP encodings.
posttokenizer.class=EEBOPostTokenizer
# Use simple XML text inputter. For very large texts, it is better (but much slower)
# to use the DiskBasedXMLTextInputter which shuttles portions of the text to and
# from disk as needed.
#textinputter.class=DiskBasedXMLTextInputter
textinputter.class=SimpleXMLTextInputter
# Use simple name standardizer.
namestandardizer.class=EEBOSimpleNameStandardizer
# Use Early Modern English lexicons and word lists.
partofspeechtagger.transition_matrix=data/emetransmat.mat
lexicon.word_lexicon=data/emelexicon.lex
lexicon.suffix_lexicon=data/emesuffixlexicon.lex
spelling.spelling_pairs=data/ememergedspellingpairs.tab
abbreviations.abbreviations_url=data/emeabbreviations.txt
# Use latin word list.
wordlists.use_latin_word_list=true
# Fix selected split words in XML input.
xml.fix_split_words=true
# Specify the Java style regular expression pattern(s) for matching and
# replacement of split words. Currently these patterns only fix up
# split reflexive pronouns and some leading periods.
# The zzzzsw is used as a marker tag internally.
xml.fix_split_words.match1 = \\s\u0304
xml.fix_split_words.replace1 = \u0304
xml.fix_split_words.match2 = (?iu)(here|her|him|hir|hym|It|it|me|mi|my|not|one|oure|our|owne|own|their|theim|theym|theyr|them|the\u0304|thy|us|youre|your)\\s(selfes|selues|selves|selfe|selfs|self)
xml.fix_split_words.replace2 = $1<zzzzsw>$2</zzzzsw>
xml.fix_split_words.match3 = (?iu)(\\s)(ofthe)(\\s)
xml.fix_split_words.replace3 = <choice><sic> ofthe </sic><corr> of the </corr></choice>
xml.fix_split_words.match4 = the\\.(\\s<hi>[0123456789.]+</hi>)
xml.fix_split_words.replace4 =<choice><sic>the.</sic><corr>the</corr></choice>$1
xml.fix_split_words.match5 = the\\.(\\s[0123456789.]+)
xml.fix_split_words.replace5 =<choice><sic>the.</sic><corr>the</corr></choice>$1