morphadorner.properties

##### Adorner processing classes.

## Use list adorned word outputter.

adornedwordoutputter.class=ListAdornedWordOutputter

## Use word tokenizer that treats apostrophes as distinct from single quotes

wordtokenizer.class=DefaultWordTokenizer

## Use default post tokenizer.

posttokenizer.class=DefaultPostTokenizer

## Use trigram part of speech tagger.

partofspeechtagger.class=TrigramTagger

## Transition matrix for part of speech tags.

partofspeechtagger.transition_matrix=data/ncftransmat.mat

## Use default retagger.

partofspeechretagger.class=DefaultPartOfSpeechRetagger

## Use default part of speech guesser.

partofspeechguesser.class=DefaultPartOfSpeechGuesser

## Check possessives.

partofspeechguesser.check_possessives=false

## Try standard spellings to assist in guessing parts of speech for unknown word.

partofspeechguesser.try_standard_spellings=true

## Use default sentence splitter.

sentencesplitter.class=DefaultSentenceSplitter

## Use noop spelling standardizer as initial standardizer.

initialspellingstandardizer.class=NoopSpellingStandardizer

## Use extended simple search spelling standardizer.

spellingstandardizer.class=ExtendedSimpleSpellingStandardizer

## Use U.S. To British spelling mapper.

spellingmapper.class=USToBritishSpellingMapper

## Use noop name standardizer.

namestandardizer.class=NoopNameStandardizer

## Use simple XML file text inputter.

textinputter.class=SimpleXMLTextInputter

## Use default XML writer.

morphadornerxmlwriter.class=DefaultMorphAdornerXMLWriter

#####  Configuration settings for Adorner output.


## No sentence number.

adorner.output.sentence_number=false
adorner.output.sentence_number_attribute=sn

## No word number.

adorner.output.word_number=false
adorner.output.word_number_attribute=wn

## Word numbers restart with each sentence.

adorner.output.running_word_numbers=false

## Word ordinal.

adorner.output.word_ordinal=true
adorner.output.word_ordinal_attribute=ord

## Output original token text (before post tokenization applied)

adorner.output.original_token=true
adorner.output.original_token_attribute=tok

## Output original spelling.

adorner.output.spelling=true
adorner.output.spelling_attribute=spe

## Output part of speech.

adorner.output.part_of_speech=true
adorner.output.part_of_speech_attribute=pos

## Output lemma.

adorner.output.lemma=true
adorner.output.lemma_attribute=lem

## Ignore lemmata in lexicon when lemmatizing?

adorner.lemmatization.ignorelexiconentries=false

## Output standard spelling.

adorner.output.standard_spelling=true
adorner.output.standard_spelling_attribute=reg

## Do not output KWIC context for token.

adorner.output.kwic=false
adorner.output.kwic_left_attribute=kl
adorner.output.kwic_right_attribute=kr

## Width in characters of KWIC context.

adorner.output.kwic.width=80

## Output end of sentence flag for each token.

adorner.output.end_of_sentence_flag=true
adorner.output.end_of_sentence_flag_attribute=eos

##  True to perform extended XML processing.

adorner.handle_xml=true

##### Lexicons.

lexicon.word_lexicon=data/ncflexicon.lex
lexicon.suffix_lexicon=data/ncfsuffixlexicon.lex

##### Spelling lists.

spelling.standard_spellings=data/standardspellings.txt
spelling.spelling_pairs=data/ncfmergedspellingpairs.tab
spelling.spelling_pairs_by_word_class=data/spellingsbywordclass.txt

##### Additional word lists.

wordlists.use_latin_word_list=false

#####  Configuration settings for XML handling follow.

##  Name of the word id.
#   This is generated only for XML output.

xml.id.attribute = xml:id

##  Type of ID.
#
#	Word IDs start with the work identifier, taken from the file name
#	of the work.
#
#	"reading_context_order" appends integer values
#	whose order gives the reading context order defined
#	by the classification of hard, soft, and jump tags.
#
#	"word_within_page_block" appends two integer values in the
#	the form pageblocknumber-wordinblock, where pageblocknumber
#	is the ordinal of the current <pb> (page break) entry,
#	and wordinblock is the number of the word within
#	the page block (starting at 1).
#
#	"spacing" gives the spacing between ID values.  For
#	example, an increment of 10 spaces subsequence
#	reading_context_order or wordinblock values by 10.
#	This allows new values to be interpolated for editing
#	purposes.
#
#	IDs for words split by soft tags have a ".n" appended,
#	where "n" is an integer starting at 1 giving the part
#	number of the word in order.
#

#xml.id.type=word_within_page_block
xml.id.type=reading_context_order
xml.id.spacing=10

##  word_tag_name is a required parameter representing the name
##  of the added text tags.
##
##  The name 'id' is forbidden if "id" = true below.

xml.word_tag_name = w

##  Field delimiter for adorner output.
##  Delimiter cannot be a quotation mark '"' .

xml.field_delimiters = "\t"

##  Word delimiter for adorned output.
##  Delimiter cannot be a quotation mark '"'.

xml.word_delimiters = "\r\n"

##  Marker for surrounding distinct segments of text

xml.surround_marker = " \ue500 "

## Ignore tag case when checking for soft/jump/hard tags.

xml.ignore_tag_case = true

##  List of jump tag names.  Separate tag names with a blank.

xml.jump_tags = bibl figdesc figDesc figure footnote note ref stage tailnote

##  List of soft tag names.  Separate tag names with a blank.
## Note:  zzzzsw is used internally.  Don't remove it or bad things will happen.

xml.soft_tags = abbr add address author c choice cl corr date emph \
   foreign gap hi l lb location m mentioned milestone money name num \
   organization orig pb person phr reg rs s sb seg sic soCalled sub sup term time \
   title unclear w zzzzsw

##  True to enable extended logging.

xml.log = false

##  Name of output doctype name.  Optional.  You must specify
##  xml.doctype.system in addition.

#xml.doctype.name = TEI

##  Name of output system name (DTD).  Optional.   You must specify
##  xml.doctype.name in addition.

#xml.doctype.system = http://ariadne.northwestern.edu/monk/dtds/teisimple.dtd

##  Schema to use for XML input parsing, if not given in input XML file.

xml.xml_schema = http://ariadne.northwestern.edu/monk/schemata/TEIAnalytics.rng

## True to output whitespace elements (e.g., <c> </c>) between
## word elements in XML.

xml.output_whitespace_elements = true

## True to emit only non-redundant word tag attributes.

xml.output_nonredundant_attributes_only = false

## True to emit only non-redundant token attributes.

xml.output_nonredundant_token_attribute = false

## Fix gap tags in XML input.

xml.fix_gap_tags = false

## Fix orig tags in XML input.

xml.fix_orig_tags = false

## Fix selected split words in XML input.

xml.fix_split_words = false

## Output pseudo-page boundaries.

xml.output_pseudo_page_boundary_milestones = false
xml.pseudo_page_size = 300
xml.pseudo_page_container_div_types = volume chapter sermon

## Force end of sentence at end of hard tag.

xml.close_sentence_at_end_of_hard_tag = false

## Force end of sentence at end of jump tag.

xml.close_sentence_at_end_of_jump_tag = true

## XML elements in which to disallow generated <w> and <c> elements.
## Separate element names by blanks.

xml.disallow_word_elements_in=figDesc sic

## Adorn XML files with existing adorned version in output directory.

xml.adorn_existing_xml_files=false