Skip to content

Commit

Permalink
Export corpus: include syllable info (#750)
Browse files Browse the repository at this point in the history
  • Loading branch information
stannam committed Sep 27, 2020
1 parent abc80b5 commit 372debe
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 14 deletions.
6 changes: 5 additions & 1 deletion corpustools/corpus/classes/lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def __init__(self, seg_list):
has_syllable_level = True
else:
has_syllable_level = False
except Exception:
except AttributeError:
has_syllable_level = False

if has_syllable_level:
Expand Down Expand Up @@ -587,6 +587,10 @@ def __ne__(self, other):
def __len__(self):
return len(self._list)

@property
def list(self):
return self._list


class FeatureMatrix(object):
"""
Expand Down
29 changes: 20 additions & 9 deletions corpustools/corpus/io/csv.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from csv import DictReader, DictWriter
import os

from corpustools.corpus.classes.lexicon import Corpus, FeatureMatrix, Word, Attribute
from corpustools.corpus.classes.lexicon import Corpus, FeatureMatrix, Word, Attribute, Transcription
from corpustools.corpus.io.binary import save_binary, load_binary

from .helper import parse_transcription, AnnotationType
from .helper import parse_transcription, AnnotationType, SyllableBaseAnnotation

from corpustools.exceptions import DelimiterError, PCTError
import corpustools.gui.modernize as modernize
Expand Down Expand Up @@ -322,7 +322,7 @@ def load_feature_matrix_csv(name, path, delimiter, stop_check = None, call_back
feature_matrix.validate()
return feature_matrix

def make_safe(value, delimiter):
def make_safe(value, seg_delimiter, syll_delimiter=None):
"""
Recursively parse transcription lists into strings for saving
Expand All @@ -331,20 +331,29 @@ def make_safe(value, delimiter):
value : object
Object to make into string
delimiter : str
Character to mark boundaries between list elements
seg_delimiter : str
Character to mark boundaries between segments
syll_delimiter : str, optional
Character to mark boundaries between syllables
Returns
-------
str
Safe string
"""
if isinstance(value,list):
return delimiter.join(map(lambda x: make_safe(x, delimiter),value))
if isinstance(value, Transcription):
if syll_delimiter is not None:
return syll_delimiter.join(map(lambda x: make_safe(list(x), seg_delimiter), value._syllable_list))
else:
return seg_delimiter.join(map(lambda x: make_safe(x, seg_delimiter), value.list))
elif isinstance(value, list):
return seg_delimiter.join(map(lambda x: make_safe(x, seg_delimiter), value))

return str(value)

def export_corpus_csv(corpus, path,
delimiter = ',', trans_delimiter = '.',
delimiter = ',', trans_delimiter = '.', syll_delimiter = None,
variant_behavior = None):
"""
Save a corpus as a column-delimited text file
Expand All @@ -359,6 +368,8 @@ def export_corpus_csv(corpus, path,
Character to mark boundaries between columns. Defaults to ','
trans_delimiter : str
Character to mark boundaries in transcriptions. Defaults to '.'
syll_delimiter : str, optional
Character to mark boundaries in syllables. Defaults to 'None'. Only active when syllable exists.
variant_behavior : str, optional
How to treat variants, 'token' will have a line for each variant,
'column' will have a single column for all variants for a word,
Expand All @@ -382,7 +393,7 @@ def export_corpus_csv(corpus, path,
for word in corpus.iter_sort():
word_outline = []
for a in corpus.attributes:
word_outline.append(make_safe(getattr(word, a.name), trans_delimiter))
word_outline.append(make_safe(getattr(word, a.name), trans_delimiter, syll_delimiter))
if variant_behavior == 'token':
var = word.variants()
for v, freq in var.items():
Expand Down
6 changes: 3 additions & 3 deletions corpustools/corpus/io/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,11 @@ def __init__(self, syllable, feature_matrix, annotation_type, begin=None, end=No
def __iter__(self):
segs = list()
for o in self.onset:
segs.append(o)
segs.append(o.label)
for n in self.nucleus:
segs.append(n)
segs.append(n.label)
for c in self.coda:
segs.append(c)
segs.append(c.label)
return iter(segs)

def get_onset(self):
Expand Down
17 changes: 16 additions & 1 deletion corpustools/gui/iogui.py
Original file line number Diff line number Diff line change
Expand Up @@ -1259,6 +1259,14 @@ def __init__(self, parent, corpus):

inlayout.addRow('Transcription delimiter:', self.transDelimiterEdit)

self.syllDelimiterEdit = QLineEdit()
self.syllDelimiterEdit.setText('-')
if len(self.corpus.inventory.syllables) == 0: # if there's no syllable in the corpus,
self.syllDelimiterEdit.setEnabled(False) # then grey out the syllable delimiter option.
self.syllDelimiterEdit.setText('')

inlayout.addRow('Syllable delimiter:', self.syllDelimiterEdit)

self.variantWidget = QComboBox()
for o in self.variantOptions:
self.variantWidget.addItem(o[0])
Expand Down Expand Up @@ -1303,8 +1311,15 @@ def accept(self):
reply = QMessageBox.critical(self,
"Invalid information", "The column delimiter must be a single character.")
return

transDelim = self.transDelimiterEdit.text()
syllDelim = self.syllDelimiterEdit.text() if self.syllDelimiterEdit.text() != '' else None
if colDelim == transDelim or colDelim == syllDelim or transDelim == syllDelim:
reply = QMessageBox.critical(self,
"Invalid information", "The delimiters must be different from each other.")
return

variant_behavior = self.variantOptions[self.variantWidget.currentIndex()][1]
export_corpus_csv(self.corpus, filename, colDelim, transDelim, variant_behavior)
export_corpus_csv(self.corpus, filename, colDelim, transDelim, syllDelim, variant_behavior)

QDialog.accept(self)

0 comments on commit 372debe

Please sign in to comment.