Skip to content

Commit

Permalink
Extrator de dados de licitacoes adicionado
Browse files Browse the repository at this point in the history
  • Loading branch information
famube committed Aug 5, 2020
1 parent ad0fc4a commit fc51818
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 9 deletions.
44 changes: 41 additions & 3 deletions inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,43 @@ def load_conll(filename, col=2):
start = ind
ind += len(spl[0])
end = ind
sent_labels.append( (start, end, spl[col-1]) )
sent_labels.append( [start, end, spl[col-1]] )
ind += 1
infile.close()
return sents, labels

def load_conll_probs(filename, col=2):
infile = open(filename, encoding="utf-8")
labels = []
sents = []
ind = 0
acc = ""
sent_labels = []
for line in infile:
lin = line.strip()
if lin == "":
ind = 0
#labels.append( (0, 0, None) )
sents.append(acc)
labels.append(sent_labels)
acc = ""
sent_labels = []
else:
spl = lin.split()
acc += spl[0] + " "
start = ind
ind += len(spl[0])
end = ind
probs = {}
for s in spl[col:]:
spl_ = s.split("=")
probs[spl_[0]] = float(spl_[1])
sent_labels.append( [start, end, spl[col-1], probs] )
ind += 1
infile.close()
return sents, labels



def conll2spacy_train_data(filename):
sents, labels = load_conll(filename)
Expand Down Expand Up @@ -185,11 +217,17 @@ def merge_bio_tags(ents):
if i >= len(ents):
break
if current != "O":
res.append( (start, end, current) )
res.append( [start, end, current] )
return res



def read_lower_cased_strings(filename):
infile = open(filename, encoding="utf-8")
res = set()
for line in infile:
res.add(line.strip().lower())
infile.close()
return res



Expand Down
6 changes: 0 additions & 6 deletions wcc.sh

This file was deleted.

0 comments on commit fc51818

Please sign in to comment.