-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
274 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import sys | ||
|
||
def is_null(key): | ||
for k in key: | ||
if k == "": | ||
return True | ||
return False | ||
|
||
|
||
def add_row(row, spl): | ||
for i,s in enumerate(spl): | ||
if row[i] == "": | ||
row[i] = spl[i] | ||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) < 2: | ||
print ("usage: %s <input file> <outfile>" % sys.argv[0]) | ||
sys.exit(-1) | ||
|
||
|
||
infile = open(sys.argv[1], encoding="utf-8") | ||
outfile = open(sys.argv[2], "w", encoding="utf-8") | ||
data = [] | ||
print("id,processo_licitatorio,num_exercicio,modalidade,municipio,tipo_licitacao,data_rec_doc", file=outfile) | ||
spl = infile.readline().strip().split(",") | ||
key = (spl[0], spl[1]) | ||
row = ["" for i in range(len(spl))] | ||
for line in infile: | ||
next_spl = line.strip().split(",") | ||
next_key = (next_spl[0], next_spl[1]) | ||
if not is_null(key): | ||
add_row(row, spl) | ||
if key != next_key: | ||
data.append(row) | ||
row = ["" for i in range(len(spl))] | ||
spl = next_spl | ||
key = next_key | ||
|
||
if not is_null(key): | ||
add_row(row, spl) | ||
data.append(row) | ||
|
||
for i,spl in enumerate(data): | ||
print(str(i+1) + "," + ",".join(spl), file=outfile) | ||
infile.close() | ||
outfile.close() | ||
|
||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
|
||
lower_case = set("o a os as e em no na nos nas de para ou do da dos das".split()) | ||
|
||
# Function to convert into title case | ||
def title_case(input_string): | ||
|
||
# variable declaration for the output text | ||
output_list = [] | ||
|
||
# separating each word in the string | ||
input_list = input_string.split(" ") | ||
|
||
# checking each word | ||
for word in input_list: | ||
|
||
# if the word exists in the list | ||
# then no need to capitalize it | ||
if word in lower_case: | ||
output_list.append(word) | ||
|
||
# if the word does not exist in | ||
# the list, then capitalize it | ||
else: | ||
output_list.append(word.title()) | ||
|
||
return " ".join(output_list) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
|
||
# Regras fixas para identificacao inicial de entidades | ||
|
||
import re | ||
import sys | ||
from preprocessing.text_cleaner import * | ||
from pycpfcnpj import cpfcnpj | ||
import json | ||
from datetime import datetime | ||
import spacy | ||
|
||
nlp = spacy.load("pt") | ||
|
||
NSENTS=10 | ||
NAME_RE = "([A-Z\u00c0-\u00da][a-z\u00e0\u00fa]+\s)([A-Z\u00c0-\u00da][a-z\u00e0\u00fa]+\s)+" | ||
UPPER_NAME_RE = "([A-Z\u00c0-\u00da]+\s)([A-Z\u00c0-\u00da]\s?)+" | ||
UPPER_NAME_PATTERN = re.compile(UPPER_NAME_RE) | ||
UPPER_NAME_MASP_RE = "([A-Z\u00c0-\u00da]+\s*)+[,/]\s[Mm][Aa][Ss][Pp]" | ||
UPPER_NAME_MASP_PATTERN = re.compile(UPPER_NAME_MASP_RE) | ||
|
||
NON_PER_STARTS = set("secretaria polic centro extrato companhia conselho endere\u00e7o ee e.e. escola empresa".split()) | ||
NON_PER_ENDS = set("ltda ltd.".split()) | ||
|
||
NON_ORGS = set("ano di\u00e1rio ato cpf cnpj termo caderno processo objeto partes licita\u00e7\u00e3o pr\u00eamio".split()) | ||
|
||
|
||
|
||
|
||
def load_regex_file(filename): | ||
patterns = [] | ||
infile = open(filename, encoding="utf-8") | ||
for line in infile: | ||
lin = line.strip() | ||
if lin.startswith("#"): | ||
continue | ||
spl = lin.strip().split("\t") | ||
if len(spl) < 2: | ||
continue | ||
name = spl[0] | ||
expr = spl[1] | ||
patterns.append( (name, re.compile(expr)) ) | ||
infile.close() | ||
return patterns | ||
|
||
|
||
def additional_validation(ent_type, token): | ||
if ent_type == "CPF" or ent_type == "CNPJ": | ||
return cpfcnpj.validate(token) | ||
return True | ||
|
||
def rule_based_ner(rules, text): | ||
ents = [] | ||
for ent_type, pattern in rules: | ||
for match in pattern.finditer(text): | ||
start, end = match.span() | ||
token = text[start:end] | ||
if additional_validation(ent_type, token): | ||
ents.append([start, end, ent_type]) | ||
return ents | ||
|
||
|
||
def person_org_ner(text): | ||
ents = [] | ||
doc = nlp(text) | ||
for ent in doc.ents: | ||
spl = ent.text.split() | ||
if ent.label_ == "PER": | ||
if re.match(NAME_RE, ent.text) == None: | ||
continue | ||
if spl[0].lower() in NON_PER_STARTS: | ||
continue | ||
if spl[-1].lower() in NON_PER_ENDS: | ||
continue | ||
ents.append( [ent.start_char, ent.end_char, "PESSOA"] ) | ||
|
||
if ent.label_ == "ORG": | ||
if ent.text.strip().lower() in NON_ORGS: | ||
continue | ||
ents.append( [ent.start_char, ent.end_char, "ORG"] ) | ||
return ents | ||
|
||
def additional_person_ner(text, ents_dict): | ||
ents = [] | ||
for match in UPPER_NAME_PATTERN.finditer(text): | ||
start, end = match.span() | ||
if "MASP" not in text[end:end+6]: | ||
continue | ||
if (start, end) not in ents_dict: | ||
ents.append( [start, end, "PESSOA"] ) | ||
return ents | ||
|
||
|
||
def ents2dict(ents): | ||
res = {} | ||
for start, end, label in ents: | ||
res[ (start, end) ] = label | ||
return res | ||
|
||
|
||
|
||
def print_output_line(out, outfile): | ||
print("{\"text\": \"%s\", \"labels\":" % out["text"], end=" ", file=outfile) | ||
labels = str(out["labels"]).replace("\'", "\"") | ||
print(labels, end="", file=outfile) | ||
#for l in out["labels"]: | ||
# print(", [%d, %d, \"%s\"]" % (l[0], l[1], l[2]), end="", file=outfile) | ||
print("}", file=outfile) | ||
|
||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) < 2: | ||
print ("usage: %s <input file> <outfile> " % sys.argv[0]) | ||
sys.exit(-1) | ||
|
||
|
||
patterns = load_regex_file("rules.tsv") | ||
|
||
infile = open(sys.argv[1], encoding="utf-8") | ||
out_mp = {"file": sys.argv[1], "entities": [], "timestamp":str(datetime.now())} | ||
mp_ents = out_mp["entities"] | ||
|
||
outfile = open(sys.argv[2] + ".aux", "w", encoding="utf-8") | ||
outjson = open(sys.argv[2] + "_doccano.json", "w", encoding="utf-8") | ||
outfile_mp = open(sys.argv[2] + ".json", "w", encoding="utf-8") | ||
|
||
text = infile.read().replace("-\n", "") | ||
text = clear_special_chars(text) | ||
infile.close() | ||
sents = merge_sentences(split_sentences(text)) | ||
|
||
for i in range(0, len(sents), NSENTS): | ||
sep = ". " | ||
text = sep.join(sents[i : i + NSENTS]).strip() | ||
ents = person_org_ner(text) + rule_based_ner(patterns, text) | ||
ents_dic = ents2dict(ents) | ||
ents = ents + additional_person_ner(text, ents_dic) | ||
ents = sorted(ents) | ||
out = {"text":text, "labels":ents} | ||
print_output_line(out, outjson) | ||
for start, end, ent_type in ents: | ||
span = text[start:end] | ||
print(ent_type, "\t", span, "\t\t", text[start-50:end+50], file=outfile) | ||
mp_ents.append({"entity":span, "start":start, "end":end, "label":ent_type}) | ||
json.dump(out_mp, outfile_mp, indent=3) | ||
outfile.close() | ||
outjson.close() | ||
outfile_mp.close() | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,17 @@ | ||
#!/bin/sh | ||
|
||
dir=/data/users/fmuniz/editais | ||
dir=$1 | ||
|
||
for f in $dir/*/*/*/*/*.entidades.json $dir/*/*/*/*.entidades.json | ||
for f in `find "$dir" -type f -name "*.entidades.json"` | ||
do | ||
#if [ ! -s "$f" ] | ||
#then | ||
echo "$f" | ||
python3 -m data_extraction.licitacao "$f" "$f.attribs" | ||
#fi | ||
echo "$f" | ||
python3 -m data_extraction.licitacao "$f" "$f.attribs" | ||
done | ||
|
||
for f in $dir/*/*/*/*/*json.attribs $dir/*/*/*/*json.attribs | ||
|
||
|
||
for f in `find "$dir" -type f -name "*.attribs"` | ||
do | ||
cat "$f" | ||
done > licitacoes.csv | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#!/bin/sh | ||
|
||
dir=$1 | ||
outdir=/datalake/ufmg/m02 | ||
|
||
#Reproduzir arvore de diretorios | ||
|
||
for d in `find "$dir" -type d` | ||
do | ||
outsubdir=`echo "$d" | cut -d "/" -f 1,2,3 --complement` | ||
mkdir -p "$outdir/$outsubdir" | ||
|
||
for f in `find "$d" -type f -name "*.pdf"` | ||
do | ||
echo "$f" | ||
filename=`basename "$f" ".pdf"` | ||
out="$outdir/$outsubdir/$filename" | ||
echo "pdftotext \"$f\" \"$out.txt\"" | ||
pdftotext "$f" "$out.txt" | ||
|
||
python3 -m preprocessing.text_cleaner "$out.txt" "$out.clean" | ||
./run_pipeline.sh "$out.clean" "$out.entidades" | ||
done | ||
done | ||
|