Skip to content

Commit

Permalink
Atualizacao geral
Browse files Browse the repository at this point in the history
  • Loading branch information
famube committed Aug 11, 2020
1 parent ae3aba0 commit 5c45acd
Show file tree
Hide file tree
Showing 10 changed files with 274 additions and 13 deletions.
48 changes: 48 additions & 0 deletions data_extraction/merge_rows.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import sys

def is_null(key):
for k in key:
if k == "":
return True
return False


def add_row(row, spl):
for i,s in enumerate(spl):
if row[i] == "":
row[i] = spl[i]

if __name__ == "__main__":
if len(sys.argv) < 2:
print ("usage: %s <input file> <outfile>" % sys.argv[0])
sys.exit(-1)


infile = open(sys.argv[1], encoding="utf-8")
outfile = open(sys.argv[2], "w", encoding="utf-8")
data = []
print("id,processo_licitatorio,num_exercicio,modalidade,municipio,tipo_licitacao,data_rec_doc", file=outfile)
spl = infile.readline().strip().split(",")
key = (spl[0], spl[1])
row = ["" for i in range(len(spl))]
for line in infile:
next_spl = line.strip().split(",")
next_key = (next_spl[0], next_spl[1])
if not is_null(key):
add_row(row, spl)
if key != next_key:
data.append(row)
row = ["" for i in range(len(spl))]
spl = next_spl
key = next_key

if not is_null(key):
add_row(row, spl)
data.append(row)

for i,spl in enumerate(data):
print(str(i+1) + "," + ",".join(spl), file=outfile)
infile.close()
outfile.close()


Binary file added preprocessing/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file added preprocessing/__pycache__/casing.cpython-35.pyc
Binary file not shown.
Binary file modified preprocessing/__pycache__/text_cleaner.cpython-35.pyc
Binary file not shown.
Binary file not shown.
27 changes: 27 additions & 0 deletions preprocessing/casing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

lower_case = set("o a os as e em no na nos nas de para ou do da dos das".split())

# Function to convert into title case
def title_case(input_string):

# variable declaration for the output text
output_list = []

# separating each word in the string
input_list = input_string.split(" ")

# checking each word
for word in input_list:

# if the word exists in the list
# then no need to capitalize it
if word in lower_case:
output_list.append(word)

# if the word does not exist in
# the list, then capitalize it
else:
output_list.append(word.title())

return " ".join(output_list)

20 changes: 17 additions & 3 deletions preprocessing/text_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,18 @@

nlp = spacy.load("pt")

def extract_digits(string):
digits = []
for char in string:
if char.isdigit():
digits.append(char)
return "".join(digits)


def clear_special_chars(text):
res = ""
for c in text:
if (c >= "\u0020" and c <= "\u007E") or (c >= "\u00A1" and c <= "\u00FF"):
if (c >= "\u0020" and c <= "\u007E") or (c >= "\u00A1" and c <= "\u00FF") or c == "\n":
res += c
else:
res += " "
Expand All @@ -26,7 +33,7 @@ def tokenize(string):
#@param text The text that must be split in to sentences.

def split_sentences(text):
sentence_delimiters = re.compile(u'[\\[\\].!?;:]\s|\n')
sentence_delimiters = re.compile(u'[\\[\\].!?;]\s|\n')
sentences = sentence_delimiters.split(text)
return sentences

Expand All @@ -50,6 +57,11 @@ def merge_sentences(sentences):



def replacements(text, replace_list):
for expr,repl in replace_list:
text = text.replace(expr, repl)
return text

#Limpa texto e exclui linhas que nao tem entidades nomeadas

def clean_text(text):
Expand All @@ -74,8 +86,10 @@ def clean_text(text):
outfile = open(sys.argv[2], "w", encoding="utf-8")
infile = open(sys.argv[1], encoding="utf-8")

replace_list = [ ["-\n", ""], [" / ", "/"], ["Av.", "Av"], ["\u00ba.", "\u00ba"], ["\u00aa.", "\u00aa"] ]

text = infile.read()
text = clean_text(text)
text = replacements(clear_special_chars(text), replace_list)
outfile.write(text)
outfile.close()

Expand Down
150 changes: 150 additions & 0 deletions rule_based_ner_bkp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@

# Regras fixas para identificacao inicial de entidades

import re
import sys
from preprocessing.text_cleaner import *
from pycpfcnpj import cpfcnpj
import json
from datetime import datetime
import spacy

nlp = spacy.load("pt")

NSENTS=10
NAME_RE = "([A-Z\u00c0-\u00da][a-z\u00e0\u00fa]+\s)([A-Z\u00c0-\u00da][a-z\u00e0\u00fa]+\s)+"
UPPER_NAME_RE = "([A-Z\u00c0-\u00da]+\s)([A-Z\u00c0-\u00da]\s?)+"
UPPER_NAME_PATTERN = re.compile(UPPER_NAME_RE)
UPPER_NAME_MASP_RE = "([A-Z\u00c0-\u00da]+\s*)+[,/]\s[Mm][Aa][Ss][Pp]"
UPPER_NAME_MASP_PATTERN = re.compile(UPPER_NAME_MASP_RE)

NON_PER_STARTS = set("secretaria polic centro extrato companhia conselho endere\u00e7o ee e.e. escola empresa".split())
NON_PER_ENDS = set("ltda ltd.".split())

NON_ORGS = set("ano di\u00e1rio ato cpf cnpj termo caderno processo objeto partes licita\u00e7\u00e3o pr\u00eamio".split())




def load_regex_file(filename):
patterns = []
infile = open(filename, encoding="utf-8")
for line in infile:
lin = line.strip()
if lin.startswith("#"):
continue
spl = lin.strip().split("\t")
if len(spl) < 2:
continue
name = spl[0]
expr = spl[1]
patterns.append( (name, re.compile(expr)) )
infile.close()
return patterns


def additional_validation(ent_type, token):
if ent_type == "CPF" or ent_type == "CNPJ":
return cpfcnpj.validate(token)
return True

def rule_based_ner(rules, text):
ents = []
for ent_type, pattern in rules:
for match in pattern.finditer(text):
start, end = match.span()
token = text[start:end]
if additional_validation(ent_type, token):
ents.append([start, end, ent_type])
return ents


def person_org_ner(text):
ents = []
doc = nlp(text)
for ent in doc.ents:
spl = ent.text.split()
if ent.label_ == "PER":
if re.match(NAME_RE, ent.text) == None:
continue
if spl[0].lower() in NON_PER_STARTS:
continue
if spl[-1].lower() in NON_PER_ENDS:
continue
ents.append( [ent.start_char, ent.end_char, "PESSOA"] )

if ent.label_ == "ORG":
if ent.text.strip().lower() in NON_ORGS:
continue
ents.append( [ent.start_char, ent.end_char, "ORG"] )
return ents

def additional_person_ner(text, ents_dict):
ents = []
for match in UPPER_NAME_PATTERN.finditer(text):
start, end = match.span()
if "MASP" not in text[end:end+6]:
continue
if (start, end) not in ents_dict:
ents.append( [start, end, "PESSOA"] )
return ents


def ents2dict(ents):
res = {}
for start, end, label in ents:
res[ (start, end) ] = label
return res



def print_output_line(out, outfile):
print("{\"text\": \"%s\", \"labels\":" % out["text"], end=" ", file=outfile)
labels = str(out["labels"]).replace("\'", "\"")
print(labels, end="", file=outfile)
#for l in out["labels"]:
# print(", [%d, %d, \"%s\"]" % (l[0], l[1], l[2]), end="", file=outfile)
print("}", file=outfile)


if __name__ == "__main__":
if len(sys.argv) < 2:
print ("usage: %s <input file> <outfile> " % sys.argv[0])
sys.exit(-1)


patterns = load_regex_file("rules.tsv")

infile = open(sys.argv[1], encoding="utf-8")
out_mp = {"file": sys.argv[1], "entities": [], "timestamp":str(datetime.now())}
mp_ents = out_mp["entities"]

outfile = open(sys.argv[2] + ".aux", "w", encoding="utf-8")
outjson = open(sys.argv[2] + "_doccano.json", "w", encoding="utf-8")
outfile_mp = open(sys.argv[2] + ".json", "w", encoding="utf-8")

text = infile.read().replace("-\n", "")
text = clear_special_chars(text)
infile.close()
sents = merge_sentences(split_sentences(text))

for i in range(0, len(sents), NSENTS):
sep = ". "
text = sep.join(sents[i : i + NSENTS]).strip()
ents = person_org_ner(text) + rule_based_ner(patterns, text)
ents_dic = ents2dict(ents)
ents = ents + additional_person_ner(text, ents_dic)
ents = sorted(ents)
out = {"text":text, "labels":ents}
print_output_line(out, outjson)
for start, end, ent_type in ents:
span = text[start:end]
print(ent_type, "\t", span, "\t\t", text[start-50:end+50], file=outfile)
mp_ents.append({"entity":span, "start":start, "end":end, "label":ent_type})
json.dump(out_mp, outfile_mp, indent=3)
outfile.close()
outjson.close()
outfile_mp.close()



17 changes: 7 additions & 10 deletions scripts/run_licitacao.sh
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
#!/bin/sh

dir=/data/users/fmuniz/editais
dir=$1

for f in $dir/*/*/*/*/*.entidades.json $dir/*/*/*/*.entidades.json
for f in `find "$dir" -type f -name "*.entidades.json"`
do
#if [ ! -s "$f" ]
#then
echo "$f"
python3 -m data_extraction.licitacao "$f" "$f.attribs"
#fi
echo "$f"
python3 -m data_extraction.licitacao "$f" "$f.attribs"
done

for f in $dir/*/*/*/*/*json.attribs $dir/*/*/*/*json.attribs


for f in `find "$dir" -type f -name "*.attribs"`
do
cat "$f"
done > licitacoes.csv



25 changes: 25 additions & 0 deletions scripts/runall.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/sh

dir=$1
outdir=/datalake/ufmg/m02

#Reproduzir arvore de diretorios

for d in `find "$dir" -type d`
do
outsubdir=`echo "$d" | cut -d "/" -f 1,2,3 --complement`
mkdir -p "$outdir/$outsubdir"

for f in `find "$d" -type f -name "*.pdf"`
do
echo "$f"
filename=`basename "$f" ".pdf"`
out="$outdir/$outsubdir/$filename"
echo "pdftotext \"$f\" \"$out.txt\""
pdftotext "$f" "$out.txt"

python3 -m preprocessing.text_cleaner "$out.txt" "$out.clean"
./run_pipeline.sh "$out.clean" "$out.entidades"
done
done

0 comments on commit 5c45acd

Please sign in to comment.