Atualizacao geral

MPMG-DCC-UFMG · Aug 11, 2020 · 5c45acd · 5c45acd
1 parent ae3aba0
commit 5c45acd
Show file tree

Hide file tree

Showing 10 changed files with 274 additions and 13 deletions.
diff --git a/data_extraction/merge_rows.py b/data_extraction/merge_rows.py
@@ -0,0 +1,48 @@
+import sys
+
+def is_null(key):
+    for k in key:
+        if k == "":
+            return True
+    return False
+
+
+def add_row(row, spl):
+    for i,s in enumerate(spl):
+        if row[i] == "":
+            row[i] = spl[i]
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print ("usage: %s <input file> <outfile>" % sys.argv[0])
+        sys.exit(-1)
+
+
+    infile = open(sys.argv[1], encoding="utf-8")
+    outfile = open(sys.argv[2], "w", encoding="utf-8")
+    data = []
+    print("id,processo_licitatorio,num_exercicio,modalidade,municipio,tipo_licitacao,data_rec_doc", file=outfile)
+    spl = infile.readline().strip().split(",")
+    key = (spl[0], spl[1])
+    row = ["" for i in range(len(spl))]
+    for line in infile:
+        next_spl = line.strip().split(",")
+        next_key = (next_spl[0], next_spl[1])
+        if not is_null(key):
+            add_row(row, spl)
+            if key != next_key:
+                data.append(row)
+                row = ["" for i in range(len(spl))]
+        spl = next_spl
+        key = next_key
+
+    if not is_null(key):
+        add_row(row, spl)
+        data.append(row)
+
+    for i,spl in enumerate(data):
+        print(str(i+1) + "," + ",".join(spl), file=outfile)
+    infile.close()
+    outfile.close()
+
+
diff --git a/preprocessing/__pycache__/__init__.cpython-36.pyc b/preprocessing/__pycache__/__init__.cpython-36.pyc
diff --git a/preprocessing/__pycache__/casing.cpython-35.pyc b/preprocessing/__pycache__/casing.cpython-35.pyc
diff --git a/preprocessing/__pycache__/text_cleaner.cpython-35.pyc b/preprocessing/__pycache__/text_cleaner.cpython-35.pyc
diff --git a/preprocessing/__pycache__/text_cleaner.cpython-36.pyc b/preprocessing/__pycache__/text_cleaner.cpython-36.pyc
diff --git a/preprocessing/casing.py b/preprocessing/casing.py
@@ -0,0 +1,27 @@
+
+lower_case = set("o a os as e em no na nos nas de para ou do da dos das".split())
+
+# Function to convert into title case 
+def title_case(input_string): 
+
+    # variable declaration for the output text  
+    output_list = []
+
+    # separating each word in the string 
+    input_list = input_string.split(" ") 
+
+    # checking each word 
+    for word in input_list: 
+
+        # if the word exists in the list 
+        # then no need to capitalize it 
+        if word in lower_case: 
+            output_list.append(word)
+
+        # if the word does not exist in 
+        # the list, then capitalize it 
+        else: 
+            output_list.append(word.title())
+
+    return " ".join(output_list)
+
diff --git a/preprocessing/text_cleaner.py b/preprocessing/text_cleaner.py
@@ -5,11 +5,18 @@
 
 nlp = spacy.load("pt")
 
+def extract_digits(string):
+    digits = []
+    for char in string:
+        if char.isdigit():
+            digits.append(char)
+    return "".join(digits)
+
 
 def clear_special_chars(text):
     res = ""
     for c in text:
-        if (c >= "\u0020" and c <= "\u007E") or (c >= "\u00A1" and c <= "\u00FF"):
+        if (c >= "\u0020" and c <= "\u007E") or (c >= "\u00A1" and c <= "\u00FF") or c == "\n":
             res += c
         else:
             res += " "
@@ -26,7 +33,7 @@ def tokenize(string):
 #@param text The text that must be split in to sentences.
 
 def split_sentences(text):
-    sentence_delimiters = re.compile(u'[\\[\\].!?;:]\s|\n')
+    sentence_delimiters = re.compile(u'[\\[\\].!?;]\s|\n')
     sentences = sentence_delimiters.split(text)
     return sentences
 
@@ -50,6 +57,11 @@ def merge_sentences(sentences):
 
 
 
+def replacements(text, replace_list):
+    for expr,repl in replace_list:
+        text = text.replace(expr, repl)
+    return text
+
 #Limpa texto e exclui linhas que nao tem entidades nomeadas
 
 def clean_text(text):
@@ -74,8 +86,10 @@ def clean_text(text):
     outfile = open(sys.argv[2], "w", encoding="utf-8")
     infile = open(sys.argv[1], encoding="utf-8")
 
+    replace_list = [ ["-\n", ""], [" / ", "/"], ["Av.", "Av"], ["\u00ba.", "\u00ba"], ["\u00aa.", "\u00aa"] ]
+
     text = infile.read()
-    text = clean_text(text)
+    text = replacements(clear_special_chars(text), replace_list)
     outfile.write(text)
     outfile.close()
 

diff --git a/rule_based_ner_bkp.py b/rule_based_ner_bkp.py
@@ -0,0 +1,150 @@
+
+# Regras fixas para identificacao inicial de entidades
+
+import re
+import sys
+from preprocessing.text_cleaner import *
+from pycpfcnpj import cpfcnpj
+import json
+from datetime import datetime
+import spacy
+
+nlp = spacy.load("pt")
+
+NSENTS=10
+NAME_RE = "([A-Z\u00c0-\u00da][a-z\u00e0\u00fa]+\s)([A-Z\u00c0-\u00da][a-z\u00e0\u00fa]+\s)+"
+UPPER_NAME_RE = "([A-Z\u00c0-\u00da]+\s)([A-Z\u00c0-\u00da]\s?)+"
+UPPER_NAME_PATTERN = re.compile(UPPER_NAME_RE)
+UPPER_NAME_MASP_RE = "([A-Z\u00c0-\u00da]+\s*)+[,/]\s[Mm][Aa][Ss][Pp]"
+UPPER_NAME_MASP_PATTERN = re.compile(UPPER_NAME_MASP_RE)
+
+NON_PER_STARTS = set("secretaria polic centro extrato companhia conselho endere\u00e7o ee e.e. escola empresa".split())
+NON_PER_ENDS = set("ltda ltd.".split())
+
+NON_ORGS = set("ano di\u00e1rio ato cpf cnpj termo caderno processo objeto partes licita\u00e7\u00e3o pr\u00eamio".split())
+
+
+
+
+def load_regex_file(filename):
+    patterns = []
+    infile = open(filename, encoding="utf-8")
+    for line in infile:
+        lin = line.strip()
+        if lin.startswith("#"):
+            continue
+        spl = lin.strip().split("\t")
+        if len(spl) < 2:
+            continue
+        name = spl[0]
+        expr = spl[1]
+        patterns.append( (name, re.compile(expr)) )
+    infile.close()
+    return patterns
+
+
+def additional_validation(ent_type, token):
+    if ent_type == "CPF" or ent_type == "CNPJ":
+        return cpfcnpj.validate(token)
+    return True
+
+def rule_based_ner(rules, text):
+    ents = []
+    for ent_type, pattern in rules:
+        for match in pattern.finditer(text):
+            start, end = match.span()
+            token = text[start:end]
+            if additional_validation(ent_type, token):
+                ents.append([start, end, ent_type])
+    return ents
+
+
+def person_org_ner(text):
+    ents = []
+    doc = nlp(text)
+    for ent in doc.ents:
+        spl = ent.text.split()
+        if ent.label_ == "PER":
+            if re.match(NAME_RE, ent.text) == None:
+                continue
+            if spl[0].lower() in NON_PER_STARTS:
+                continue
+            if spl[-1].lower() in NON_PER_ENDS:
+                continue
+            ents.append( [ent.start_char, ent.end_char, "PESSOA"] )
+
+        if ent.label_ == "ORG":
+            if ent.text.strip().lower() in NON_ORGS:
+                continue
+            ents.append( [ent.start_char, ent.end_char, "ORG"] )
+    return ents
+
+def additional_person_ner(text, ents_dict):
+    ents = []
+    for match in UPPER_NAME_PATTERN.finditer(text):
+        start, end = match.span()
+        if "MASP" not in text[end:end+6]:
+            continue
+        if (start, end) not in ents_dict:
+            ents.append( [start, end, "PESSOA"] )
+    return ents
+
+
+def ents2dict(ents):
+    res = {}
+    for start, end, label in ents:
+        res[ (start, end) ] = label
+    return res
+
+
+
+def print_output_line(out, outfile):
+    print("{\"text\": \"%s\", \"labels\":" % out["text"], end=" ", file=outfile)
+    labels = str(out["labels"]).replace("\'", "\"")
+    print(labels, end="", file=outfile)
+    #for l in out["labels"]:
+    #    print(", [%d, %d, \"%s\"]" % (l[0], l[1], l[2]), end="", file=outfile)
+    print("}", file=outfile)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print ("usage: %s <input file> <outfile> " % sys.argv[0])
+        sys.exit(-1)
+
+
+    patterns = load_regex_file("rules.tsv")
+
+    infile = open(sys.argv[1], encoding="utf-8")
+    out_mp = {"file": sys.argv[1], "entities": [], "timestamp":str(datetime.now())}
+    mp_ents = out_mp["entities"]
+
+    outfile = open(sys.argv[2] + ".aux", "w", encoding="utf-8")
+    outjson = open(sys.argv[2] + "_doccano.json", "w", encoding="utf-8")
+    outfile_mp = open(sys.argv[2] + ".json", "w", encoding="utf-8")
+
+    text = infile.read().replace("-\n", "")
+    text = clear_special_chars(text)
+    infile.close()
+    sents = merge_sentences(split_sentences(text))
+
+    for i in range(0, len(sents), NSENTS):
+        sep = ". "
+        text = sep.join(sents[i : i + NSENTS]).strip()
+        ents = person_org_ner(text) + rule_based_ner(patterns, text)
+        ents_dic = ents2dict(ents)
+        ents = ents + additional_person_ner(text, ents_dic)
+        ents = sorted(ents)
+        out = {"text":text, "labels":ents}
+        print_output_line(out, outjson)
+        for start, end, ent_type in ents:
+            span = text[start:end]
+            print(ent_type, "\t", span, "\t\t", text[start-50:end+50], file=outfile)
+            mp_ents.append({"entity":span, "start":start, "end":end, "label":ent_type})
+    json.dump(out_mp, outfile_mp, indent=3)
+    outfile.close()
+    outjson.close()
+    outfile_mp.close()
+
+
+
diff --git a/scripts/run_licitacao.sh b/scripts/run_licitacao.sh
@@ -1,20 +1,17 @@
 #!/bin/sh
 
-dir=/data/users/fmuniz/editais
+dir=$1
 
-for f in $dir/*/*/*/*/*.entidades.json $dir/*/*/*/*.entidades.json
+for f in `find "$dir" -type f -name "*.entidades.json"`
 do
-    #if [ ! -s "$f" ]
-    #then	
-        echo "$f"
-        python3 -m data_extraction.licitacao "$f" "$f.attribs"
-    #fi
+    echo "$f"
+    python3 -m data_extraction.licitacao "$f" "$f.attribs"
 done
 
-for f in $dir/*/*/*/*/*json.attribs $dir/*/*/*/*json.attribs
+
+
+for f in `find "$dir" -type f -name "*.attribs"`
 do
     cat "$f"
 done > licitacoes.csv
 
-
-
diff --git a/scripts/runall.sh b/scripts/runall.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+dir=$1
+outdir=/datalake/ufmg/m02
+
+#Reproduzir arvore de diretorios
+
+for d in `find "$dir" -type d`
+do
+    outsubdir=`echo "$d" | cut -d "/" -f 1,2,3 --complement`
+    mkdir -p "$outdir/$outsubdir"
+
+    for f in `find "$d" -type f -name "*.pdf"`
+    do
+       echo "$f"
+       filename=`basename "$f" ".pdf"`
+       out="$outdir/$outsubdir/$filename"
+       echo "pdftotext \"$f\" \"$out.txt\""
+       pdftotext "$f" "$out.txt"
+
+       python3 -m preprocessing.text_cleaner "$out.txt" "$out.clean"
+       ./run_pipeline.sh "$out.clean" "$out.entidades"
+    done
+done
+