Restoring repo

empathyco · Dec 15, 2022 · e905e9b · e905e9b
1 parent 6653360
commit e905e9b
Show file tree

Hide file tree

Showing 148 changed files with 40,825 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,21 @@
 # academy-government-data
 Project in favor of transparency of the grants and concessions of the Government of Spain.
+
+## Search service
+The search service itself is provided by Empathy.co. After a little bit of tuning and automation, it allows deploying a complete indexing and search service by executing a single command.
+
+### Prerrequisites
+There are some things needed prior to the execution of the deployment script.
+- A **docker** service running to create and compose the containers (this is, opening Docker desktop)
+- *jq* command (*it might not be necessary*). It should be installed in your device to be able to execute all the script. It can be easily installed in case you don't have it with a homebrew manager such like **brew**, with the command `brew install jq`
+- **python3**. It's essential to parse the data files.
+
+### Search deployment
+In order to deploy the search service, we only need to perform two steps:
+1. Run Docker desktop to have a docker service running
+2. Move to the **search-plugin** folder and execute `./pipelineDeployer.sh` using the terminal.
+
+#### What does the script do?
+Firstly, it parses the compressed tsv with the data to index (placed in */data-plugin/datos_limpios*) to a JSON in order to send it in the indexing request. Then it raises up two containers that are shared between the search plugin and the index plugin, These are the *elasticsearch* and the *cerebro* containers.
+After that, all the containers of the index plugin are raised up, creating a functional indexin service that is going to receive an automated request with the previously parsed data in order to index it.
+After the indexing is finished, the script turns down these containers of the index service (with the objective of not blowing up your computer :smiley:) and runs the containers of the search service, that will receive and manage the queries.
diff --git a/crawler/README.md b/crawler/README.md
@@ -0,0 +1,2 @@
+# academy-government-data
+Project in favor of transparency of the grants and concessions of the Government of Spain.
diff --git a/crawler/crawler.py b/crawler/crawler.py
@@ -0,0 +1,76 @@
+from urllib import request
+from bs4 import BeautifulSoup
+from urllib.error import HTTPError
+import pandas as pd
+import ssl
+
+def complete_bdns_mk2(original_data):
+
+    no_encontrado = []
+    indice = list(original_data.index)
+
+    crawled_data = pd.DataFrame(original_data.codigo_bdns)
+    crawled_data['importe_total'] = ''
+    crawled_data['tipo_beneficiario'] = ''
+    crawled_data['sector_beneficiario'] = ''
+    crawled_data['region_impacto'] = ''
+    crawled_data['finalidad'] = ''
+
+    for i in indice:
+        # Allows the crawler to conect to the web page
+        ssl._create_default_https_context = ssl._create_unverified_context
+
+        url = 'https://www.pap.hacienda.gob.es/bdnstrans/GE/es/convocatoria/' + str(crawled_data.codigo_bdns[i])
+
+        try:
+           rawpage = request.urlopen(url) # Open the url
+        except HTTPError as err:
+           if err.code == 404:
+               no_encontrado.append(crawled_data.codigo_bdns[i])
+
+        else:
+            # Parsing to html
+            soupedpage = BeautifulSoup(rawpage, "html5lib")
+
+            #Extract only the article of the page
+            contenido = soupedpage.article
+
+            # Assigning values
+            bloques = contenido.find_all('div', attrs = 'bloque')
+
+            importe_total = bloques[7]
+            if importe_total.find('p') is None:
+                crawled_data.importe_total[i] = 'NaN'
+            else:
+                crawled_data.importe_total[i] = importe_total.find('p').get_text()
+
+            tipo_beneficiario = bloques[10]
+            if tipo_beneficiario.find('li') is None:
+                crawled_data.tipo_beneficiario[i] = 'NaN'
+            else:
+                crawled_data.tipo_beneficiario[i] = tipo_beneficiario.find('li').get_text()
+
+            sector_beneficiario = bloques[11]
+            if sector_beneficiario.find('li') is None:
+                crawled_data.sector_beneficiario[i] = 'NaN'
+            else:
+                crawled_data.sector_beneficiario[i] = sector_beneficiario.find('li').get_text()
+
+            region_impacto = bloques[12]
+            if region_impacto.find('li') is None:
+                crawled_data.region_impacto[i] = 'NaN'
+            else:
+                crawled_data.region_impacto[i] = region_impacto.find('li').get_text()
+
+            finalidad = bloques[13]
+            if finalidad.find('p') is None:
+                crawled_data.finalidad[i] = 'NaN'
+            else:
+                crawled_data.finalidad[i] = finalidad.find('p').get_text()
+
+        if i % 100 == 0:
+            print(round(i / len(crawled_data.index) * 100, 2), '%')
+
+    return crawled_data, no_encontrado
+
+crawled_data, no_encontrado = complete_bdns_mk2(datos)
diff --git a/crawler/crawler_strings.py b/crawler/crawler_strings.py
@@ -0,0 +1,85 @@
+from urllib import request
+from bs4 import BeautifulSoup
+from urllib.error import HTTPError
+import pandas as pd
+import ssl
+
+def complete_bdns_mk3(original_data):
+    indice = list(original_data.index)
+    crawled_data = list(original_data.codigo_bdns)
+
+    no_encontrado = []
+
+    headers = 'codigo_bdns;importe_total;tipo_beneficiario;sector_beneficiario;región_impacto;finalidad'
+
+    with open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/prueba.csv','a') as csv_file:
+        csv_file.write(headers + '\n')
+        csv_file.close()
+
+    for i in indice:
+        # Allows the crawler to conect to the web page
+        ssl._create_default_https_context = ssl._create_unverified_context
+
+        url = 'https://www.pap.hacienda.gob.es/bdnstrans/GE/es/convocatoria/' + str(crawled_data[i])
+
+        convocatoria = ''
+
+        try:
+           rawpage = request.urlopen(url) # Open the url
+        except HTTPError as err:
+           if err.code == 404:
+               no_encontrado.append(crawled_data[i])
+
+        else:
+            convocatoria += str(crawled_data[i]) + ';'
+
+            # Parsing to html
+            soupedpage = BeautifulSoup(rawpage, "html5lib")
+
+            #Extract only the article of the page
+            contenido = soupedpage.article
+
+            # Assigning values
+            bloques = contenido.find_all('div', attrs = 'bloque')
+
+            importe_total = bloques[7]
+            if importe_total.find('p') is None:
+                convocatoria += 'NaN' + ';'
+            else:
+                convocatoria += str(importe_total.find('p').get_text()) + ';'
+
+            tipo_beneficiario = bloques[10]
+            if tipo_beneficiario.find('li') is None:
+                convocatoria += 'NaN' + ';'
+            else:
+                convocatoria += str(tipo_beneficiario.find('li').get_text()) + ';'
+
+            sector_beneficiario = bloques[11]
+            if sector_beneficiario.find('li') is None:
+                convocatoria += 'NaN' + ';'
+            else:
+                convocatoria += str(sector_beneficiario.find('li').get_text()).replace(';','-') + ';'
+
+            region_impacto = bloques[12]
+            if region_impacto.find('li') is None:
+                convocatoria += 'NaN' + ';'
+            else:
+                convocatoria += str(region_impacto.find('li').get_text()) + ';'
+
+            finalidad = bloques[13]
+            if finalidad.find('p') is None:
+                convocatoria += 'NaN' + ';'
+            else:
+                convocatoria += str(finalidad.find('p').get_text()) + ';'
+
+            with open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/prueba.csv','a') as csv_file:
+                csv_file.write(convocatoria +'\n')
+                csv_file.close()
+
+        if i % 100 == 0:
+            print(round(i / len(indice) * 100, 2), '%')
+
+
+
+
+complete_bdns_mk3(convocatorias)
diff --git a/crawler/crawler_tabuladores.py b/crawler/crawler_tabuladores.py
@@ -0,0 +1,179 @@
+from urllib import request
+from bs4 import BeautifulSoup
+from urllib.error import HTTPError
+import pandas as pd
+import ssl
+
+def complete_bdns(original_data):
+    indice = list(original_data.index)
+    crawled_data = list(original_data.codigo_bdns)
+
+    no_encontrado = ''
+
+    headers = ('codigo_bdns' + '\t' +
+               'importe_total' + '\t' +
+               'tipo_beneficiario' + '\t' +
+               'sector_beneficiario' + '\t' +
+               'región_impacto' + '\t' +
+               'finalidad')
+
+    tsv_1 = open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/concesiones.tsv','a')
+    tsv_1.write(headers + '\n')
+
+    for i in indice:
+        # Allows the crawler to conect to the web page
+        ssl._create_default_https_context = ssl._create_unverified_context
+
+        url = 'https://www.pap.hacienda.gob.es/bdnstrans/GE/es/convocatoria/' + str(crawled_data[i])
+
+        convocatoria = ''
+
+        try:
+           rawpage = request.urlopen(url) # Open the url
+
+        except HTTPError as err:
+           if err.code == 404:
+               no_encontrado = str(crawled_data[i])
+
+               with open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/no_encontrados2.tsv','a') as tsv_2:
+                   tsv_2.write(no_encontrado +'\n')
+                   tsv_2.close()
+
+        else:
+            convocatoria += str(crawled_data[i]) + '\t'
+
+            # Parsing to html
+            soupedpage = BeautifulSoup(rawpage, "html5lib")
+
+            #Extract only the article of the page
+            contenido = soupedpage.article
+
+            # Assigning values
+            bloques = contenido.find_all('div', attrs = 'bloque')
+
+            importe_total = bloques[7]
+            if importe_total.find('p') is None:
+                convocatoria += 'NaN' + '\t'
+            else:
+                convocatoria += str(importe_total.find('p').get_text()) + '\t'
+
+            tipo_beneficiario = bloques[10]
+            if tipo_beneficiario.find('li') is None:
+                convocatoria += 'NaN' + '\t'
+            else:
+                convocatoria += str(tipo_beneficiario.find('li').get_text()) + '\t'
+
+            sector_beneficiario = bloques[11]
+            if sector_beneficiario.find('li') is None:
+                convocatoria += 'NaN' + '\t'
+            else:
+                convocatoria += str(sector_beneficiario.find('li').get_text()) + '\t'
+
+            region_impacto = bloques[12]
+            if region_impacto.find('li') is None:
+                convocatoria += 'NaN' + '\t'
+            else:
+                convocatoria += str(region_impacto.find('li').get_text()) + '\t'
+
+            finalidad = bloques[13]
+            if finalidad.find('p') is None:
+                convocatoria += 'NaN' + '\t'
+            else:
+                convocatoria += str(finalidad.find('p').get_text()) + '\t'
+
+            tsv_1.write(convocatoria +'\n')
+
+        if i % 10000 == 0:
+            print(round(i / len(indice) * 100, 2), '%')
+
+    tsv_1.close()
+
+complete_bdns(concesiones)
+
+
+
+
+def complete_bdns_concesiones(original_data):
+    indice = list(original_data.index)
+    crawled_data = list(original_data.codigo_bdns)
+
+    no_encontrado = ''
+
+    headers = ('id' + '\t' +
+               'codigo_bdns' + '\t' +
+               'importe_total' + '\t' +
+               'tipo_beneficiario' + '\t' +
+               'sector_beneficiario' + '\t' +
+               'región_impacto' + '\t' +
+               'finalidad')
+
+    tsv_1 = open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/concesiones.tsv','a')
+    tsv_1.write(headers + '\n')
+
+    for i in indice:
+        # Allows the crawler to conect to the web page
+        ssl._create_default_https_context = ssl._create_unverified_context
+
+        url = 'https://www.pap.hacienda.gob.es/bdnstrans/GE/es/convocatoria/' + str(crawled_data[i])
+
+        convocatoria = ''
+
+        try:
+           rawpage = request.urlopen(url) # Open the url
+
+        except HTTPError as err:
+           if err.code == 404:
+               no_encontrado = str(crawled_data[i])
+
+               with open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/no_encontrados2.tsv','a') as tsv_2:
+                   tsv_2.write(no_encontrado +'\n')
+                   tsv_2.close()
+
+        else:
+            convocatoria += str(i) + '\t' + str(crawled_data[i]) + '\t'
+
+            # Parsing to html
+            soupedpage = BeautifulSoup(rawpage, "html5lib")
+
+            #Extract only the article of the page
+            contenido = soupedpage.article
+
+            # Assigning values
+            bloques = contenido.find_all('div', attrs = 'bloque')
+
+            importe_total = bloques[7]
+            if importe_total.find('p') is None:
+                convocatoria += 'NaN' + '\t'
+            else:
+                convocatoria += str(importe_total.find('p').get_text()) + '\t'
+
+            tipo_beneficiario = bloques[10]
+            if tipo_beneficiario.find('li') is None:
+                convocatoria += 'NaN' + '\t'
+            else:
+                convocatoria += str(tipo_beneficiario.find('li').get_text()) + '\t'
+
+            sector_beneficiario = bloques[11]
+            if sector_beneficiario.find('li') is None:
+                convocatoria += 'NaN' + '\t'
+            else:
+                convocatoria += str(sector_beneficiario.find('li').get_text()) + '\t'
+
+            region_impacto = bloques[12]
+            if region_impacto.find('li') is None:
+                convocatoria += 'NaN' + '\t'
+            else:
+                convocatoria += str(region_impacto.find('li').get_text()) + '\t'
+
+            finalidad = bloques[13]
+            if finalidad.find('p') is None:
+                convocatoria += 'NaN' + '\t'
+            else:
+                convocatoria += str(finalidad.find('p').get_text()) + '\t'
+
+            tsv_1.write(convocatoria +'\n')
+
+        if i % 10000 == 0:
+            print(round(i / len(indice) * 100, 2), '%')
+
+    tsv_1.close()
diff --git a/data-plugin/README.md b/data-plugin/README.md
@@ -0,0 +1,2 @@
+# academy-government-data
+Project in favor of transparency of the grants and concessions of the Government of Spain.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# academy-government-data
		Project in favor of transparency of the grants and concessions of the Government of Spain.