-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6653360
commit e905e9b
Showing
148 changed files
with
40,825 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,21 @@ | ||
# academy-government-data | ||
Project in favor of transparency of the grants and concessions of the Government of Spain. | ||
|
||
## Search service | ||
The search service itself is provided by Empathy.co. After a little bit of tuning and automation, it allows deploying a complete indexing and search service by executing a single command. | ||
|
||
### Prerrequisites | ||
There are some things needed prior to the execution of the deployment script. | ||
- A **docker** service running to create and compose the containers (this is, opening Docker desktop) | ||
- *jq* command (*it might not be necessary*). It should be installed in your device to be able to execute all the script. It can be easily installed in case you don't have it with a homebrew manager such like **brew**, with the command `brew install jq` | ||
- **python3**. It's essential to parse the data files. | ||
|
||
### Search deployment | ||
In order to deploy the search service, we only need to perform two steps: | ||
1. Run Docker desktop to have a docker service running | ||
2. Move to the **search-plugin** folder and execute `./pipelineDeployer.sh` using the terminal. | ||
|
||
#### What does the script do? | ||
Firstly, it parses the compressed tsv with the data to index (placed in */data-plugin/datos_limpios*) to a JSON in order to send it in the indexing request. Then it raises up two containers that are shared between the search plugin and the index plugin, These are the *elasticsearch* and the *cerebro* containers. | ||
After that, all the containers of the index plugin are raised up, creating a functional indexin service that is going to receive an automated request with the previously parsed data in order to index it. | ||
After the indexing is finished, the script turns down these containers of the index service (with the objective of not blowing up your computer :smiley:) and runs the containers of the search service, that will receive and manage the queries. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# academy-government-data | ||
Project in favor of transparency of the grants and concessions of the Government of Spain. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
from urllib import request | ||
from bs4 import BeautifulSoup | ||
from urllib.error import HTTPError | ||
import pandas as pd | ||
import ssl | ||
|
||
def complete_bdns_mk2(original_data): | ||
|
||
no_encontrado = [] | ||
indice = list(original_data.index) | ||
|
||
crawled_data = pd.DataFrame(original_data.codigo_bdns) | ||
crawled_data['importe_total'] = '' | ||
crawled_data['tipo_beneficiario'] = '' | ||
crawled_data['sector_beneficiario'] = '' | ||
crawled_data['region_impacto'] = '' | ||
crawled_data['finalidad'] = '' | ||
|
||
for i in indice: | ||
# Allows the crawler to conect to the web page | ||
ssl._create_default_https_context = ssl._create_unverified_context | ||
|
||
url = 'https://www.pap.hacienda.gob.es/bdnstrans/GE/es/convocatoria/' + str(crawled_data.codigo_bdns[i]) | ||
|
||
try: | ||
rawpage = request.urlopen(url) # Open the url | ||
except HTTPError as err: | ||
if err.code == 404: | ||
no_encontrado.append(crawled_data.codigo_bdns[i]) | ||
|
||
else: | ||
# Parsing to html | ||
soupedpage = BeautifulSoup(rawpage, "html5lib") | ||
|
||
#Extract only the article of the page | ||
contenido = soupedpage.article | ||
|
||
# Assigning values | ||
bloques = contenido.find_all('div', attrs = 'bloque') | ||
|
||
importe_total = bloques[7] | ||
if importe_total.find('p') is None: | ||
crawled_data.importe_total[i] = 'NaN' | ||
else: | ||
crawled_data.importe_total[i] = importe_total.find('p').get_text() | ||
|
||
tipo_beneficiario = bloques[10] | ||
if tipo_beneficiario.find('li') is None: | ||
crawled_data.tipo_beneficiario[i] = 'NaN' | ||
else: | ||
crawled_data.tipo_beneficiario[i] = tipo_beneficiario.find('li').get_text() | ||
|
||
sector_beneficiario = bloques[11] | ||
if sector_beneficiario.find('li') is None: | ||
crawled_data.sector_beneficiario[i] = 'NaN' | ||
else: | ||
crawled_data.sector_beneficiario[i] = sector_beneficiario.find('li').get_text() | ||
|
||
region_impacto = bloques[12] | ||
if region_impacto.find('li') is None: | ||
crawled_data.region_impacto[i] = 'NaN' | ||
else: | ||
crawled_data.region_impacto[i] = region_impacto.find('li').get_text() | ||
|
||
finalidad = bloques[13] | ||
if finalidad.find('p') is None: | ||
crawled_data.finalidad[i] = 'NaN' | ||
else: | ||
crawled_data.finalidad[i] = finalidad.find('p').get_text() | ||
|
||
if i % 100 == 0: | ||
print(round(i / len(crawled_data.index) * 100, 2), '%') | ||
|
||
return crawled_data, no_encontrado | ||
|
||
crawled_data, no_encontrado = complete_bdns_mk2(datos) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
from urllib import request | ||
from bs4 import BeautifulSoup | ||
from urllib.error import HTTPError | ||
import pandas as pd | ||
import ssl | ||
|
||
def complete_bdns_mk3(original_data): | ||
indice = list(original_data.index) | ||
crawled_data = list(original_data.codigo_bdns) | ||
|
||
no_encontrado = [] | ||
|
||
headers = 'codigo_bdns;importe_total;tipo_beneficiario;sector_beneficiario;región_impacto;finalidad' | ||
|
||
with open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/prueba.csv','a') as csv_file: | ||
csv_file.write(headers + '\n') | ||
csv_file.close() | ||
|
||
for i in indice: | ||
# Allows the crawler to conect to the web page | ||
ssl._create_default_https_context = ssl._create_unverified_context | ||
|
||
url = 'https://www.pap.hacienda.gob.es/bdnstrans/GE/es/convocatoria/' + str(crawled_data[i]) | ||
|
||
convocatoria = '' | ||
|
||
try: | ||
rawpage = request.urlopen(url) # Open the url | ||
except HTTPError as err: | ||
if err.code == 404: | ||
no_encontrado.append(crawled_data[i]) | ||
|
||
else: | ||
convocatoria += str(crawled_data[i]) + ';' | ||
|
||
# Parsing to html | ||
soupedpage = BeautifulSoup(rawpage, "html5lib") | ||
|
||
#Extract only the article of the page | ||
contenido = soupedpage.article | ||
|
||
# Assigning values | ||
bloques = contenido.find_all('div', attrs = 'bloque') | ||
|
||
importe_total = bloques[7] | ||
if importe_total.find('p') is None: | ||
convocatoria += 'NaN' + ';' | ||
else: | ||
convocatoria += str(importe_total.find('p').get_text()) + ';' | ||
|
||
tipo_beneficiario = bloques[10] | ||
if tipo_beneficiario.find('li') is None: | ||
convocatoria += 'NaN' + ';' | ||
else: | ||
convocatoria += str(tipo_beneficiario.find('li').get_text()) + ';' | ||
|
||
sector_beneficiario = bloques[11] | ||
if sector_beneficiario.find('li') is None: | ||
convocatoria += 'NaN' + ';' | ||
else: | ||
convocatoria += str(sector_beneficiario.find('li').get_text()).replace(';','-') + ';' | ||
|
||
region_impacto = bloques[12] | ||
if region_impacto.find('li') is None: | ||
convocatoria += 'NaN' + ';' | ||
else: | ||
convocatoria += str(region_impacto.find('li').get_text()) + ';' | ||
|
||
finalidad = bloques[13] | ||
if finalidad.find('p') is None: | ||
convocatoria += 'NaN' + ';' | ||
else: | ||
convocatoria += str(finalidad.find('p').get_text()) + ';' | ||
|
||
with open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/prueba.csv','a') as csv_file: | ||
csv_file.write(convocatoria +'\n') | ||
csv_file.close() | ||
|
||
if i % 100 == 0: | ||
print(round(i / len(indice) * 100, 2), '%') | ||
|
||
|
||
|
||
|
||
complete_bdns_mk3(convocatorias) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
from urllib import request | ||
from bs4 import BeautifulSoup | ||
from urllib.error import HTTPError | ||
import pandas as pd | ||
import ssl | ||
|
||
def complete_bdns(original_data): | ||
indice = list(original_data.index) | ||
crawled_data = list(original_data.codigo_bdns) | ||
|
||
no_encontrado = '' | ||
|
||
headers = ('codigo_bdns' + '\t' + | ||
'importe_total' + '\t' + | ||
'tipo_beneficiario' + '\t' + | ||
'sector_beneficiario' + '\t' + | ||
'región_impacto' + '\t' + | ||
'finalidad') | ||
|
||
tsv_1 = open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/concesiones.tsv','a') | ||
tsv_1.write(headers + '\n') | ||
|
||
for i in indice: | ||
# Allows the crawler to conect to the web page | ||
ssl._create_default_https_context = ssl._create_unverified_context | ||
|
||
url = 'https://www.pap.hacienda.gob.es/bdnstrans/GE/es/convocatoria/' + str(crawled_data[i]) | ||
|
||
convocatoria = '' | ||
|
||
try: | ||
rawpage = request.urlopen(url) # Open the url | ||
|
||
except HTTPError as err: | ||
if err.code == 404: | ||
no_encontrado = str(crawled_data[i]) | ||
|
||
with open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/no_encontrados2.tsv','a') as tsv_2: | ||
tsv_2.write(no_encontrado +'\n') | ||
tsv_2.close() | ||
|
||
else: | ||
convocatoria += str(crawled_data[i]) + '\t' | ||
|
||
# Parsing to html | ||
soupedpage = BeautifulSoup(rawpage, "html5lib") | ||
|
||
#Extract only the article of the page | ||
contenido = soupedpage.article | ||
|
||
# Assigning values | ||
bloques = contenido.find_all('div', attrs = 'bloque') | ||
|
||
importe_total = bloques[7] | ||
if importe_total.find('p') is None: | ||
convocatoria += 'NaN' + '\t' | ||
else: | ||
convocatoria += str(importe_total.find('p').get_text()) + '\t' | ||
|
||
tipo_beneficiario = bloques[10] | ||
if tipo_beneficiario.find('li') is None: | ||
convocatoria += 'NaN' + '\t' | ||
else: | ||
convocatoria += str(tipo_beneficiario.find('li').get_text()) + '\t' | ||
|
||
sector_beneficiario = bloques[11] | ||
if sector_beneficiario.find('li') is None: | ||
convocatoria += 'NaN' + '\t' | ||
else: | ||
convocatoria += str(sector_beneficiario.find('li').get_text()) + '\t' | ||
|
||
region_impacto = bloques[12] | ||
if region_impacto.find('li') is None: | ||
convocatoria += 'NaN' + '\t' | ||
else: | ||
convocatoria += str(region_impacto.find('li').get_text()) + '\t' | ||
|
||
finalidad = bloques[13] | ||
if finalidad.find('p') is None: | ||
convocatoria += 'NaN' + '\t' | ||
else: | ||
convocatoria += str(finalidad.find('p').get_text()) + '\t' | ||
|
||
tsv_1.write(convocatoria +'\n') | ||
|
||
if i % 10000 == 0: | ||
print(round(i / len(indice) * 100, 2), '%') | ||
|
||
tsv_1.close() | ||
|
||
complete_bdns(concesiones) | ||
|
||
|
||
|
||
|
||
def complete_bdns_concesiones(original_data): | ||
indice = list(original_data.index) | ||
crawled_data = list(original_data.codigo_bdns) | ||
|
||
no_encontrado = '' | ||
|
||
headers = ('id' + '\t' + | ||
'codigo_bdns' + '\t' + | ||
'importe_total' + '\t' + | ||
'tipo_beneficiario' + '\t' + | ||
'sector_beneficiario' + '\t' + | ||
'región_impacto' + '\t' + | ||
'finalidad') | ||
|
||
tsv_1 = open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/concesiones.tsv','a') | ||
tsv_1.write(headers + '\n') | ||
|
||
for i in indice: | ||
# Allows the crawler to conect to the web page | ||
ssl._create_default_https_context = ssl._create_unverified_context | ||
|
||
url = 'https://www.pap.hacienda.gob.es/bdnstrans/GE/es/convocatoria/' + str(crawled_data[i]) | ||
|
||
convocatoria = '' | ||
|
||
try: | ||
rawpage = request.urlopen(url) # Open the url | ||
|
||
except HTTPError as err: | ||
if err.code == 404: | ||
no_encontrado = str(crawled_data[i]) | ||
|
||
with open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/no_encontrados2.tsv','a') as tsv_2: | ||
tsv_2.write(no_encontrado +'\n') | ||
tsv_2.close() | ||
|
||
else: | ||
convocatoria += str(i) + '\t' + str(crawled_data[i]) + '\t' | ||
|
||
# Parsing to html | ||
soupedpage = BeautifulSoup(rawpage, "html5lib") | ||
|
||
#Extract only the article of the page | ||
contenido = soupedpage.article | ||
|
||
# Assigning values | ||
bloques = contenido.find_all('div', attrs = 'bloque') | ||
|
||
importe_total = bloques[7] | ||
if importe_total.find('p') is None: | ||
convocatoria += 'NaN' + '\t' | ||
else: | ||
convocatoria += str(importe_total.find('p').get_text()) + '\t' | ||
|
||
tipo_beneficiario = bloques[10] | ||
if tipo_beneficiario.find('li') is None: | ||
convocatoria += 'NaN' + '\t' | ||
else: | ||
convocatoria += str(tipo_beneficiario.find('li').get_text()) + '\t' | ||
|
||
sector_beneficiario = bloques[11] | ||
if sector_beneficiario.find('li') is None: | ||
convocatoria += 'NaN' + '\t' | ||
else: | ||
convocatoria += str(sector_beneficiario.find('li').get_text()) + '\t' | ||
|
||
region_impacto = bloques[12] | ||
if region_impacto.find('li') is None: | ||
convocatoria += 'NaN' + '\t' | ||
else: | ||
convocatoria += str(region_impacto.find('li').get_text()) + '\t' | ||
|
||
finalidad = bloques[13] | ||
if finalidad.find('p') is None: | ||
convocatoria += 'NaN' + '\t' | ||
else: | ||
convocatoria += str(finalidad.find('p').get_text()) + '\t' | ||
|
||
tsv_1.write(convocatoria +'\n') | ||
|
||
if i % 10000 == 0: | ||
print(round(i / len(indice) * 100, 2), '%') | ||
|
||
tsv_1.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# academy-government-data | ||
Project in favor of transparency of the grants and concessions of the Government of Spain. |
Oops, something went wrong.