Skip to content

Commit

Permalink
Restoring repo
Browse files Browse the repository at this point in the history
  • Loading branch information
guillermotti committed Dec 15, 2022
1 parent 6653360 commit e905e9b
Show file tree
Hide file tree
Showing 148 changed files with 40,825 additions and 0 deletions.
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,21 @@
# academy-government-data
Project in favor of transparency of the grants and concessions of the Government of Spain.

## Search service
The search service itself is provided by Empathy.co. After a little bit of tuning and automation, it allows deploying a complete indexing and search service by executing a single command.

### Prerrequisites
There are some things needed prior to the execution of the deployment script.
- A **docker** service running to create and compose the containers (this is, opening Docker desktop)
- *jq* command (*it might not be necessary*). It should be installed in your device to be able to execute all the script. It can be easily installed in case you don't have it with a homebrew manager such like **brew**, with the command `brew install jq`
- **python3**. It's essential to parse the data files.

### Search deployment
In order to deploy the search service, we only need to perform two steps:
1. Run Docker desktop to have a docker service running
2. Move to the **search-plugin** folder and execute `./pipelineDeployer.sh` using the terminal.

#### What does the script do?
Firstly, it parses the compressed tsv with the data to index (placed in */data-plugin/datos_limpios*) to a JSON in order to send it in the indexing request. Then it raises up two containers that are shared between the search plugin and the index plugin, These are the *elasticsearch* and the *cerebro* containers.
After that, all the containers of the index plugin are raised up, creating a functional indexin service that is going to receive an automated request with the previously parsed data in order to index it.
After the indexing is finished, the script turns down these containers of the index service (with the objective of not blowing up your computer :smiley:) and runs the containers of the search service, that will receive and manage the queries.
2 changes: 2 additions & 0 deletions crawler/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# academy-government-data
Project in favor of transparency of the grants and concessions of the Government of Spain.
76 changes: 76 additions & 0 deletions crawler/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from urllib import request
from bs4 import BeautifulSoup
from urllib.error import HTTPError
import pandas as pd
import ssl

def complete_bdns_mk2(original_data):

no_encontrado = []
indice = list(original_data.index)

crawled_data = pd.DataFrame(original_data.codigo_bdns)
crawled_data['importe_total'] = ''
crawled_data['tipo_beneficiario'] = ''
crawled_data['sector_beneficiario'] = ''
crawled_data['region_impacto'] = ''
crawled_data['finalidad'] = ''

for i in indice:
# Allows the crawler to conect to the web page
ssl._create_default_https_context = ssl._create_unverified_context

url = 'https://www.pap.hacienda.gob.es/bdnstrans/GE/es/convocatoria/' + str(crawled_data.codigo_bdns[i])

try:
rawpage = request.urlopen(url) # Open the url
except HTTPError as err:
if err.code == 404:
no_encontrado.append(crawled_data.codigo_bdns[i])

else:
# Parsing to html
soupedpage = BeautifulSoup(rawpage, "html5lib")

#Extract only the article of the page
contenido = soupedpage.article

# Assigning values
bloques = contenido.find_all('div', attrs = 'bloque')

importe_total = bloques[7]
if importe_total.find('p') is None:
crawled_data.importe_total[i] = 'NaN'
else:
crawled_data.importe_total[i] = importe_total.find('p').get_text()

tipo_beneficiario = bloques[10]
if tipo_beneficiario.find('li') is None:
crawled_data.tipo_beneficiario[i] = 'NaN'
else:
crawled_data.tipo_beneficiario[i] = tipo_beneficiario.find('li').get_text()

sector_beneficiario = bloques[11]
if sector_beneficiario.find('li') is None:
crawled_data.sector_beneficiario[i] = 'NaN'
else:
crawled_data.sector_beneficiario[i] = sector_beneficiario.find('li').get_text()

region_impacto = bloques[12]
if region_impacto.find('li') is None:
crawled_data.region_impacto[i] = 'NaN'
else:
crawled_data.region_impacto[i] = region_impacto.find('li').get_text()

finalidad = bloques[13]
if finalidad.find('p') is None:
crawled_data.finalidad[i] = 'NaN'
else:
crawled_data.finalidad[i] = finalidad.find('p').get_text()

if i % 100 == 0:
print(round(i / len(crawled_data.index) * 100, 2), '%')

return crawled_data, no_encontrado

crawled_data, no_encontrado = complete_bdns_mk2(datos)
85 changes: 85 additions & 0 deletions crawler/crawler_strings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from urllib import request
from bs4 import BeautifulSoup
from urllib.error import HTTPError
import pandas as pd
import ssl

def complete_bdns_mk3(original_data):
indice = list(original_data.index)
crawled_data = list(original_data.codigo_bdns)

no_encontrado = []

headers = 'codigo_bdns;importe_total;tipo_beneficiario;sector_beneficiario;región_impacto;finalidad'

with open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/prueba.csv','a') as csv_file:
csv_file.write(headers + '\n')
csv_file.close()

for i in indice:
# Allows the crawler to conect to the web page
ssl._create_default_https_context = ssl._create_unverified_context

url = 'https://www.pap.hacienda.gob.es/bdnstrans/GE/es/convocatoria/' + str(crawled_data[i])

convocatoria = ''

try:
rawpage = request.urlopen(url) # Open the url
except HTTPError as err:
if err.code == 404:
no_encontrado.append(crawled_data[i])

else:
convocatoria += str(crawled_data[i]) + ';'

# Parsing to html
soupedpage = BeautifulSoup(rawpage, "html5lib")

#Extract only the article of the page
contenido = soupedpage.article

# Assigning values
bloques = contenido.find_all('div', attrs = 'bloque')

importe_total = bloques[7]
if importe_total.find('p') is None:
convocatoria += 'NaN' + ';'
else:
convocatoria += str(importe_total.find('p').get_text()) + ';'

tipo_beneficiario = bloques[10]
if tipo_beneficiario.find('li') is None:
convocatoria += 'NaN' + ';'
else:
convocatoria += str(tipo_beneficiario.find('li').get_text()) + ';'

sector_beneficiario = bloques[11]
if sector_beneficiario.find('li') is None:
convocatoria += 'NaN' + ';'
else:
convocatoria += str(sector_beneficiario.find('li').get_text()).replace(';','-') + ';'

region_impacto = bloques[12]
if region_impacto.find('li') is None:
convocatoria += 'NaN' + ';'
else:
convocatoria += str(region_impacto.find('li').get_text()) + ';'

finalidad = bloques[13]
if finalidad.find('p') is None:
convocatoria += 'NaN' + ';'
else:
convocatoria += str(finalidad.find('p').get_text()) + ';'

with open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/prueba.csv','a') as csv_file:
csv_file.write(convocatoria +'\n')
csv_file.close()

if i % 100 == 0:
print(round(i / len(indice) * 100, 2), '%')




complete_bdns_mk3(convocatorias)
179 changes: 179 additions & 0 deletions crawler/crawler_tabuladores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
from urllib import request
from bs4 import BeautifulSoup
from urllib.error import HTTPError
import pandas as pd
import ssl

def complete_bdns(original_data):
indice = list(original_data.index)
crawled_data = list(original_data.codigo_bdns)

no_encontrado = ''

headers = ('codigo_bdns' + '\t' +
'importe_total' + '\t' +
'tipo_beneficiario' + '\t' +
'sector_beneficiario' + '\t' +
'región_impacto' + '\t' +
'finalidad')

tsv_1 = open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/concesiones.tsv','a')
tsv_1.write(headers + '\n')

for i in indice:
# Allows the crawler to conect to the web page
ssl._create_default_https_context = ssl._create_unverified_context

url = 'https://www.pap.hacienda.gob.es/bdnstrans/GE/es/convocatoria/' + str(crawled_data[i])

convocatoria = ''

try:
rawpage = request.urlopen(url) # Open the url

except HTTPError as err:
if err.code == 404:
no_encontrado = str(crawled_data[i])

with open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/no_encontrados2.tsv','a') as tsv_2:
tsv_2.write(no_encontrado +'\n')
tsv_2.close()

else:
convocatoria += str(crawled_data[i]) + '\t'

# Parsing to html
soupedpage = BeautifulSoup(rawpage, "html5lib")

#Extract only the article of the page
contenido = soupedpage.article

# Assigning values
bloques = contenido.find_all('div', attrs = 'bloque')

importe_total = bloques[7]
if importe_total.find('p') is None:
convocatoria += 'NaN' + '\t'
else:
convocatoria += str(importe_total.find('p').get_text()) + '\t'

tipo_beneficiario = bloques[10]
if tipo_beneficiario.find('li') is None:
convocatoria += 'NaN' + '\t'
else:
convocatoria += str(tipo_beneficiario.find('li').get_text()) + '\t'

sector_beneficiario = bloques[11]
if sector_beneficiario.find('li') is None:
convocatoria += 'NaN' + '\t'
else:
convocatoria += str(sector_beneficiario.find('li').get_text()) + '\t'

region_impacto = bloques[12]
if region_impacto.find('li') is None:
convocatoria += 'NaN' + '\t'
else:
convocatoria += str(region_impacto.find('li').get_text()) + '\t'

finalidad = bloques[13]
if finalidad.find('p') is None:
convocatoria += 'NaN' + '\t'
else:
convocatoria += str(finalidad.find('p').get_text()) + '\t'

tsv_1.write(convocatoria +'\n')

if i % 10000 == 0:
print(round(i / len(indice) * 100, 2), '%')

tsv_1.close()

complete_bdns(concesiones)




def complete_bdns_concesiones(original_data):
indice = list(original_data.index)
crawled_data = list(original_data.codigo_bdns)

no_encontrado = ''

headers = ('id' + '\t' +
'codigo_bdns' + '\t' +
'importe_total' + '\t' +
'tipo_beneficiario' + '\t' +
'sector_beneficiario' + '\t' +
'región_impacto' + '\t' +
'finalidad')

tsv_1 = open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/concesiones.tsv','a')
tsv_1.write(headers + '\n')

for i in indice:
# Allows the crawler to conect to the web page
ssl._create_default_https_context = ssl._create_unverified_context

url = 'https://www.pap.hacienda.gob.es/bdnstrans/GE/es/convocatoria/' + str(crawled_data[i])

convocatoria = ''

try:
rawpage = request.urlopen(url) # Open the url

except HTTPError as err:
if err.code == 404:
no_encontrado = str(crawled_data[i])

with open('/Users/enriquecarnerofernandez/Documents/BDNS/cured_data/no_encontrados2.tsv','a') as tsv_2:
tsv_2.write(no_encontrado +'\n')
tsv_2.close()

else:
convocatoria += str(i) + '\t' + str(crawled_data[i]) + '\t'

# Parsing to html
soupedpage = BeautifulSoup(rawpage, "html5lib")

#Extract only the article of the page
contenido = soupedpage.article

# Assigning values
bloques = contenido.find_all('div', attrs = 'bloque')

importe_total = bloques[7]
if importe_total.find('p') is None:
convocatoria += 'NaN' + '\t'
else:
convocatoria += str(importe_total.find('p').get_text()) + '\t'

tipo_beneficiario = bloques[10]
if tipo_beneficiario.find('li') is None:
convocatoria += 'NaN' + '\t'
else:
convocatoria += str(tipo_beneficiario.find('li').get_text()) + '\t'

sector_beneficiario = bloques[11]
if sector_beneficiario.find('li') is None:
convocatoria += 'NaN' + '\t'
else:
convocatoria += str(sector_beneficiario.find('li').get_text()) + '\t'

region_impacto = bloques[12]
if region_impacto.find('li') is None:
convocatoria += 'NaN' + '\t'
else:
convocatoria += str(region_impacto.find('li').get_text()) + '\t'

finalidad = bloques[13]
if finalidad.find('p') is None:
convocatoria += 'NaN' + '\t'
else:
convocatoria += str(finalidad.find('p').get_text()) + '\t'

tsv_1.write(convocatoria +'\n')

if i % 10000 == 0:
print(round(i / len(indice) * 100, 2), '%')

tsv_1.close()
2 changes: 2 additions & 0 deletions data-plugin/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# academy-government-data
Project in favor of transparency of the grants and concessions of the Government of Spain.
Loading

0 comments on commit e905e9b

Please sign in to comment.