Skip to content

Commit

Permalink
Merge pull request #3 from datasets/scripts
Browse files Browse the repository at this point in the history
[UP][m] Updating the scripts with v2 source of world bank
  • Loading branch information
Mikanebu authored Dec 3, 2024
2 parents 1538eb3 + 99c8186 commit 8f5f419
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 73 deletions.
22 changes: 20 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,27 @@ The repository of the data package of the GINI Index.
The data that is contained in the `gini-index.csv` file, under `/data` was
retrieved from [the World Bank](http://data.worldbank.org/indicator/SI.POV.GINI).

## Preparation

## License
Run the requirements through `pip` in order to install all required packages to run the script.

`pip install -r scripts/requirements.txt`

The process is recorded and automated in a Python script:

`scripts/process.py`

Or, alternatively just by running:

`make`

## Automation

Up-to-date (auto-updates every year) gdp dataset could be found on the datahub.io:
https://datahub.io/core/gini-index

## Licence

All data is licensed under the Open Data Commons Public Domain Dedication and License. All code is licensed under the MIT/BSD license.

Note that while no credit is formally required a link back or credit to Rufus Pollock and the Open Knowledge Foundation is much appreciated.
Note that while no credit is formally required a link back or credit to Rufus Pollock and the Open Knowledge Foundation is much appreciated.
39 changes: 0 additions & 39 deletions scripts/README.md

This file was deleted.

92 changes: 63 additions & 29 deletions scripts/process.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,69 @@
#!/usr/bin/python

import csv, os, sys
import numpy as np
import os, csv
import requests
import zipfile
import tempfile
import pandas as pd

# Building query to fetch data from API
apiBase = "http://api.worldbank.org/indicator/"
apiIndicator = "SI.POV.GINI" # This can be changed to any other indicator
FILE_NAME = 'gini-index.csv'
source = apiBase+apiIndicator+"?format=csv"
print(source)

def main():
giniIndex = pd.read_csv(source)
giniIndex.to_csv('archive/gini-index.csv', sep=",", index=False)
print("Saved archive CSV file.")
print(giniIndex)

# Processing the data
df = pd.read_csv('archive/gini-index.csv') # Reading the source csv
"""
Python is printing "Country Name" with quotes in data frame and does not
work for the remaining code
"""
df.columns.values[0] = 'Country Name'

df = pd.melt(df, id_vars=['Country Name', 'Country Code'], var_name="Year", value_name="Value") # Unpivoting
df = df.sort_values(by=['Country Name', 'Year'], ascending=[True, True]) # Sorting by country

df.dropna().to_csv('data/gini-index.csv', sep=",", index=False) # Saving CSV
print ("File has been saved and it is ready for data packaging.")
tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix='.zip')
tmpdir = tempfile.TemporaryDirectory()

API_INDICATOR = "SI.POV.GINI"
SOURCE_URL = f"https://api.worldbank.org/v2/en/indicator/{API_INDICATOR}?downloadformat=csv"
ARCHIVE_FILE = 'archive/gini-index.csv'
OUTPUT_FILE = 'data/gini-index.csv'

def download_zip_file():
response = requests.get(SOURCE_URL)

with open(tmpfile.name, 'wb') as d:
d.write(response.content)

with zipfile.ZipFile(tmpfile.name, 'r') as zip_ref:
zip_ref.extractall(tmpdir.name)

os.unlink(tmpfile.name)

for path in os.scandir(tmpdir.name):
if path.is_file() and 'metadata' not in path.name.lower():
filename = os.path.join(tmpdir.name, path.name)
archive_path = os.path.join('archive', 'gini-index.csv')

# Ensure the archive folder exists
os.makedirs('archive', exist_ok=True)

os.rename(filename, archive_path)
print(f"File saved to: {archive_path}")

def process_population_data(filename, output_file):
# Read the raw CSV file
with open(filename) as fo:
lines = [row for row in csv.reader(fo)]

# Extract headings and data rows
headings = lines[4]
lines = lines[5:]

# Define output structure
outheadings = ['Country Name', 'Country Code', 'Year', 'Value']
outlines = []

# Process each row and reshape the data
for row in lines:
for idx, year in enumerate(headings[4:]):
if row[idx + 4]: # Check if the value exists
value = row[idx + 4]
outlines.append(row[:2] + [int(year), value])

df = pd.DataFrame(outlines, columns=outheadings)

df = df.sort_values(by=['Country Name', 'Year'])
df.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")

# Example usage
if __name__ == "__main__":
main()
download_zip_file()
process_population_data(ARCHIVE_FILE, OUTPUT_FILE)

5 changes: 2 additions & 3 deletions scripts/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
process.py
pandas
numpy
pandas==2.2.3
requests==2.32.3

0 comments on commit 8f5f419

Please sign in to comment.