Merge pull request #3 from datasets/scripts

[UP][m] Updating the scripts with v2 source of world bank
datasets · Dec 3, 2024 · 8f5f419 · 8f5f419
2 parents 1538eb3 + 99c8186
commit 8f5f419
Show file tree

Hide file tree

Showing 4 changed files with 85 additions and 73 deletions.
diff --git a/README.md b/README.md
@@ -7,9 +7,27 @@ The repository of the data package of the GINI Index.
 The data that is contained in the `gini-index.csv` file, under `/data` was
 retrieved from [the World Bank](http://data.worldbank.org/indicator/SI.POV.GINI).
 
+## Preparation
 
-## License
+Run the requirements through `pip` in order to install all required packages to run the script.
+
+`pip install -r scripts/requirements.txt`
+
+The process is recorded and automated in a Python script:
+
+`scripts/process.py`
+
+Or, alternatively just by running:
+
+`make`
+
+## Automation
+
+Up-to-date (auto-updates every year) gdp dataset could be found on the datahub.io:
+https://datahub.io/core/gini-index
+
+## Licence
 
 All data is licensed under the Open Data Commons Public Domain Dedication and License. All code is licensed under the MIT/BSD license.
 
-Note that while no credit is formally required a link back or credit to Rufus Pollock and the Open Knowledge Foundation is much appreciated.
+Note that while no credit is formally required a link back or credit to Rufus Pollock and the Open Knowledge Foundation is much appreciated.
diff --git a/scripts/README.md b/scripts/README.md
diff --git a/scripts/process.py b/scripts/process.py
@@ -1,35 +1,69 @@
 #!/usr/bin/python
 
-import csv, os, sys
-import numpy as np
+import os, csv
+import requests
+import zipfile
+import tempfile
 import pandas as pd
 
-# Building query to fetch data from API
-apiBase = "http://api.worldbank.org/indicator/"
-apiIndicator = "SI.POV.GINI"    # This can be changed to any other indicator
-FILE_NAME = 'gini-index.csv'
-source = apiBase+apiIndicator+"?format=csv"
-print(source)
-
-def main():
-    giniIndex = pd.read_csv(source)
-    giniIndex.to_csv('archive/gini-index.csv', sep=",", index=False)
-    print("Saved archive CSV file.")
-    print(giniIndex)
-
-    # Processing the data
-    df = pd.read_csv('archive/gini-index.csv')      # Reading the source csv
-    """
-    Python is printing "Country Name" with quotes in data frame and does not
-    work for the remaining code
-    """
-    df.columns.values[0] = 'Country Name'
-
-    df = pd.melt(df, id_vars=['Country Name', 'Country Code'], var_name="Year", value_name="Value")     # Unpivoting
-    df = df.sort_values(by=['Country Name', 'Year'], ascending=[True, True])  # Sorting by country
-
-    df.dropna().to_csv('data/gini-index.csv', sep=",", index=False)   # Saving CSV
-    print ("File has been saved and it is ready for data packaging.")
+tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix='.zip')
+tmpdir = tempfile.TemporaryDirectory()
 
+API_INDICATOR = "SI.POV.GINI"
+SOURCE_URL = f"https://api.worldbank.org/v2/en/indicator/{API_INDICATOR}?downloadformat=csv"
+ARCHIVE_FILE = 'archive/gini-index.csv'
+OUTPUT_FILE = 'data/gini-index.csv'
+
+def download_zip_file():
+    response = requests.get(SOURCE_URL)
+
+    with open(tmpfile.name, 'wb') as d:
+        d.write(response.content)
+
+    with zipfile.ZipFile(tmpfile.name, 'r') as zip_ref:
+        zip_ref.extractall(tmpdir.name)
+
+    os.unlink(tmpfile.name)
+
+    for path in os.scandir(tmpdir.name):
+        if path.is_file() and 'metadata' not in path.name.lower():
+            filename = os.path.join(tmpdir.name, path.name)
+            archive_path = os.path.join('archive', 'gini-index.csv')
+
+            # Ensure the archive folder exists
+            os.makedirs('archive', exist_ok=True)
+
+            os.rename(filename, archive_path)
+            print(f"File saved to: {archive_path}")
+
+def process_population_data(filename, output_file):
+    # Read the raw CSV file
+    with open(filename) as fo:
+        lines = [row for row in csv.reader(fo)]
+
+    # Extract headings and data rows
+    headings = lines[4]
+    lines = lines[5:]
+
+    # Define output structure
+    outheadings = ['Country Name', 'Country Code', 'Year', 'Value']
+    outlines = []
+
+    # Process each row and reshape the data
+    for row in lines:
+        for idx, year in enumerate(headings[4:]):
+            if row[idx + 4]:  # Check if the value exists
+                value = row[idx + 4]
+                outlines.append(row[:2] + [int(year), value])
+
+    df = pd.DataFrame(outlines, columns=outheadings)
+
+    df = df.sort_values(by=['Country Name', 'Year'])
+    df.to_csv(output_file, index=False)
+    print(f"Processed data saved to {output_file}")
+
+# Example usage
 if __name__ == "__main__":
-    main()
+    download_zip_file()
+    process_population_data(ARCHIVE_FILE, OUTPUT_FILE)
+
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
@@ -1,3 +1,2 @@
-process.py
-pandas
-numpy
+pandas==2.2.3
+requests==2.32.3