From 7096451e5feb3f23dab4bdf6abf5846caab5c929 Mon Sep 17 00:00:00 2001
From: gradedSystem <homesiknessgerald@gmail.com>
Date: Wed, 27 Nov 2024 17:14:54 +0800
Subject: [PATCH 1/5] [UP][m] Updating the scripts with v2 source of world bank

---
 scripts/README.md        |  2 +-
 scripts/process.py       | 92 +++++++++++++++++++++++++++-------------
 scripts/requirements.txt |  5 +--
 3 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/scripts/README.md b/scripts/README.md
index 47153d6..aacec64 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -11,7 +11,7 @@ Afterwards, it will delete rows with blank inputs.
 Run the requirements through `pip` in order to install all required packages to 
 run the script.
 
-`pip install -r /path/to/requirements.txt`
+`pip install -r scripts/requirements.txt`
 
 ## Contributing
 
diff --git a/scripts/process.py b/scripts/process.py
index 90c94da..53e5405 100644
--- a/scripts/process.py
+++ b/scripts/process.py
@@ -1,35 +1,69 @@
 #!/usr/bin/python
 
-import csv, os, sys
-import numpy as np
+import os, csv
+import requests
+import zipfile
+import tempfile
 import pandas as pd
 
-# Building query to fetch data from API
-apiBase = "http://api.worldbank.org/indicator/"
-apiIndicator = "SI.POV.GINI"    # This can be changed to any other indicator
-FILE_NAME = 'gini-index.csv'
-source = apiBase+apiIndicator+"?format=csv"
-print(source)
-
-def main():
-    giniIndex = pd.read_csv(source)
-    giniIndex.to_csv('archive/gini-index.csv', sep=",", index=False)
-    print("Saved archive CSV file.")
-    print(giniIndex)
-
-    # Processing the data
-    df = pd.read_csv('archive/gini-index.csv')      # Reading the source csv
-    """
-    Python is printing "Country Name" with quotes in data frame and does not
-    work for the remaining code
-    """
-    df.columns.values[0] = 'Country Name'
-
-    df = pd.melt(df, id_vars=['Country Name', 'Country Code'], var_name="Year", value_name="Value")     # Unpivoting
-    df = df.sort_values(by=['Country Name', 'Year'], ascending=[True, True])  # Sorting by country
-
-    df.dropna().to_csv('data/gini-index.csv', sep=",", index=False)   # Saving CSV
-    print ("File has been saved and it is ready for data packaging.")
+tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix='.zip')
+tmpdir = tempfile.TemporaryDirectory()
 
+API_INDICATOR = "SI.POV.GINI"
+SOURCE_URL = f"https://api.worldbank.org/v2/en/indicator/{API_INDICATOR}?downloadformat=csv"
+ARCHIVE_FILE = 'archive/gini-index.csv'
+OUTPUT_FILE = 'data/gini-index.csv'
+
+def download_zip_file():
+    response = requests.get(SOURCE_URL)
+    
+    with open(tmpfile.name, 'wb') as d:
+        d.write(response.content)
+    
+    with zipfile.ZipFile(tmpfile.name, 'r') as zip_ref:
+        zip_ref.extractall(tmpdir.name)
+    
+    os.unlink(tmpfile.name)
+    
+    for path in os.scandir(tmpdir.name):
+        if path.is_file() and 'metadata' not in path.name.lower():
+            filename = os.path.join(tmpdir.name, path.name)
+            archive_path = os.path.join('archive', 'gini-index.csv')
+            
+            # Ensure the archive folder exists
+            os.makedirs('archive', exist_ok=True)
+            
+            os.rename(filename, archive_path)
+            print(f"File saved to: {archive_path}")
+
+def process_population_data(filename, output_file):
+    # Read the raw CSV file
+    with open(filename) as fo:
+        lines = [row for row in csv.reader(fo)]
+    
+    # Extract headings and data rows
+    headings = lines[4]
+    lines = lines[5:]
+    
+    # Define output structure
+    outheadings = ['Country Name', 'Country Code', 'Year', 'Value']
+    outlines = []
+
+    # Process each row and reshape the data
+    for row in lines:
+        for idx, year in enumerate(headings[4:]):
+            if row[idx + 4]:  # Check if the value exists
+                value = row[idx + 4]
+                outlines.append(row[:2] + [int(year), value])
+
+    df = pd.DataFrame(outlines, columns=outheadings)
+    
+    df = df.sort_values(by=['Country Name', 'Year'])
+    df.to_csv(output_file, index=False)
+    print(f"Processed data saved to {output_file}")
+
+# Example usage
 if __name__ == "__main__":
-    main()
+    download_zip_file()
+    process_population_data(ARCHIVE_FILE, OUTPUT_FILE)
+
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
index 06ce716..5b466cd 100644
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -1,3 +1,2 @@
-process.py
-pandas
-numpy
+pandas==2.2.3
+requests==2.32.3
\ No newline at end of file

From a53c5808589d8f2247548f72940538ea89f83e38 Mon Sep 17 00:00:00 2001
From: gradedSystem <homesiknessgerald@gmail.com>
Date: Tue, 3 Dec 2024 19:39:25 +0700
Subject: [PATCH 2/5] [remove-update][s] Moved README.md to the root folder

---
 README.md         | 31 +++++++++++++++++++++++++++++++
 scripts/README.md | 39 ---------------------------------------
 2 files changed, 31 insertions(+), 39 deletions(-)
 delete mode 100644 scripts/README.md

diff --git a/README.md b/README.md
index 05ee0a9..c365c83 100644
--- a/README.md
+++ b/README.md
@@ -7,9 +7,40 @@ The repository of the data package of the GINI Index.
 The data that is contained in the `gini-index.csv` file, under `/data` was
 retrieved from [the World Bank](http://data.worldbank.org/indicator/SI.POV.GINI).
 
+## Requirements
+
+Run the requirements through `pip` in order to install all required packages to 
+run the script.
+
+`pip install -r scripts/requirements.txt`
+
 
 ## License
 
 All data is licensed under the Open Data Commons Public Domain Dedication and License. All code is licensed under the MIT/BSD license.
 
 Note that while no credit is formally required a link back or credit to Rufus Pollock and the Open Knowledge Foundation is much appreciated.
+
+
+## Contributing
+
+This dataset will eventually need an update, specially because there are many
+pieces missing.
+In order to contribute, please fork this repository and do the changes you must,
+but please follow [OKFN's curator guide](http://data.okfn.org/doc/core-data-curators)
+to keep things as organized as possible.
+
+## Tags and committing changes
+
+When you push your modifications to GitHub, you should do it based on a few tags
+to help identify the work progress. There is not an exact definition but here
+are a few and important tags:
+ * [WK] - working update
+
+ * [TP] - Ready for packaging
+
+ * [UP] - Update, of any sort
+
+ * [FL] - Failure, of any sort (please report in the issues page)
+
+ * [ED] - Smaller edits
diff --git a/scripts/README.md b/scripts/README.md
deleted file mode 100644
index aacec64..0000000
--- a/scripts/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# Usage of process.py
-
-The script will use the csv file under `/data` and it will unpivote the 
-original `xls` file and sort data by Country (alphabetically) and by Year 
-(ascending order).
-
-Afterwards, it will delete rows with blank inputs.
-
-## Requirements
-
-Run the requirements through `pip` in order to install all required packages to 
-run the script.
-
-`pip install -r scripts/requirements.txt`
-
-## Contributing
-
-This dataset will eventually need an update, specially because there are many
-pieces missing.
-In order to contribute, please fork this repository and do the changes you must,
-but please follow [OKFN's curator guide](http://data.okfn.org/doc/core-data-curators)
-to keep things as organized as possible.
-
-## Tags and committing changes
-
-When you push your modifications to GitHub, you should do it based on a few tags
-to help identify the work progress. There is not an exact definition but here
-are a few and important tags:
- * [WK] - working update
-
- * [TP] - Ready for packaging
-
- * [UP] - Update, of any sort
-
- * [FL] - Failure, of any sort (please report in the issues page)
-
- * [ED] - Smaller edits
-
-

From c4ea44e5d6bffbef4cd018bc2ba3037965a9ec93 Mon Sep 17 00:00:00 2001
From: gradedSystem <homesiknessgerald@gmail.com>
Date: Tue, 3 Dec 2024 19:55:34 +0700
Subject: [PATCH 3/5] [update][s] Update README

---
 README.md | 34 +++++++---------------------------
 1 file changed, 7 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index c365c83..ade818a 100644
--- a/README.md
+++ b/README.md
@@ -7,40 +7,20 @@ The repository of the data package of the GINI Index.
 The data that is contained in the `gini-index.csv` file, under `/data` was
 retrieved from [the World Bank](http://data.worldbank.org/indicator/SI.POV.GINI).
 
-## Requirements
+## Preparation
 
 Run the requirements through `pip` in order to install all required packages to 
 run the script.
 
 `pip install -r scripts/requirements.txt`
 
+## Automation
 
-## License
+Up-to-date (auto-updates every year) gdp dataset could be found on the datahub.io:
+https://datahub.io/core/gini-index
 
-All data is licensed under the Open Data Commons Public Domain Dedication and License. All code is licensed under the MIT/BSD license.
-
-Note that while no credit is formally required a link back or credit to Rufus Pollock and the Open Knowledge Foundation is much appreciated.
-
-
-## Contributing
-
-This dataset will eventually need an update, specially because there are many
-pieces missing.
-In order to contribute, please fork this repository and do the changes you must,
-but please follow [OKFN's curator guide](http://data.okfn.org/doc/core-data-curators)
-to keep things as organized as possible.
+## Licence
 
-## Tags and committing changes
-
-When you push your modifications to GitHub, you should do it based on a few tags
-to help identify the work progress. There is not an exact definition but here
-are a few and important tags:
- * [WK] - working update
-
- * [TP] - Ready for packaging
-
- * [UP] - Update, of any sort
-
- * [FL] - Failure, of any sort (please report in the issues page)
+All data is licensed under the Open Data Commons Public Domain Dedication and License. All code is licensed under the MIT/BSD license.
 
- * [ED] - Smaller edits
+Note that while no credit is formally required a link back or credit to Rufus Pollock and the Open Knowledge Foundation is much appreciated.
\ No newline at end of file

From aab7dc4517bcfb50e462acf337480a143154dc46 Mon Sep 17 00:00:00 2001
From: gradedSystem <homesiknessgerald@gmail.com>
Date: Tue, 3 Dec 2024 20:05:05 +0700
Subject: [PATCH 4/5] [update][s] Updating the README.md

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index ade818a..6bd2bbf 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,10 @@ run the script.
 
 `pip install -r scripts/requirements.txt`
 
+Process is recorded and automated in python script:
+
+`scripts/process.py`
+
 ## Automation
 
 Up-to-date (auto-updates every year) gdp dataset could be found on the datahub.io:

From 99c81866d53becbbb9b73b6b970734e8a50acbf3 Mon Sep 17 00:00:00 2001
From: gradedSystem <homesiknessgerald@gmail.com>
Date: Tue, 3 Dec 2024 20:14:53 +0700
Subject: [PATCH 5/5] [update][s] Update README.md

---
 README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 6bd2bbf..e91a801 100644
--- a/README.md
+++ b/README.md
@@ -9,15 +9,18 @@ retrieved from [the World Bank](http://data.worldbank.org/indicator/SI.POV.GINI)
 
 ## Preparation
 
-Run the requirements through `pip` in order to install all required packages to 
-run the script.
+Run the requirements through `pip` in order to install all required packages to run the script.
 
 `pip install -r scripts/requirements.txt`
 
-Process is recorded and automated in python script:
+The process is recorded and automated in a Python script:
 
 `scripts/process.py`
 
+Or, alternatively just by running:
+
+`make`
+
 ## Automation
 
 Up-to-date (auto-updates every year) gdp dataset could be found on the datahub.io: