From 7096451e5feb3f23dab4bdf6abf5846caab5c929 Mon Sep 17 00:00:00 2001 From: gradedSystem Date: Wed, 27 Nov 2024 17:14:54 +0800 Subject: [PATCH 1/5] [UP][m] Updating the scripts with v2 source of world bank --- scripts/README.md | 2 +- scripts/process.py | 92 +++++++++++++++++++++++++++------------- scripts/requirements.txt | 5 +-- 3 files changed, 66 insertions(+), 33 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index 47153d6..aacec64 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -11,7 +11,7 @@ Afterwards, it will delete rows with blank inputs. Run the requirements through `pip` in order to install all required packages to run the script. -`pip install -r /path/to/requirements.txt` +`pip install -r scripts/requirements.txt` ## Contributing diff --git a/scripts/process.py b/scripts/process.py index 90c94da..53e5405 100644 --- a/scripts/process.py +++ b/scripts/process.py @@ -1,35 +1,69 @@ #!/usr/bin/python -import csv, os, sys -import numpy as np +import os, csv +import requests +import zipfile +import tempfile import pandas as pd -# Building query to fetch data from API -apiBase = "http://api.worldbank.org/indicator/" -apiIndicator = "SI.POV.GINI" # This can be changed to any other indicator -FILE_NAME = 'gini-index.csv' -source = apiBase+apiIndicator+"?format=csv" -print(source) - -def main(): - giniIndex = pd.read_csv(source) - giniIndex.to_csv('archive/gini-index.csv', sep=",", index=False) - print("Saved archive CSV file.") - print(giniIndex) - - # Processing the data - df = pd.read_csv('archive/gini-index.csv') # Reading the source csv - """ - Python is printing "Country Name" with quotes in data frame and does not - work for the remaining code - """ - df.columns.values[0] = 'Country Name' - - df = pd.melt(df, id_vars=['Country Name', 'Country Code'], var_name="Year", value_name="Value") # Unpivoting - df = df.sort_values(by=['Country Name', 'Year'], ascending=[True, True]) # Sorting by country - - df.dropna().to_csv('data/gini-index.csv', sep=",", index=False) # Saving CSV - print ("File has been saved and it is ready for data packaging.") +tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix='.zip') +tmpdir = tempfile.TemporaryDirectory() +API_INDICATOR = "SI.POV.GINI" +SOURCE_URL = f"https://api.worldbank.org/v2/en/indicator/{API_INDICATOR}?downloadformat=csv" +ARCHIVE_FILE = 'archive/gini-index.csv' +OUTPUT_FILE = 'data/gini-index.csv' + +def download_zip_file(): + response = requests.get(SOURCE_URL) + + with open(tmpfile.name, 'wb') as d: + d.write(response.content) + + with zipfile.ZipFile(tmpfile.name, 'r') as zip_ref: + zip_ref.extractall(tmpdir.name) + + os.unlink(tmpfile.name) + + for path in os.scandir(tmpdir.name): + if path.is_file() and 'metadata' not in path.name.lower(): + filename = os.path.join(tmpdir.name, path.name) + archive_path = os.path.join('archive', 'gini-index.csv') + + # Ensure the archive folder exists + os.makedirs('archive', exist_ok=True) + + os.rename(filename, archive_path) + print(f"File saved to: {archive_path}") + +def process_population_data(filename, output_file): + # Read the raw CSV file + with open(filename) as fo: + lines = [row for row in csv.reader(fo)] + + # Extract headings and data rows + headings = lines[4] + lines = lines[5:] + + # Define output structure + outheadings = ['Country Name', 'Country Code', 'Year', 'Value'] + outlines = [] + + # Process each row and reshape the data + for row in lines: + for idx, year in enumerate(headings[4:]): + if row[idx + 4]: # Check if the value exists + value = row[idx + 4] + outlines.append(row[:2] + [int(year), value]) + + df = pd.DataFrame(outlines, columns=outheadings) + + df = df.sort_values(by=['Country Name', 'Year']) + df.to_csv(output_file, index=False) + print(f"Processed data saved to {output_file}") + +# Example usage if __name__ == "__main__": - main() + download_zip_file() + process_population_data(ARCHIVE_FILE, OUTPUT_FILE) + diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 06ce716..5b466cd 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,3 +1,2 @@ -process.py -pandas -numpy +pandas==2.2.3 +requests==2.32.3 \ No newline at end of file From a53c5808589d8f2247548f72940538ea89f83e38 Mon Sep 17 00:00:00 2001 From: gradedSystem Date: Tue, 3 Dec 2024 19:39:25 +0700 Subject: [PATCH 2/5] [remove-update][s] Moved README.md to the root folder --- README.md | 31 +++++++++++++++++++++++++++++++ scripts/README.md | 39 --------------------------------------- 2 files changed, 31 insertions(+), 39 deletions(-) delete mode 100644 scripts/README.md diff --git a/README.md b/README.md index 05ee0a9..c365c83 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,40 @@ The repository of the data package of the GINI Index. The data that is contained in the `gini-index.csv` file, under `/data` was retrieved from [the World Bank](http://data.worldbank.org/indicator/SI.POV.GINI). +## Requirements + +Run the requirements through `pip` in order to install all required packages to +run the script. + +`pip install -r scripts/requirements.txt` + ## License All data is licensed under the Open Data Commons Public Domain Dedication and License. All code is licensed under the MIT/BSD license. Note that while no credit is formally required a link back or credit to Rufus Pollock and the Open Knowledge Foundation is much appreciated. + + +## Contributing + +This dataset will eventually need an update, specially because there are many +pieces missing. +In order to contribute, please fork this repository and do the changes you must, +but please follow [OKFN's curator guide](http://data.okfn.org/doc/core-data-curators) +to keep things as organized as possible. + +## Tags and committing changes + +When you push your modifications to GitHub, you should do it based on a few tags +to help identify the work progress. There is not an exact definition but here +are a few and important tags: + * [WK] - working update + + * [TP] - Ready for packaging + + * [UP] - Update, of any sort + + * [FL] - Failure, of any sort (please report in the issues page) + + * [ED] - Smaller edits diff --git a/scripts/README.md b/scripts/README.md deleted file mode 100644 index aacec64..0000000 --- a/scripts/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Usage of process.py - -The script will use the csv file under `/data` and it will unpivote the -original `xls` file and sort data by Country (alphabetically) and by Year -(ascending order). - -Afterwards, it will delete rows with blank inputs. - -## Requirements - -Run the requirements through `pip` in order to install all required packages to -run the script. - -`pip install -r scripts/requirements.txt` - -## Contributing - -This dataset will eventually need an update, specially because there are many -pieces missing. -In order to contribute, please fork this repository and do the changes you must, -but please follow [OKFN's curator guide](http://data.okfn.org/doc/core-data-curators) -to keep things as organized as possible. - -## Tags and committing changes - -When you push your modifications to GitHub, you should do it based on a few tags -to help identify the work progress. There is not an exact definition but here -are a few and important tags: - * [WK] - working update - - * [TP] - Ready for packaging - - * [UP] - Update, of any sort - - * [FL] - Failure, of any sort (please report in the issues page) - - * [ED] - Smaller edits - - From c4ea44e5d6bffbef4cd018bc2ba3037965a9ec93 Mon Sep 17 00:00:00 2001 From: gradedSystem Date: Tue, 3 Dec 2024 19:55:34 +0700 Subject: [PATCH 3/5] [update][s] Update README --- README.md | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index c365c83..ade818a 100644 --- a/README.md +++ b/README.md @@ -7,40 +7,20 @@ The repository of the data package of the GINI Index. The data that is contained in the `gini-index.csv` file, under `/data` was retrieved from [the World Bank](http://data.worldbank.org/indicator/SI.POV.GINI). -## Requirements +## Preparation Run the requirements through `pip` in order to install all required packages to run the script. `pip install -r scripts/requirements.txt` +## Automation -## License +Up-to-date (auto-updates every year) gdp dataset could be found on the datahub.io: +https://datahub.io/core/gini-index -All data is licensed under the Open Data Commons Public Domain Dedication and License. All code is licensed under the MIT/BSD license. - -Note that while no credit is formally required a link back or credit to Rufus Pollock and the Open Knowledge Foundation is much appreciated. - - -## Contributing - -This dataset will eventually need an update, specially because there are many -pieces missing. -In order to contribute, please fork this repository and do the changes you must, -but please follow [OKFN's curator guide](http://data.okfn.org/doc/core-data-curators) -to keep things as organized as possible. +## Licence -## Tags and committing changes - -When you push your modifications to GitHub, you should do it based on a few tags -to help identify the work progress. There is not an exact definition but here -are a few and important tags: - * [WK] - working update - - * [TP] - Ready for packaging - - * [UP] - Update, of any sort - - * [FL] - Failure, of any sort (please report in the issues page) +All data is licensed under the Open Data Commons Public Domain Dedication and License. All code is licensed under the MIT/BSD license. - * [ED] - Smaller edits +Note that while no credit is formally required a link back or credit to Rufus Pollock and the Open Knowledge Foundation is much appreciated. \ No newline at end of file From aab7dc4517bcfb50e462acf337480a143154dc46 Mon Sep 17 00:00:00 2001 From: gradedSystem Date: Tue, 3 Dec 2024 20:05:05 +0700 Subject: [PATCH 4/5] [update][s] Updating the README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index ade818a..6bd2bbf 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,10 @@ run the script. `pip install -r scripts/requirements.txt` +Process is recorded and automated in python script: + +`scripts/process.py` + ## Automation Up-to-date (auto-updates every year) gdp dataset could be found on the datahub.io: From 99c81866d53becbbb9b73b6b970734e8a50acbf3 Mon Sep 17 00:00:00 2001 From: gradedSystem Date: Tue, 3 Dec 2024 20:14:53 +0700 Subject: [PATCH 5/5] [update][s] Update README.md --- README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6bd2bbf..e91a801 100644 --- a/README.md +++ b/README.md @@ -9,15 +9,18 @@ retrieved from [the World Bank](http://data.worldbank.org/indicator/SI.POV.GINI) ## Preparation -Run the requirements through `pip` in order to install all required packages to -run the script. +Run the requirements through `pip` in order to install all required packages to run the script. `pip install -r scripts/requirements.txt` -Process is recorded and automated in python script: +The process is recorded and automated in a Python script: `scripts/process.py` +Or, alternatively just by running: + +`make` + ## Automation Up-to-date (auto-updates every year) gdp dataset could be found on the datahub.io: