Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up files/directories, add auto-formatting #11

Merged
merged 2 commits into from
Jan 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .github/workflows/black.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: Lint
on:
pull_request:
paths: ['**/*.py']
jobs:
lint:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- uses: psf/black@stable
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
name: Check YAML Consistency
name: Check Output Consistency
'on':
pull_request:
paths:
- '**/*.yaml'
jobs:
check_yaml:
check_output:
runs-on: ubuntu-latest
steps:
- name: Checkout code
Expand All @@ -17,5 +17,5 @@ jobs:
run: pip install -r requirements.txt
- name: Run export and check consistency
run: |
python export.py
python scripts/export.py
git diff --exit-code
5 changes: 2 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
- name: Validate YAML
run: |
python --version
python validate_yaml.py
python scripts/validate_yaml.py

if_merged:
if: github.event.pull_request.merged == true
Expand All @@ -49,8 +49,7 @@ jobs:
cache: 'pip' # caching pip dependencies
- run: |
pip install -r requirements.txt
python validate_yaml.py
python generate_model.py
python scripts/validate_yaml.py
- uses: mikeal/publish-to-github-action@master
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand Down
16 changes: 13 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
repos:
- repo: local
hooks:
- id: validate-yaml
name: Validate YAML against the schema
language: python
entry: python scripts/validate_yaml.py
pass_filenames: false
additional_dependencies: [pyyaml, jsonschema]
- id: export-yaml
name: Export YAML to CSV and JSON
language: python
entry: python export.py
additional_dependencies: ['pyyaml']

entry: python scripts/export.py
pass_filenames: false
additional_dependencies: [pyyaml]
- repo: https://github.com/psf/black
rev: 22.10.0
hooks:
- id: black
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Output formats, such as CSV and JSON are in the `generated/` directory.

Schemas for the YAML data can be found in the `schemas` directory, along with descriptions for each field.
These schemas are in [JSON Schema](https://json-schema.org/) format, but represented in YAML for simplicity.
The `validate_yaml.py` script validates all brands and companies using the schemas.
The `scripts/validate_yaml.py` script validates all brands and companies using the schemas.

## Useful Resources & Links

Expand Down
1 change: 0 additions & 1 deletion generate_model.py

This file was deleted.

Empty file removed generated/placeholder.txt
Empty file.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
black == 23.12.1
jsonschema == 4.20.0
pre-commit == 3.6.0
pyyaml == 6.0.1
77 changes: 42 additions & 35 deletions export.py → scripts/export.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,32 @@

import yaml
import csv
import json
import csv
import os


def read_yaml(file_path):
with open(file_path, 'r') as file:
with open(file_path, "r") as file:
data = yaml.safe_load(file)
return data


def clean_value(value):
if isinstance(value, list):
return ', '.join(map(str, value))
return ", ".join(map(str, value))
else:
return value


def export_to_csv(input_dir, output_csv, schema_file):
schema = read_yaml(schema_file)

with open(output_csv, 'w', newline='') as csvfile:
schema_fields = list(schema['properties'].keys())
if 'stakeholders' in schema_fields:
with open(output_csv, "w", newline="") as csvfile:
schema_fields = list(schema["properties"].keys())
if "stakeholders" in schema_fields:
# Haven't decided how to represent stakeholders in the CSV format, so just remove it for now.
schema_fields.remove('stakeholders')
fieldnames = ['id'] + schema_fields
schema_fields.remove("stakeholders")
fieldnames = ["id"] + schema_fields
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

# Write header based on the schema
Expand All @@ -31,59 +35,62 @@ def export_to_csv(input_dir, output_csv, schema_file):
for yaml_file in sorted(os.listdir(input_dir)):
if yaml_file.endswith(".yaml"):
yaml_file_path = os.path.join(input_dir, yaml_file)
with open(yaml_file_path, 'r') as file:
with open(yaml_file_path, "r") as file:
data = yaml.safe_load(file)

cleaned_data = {key: clean_value(data.get(key, None)) for key in fieldnames}
cleaned_data['id'] = os.path.splitext(yaml_file)[0]

cleaned_data = {
key: clean_value(data.get(key, None)) for key in fieldnames
}
cleaned_data["id"] = os.path.splitext(yaml_file)[0]

writer.writerow(cleaned_data)

print(f"Converted {yaml_file} to CSV")


def convert_yaml_to_json(directory_path, key):
data = {}

for file_name in sorted(os.listdir(directory_path)):
if file_name.endswith(".yaml"):
file_path = os.path.join(directory_path, file_name)
with open(file_path, 'r') as yaml_file:
with open(file_path, "r") as yaml_file:
yaml_data = {}

yaml_data = yaml.safe_load(yaml_file)

data[os.path.splitext(file_name)[0]] = {'id':os.path.splitext(file_name)[0], **yaml_data}


data[os.path.splitext(file_name)[0]] = {
"id": os.path.splitext(file_name)[0],
**yaml_data,
}

return {key: data}


def export_to_json(directory1, directory2, output_json):
brands_data = convert_yaml_to_json(directory1, 'brands')
companies_data = convert_yaml_to_json(directory2, 'companies')
brands_data = convert_yaml_to_json(directory1, "brands")
companies_data = convert_yaml_to_json(directory2, "companies")

combined_data = {**brands_data, **companies_data}

with open(output_json, 'w') as json_file:
with open(output_json, "w") as json_file:
json.dump(combined_data, json_file, indent=2)

print(f"Converted data to JSON")


if __name__ == "__main__":
brands_yaml = 'data/brands'
companies_yaml = 'data/companies'
brands_yaml = "data/brands"
companies_yaml = "data/companies"

brands_csv_file = 'output/csv/brands.csv'
companies_csv_file = 'output/csv/companies.csv'
brands_csv_file = "output/csv/brands.csv"
companies_csv_file = "output/csv/companies.csv"

data_json_file = 'output/json/data.json'
data_json_file = "output/json/data.json"

brand_schema = 'schemas/brand_schema.yaml'
company_schema = 'schemas/company_schema.yaml'
brand_schema = "schemas/brand_schema.yaml"
company_schema = "schemas/company_schema.yaml"

export_to_csv(brands_yaml, brands_csv_file, brand_schema)
export_to_csv(companies_yaml, companies_csv_file, company_schema)
export_to_json(brands_yaml,companies_yaml,data_json_file)





export_to_json(brands_yaml, companies_yaml, data_json_file)
78 changes: 39 additions & 39 deletions import.py → scripts/import.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,18 +57,21 @@ def parent_from_details(details):
return parse_for_parent[1]
return ""


# Custom representer for multiline strings
def literal_presenter(dumper, data):
if '\n' in data or len(data)>30:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
if ' ' in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='"')
if "\n" in data or len(data) > 30:
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
if " " in data:
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style='"')
if not data or data == "":
return dumper.represent_scalar('tag:yaml.org,2002:null', '')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
return dumper.represent_scalar("tag:yaml.org,2002:null", "")
return dumper.represent_scalar("tag:yaml.org,2002:str", data)


def none_representer(dumper, data):
return dumper.represent_scalar('tag:yaml.org,2002:null', '')
return dumper.represent_scalar("tag:yaml.org,2002:null", "")


# Apply the custom representer
yaml.add_representer(str, literal_presenter)
Expand All @@ -90,18 +93,20 @@ def create_data_model(data, default_level=ALTERNATIVE):
# country = alternative.get('attributes').get('Market', None)
# categories_raw = alternative.get('attributes').get('tags')
# categories = categories.split(",") if categories_raw else [] #split_categories(categories_raw)
brand_name = row.get('attributes').get('name')
reason = row.get('attributes').get('proof')

brand_name = row.get("attributes").get("name")
reason = row.get("attributes").get("proof")
parent_name = parent_from_details(reason)
source_url = row.get('attributes').get('proofUrl')
image_url = row.get('attributes').get('imageUrl')
website = row.get('attributes').get('Website', '')
level = row.get('attributes').get('Level', default_level)
country = row.get('attributes').get('Market', None)
categories_raw = row.get('attributes').get('tags')
categories = categories.split(",") if categories_raw else [] #split_categories(categories_raw)
location = GLOBAL if not country or country == '' else country
source_url = row.get("attributes").get("proofUrl")
image_url = row.get("attributes").get("imageUrl")
website = row.get("attributes").get("Website", "")
level = row.get("attributes").get("Level", default_level)
country = row.get("attributes").get("Market", None)
categories_raw = row.get("attributes").get("tags")
categories = (
categories.split(",") if categories_raw else []
) # split_categories(categories_raw)
location = GLOBAL if not country or country == "" else country

if not brand_name:
continue
Expand All @@ -111,51 +116,46 @@ def create_data_model(data, default_level=ALTERNATIVE):
NAME: parent_name,
LOCATION: location,
LEVEL: level,
DETAILS: {
REASON: reason,
SOURCE: source_url
}
DETAILS: {REASON: reason, SOURCE: source_url},
}
yaml_data[brand_name][PARENTS].append(new_parent)
else:
yaml_data[brand_name] = {
NAME: brand_name,
WEBSITE: website,
IMAGE_URL: image_url,
CATEGORIES: categories,
PARENTS: [{
NAME: parent_name,
LOCATION: location,
LEVEL: level,
DETAILS: {
REASON: reason,
SOURCE: source_url
}
}]
NAME: brand_name,
WEBSITE: website,
IMAGE_URL: image_url,
CATEGORIES: categories,
PARENTS: [
{
NAME: parent_name,
LOCATION: location,
LEVEL: level,
DETAILS: {REASON: reason, SOURCE: source_url},
}
],
}
return yaml_data


def write_yaml(data, file_name):
data_list = {"brands": list(data.values())}
with open(file_name, 'w', encoding='utf-8') as yaml_file:
with open(file_name, "w", encoding="utf-8") as yaml_file:
yaml.dump(data_list, yaml_file, default_flow_style=False, sort_keys=False)


def json_to_csv(json_file, output):
with open(json_file, encoding='utf-8') as fh:
with open(json_file, encoding="utf-8") as fh:
data = json.load(fh)
yaml_data = create_data_model(data)
write_yaml(yaml_data, output)

pdb.set_trace()


if __name__ == "__main__":
args = sys.argv[1:]

file_to_parse = args[0]
output_yaml = args[1]

json_to_csv(file_to_parse, output_yaml)


Loading
Loading