forked from opensciencegrid/topology
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1,710 changed files
with
15,407 additions
and
1,972 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# Run through all the FACILITY.yaml files and check that they have an institution_id field. | ||
|
||
import yaml | ||
import sys | ||
import glob | ||
import requests | ||
|
||
export_mapping = False | ||
|
||
# Get the list of valid institution ids | ||
response = requests.get("https://topology-institutions.osg-htc.org/api/institution_ids") | ||
topology_institutions = response.json() | ||
topology_institutions_by_id = {x['id']: x for x in topology_institutions} | ||
topology_institution_ids = {x['id'] for x in topology_institutions} | ||
|
||
def check_facility_institution_id(yaml_string: str): | ||
|
||
facility = yaml.load(yaml_string, Loader=yaml.Loader) | ||
if 'InstitutionID' not in facility: | ||
raise Exception("FACILITY.yaml does not have an InstitutionID field") | ||
|
||
if facility['InstitutionID'] not in topology_institution_ids and facility['InstitutionID'] is not None: | ||
raise Exception(f"Invalid InstitutionID: {facility['InstitutionID']}") | ||
|
||
def provide_human_check_interface(facility_files: list): | ||
|
||
facility_institution_mapping = {} | ||
for file in facility_files: | ||
with open(file, 'r') as f: | ||
facility_name = file.split('/')[-2] | ||
facility = yaml.load(f, Loader=yaml.Loader) | ||
facility_institution_mapping[facility_name] = topology_institutions_by_id.get(facility['InstitutionID'], {}).get('name', None) | ||
|
||
if export_mapping: | ||
with open("facility_institution_mapping.yaml", 'w') as f: | ||
yaml.dump(facility_institution_mapping, f) | ||
|
||
else: | ||
print(facility_institution_mapping) | ||
|
||
def main(): | ||
|
||
facility_files = glob.glob("../../../topology/**/FACILITY.yaml") | ||
|
||
# Check the files | ||
errors = [] | ||
for file in facility_files: | ||
with open(file, 'r') as f: | ||
try: | ||
check_facility_institution_id(f) | ||
except Exception as e: | ||
errors.append((file.split("/")[-2], e)) | ||
|
||
# Print the errors and exit if needed | ||
if errors: | ||
for error in errors: | ||
print(f"Error in {error[0]}: \n\t {error[1]}") | ||
sys.exit(1) | ||
|
||
provide_human_check_interface(facility_files) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
2 changes: 2 additions & 0 deletions
2
.github/scripts/check_facility_has_institution_id/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
PyYAML==6.0.1 | ||
requests==2.32.0 |
188 changes: 188 additions & 0 deletions
188
.github/scripts/check_project_fos_precision/field_of_science.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
from functools import lru_cache | ||
from typing import Union | ||
import string | ||
|
||
import pandas as pd | ||
|
||
|
||
@lru_cache() | ||
def get_cip_df(): | ||
|
||
cip_df = pd.read_excel("data/SED-CIP-2022.xlsx") | ||
|
||
# Drop the first two rows and make the third row the column title | ||
cip_df.columns = cip_df.iloc[2] | ||
cip_df = cip_df.iloc[3:] | ||
|
||
cip_df["BroadFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 0)) | ||
cip_df["MajorFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 1)) | ||
cip_df["DetailedFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 2)) | ||
|
||
return cip_df | ||
|
||
|
||
def get_matching_rows(cip_df, broad_id, major_id, detailed_id): | ||
|
||
# Check the finest grain first | ||
detailed_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & ( | ||
cip_df["DetailedFieldId"] == detailed_id)] | ||
|
||
if len(detailed_rows) > 0: | ||
return detailed_rows | ||
|
||
# Check the major grain | ||
major_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id)] | ||
|
||
if len(major_rows) > 0: | ||
return major_rows | ||
|
||
# Check the broad grain | ||
broad_rows = cip_df[cip_df["BroadFieldId"] == broad_id] | ||
|
||
if len(broad_rows) > 0: | ||
return broad_rows | ||
|
||
raise ValueError(f"No matching rows for {broad_id}.{major_id}{detailed_id}") | ||
|
||
|
||
def map_id_to_fields_of_science(id: str): | ||
|
||
# Define the fields we hope to populate | ||
broad_field_of_science = None | ||
major_field_of_science = None | ||
detailed_field_of_science = None | ||
|
||
cip_df = get_cip_df() | ||
|
||
# If we have a direct match, return it | ||
direct_match = cip_df[cip_df["SED-CIP code"] == id] | ||
if len(direct_match) > 0: | ||
return [direct_match["New broad field"].values[0], direct_match["New major field"].values[0], direct_match["New detailed field"].values[0]] | ||
|
||
# Add the broad field | ||
broad_id = get_id(id, 0) | ||
major_id = get_id(id, 1) | ||
detailed_id = get_id(id, 2) | ||
|
||
try: | ||
matching_rows = get_matching_rows(cip_df, broad_id, major_id, detailed_id) | ||
except ValueError as e: | ||
print(id) | ||
return [broad_field_of_science, major_field_of_science, detailed_field_of_science] | ||
|
||
possible_broad_fields = set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows())) | ||
if broad_id is not None: | ||
best_option = None | ||
max_rows = 0 | ||
for possible_broad_field in set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows())): | ||
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df["New broad field"] == possible_broad_field)]) | ||
|
||
if l > max_rows: | ||
max_rows = l | ||
best_option = possible_broad_field | ||
|
||
print(f"Broad Field: {broad_id}.{major_id}{detailed_id} has possible values {possible_broad_fields} we picked {best_option}") | ||
|
||
broad_field_of_science = best_option | ||
|
||
possible_major_fields = set(map(lambda x: x[1]['New major field'], matching_rows.iterrows())) | ||
if major_id is not None: | ||
best_option = None | ||
max_rows = 0 | ||
for possible_major_field in possible_major_fields: | ||
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & ( | ||
cip_df["New major field"] == possible_major_field)]) | ||
if l > max_rows: | ||
max_rows = l | ||
best_option = possible_major_field | ||
|
||
print(f"Major Field: {broad_id}.{major_id}{detailed_id} has rows {possible_major_fields} we picked {best_option}") | ||
|
||
major_field_of_science = best_option | ||
|
||
possible_detailed_fields = set(map(lambda x: x[1]['New detailed field'], matching_rows.iterrows())) | ||
if detailed_id is not None: | ||
best_option = None | ||
max_rows = 0 | ||
for possible_detailed_field in possible_detailed_fields: | ||
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & ( | ||
cip_df["DetailedFieldId"] == detailed_id) & (cip_df["New detailed field"] == possible_detailed_field)]) | ||
if l > max_rows: | ||
max_rows = l | ||
best_option = possible_detailed_field | ||
|
||
print(f"Detailed Field: {broad_id}.{major_id}{detailed_id} has rows {possible_detailed_fields} we picked {best_option}") | ||
|
||
detailed_field_of_science = best_option | ||
|
||
return [broad_field_of_science, major_field_of_science, detailed_field_of_science] | ||
|
||
|
||
def get_id(id: Union[float, str], granularity: int): | ||
|
||
# Check if None | ||
if pd.isna(id): | ||
return None | ||
|
||
# Fix up issues from reading the id as a float | ||
digits = [x for x in str(id) if x in string.digits] | ||
|
||
# If the first part is preceded with a 0, (01.2023) | ||
if len(str(id).split(".")[0]) == 1: | ||
digits = ['0', *digits] | ||
|
||
# If the number ends with a 0, (10.2320) | ||
if len(digits) % 2 == 1: | ||
digits = [*digits, '0'] | ||
|
||
|
||
if len(digits) % 2 == 1: | ||
digits = ['0', *digits] | ||
|
||
if granularity == 0: | ||
return "".join(digits[:2]) | ||
|
||
if granularity == 1: | ||
|
||
if len(digits) < 4: | ||
return None | ||
|
||
return "".join(digits[2:4]) | ||
|
||
if granularity == 2: | ||
|
||
if len(digits) < 6: | ||
return None | ||
|
||
return "".join(digits[4:]) | ||
|
||
|
||
def tests(): | ||
|
||
if get_id(1.0, 0) != "01": | ||
raise ValueError("Test failed") | ||
|
||
if get_id(1.0, 1) != "00": | ||
raise ValueError("Test failed") | ||
|
||
if get_id(10.2320, 2) != "20": | ||
raise ValueError("Test failed") | ||
|
||
if get_id(10.2320, 1) != "23": | ||
raise ValueError("Test failed") | ||
|
||
if get_id(10.2320, 0) != "10": | ||
raise ValueError("Test failed") | ||
|
||
if get_id(01.23, 2) != None: | ||
raise ValueError("Test failed") | ||
|
||
if get_id(01.23, 0) != "01": | ||
raise ValueError("Test failed") | ||
|
||
if map_id_to_fields_of_science("26.15") != ['Biological and biomedical sciences','Neurobiology and neurosciences', None]: | ||
raise ValueError("Test failed") | ||
|
||
if __name__ == "__main__": | ||
tests() | ||
print("All tests passed") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import sys | ||
import datetime | ||
|
||
import yaml | ||
import requests | ||
|
||
from field_of_science import get_id | ||
|
||
|
||
def get_active_projects(start_date: datetime.datetime): | ||
response = requests.get( | ||
"https://gracc.opensciencegrid.org/q/gracc.osg.summary/_search", | ||
json={ | ||
"size": 0, | ||
"query": { | ||
"bool": { | ||
"filter": [ | ||
{ | ||
"term": { | ||
"ResourceType": "Payload" | ||
} | ||
}, | ||
{ | ||
"range": { | ||
"EndTime": { | ||
"lte": int(datetime.datetime.now().timestamp() * 1000), | ||
"gte": int(start_date.timestamp() * 1000) | ||
} | ||
} | ||
} | ||
] | ||
}, | ||
}, | ||
"aggs": { | ||
"projects": { | ||
"terms": { | ||
"field": "ProjectName", | ||
"size": 99999999 | ||
}, | ||
"aggs": { | ||
"projectJobsRan": { | ||
"sum": { | ||
"field": "Njobs" | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
) | ||
|
||
data = response.json() | ||
|
||
active_projects = [x['key'] for x in data['aggregations']['projects']['buckets']] | ||
|
||
return active_projects | ||
|
||
|
||
|
||
def has_detailed_precision(id: str): | ||
return get_id(id, granularity=1) is not None | ||
|
||
|
||
def main(): | ||
one_year_ago = datetime.datetime.now() - datetime.timedelta(days=365) | ||
active_project_names = get_active_projects(one_year_ago) | ||
|
||
print(active_project_names) | ||
|
||
exceptions = [] | ||
for project_name in active_project_names: | ||
try: | ||
project_data = yaml.load(open(f"../../../projects/{project_name}.yaml"), Loader=yaml.Loader) | ||
|
||
if "FieldOfScienceID" not in project_data or not has_detailed_precision(project_data["FieldOfScienceID"]): | ||
exceptions.append(f"Project {project_name} is running in the OSPool without detailed precision.") | ||
|
||
except FileNotFoundError as e: | ||
pass | ||
|
||
|
||
if exceptions: | ||
print("\n".join(exceptions), sys.stderr) | ||
raise Exception("Projects without detailed precision need to be updated.") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
12 changes: 12 additions & 0 deletions
12
.github/scripts/check_project_fos_precision/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
certifi==2024.7.4 | ||
charset-normalizer==3.3.2 | ||
idna==3.7 | ||
numpy==1.26.4 | ||
pandas==2.2.2 | ||
python-dateutil==2.9.0.post0 | ||
pytz==2024.1 | ||
PyYAML==6.0.1 | ||
requests==2.32.0 | ||
six==1.16.0 | ||
tzdata==2024.1 | ||
urllib3==2.2.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
name: Check Project FOS Precision | ||
on: | ||
pull_request: | ||
branches: | ||
- main | ||
|
||
jobs: | ||
check: | ||
name: Check | ||
runs-on: ubuntu-latest | ||
if: startsWith(github.repository, 'opensciencegrid/') | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: 3.9.15 | ||
cache: 'pip' # caching pip dependencies | ||
- run: pip install -r ./.github/scripts/check_facility_has_institution_id/requirements.txt | ||
- run: python ./.github/scripts/check_facility_has_institution_id/main.py |
Oops, something went wrong.