Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
akmohapatra authored Oct 28, 2024
2 parents de3d83d + 247f31c commit 2dd6830
Show file tree
Hide file tree
Showing 1,710 changed files with 15,407 additions and 1,972 deletions.
64 changes: 64 additions & 0 deletions .github/scripts/check_facility_has_institution_id/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Run through all the FACILITY.yaml files and check that they have an institution_id field.

import yaml
import sys
import glob
import requests

export_mapping = False

# Get the list of valid institution ids
response = requests.get("https://topology-institutions.osg-htc.org/api/institution_ids")
topology_institutions = response.json()
topology_institutions_by_id = {x['id']: x for x in topology_institutions}
topology_institution_ids = {x['id'] for x in topology_institutions}

def check_facility_institution_id(yaml_string: str):

facility = yaml.load(yaml_string, Loader=yaml.Loader)
if 'InstitutionID' not in facility:
raise Exception("FACILITY.yaml does not have an InstitutionID field")

if facility['InstitutionID'] not in topology_institution_ids and facility['InstitutionID'] is not None:
raise Exception(f"Invalid InstitutionID: {facility['InstitutionID']}")

def provide_human_check_interface(facility_files: list):

facility_institution_mapping = {}
for file in facility_files:
with open(file, 'r') as f:
facility_name = file.split('/')[-2]
facility = yaml.load(f, Loader=yaml.Loader)
facility_institution_mapping[facility_name] = topology_institutions_by_id.get(facility['InstitutionID'], {}).get('name', None)

if export_mapping:
with open("facility_institution_mapping.yaml", 'w') as f:
yaml.dump(facility_institution_mapping, f)

else:
print(facility_institution_mapping)

def main():

facility_files = glob.glob("../../../topology/**/FACILITY.yaml")

# Check the files
errors = []
for file in facility_files:
with open(file, 'r') as f:
try:
check_facility_institution_id(f)
except Exception as e:
errors.append((file.split("/")[-2], e))

# Print the errors and exit if needed
if errors:
for error in errors:
print(f"Error in {error[0]}: \n\t {error[1]}")
sys.exit(1)

provide_human_check_interface(facility_files)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
PyYAML==6.0.1
requests==2.32.0
188 changes: 188 additions & 0 deletions .github/scripts/check_project_fos_precision/field_of_science.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
from functools import lru_cache
from typing import Union
import string

import pandas as pd


@lru_cache()
def get_cip_df():

cip_df = pd.read_excel("data/SED-CIP-2022.xlsx")

# Drop the first two rows and make the third row the column title
cip_df.columns = cip_df.iloc[2]
cip_df = cip_df.iloc[3:]

cip_df["BroadFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 0))
cip_df["MajorFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 1))
cip_df["DetailedFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 2))

return cip_df


def get_matching_rows(cip_df, broad_id, major_id, detailed_id):

# Check the finest grain first
detailed_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
cip_df["DetailedFieldId"] == detailed_id)]

if len(detailed_rows) > 0:
return detailed_rows

# Check the major grain
major_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id)]

if len(major_rows) > 0:
return major_rows

# Check the broad grain
broad_rows = cip_df[cip_df["BroadFieldId"] == broad_id]

if len(broad_rows) > 0:
return broad_rows

raise ValueError(f"No matching rows for {broad_id}.{major_id}{detailed_id}")


def map_id_to_fields_of_science(id: str):

# Define the fields we hope to populate
broad_field_of_science = None
major_field_of_science = None
detailed_field_of_science = None

cip_df = get_cip_df()

# If we have a direct match, return it
direct_match = cip_df[cip_df["SED-CIP code"] == id]
if len(direct_match) > 0:
return [direct_match["New broad field"].values[0], direct_match["New major field"].values[0], direct_match["New detailed field"].values[0]]

# Add the broad field
broad_id = get_id(id, 0)
major_id = get_id(id, 1)
detailed_id = get_id(id, 2)

try:
matching_rows = get_matching_rows(cip_df, broad_id, major_id, detailed_id)
except ValueError as e:
print(id)
return [broad_field_of_science, major_field_of_science, detailed_field_of_science]

possible_broad_fields = set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows()))
if broad_id is not None:
best_option = None
max_rows = 0
for possible_broad_field in set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows())):
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df["New broad field"] == possible_broad_field)])

if l > max_rows:
max_rows = l
best_option = possible_broad_field

print(f"Broad Field: {broad_id}.{major_id}{detailed_id} has possible values {possible_broad_fields} we picked {best_option}")

broad_field_of_science = best_option

possible_major_fields = set(map(lambda x: x[1]['New major field'], matching_rows.iterrows()))
if major_id is not None:
best_option = None
max_rows = 0
for possible_major_field in possible_major_fields:
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
cip_df["New major field"] == possible_major_field)])
if l > max_rows:
max_rows = l
best_option = possible_major_field

print(f"Major Field: {broad_id}.{major_id}{detailed_id} has rows {possible_major_fields} we picked {best_option}")

major_field_of_science = best_option

possible_detailed_fields = set(map(lambda x: x[1]['New detailed field'], matching_rows.iterrows()))
if detailed_id is not None:
best_option = None
max_rows = 0
for possible_detailed_field in possible_detailed_fields:
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
cip_df["DetailedFieldId"] == detailed_id) & (cip_df["New detailed field"] == possible_detailed_field)])
if l > max_rows:
max_rows = l
best_option = possible_detailed_field

print(f"Detailed Field: {broad_id}.{major_id}{detailed_id} has rows {possible_detailed_fields} we picked {best_option}")

detailed_field_of_science = best_option

return [broad_field_of_science, major_field_of_science, detailed_field_of_science]


def get_id(id: Union[float, str], granularity: int):

# Check if None
if pd.isna(id):
return None

# Fix up issues from reading the id as a float
digits = [x for x in str(id) if x in string.digits]

# If the first part is preceded with a 0, (01.2023)
if len(str(id).split(".")[0]) == 1:
digits = ['0', *digits]

# If the number ends with a 0, (10.2320)
if len(digits) % 2 == 1:
digits = [*digits, '0']


if len(digits) % 2 == 1:
digits = ['0', *digits]

if granularity == 0:
return "".join(digits[:2])

if granularity == 1:

if len(digits) < 4:
return None

return "".join(digits[2:4])

if granularity == 2:

if len(digits) < 6:
return None

return "".join(digits[4:])


def tests():

if get_id(1.0, 0) != "01":
raise ValueError("Test failed")

if get_id(1.0, 1) != "00":
raise ValueError("Test failed")

if get_id(10.2320, 2) != "20":
raise ValueError("Test failed")

if get_id(10.2320, 1) != "23":
raise ValueError("Test failed")

if get_id(10.2320, 0) != "10":
raise ValueError("Test failed")

if get_id(01.23, 2) != None:
raise ValueError("Test failed")

if get_id(01.23, 0) != "01":
raise ValueError("Test failed")

if map_id_to_fields_of_science("26.15") != ['Biological and biomedical sciences','Neurobiology and neurosciences', None]:
raise ValueError("Test failed")

if __name__ == "__main__":
tests()
print("All tests passed")
88 changes: 88 additions & 0 deletions .github/scripts/check_project_fos_precision/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import sys
import datetime

import yaml
import requests

from field_of_science import get_id


def get_active_projects(start_date: datetime.datetime):
response = requests.get(
"https://gracc.opensciencegrid.org/q/gracc.osg.summary/_search",
json={
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"ResourceType": "Payload"
}
},
{
"range": {
"EndTime": {
"lte": int(datetime.datetime.now().timestamp() * 1000),
"gte": int(start_date.timestamp() * 1000)
}
}
}
]
},
},
"aggs": {
"projects": {
"terms": {
"field": "ProjectName",
"size": 99999999
},
"aggs": {
"projectJobsRan": {
"sum": {
"field": "Njobs"
}
}
}
}
}
}
)

data = response.json()

active_projects = [x['key'] for x in data['aggregations']['projects']['buckets']]

return active_projects



def has_detailed_precision(id: str):
return get_id(id, granularity=1) is not None


def main():
one_year_ago = datetime.datetime.now() - datetime.timedelta(days=365)
active_project_names = get_active_projects(one_year_ago)

print(active_project_names)

exceptions = []
for project_name in active_project_names:
try:
project_data = yaml.load(open(f"../../../projects/{project_name}.yaml"), Loader=yaml.Loader)

if "FieldOfScienceID" not in project_data or not has_detailed_precision(project_data["FieldOfScienceID"]):
exceptions.append(f"Project {project_name} is running in the OSPool without detailed precision.")

except FileNotFoundError as e:
pass


if exceptions:
print("\n".join(exceptions), sys.stderr)
raise Exception("Projects without detailed precision need to be updated.")


if __name__ == "__main__":
main()
12 changes: 12 additions & 0 deletions .github/scripts/check_project_fos_precision/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
certifi==2024.7.4
charset-normalizer==3.3.2
idna==3.7
numpy==1.26.4
pandas==2.2.2
python-dateutil==2.9.0.post0
pytz==2024.1
PyYAML==6.0.1
requests==2.32.0
six==1.16.0
tzdata==2024.1
urllib3==2.2.2
20 changes: 20 additions & 0 deletions .github/workflows/check_facility_institution_id.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: Check Project FOS Precision
on:
pull_request:
branches:
- main

jobs:
check:
name: Check
runs-on: ubuntu-latest
if: startsWith(github.repository, 'opensciencegrid/')
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9.15
cache: 'pip' # caching pip dependencies
- run: pip install -r ./.github/scripts/check_facility_has_institution_id/requirements.txt
- run: python ./.github/scripts/check_facility_has_institution_id/main.py
Loading

0 comments on commit 2dd6830

Please sign in to comment.