Skip to content

Commit

Permalink
Field of science ids (#3809)
Browse files Browse the repository at this point in the history
* Add GHA

* Add FieldOfScienceIDs
  • Loading branch information
CannonLock authored Apr 23, 2024
1 parent 5910d49 commit d88340c
Show file tree
Hide file tree
Showing 1,132 changed files with 1,494 additions and 0 deletions.
188 changes: 188 additions & 0 deletions .github/scripts/check_project_fos_precision/field_of_science.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
from functools import lru_cache
from typing import Union
import string

import pandas as pd


@lru_cache()
def get_cip_df():

cip_df = pd.read_excel("data/SED-CIP-2022.xlsx")

# Drop the first two rows and make the third row the column title
cip_df.columns = cip_df.iloc[2]
cip_df = cip_df.iloc[3:]

cip_df["BroadFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 0))
cip_df["MajorFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 1))
cip_df["DetailedFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 2))

return cip_df


def get_matching_rows(cip_df, broad_id, major_id, detailed_id):

# Check the finest grain first
detailed_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
cip_df["DetailedFieldId"] == detailed_id)]

if len(detailed_rows) > 0:
return detailed_rows

# Check the major grain
major_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id)]

if len(major_rows) > 0:
return major_rows

# Check the broad grain
broad_rows = cip_df[cip_df["BroadFieldId"] == broad_id]

if len(broad_rows) > 0:
return broad_rows

raise ValueError(f"No matching rows for {broad_id}.{major_id}{detailed_id}")


def map_id_to_fields_of_science(id: str):

# Define the fields we hope to populate
broad_field_of_science = None
major_field_of_science = None
detailed_field_of_science = None

cip_df = get_cip_df()

# If we have a direct match, return it
direct_match = cip_df[cip_df["SED-CIP code"] == id]
if len(direct_match) > 0:
return [direct_match["New broad field"].values[0], direct_match["New major field"].values[0], direct_match["New detailed field"].values[0]]

# Add the broad field
broad_id = get_id(id, 0)
major_id = get_id(id, 1)
detailed_id = get_id(id, 2)

try:
matching_rows = get_matching_rows(cip_df, broad_id, major_id, detailed_id)
except ValueError as e:
print(id)
return [broad_field_of_science, major_field_of_science, detailed_field_of_science]

possible_broad_fields = set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows()))
if broad_id is not None:
best_option = None
max_rows = 0
for possible_broad_field in set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows())):
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df["New broad field"] == possible_broad_field)])

if l > max_rows:
max_rows = l
best_option = possible_broad_field

print(f"Broad Field: {broad_id}.{major_id}{detailed_id} has possible values {possible_broad_fields} we picked {best_option}")

broad_field_of_science = best_option

possible_major_fields = set(map(lambda x: x[1]['New major field'], matching_rows.iterrows()))
if major_id is not None:
best_option = None
max_rows = 0
for possible_major_field in possible_major_fields:
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
cip_df["New major field"] == possible_major_field)])
if l > max_rows:
max_rows = l
best_option = possible_major_field

print(f"Major Field: {broad_id}.{major_id}{detailed_id} has rows {possible_major_fields} we picked {best_option}")

major_field_of_science = best_option

possible_detailed_fields = set(map(lambda x: x[1]['New detailed field'], matching_rows.iterrows()))
if detailed_id is not None:
best_option = None
max_rows = 0
for possible_detailed_field in possible_detailed_fields:
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
cip_df["DetailedFieldId"] == detailed_id) & (cip_df["New detailed field"] == possible_detailed_field)])
if l > max_rows:
max_rows = l
best_option = possible_detailed_field

print(f"Detailed Field: {broad_id}.{major_id}{detailed_id} has rows {possible_detailed_fields} we picked {best_option}")

detailed_field_of_science = best_option

return [broad_field_of_science, major_field_of_science, detailed_field_of_science]


def get_id(id: Union[float, str], granularity: int):

# Check if None
if pd.isna(id):
return None

# Fix up issues from reading the id as a float
digits = [x for x in str(id) if x in string.digits]

# If the first part is preceded with a 0, (01.2023)
if len(str(id).split(".")[0]) == 1:
digits = ['0', *digits]

# If the number ends with a 0, (10.2320)
if len(digits) % 2 == 1:
digits = [*digits, '0']


if len(digits) % 2 == 1:
digits = ['0', *digits]

if granularity == 0:
return "".join(digits[:2])

if granularity == 1:

if len(digits) < 4:
return None

return "".join(digits[2:4])

if granularity == 2:

if len(digits) < 6:
return None

return "".join(digits[4:])


def tests():

if get_id(1.0, 0) != "01":
raise ValueError("Test failed")

if get_id(1.0, 1) != "00":
raise ValueError("Test failed")

if get_id(10.2320, 2) != "20":
raise ValueError("Test failed")

if get_id(10.2320, 1) != "23":
raise ValueError("Test failed")

if get_id(10.2320, 0) != "10":
raise ValueError("Test failed")

if get_id(01.23, 2) != None:
raise ValueError("Test failed")

if get_id(01.23, 0) != "01":
raise ValueError("Test failed")

if map_id_to_fields_of_science("26.15") != ['Biological and biomedical sciences','Neurobiology and neurosciences', None]:
raise ValueError("Test failed")

if __name__ == "__main__":
tests()
print("All tests passed")
88 changes: 88 additions & 0 deletions .github/scripts/check_project_fos_precision/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import sys
import datetime

import yaml
import requests

from field_of_science import get_id


def get_active_projects(start_date: datetime.datetime):
response = requests.get(
"https://gracc.opensciencegrid.org/q/gracc.osg.summary/_search",
json={
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"ResourceType": "Payload"
}
},
{
"range": {
"EndTime": {
"lte": int(datetime.datetime.now().timestamp() * 1000),
"gte": int(start_date.timestamp() * 1000)
}
}
}
]
},
},
"aggs": {
"projects": {
"terms": {
"field": "ProjectName",
"size": 99999999
},
"aggs": {
"projectJobsRan": {
"sum": {
"field": "Njobs"
}
}
}
}
}
}
)

data = response.json()

active_projects = [x['key'] for x in data['aggregations']['projects']['buckets']]

return active_projects



def has_detailed_precision(id: str):
return get_id(id, granularity=1) is not None


def main():
one_year_ago = datetime.datetime.now() - datetime.timedelta(days=365)
active_project_names = get_active_projects(one_year_ago)

print(active_project_names)

exceptions = []
for project_name in active_project_names:
try:
project_data = yaml.load(open(f"../../../projects/{project_name}.yaml"), Loader=yaml.Loader)

if "FieldOfScienceID" not in project_data or not has_detailed_precision(project_data["FieldOfScienceID"]):
exceptions.append(f"Project {project_name} is running in the OSPool without detailed precision.")

except FileNotFoundError as e:
pass


if exceptions:
print("\n".join(exceptions), sys.stderr)
raise Exception("Projects without detailed precision need to be updated.")


if __name__ == "__main__":
main()
68 changes: 68 additions & 0 deletions .github/scripts/check_project_fos_precision/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
asn1==2.7.0
async-generator==1.10
attrs==21.4.0
beautifulsoup4==4.11.1
blinker==1.6.3
certifi==2024.2.2
cffi==1.15.0
chardet==5.1.0
click==6.7
configobj==5.0.8
cryptography==37.0.2
Deprecated==1.2.13
enum-compat==0.0.3
Flask==1.0.4
Flask-WTF==0.14.3
gitdb==4.0.11
GitPython==3.1.43
gunicorn==20.1.0
h11==0.13.0
icalendar==5.0.12
idna==3.7
iniconfig==1.1.1
itsdangerous==0.24
Jinja2==2.11.3
ldap3==2.9.1
MarkupSafe==2.0.1
numpy==1.26.4
outcome==1.1.0
packaging==21.3
pandas==2.2.2
pluggy==1.0.0
prometheus-client==0.20.0
py==1.11.0
pyasn1==0.5.1
pyasn1-modules==0.2.8
pycparser==2.21
PyGithub==1.57
PyJWT==2.6.0
PyNaCl==1.5.0
pyOpenSSL==22.0.0
pyparsing==3.0.7
PySocks==1.7.1
pytest==7.1.1
pytest-mock==3.7.0
python-dateutil==2.8.2
python-gnupg==0.5.2
python-ldap==3.3.1
pytz==2024.1
PyYAML==6.0.1
requests==2.25.1
selenium==4.1.3
six==1.16.0
smmap==5.0.1
sniffio==1.2.0
sortedcontainers==2.4.0
soupsieve==2.3.2.post1
tomli==2.0.1
tqdm==4.64.0
trio==0.20.0
trio-websocket==0.9.2
tzdata==2024.1
urllib3==1.26.6
webdriverdownloader==1.1.0.3
Werkzeug==0.15.6
wrapt==1.14.1
wsproto==1.1.0
WTForms==3.0.1
xmltodict==0.13.0
21 changes: 21 additions & 0 deletions .github/workflows/check_project_fos_precision.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Check Project FOS Precision
on:
pull_request:
branches:
- main
schedule:
- cron: '0 0 * * *'

jobs:
check:
name: Check
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9.15
cache: 'pip' # caching pip dependencies
- run: pip install -r ./.github/scripts/check_project_fos_precision/requirements.txt
- run: python ./.github/scripts/check_project_fos_precision/main.py
1 change: 1 addition & 0 deletions projects/ACE_NIAID.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ Sponsor:
CampusGrid:
Name: OSG Connect
InstitutionID: 'https://osg-htc.org/iid/451cgt72wj62'
FieldOfScienceID: '26.1103'
1 change: 1 addition & 0 deletions projects/AMFORA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ Sponsor:
CampusGrid:
Name: OSG Connect
InstitutionID: 'https://osg-htc.org/iid/o14joi278jrs'
FieldOfScienceID: '11'
1 change: 1 addition & 0 deletions projects/AMNH.astro.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ Sponsor:
CampusGrid:
Name: OSG Connect
InstitutionID: 'https://osg-htc.org/iid/em2w05s9c1uc'
FieldOfScienceID: '40.02'
1 change: 1 addition & 0 deletions projects/AMNH.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ Sponsor:
CampusGrid:
Name: OSG Connect
InstitutionID: 'https://osg-htc.org/iid/em2w05s9c1uc'
FieldOfScienceID: '54.0101'
1 change: 1 addition & 0 deletions projects/AMNH_Burbrink.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ Sponsor:
CampusGrid:
Name: OSG Connect
InstitutionID: 'https://osg-htc.org/iid/em2w05s9c1uc'
FieldOfScienceID: '26'
1 change: 1 addition & 0 deletions projects/AMNH_MacLow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ Sponsor:
CampusGrid:
Name: OSG Connect
InstitutionID: 'https://osg-htc.org/iid/em2w05s9c1uc'
FieldOfScienceID: '40.02'
Loading

0 comments on commit d88340c

Please sign in to comment.