Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Field of science ids #3809

Merged
merged 8 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
188 changes: 188 additions & 0 deletions .github/scripts/check_project_fos_precision/field_of_science.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
from functools import lru_cache
from typing import Union
import string

import pandas as pd


@lru_cache()
def get_cip_df():

cip_df = pd.read_excel("data/SED-CIP-2022.xlsx")

# Drop the first two rows and make the third row the column title
cip_df.columns = cip_df.iloc[2]
cip_df = cip_df.iloc[3:]

cip_df["BroadFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 0))
cip_df["MajorFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 1))
cip_df["DetailedFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 2))

return cip_df


def get_matching_rows(cip_df, broad_id, major_id, detailed_id):

# Check the finest grain first
detailed_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
cip_df["DetailedFieldId"] == detailed_id)]

if len(detailed_rows) > 0:
return detailed_rows

# Check the major grain
major_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id)]

if len(major_rows) > 0:
return major_rows

# Check the broad grain
broad_rows = cip_df[cip_df["BroadFieldId"] == broad_id]

if len(broad_rows) > 0:
return broad_rows

raise ValueError(f"No matching rows for {broad_id}.{major_id}{detailed_id}")


def map_id_to_fields_of_science(id: str):

# Define the fields we hope to populate
broad_field_of_science = None
major_field_of_science = None
detailed_field_of_science = None

cip_df = get_cip_df()

# If we have a direct match, return it
direct_match = cip_df[cip_df["SED-CIP code"] == id]
if len(direct_match) > 0:
return [direct_match["New broad field"].values[0], direct_match["New major field"].values[0], direct_match["New detailed field"].values[0]]

# Add the broad field
broad_id = get_id(id, 0)
major_id = get_id(id, 1)
detailed_id = get_id(id, 2)

try:
matching_rows = get_matching_rows(cip_df, broad_id, major_id, detailed_id)
except ValueError as e:
print(id)
return [broad_field_of_science, major_field_of_science, detailed_field_of_science]

possible_broad_fields = set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows()))
if broad_id is not None:
best_option = None
max_rows = 0
for possible_broad_field in set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows())):
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df["New broad field"] == possible_broad_field)])

if l > max_rows:
max_rows = l
best_option = possible_broad_field

print(f"Broad Field: {broad_id}.{major_id}{detailed_id} has possible values {possible_broad_fields} we picked {best_option}")

broad_field_of_science = best_option

possible_major_fields = set(map(lambda x: x[1]['New major field'], matching_rows.iterrows()))
if major_id is not None:
best_option = None
max_rows = 0
for possible_major_field in possible_major_fields:
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
cip_df["New major field"] == possible_major_field)])
if l > max_rows:
max_rows = l
best_option = possible_major_field

print(f"Major Field: {broad_id}.{major_id}{detailed_id} has rows {possible_major_fields} we picked {best_option}")

major_field_of_science = best_option

possible_detailed_fields = set(map(lambda x: x[1]['New detailed field'], matching_rows.iterrows()))
if detailed_id is not None:
best_option = None
max_rows = 0
for possible_detailed_field in possible_detailed_fields:
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
cip_df["DetailedFieldId"] == detailed_id) & (cip_df["New detailed field"] == possible_detailed_field)])
if l > max_rows:
max_rows = l
best_option = possible_detailed_field

print(f"Detailed Field: {broad_id}.{major_id}{detailed_id} has rows {possible_detailed_fields} we picked {best_option}")

detailed_field_of_science = best_option

return [broad_field_of_science, major_field_of_science, detailed_field_of_science]


def get_id(id: Union[float, str], granularity: int):

# Check if None
if pd.isna(id):
return None

# Fix up issues from reading the id as a float
digits = [x for x in str(id) if x in string.digits]

# If the first part is preceded with a 0, (01.2023)
if len(str(id).split(".")[0]) == 1:
digits = ['0', *digits]

# If the number ends with a 0, (10.2320)
if len(digits) % 2 == 1:
digits = [*digits, '0']


if len(digits) % 2 == 1:
digits = ['0', *digits]

if granularity == 0:
return "".join(digits[:2])

if granularity == 1:

if len(digits) < 4:
return None

return "".join(digits[2:4])

if granularity == 2:

if len(digits) < 6:
return None

return "".join(digits[4:])


def tests():

if get_id(1.0, 0) != "01":
raise ValueError("Test failed")

if get_id(1.0, 1) != "00":
raise ValueError("Test failed")

if get_id(10.2320, 2) != "20":
raise ValueError("Test failed")

if get_id(10.2320, 1) != "23":
raise ValueError("Test failed")

if get_id(10.2320, 0) != "10":
raise ValueError("Test failed")

if get_id(01.23, 2) != None:
raise ValueError("Test failed")

if get_id(01.23, 0) != "01":
raise ValueError("Test failed")

if map_id_to_fields_of_science("26.15") != ['Biological and biomedical sciences','Neurobiology and neurosciences', None]:
raise ValueError("Test failed")

if __name__ == "__main__":
tests()
print("All tests passed")
88 changes: 88 additions & 0 deletions .github/scripts/check_project_fos_precision/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import sys
import datetime

import yaml
import requests

from field_of_science import get_id


def get_active_projects(start_date: datetime.datetime):
response = requests.get(
"https://gracc.opensciencegrid.org/q/gracc.osg.summary/_search",
json={
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"ResourceType": "Payload"
}
},
{
"range": {
"EndTime": {
"lte": int(datetime.datetime.now().timestamp() * 1000),
"gte": int(start_date.timestamp() * 1000)
}
}
}
]
},
},
"aggs": {
"projects": {
"terms": {
"field": "ProjectName",
"size": 99999999
},
"aggs": {
"projectJobsRan": {
"sum": {
"field": "Njobs"
}
}
}
}
}
}
)

data = response.json()

active_projects = [x['key'] for x in data['aggregations']['projects']['buckets']]

return active_projects



def has_detailed_precision(id: str):
return get_id(id, granularity=1) is not None


def main():
one_year_ago = datetime.datetime.now() - datetime.timedelta(days=365)
active_project_names = get_active_projects(one_year_ago)

print(active_project_names)

exceptions = []
for project_name in active_project_names:
try:
project_data = yaml.load(open(f"../../../projects/{project_name}.yaml"), Loader=yaml.Loader)

if "FieldOfScienceID" not in project_data or not has_detailed_precision(project_data["FieldOfScienceID"]):
exceptions.append(f"Project {project_name} is running in the OSPool without detailed precision.")

except FileNotFoundError as e:
pass


if exceptions:
print("\n".join(exceptions), sys.stderr)
raise Exception("Projects without detailed precision need to be updated.")


if __name__ == "__main__":
main()
68 changes: 68 additions & 0 deletions .github/scripts/check_project_fos_precision/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
asn1==2.7.0
async-generator==1.10
attrs==21.4.0
beautifulsoup4==4.11.1
blinker==1.6.3
certifi==2024.2.2
cffi==1.15.0
chardet==5.1.0
click==6.7
configobj==5.0.8
cryptography==37.0.2
Deprecated==1.2.13
enum-compat==0.0.3
Flask==1.0.4
Flask-WTF==0.14.3
gitdb==4.0.11
GitPython==3.1.43
gunicorn==20.1.0
h11==0.13.0
icalendar==5.0.12
idna==3.7
iniconfig==1.1.1
itsdangerous==0.24
Jinja2==2.11.3
ldap3==2.9.1
MarkupSafe==2.0.1
numpy==1.26.4
outcome==1.1.0
packaging==21.3
pandas==2.2.2
pluggy==1.0.0
prometheus-client==0.20.0
py==1.11.0
pyasn1==0.5.1
pyasn1-modules==0.2.8
pycparser==2.21
PyGithub==1.57
PyJWT==2.6.0
PyNaCl==1.5.0
pyOpenSSL==22.0.0
pyparsing==3.0.7
PySocks==1.7.1
pytest==7.1.1
pytest-mock==3.7.0
python-dateutil==2.8.2
python-gnupg==0.5.2
python-ldap==3.3.1
pytz==2024.1
PyYAML==6.0.1
requests==2.25.1
selenium==4.1.3
six==1.16.0
smmap==5.0.1
sniffio==1.2.0
sortedcontainers==2.4.0
soupsieve==2.3.2.post1
tomli==2.0.1
tqdm==4.64.0
trio==0.20.0
trio-websocket==0.9.2
tzdata==2024.1
urllib3==1.26.6
webdriverdownloader==1.1.0.3
Werkzeug==0.15.6
wrapt==1.14.1
wsproto==1.1.0
WTForms==3.0.1
xmltodict==0.13.0
21 changes: 21 additions & 0 deletions .github/workflows/check_project_fos_precision.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Check Project FOS Precision
on:
pull_request:
branches:
- main
schedule:
- cron: '0 0 * * *'

jobs:
check:
name: Check
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9.15
cache: 'pip' # caching pip dependencies
- run: pip install -r ./.github/scripts/check_project_fos_precision/requirements.txt
- run: python ./.github/scripts/check_project_fos_precision/main.py
1 change: 1 addition & 0 deletions projects/ACE_NIAID.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ Sponsor:
CampusGrid:
Name: OSG Connect
InstitutionID: 'https://osg-htc.org/iid/451cgt72wj62'
FieldOfScienceID: '26.1103'
1 change: 1 addition & 0 deletions projects/AMFORA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ Sponsor:
CampusGrid:
Name: OSG Connect
InstitutionID: 'https://osg-htc.org/iid/o14joi278jrs'
FieldOfScienceID: '11'
1 change: 1 addition & 0 deletions projects/AMNH.astro.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ Sponsor:
CampusGrid:
Name: OSG Connect
InstitutionID: 'https://osg-htc.org/iid/em2w05s9c1uc'
FieldOfScienceID: '40.02'
1 change: 1 addition & 0 deletions projects/AMNH.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ Sponsor:
CampusGrid:
Name: OSG Connect
InstitutionID: 'https://osg-htc.org/iid/em2w05s9c1uc'
FieldOfScienceID: '54.0101'
1 change: 1 addition & 0 deletions projects/AMNH_Burbrink.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ Sponsor:
CampusGrid:
Name: OSG Connect
InstitutionID: 'https://osg-htc.org/iid/em2w05s9c1uc'
FieldOfScienceID: '26'
1 change: 1 addition & 0 deletions projects/AMNH_MacLow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ Sponsor:
CampusGrid:
Name: OSG Connect
InstitutionID: 'https://osg-htc.org/iid/em2w05s9c1uc'
FieldOfScienceID: '40.02'
Loading
Loading