Field of science ids (#3809)

* Add GHA * Add FieldOfScienceIDs
opensciencegrid · Apr 23, 2024 · d88340c · d88340c
1 parent 5910d49
commit d88340c
Show file tree

Hide file tree

Showing 1,132 changed files with 1,494 additions and 0 deletions.
diff --git a/.github/scripts/check_project_fos_precision/field_of_science.py b/.github/scripts/check_project_fos_precision/field_of_science.py
@@ -0,0 +1,188 @@
+from functools import lru_cache
+from typing import Union
+import string
+
+import pandas as pd
+
+
+@lru_cache()
+def get_cip_df():
+
+    cip_df = pd.read_excel("data/SED-CIP-2022.xlsx")
+
+    # Drop the first two rows and make the third row the column title
+    cip_df.columns = cip_df.iloc[2]
+    cip_df = cip_df.iloc[3:]
+
+    cip_df["BroadFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 0))
+    cip_df["MajorFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 1))
+    cip_df["DetailedFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 2))
+
+    return cip_df
+
+
+def get_matching_rows(cip_df, broad_id, major_id, detailed_id):
+
+    # Check the finest grain first
+    detailed_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
+                cip_df["DetailedFieldId"] == detailed_id)]
+
+    if len(detailed_rows) > 0:
+        return detailed_rows
+
+    # Check the major grain
+    major_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id)]
+
+    if len(major_rows) > 0:
+        return major_rows
+
+    # Check the broad grain
+    broad_rows = cip_df[cip_df["BroadFieldId"] == broad_id]
+
+    if len(broad_rows) > 0:
+        return broad_rows
+
+    raise ValueError(f"No matching rows for {broad_id}.{major_id}{detailed_id}")
+
+
+def map_id_to_fields_of_science(id: str):
+
+    # Define the fields we hope to populate
+    broad_field_of_science = None
+    major_field_of_science = None
+    detailed_field_of_science = None
+
+    cip_df = get_cip_df()
+
+    # If we have a direct match, return it
+    direct_match = cip_df[cip_df["SED-CIP code"] == id]
+    if len(direct_match) > 0:
+        return [direct_match["New broad field"].values[0], direct_match["New major field"].values[0], direct_match["New detailed field"].values[0]]
+
+    # Add the broad field
+    broad_id = get_id(id, 0)
+    major_id = get_id(id, 1)
+    detailed_id = get_id(id, 2)
+
+    try:
+        matching_rows = get_matching_rows(cip_df, broad_id, major_id, detailed_id)
+    except ValueError as e:
+        print(id)
+        return [broad_field_of_science, major_field_of_science, detailed_field_of_science]
+
+    possible_broad_fields = set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows()))
+    if broad_id is not None:
+        best_option = None
+        max_rows = 0
+        for possible_broad_field in set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows())):
+            l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df["New broad field"] == possible_broad_field)])
+
+            if l > max_rows:
+                max_rows = l
+                best_option = possible_broad_field
+
+        print(f"Broad Field: {broad_id}.{major_id}{detailed_id} has possible values {possible_broad_fields} we picked {best_option}")
+
+        broad_field_of_science = best_option
+
+    possible_major_fields = set(map(lambda x: x[1]['New major field'], matching_rows.iterrows()))
+    if major_id is not None:
+        best_option = None
+        max_rows = 0
+        for possible_major_field in possible_major_fields:
+            l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
+                        cip_df["New major field"] == possible_major_field)])
+            if l > max_rows:
+                max_rows = l
+                best_option = possible_major_field
+
+        print(f"Major Field: {broad_id}.{major_id}{detailed_id} has rows {possible_major_fields} we picked {best_option}")
+
+        major_field_of_science = best_option
+
+    possible_detailed_fields = set(map(lambda x: x[1]['New detailed field'], matching_rows.iterrows()))
+    if detailed_id is not None:
+        best_option = None
+        max_rows = 0
+        for possible_detailed_field in possible_detailed_fields:
+            l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
+                        cip_df["DetailedFieldId"] == detailed_id) & (cip_df["New detailed field"] == possible_detailed_field)])
+            if l > max_rows:
+                max_rows = l
+                best_option = possible_detailed_field
+
+        print(f"Detailed Field: {broad_id}.{major_id}{detailed_id} has rows {possible_detailed_fields} we picked {best_option}")
+
+        detailed_field_of_science = best_option
+
+    return [broad_field_of_science, major_field_of_science, detailed_field_of_science]
+
+
+def get_id(id: Union[float, str], granularity: int):
+
+    # Check if None
+    if pd.isna(id):
+        return None
+
+    # Fix up issues from reading the id as a float
+    digits = [x for x in str(id) if x in string.digits]
+
+    # If the first part is preceded with a 0, (01.2023)
+    if len(str(id).split(".")[0]) == 1:
+        digits = ['0', *digits]
+
+    # If the number ends with a 0, (10.2320)
+    if len(digits) % 2 == 1:
+        digits = [*digits, '0']
+
+
+    if len(digits) % 2 == 1:
+        digits = ['0', *digits]
+
+    if granularity == 0:
+        return "".join(digits[:2])
+
+    if granularity == 1:
+
+        if len(digits) < 4:
+            return None
+
+        return "".join(digits[2:4])
+
+    if granularity == 2:
+
+        if len(digits) < 6:
+            return None
+
+        return "".join(digits[4:])
+
+
+def tests():
+
+    if get_id(1.0, 0) != "01":
+        raise ValueError("Test failed")
+
+    if get_id(1.0, 1) != "00":
+        raise ValueError("Test failed")
+
+    if get_id(10.2320, 2) != "20":
+        raise ValueError("Test failed")
+
+    if get_id(10.2320, 1) != "23":
+        raise ValueError("Test failed")
+
+    if get_id(10.2320, 0) != "10":
+        raise ValueError("Test failed")
+
+    if get_id(01.23, 2) != None:
+        raise ValueError("Test failed")
+
+    if get_id(01.23, 0) != "01":
+        raise ValueError("Test failed")
+
+    if map_id_to_fields_of_science("26.15") != ['Biological and biomedical sciences','Neurobiology and neurosciences', None]:
+        raise ValueError("Test failed")
+
+if __name__ == "__main__":
+    tests()
+    print("All tests passed")
diff --git a/.github/scripts/check_project_fos_precision/main.py b/.github/scripts/check_project_fos_precision/main.py
@@ -0,0 +1,88 @@
+import sys
+import datetime
+
+import yaml
+import requests
+
+from field_of_science import get_id
+
+
+def get_active_projects(start_date: datetime.datetime):
+    response = requests.get(
+        "https://gracc.opensciencegrid.org/q/gracc.osg.summary/_search",
+        json={
+            "size": 0,
+            "query": {
+                "bool": {
+                    "filter": [
+                        {
+                            "term": {
+                                "ResourceType": "Payload"
+                            }
+                        },
+                        {
+                            "range": {
+                                "EndTime": {
+                                    "lte": int(datetime.datetime.now().timestamp() * 1000),
+                                    "gte": int(start_date.timestamp() * 1000)
+                                }
+                            }
+                        }
+                    ]
+                },
+            },
+            "aggs": {
+                "projects": {
+                    "terms": {
+                        "field": "ProjectName",
+                        "size": 99999999
+                    },
+                    "aggs": {
+                        "projectJobsRan": {
+                            "sum": {
+                                "field": "Njobs"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    )
+
+    data = response.json()
+
+    active_projects = [x['key'] for x in data['aggregations']['projects']['buckets']]
+
+    return active_projects
+
+
+
+def has_detailed_precision(id: str):
+    return get_id(id, granularity=1) is not None
+
+
+def main():
+    one_year_ago = datetime.datetime.now() - datetime.timedelta(days=365)
+    active_project_names = get_active_projects(one_year_ago)
+
+    print(active_project_names)
+
+    exceptions = []
+    for project_name in active_project_names:
+        try:
+            project_data = yaml.load(open(f"../../../projects/{project_name}.yaml"), Loader=yaml.Loader)
+
+            if "FieldOfScienceID" not in project_data or not has_detailed_precision(project_data["FieldOfScienceID"]):
+                exceptions.append(f"Project {project_name} is running in the OSPool without detailed precision.")
+
+        except FileNotFoundError as e:
+            pass
+
+
+    if exceptions:
+        print("\n".join(exceptions), sys.stderr)
+        raise Exception("Projects without detailed precision need to be updated.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/check_project_fos_precision/requirements.txt b/.github/scripts/check_project_fos_precision/requirements.txt
@@ -0,0 +1,68 @@
+asn1==2.7.0
+async-generator==1.10
+attrs==21.4.0
+beautifulsoup4==4.11.1
+blinker==1.6.3
+certifi==2024.2.2
+cffi==1.15.0
+chardet==5.1.0
+click==6.7
+configobj==5.0.8
+cryptography==37.0.2
+Deprecated==1.2.13
+enum-compat==0.0.3
+Flask==1.0.4
+Flask-WTF==0.14.3
+gitdb==4.0.11
+GitPython==3.1.43
+gunicorn==20.1.0
+h11==0.13.0
+icalendar==5.0.12
+idna==3.7
+iniconfig==1.1.1
+itsdangerous==0.24
+Jinja2==2.11.3
+ldap3==2.9.1
+MarkupSafe==2.0.1
+numpy==1.26.4
+outcome==1.1.0
+packaging==21.3
+pandas==2.2.2
+pluggy==1.0.0
+prometheus-client==0.20.0
+py==1.11.0
+pyasn1==0.5.1
+pyasn1-modules==0.2.8
+pycparser==2.21
+PyGithub==1.57
+PyJWT==2.6.0
+PyNaCl==1.5.0
+pyOpenSSL==22.0.0
+pyparsing==3.0.7
+PySocks==1.7.1
+pytest==7.1.1
+pytest-mock==3.7.0
+python-dateutil==2.8.2
+python-gnupg==0.5.2
+python-ldap==3.3.1
+pytz==2024.1
+PyYAML==6.0.1
+requests==2.25.1
+selenium==4.1.3
+six==1.16.0
+smmap==5.0.1
+sniffio==1.2.0
+sortedcontainers==2.4.0
+soupsieve==2.3.2.post1
+tomli==2.0.1
+tqdm==4.64.0
+trio==0.20.0
+trio-websocket==0.9.2
+tzdata==2024.1
+urllib3==1.26.6
+webdriverdownloader==1.1.0.3
+Werkzeug==0.15.6
+wrapt==1.14.1
+wsproto==1.1.0
+WTForms==3.0.1
+xmltodict==0.13.0
diff --git a/.github/workflows/check_project_fos_precision.yml b/.github/workflows/check_project_fos_precision.yml
@@ -0,0 +1,21 @@
+name: Check Project FOS Precision
+on:
+  pull_request:
+    branches:
+      - main
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  check:
+    name: Check
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9.15
+          cache: 'pip' # caching pip dependencies
+      - run: pip install -r ./.github/scripts/check_project_fos_precision/requirements.txt
+      - run: python ./.github/scripts/check_project_fos_precision/main.py
diff --git a/projects/ACE_NIAID.yaml b/projects/ACE_NIAID.yaml
@@ -18,3 +18,4 @@ Sponsor:
   CampusGrid:
     Name: OSG Connect
 InstitutionID: 'https://osg-htc.org/iid/451cgt72wj62'
+FieldOfScienceID: '26.1103'
diff --git a/projects/AMFORA.yaml b/projects/AMFORA.yaml
@@ -10,3 +10,4 @@ Sponsor:
   CampusGrid:
     Name: OSG Connect
 InstitutionID: 'https://osg-htc.org/iid/o14joi278jrs'
+FieldOfScienceID: '11'
diff --git a/projects/AMNH.astro.yaml b/projects/AMNH.astro.yaml
@@ -10,3 +10,4 @@ Sponsor:
   CampusGrid:
     Name: OSG Connect
 InstitutionID: 'https://osg-htc.org/iid/em2w05s9c1uc'
+FieldOfScienceID: '40.02'
diff --git a/projects/AMNH.yaml b/projects/AMNH.yaml
@@ -10,3 +10,4 @@ Sponsor:
   CampusGrid:
     Name: OSG Connect
 InstitutionID: 'https://osg-htc.org/iid/em2w05s9c1uc'
+FieldOfScienceID: '54.0101'
diff --git a/projects/AMNH_Burbrink.yaml b/projects/AMNH_Burbrink.yaml
@@ -10,3 +10,4 @@ Sponsor:
   CampusGrid:
     Name: OSG Connect
 InstitutionID: 'https://osg-htc.org/iid/em2w05s9c1uc'
+FieldOfScienceID: '26'
diff --git a/projects/AMNH_MacLow.yaml b/projects/AMNH_MacLow.yaml
@@ -9,3 +9,4 @@ Sponsor:
   CampusGrid:
     Name: OSG Connect
 InstitutionID: 'https://osg-htc.org/iid/em2w05s9c1uc'
+FieldOfScienceID: '40.02'