Merge branch 'master' into update_ucsd

opensciencegrid · Jul 15, 2024 · 724815e · 724815e
2 parents 8bc9abe + 1e61375
commit 724815e
Show file tree

Hide file tree

Showing 1,491 changed files with 24,050 additions and 1,778 deletions.
diff --git a/.github/scripts/check_project_fos_precision/field_of_science.py b/.github/scripts/check_project_fos_precision/field_of_science.py
@@ -0,0 +1,188 @@
+from functools import lru_cache
+from typing import Union
+import string
+
+import pandas as pd
+
+
+@lru_cache()
+def get_cip_df():
+
+    cip_df = pd.read_excel("data/SED-CIP-2022.xlsx")
+
+    # Drop the first two rows and make the third row the column title
+    cip_df.columns = cip_df.iloc[2]
+    cip_df = cip_df.iloc[3:]
+
+    cip_df["BroadFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 0))
+    cip_df["MajorFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 1))
+    cip_df["DetailedFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 2))
+
+    return cip_df
+
+
+def get_matching_rows(cip_df, broad_id, major_id, detailed_id):
+
+    # Check the finest grain first
+    detailed_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
+                cip_df["DetailedFieldId"] == detailed_id)]
+
+    if len(detailed_rows) > 0:
+        return detailed_rows
+
+    # Check the major grain
+    major_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id)]
+
+    if len(major_rows) > 0:
+        return major_rows
+
+    # Check the broad grain
+    broad_rows = cip_df[cip_df["BroadFieldId"] == broad_id]
+
+    if len(broad_rows) > 0:
+        return broad_rows
+
+    raise ValueError(f"No matching rows for {broad_id}.{major_id}{detailed_id}")
+
+
+def map_id_to_fields_of_science(id: str):
+
+    # Define the fields we hope to populate
+    broad_field_of_science = None
+    major_field_of_science = None
+    detailed_field_of_science = None
+
+    cip_df = get_cip_df()
+
+    # If we have a direct match, return it
+    direct_match = cip_df[cip_df["SED-CIP code"] == id]
+    if len(direct_match) > 0:
+        return [direct_match["New broad field"].values[0], direct_match["New major field"].values[0], direct_match["New detailed field"].values[0]]
+
+    # Add the broad field
+    broad_id = get_id(id, 0)
+    major_id = get_id(id, 1)
+    detailed_id = get_id(id, 2)
+
+    try:
+        matching_rows = get_matching_rows(cip_df, broad_id, major_id, detailed_id)
+    except ValueError as e:
+        print(id)
+        return [broad_field_of_science, major_field_of_science, detailed_field_of_science]
+
+    possible_broad_fields = set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows()))
+    if broad_id is not None:
+        best_option = None
+        max_rows = 0
+        for possible_broad_field in set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows())):
+            l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df["New broad field"] == possible_broad_field)])
+
+            if l > max_rows:
+                max_rows = l
+                best_option = possible_broad_field
+
+        print(f"Broad Field: {broad_id}.{major_id}{detailed_id} has possible values {possible_broad_fields} we picked {best_option}")
+
+        broad_field_of_science = best_option
+
+    possible_major_fields = set(map(lambda x: x[1]['New major field'], matching_rows.iterrows()))
+    if major_id is not None:
+        best_option = None
+        max_rows = 0
+        for possible_major_field in possible_major_fields:
+            l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
+                        cip_df["New major field"] == possible_major_field)])
+            if l > max_rows:
+                max_rows = l
+                best_option = possible_major_field
+
+        print(f"Major Field: {broad_id}.{major_id}{detailed_id} has rows {possible_major_fields} we picked {best_option}")
+
+        major_field_of_science = best_option
+
+    possible_detailed_fields = set(map(lambda x: x[1]['New detailed field'], matching_rows.iterrows()))
+    if detailed_id is not None:
+        best_option = None
+        max_rows = 0
+        for possible_detailed_field in possible_detailed_fields:
+            l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
+                        cip_df["DetailedFieldId"] == detailed_id) & (cip_df["New detailed field"] == possible_detailed_field)])
+            if l > max_rows:
+                max_rows = l
+                best_option = possible_detailed_field
+
+        print(f"Detailed Field: {broad_id}.{major_id}{detailed_id} has rows {possible_detailed_fields} we picked {best_option}")
+
+        detailed_field_of_science = best_option
+
+    return [broad_field_of_science, major_field_of_science, detailed_field_of_science]
+
+
+def get_id(id: Union[float, str], granularity: int):
+
+    # Check if None
+    if pd.isna(id):
+        return None
+
+    # Fix up issues from reading the id as a float
+    digits = [x for x in str(id) if x in string.digits]
+
+    # If the first part is preceded with a 0, (01.2023)
+    if len(str(id).split(".")[0]) == 1:
+        digits = ['0', *digits]
+
+    # If the number ends with a 0, (10.2320)
+    if len(digits) % 2 == 1:
+        digits = [*digits, '0']
+
+
+    if len(digits) % 2 == 1:
+        digits = ['0', *digits]
+
+    if granularity == 0:
+        return "".join(digits[:2])
+
+    if granularity == 1:
+
+        if len(digits) < 4:
+            return None
+
+        return "".join(digits[2:4])
+
+    if granularity == 2:
+
+        if len(digits) < 6:
+            return None
+
+        return "".join(digits[4:])
+
+
+def tests():
+
+    if get_id(1.0, 0) != "01":
+        raise ValueError("Test failed")
+
+    if get_id(1.0, 1) != "00":
+        raise ValueError("Test failed")
+
+    if get_id(10.2320, 2) != "20":
+        raise ValueError("Test failed")
+
+    if get_id(10.2320, 1) != "23":
+        raise ValueError("Test failed")
+
+    if get_id(10.2320, 0) != "10":
+        raise ValueError("Test failed")
+
+    if get_id(01.23, 2) != None:
+        raise ValueError("Test failed")
+
+    if get_id(01.23, 0) != "01":
+        raise ValueError("Test failed")
+
+    if map_id_to_fields_of_science("26.15") != ['Biological and biomedical sciences','Neurobiology and neurosciences', None]:
+        raise ValueError("Test failed")
+
+if __name__ == "__main__":
+    tests()
+    print("All tests passed")
diff --git a/.github/scripts/check_project_fos_precision/main.py b/.github/scripts/check_project_fos_precision/main.py
@@ -0,0 +1,88 @@
+import sys
+import datetime
+
+import yaml
+import requests
+
+from field_of_science import get_id
+
+
+def get_active_projects(start_date: datetime.datetime):
+    response = requests.get(
+        "https://gracc.opensciencegrid.org/q/gracc.osg.summary/_search",
+        json={
+            "size": 0,
+            "query": {
+                "bool": {
+                    "filter": [
+                        {
+                            "term": {
+                                "ResourceType": "Payload"
+                            }
+                        },
+                        {
+                            "range": {
+                                "EndTime": {
+                                    "lte": int(datetime.datetime.now().timestamp() * 1000),
+                                    "gte": int(start_date.timestamp() * 1000)
+                                }
+                            }
+                        }
+                    ]
+                },
+            },
+            "aggs": {
+                "projects": {
+                    "terms": {
+                        "field": "ProjectName",
+                        "size": 99999999
+                    },
+                    "aggs": {
+                        "projectJobsRan": {
+                            "sum": {
+                                "field": "Njobs"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    )
+
+    data = response.json()
+
+    active_projects = [x['key'] for x in data['aggregations']['projects']['buckets']]
+
+    return active_projects
+
+
+
+def has_detailed_precision(id: str):
+    return get_id(id, granularity=1) is not None
+
+
+def main():
+    one_year_ago = datetime.datetime.now() - datetime.timedelta(days=365)
+    active_project_names = get_active_projects(one_year_ago)
+
+    print(active_project_names)
+
+    exceptions = []
+    for project_name in active_project_names:
+        try:
+            project_data = yaml.load(open(f"../../../projects/{project_name}.yaml"), Loader=yaml.Loader)
+
+            if "FieldOfScienceID" not in project_data or not has_detailed_precision(project_data["FieldOfScienceID"]):
+                exceptions.append(f"Project {project_name} is running in the OSPool without detailed precision.")
+
+        except FileNotFoundError as e:
+            pass
+
+
+    if exceptions:
+        print("\n".join(exceptions), sys.stderr)
+        raise Exception("Projects without detailed precision need to be updated.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/check_project_fos_precision/requirements.txt b/.github/scripts/check_project_fos_precision/requirements.txt
@@ -0,0 +1,12 @@
+certifi==2024.7.4
+charset-normalizer==3.3.2
+idna==3.7
+numpy==1.26.4
+pandas==2.2.2
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+requests==2.32.0
+six==1.16.0
+tzdata==2024.1
+urllib3==2.2.2
diff --git a/.github/workflows/build-client-container.yml b/.github/workflows/build-client-container.yml
@@ -19,7 +19,7 @@ jobs:
     steps:
     - name: make date tag
       id: mkdatetag
-      run: echo "::set-output name=dtag::$(date +%Y%m%d-%H%M)"
+      run: echo "dtag=$(date +%Y%m%d-%H%M)" >> $GITHUB_OUTPUT
 
   build:
     runs-on: ubuntu-latest
@@ -29,7 +29,7 @@ jobs:
       fail-fast: False
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
 
     - name: Generate tag list
       id: generate-tag-list
@@ -43,26 +43,26 @@ jobs:
           tag_list+=($registry/$docker_repo:release-$TIMESTAMP)
         done
         IFS=,
-        echo "::set-output name=taglist::${tag_list[*]}"
+        echo "taglist=${tag_list[*]}" >> $GITHUB_OUTPUT
 
     - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v1
+      uses: docker/setup-buildx-action@v2.7.0
 
     - name: Log in to Docker Hub
-      uses: docker/login-action@v1
+      uses: docker/login-action@v2.2.0
       with:
         username: ${{ secrets.DOCKER_USERNAME }}
         password: ${{ secrets.DOCKER_PASSWORD }}
 
     - name: Log in to OSG Harbor
-      uses: docker/login-action@v1
+      uses: docker/login-action@v2.2.0
       with:
         registry: hub.opensciencegrid.org
         username: ${{ secrets.OSG_HARBOR_ROBOT_USER }}
         password: ${{ secrets.OSG_HARBOR_ROBOT_PASSWORD }}
 
     - name: Build and push Client Docker images
-      uses: docker/build-push-action@v2.2.0
+      uses: docker/build-push-action@v4
       with:
         push: true
         tags: "${{ steps.generate-tag-list.outputs.taglist }}"

diff --git a/.github/workflows/build-sw-container.yml b/.github/workflows/build-sw-container.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     if: startsWith(github.repository, 'opensciencegrid/')
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
 
     - name: Generate tag list
       id: generate-tag-list
@@ -28,26 +28,26 @@ jobs:
         # This causes the tag_list array to be comma-separated below,
         # which is required for build-push-action
         IFS=,
-        echo "::set-output name=taglist::${tag_list[*]}"
+        echo "taglist=${tag_list[*]}" >> $GITHUB_OUTPUT
 
     - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v1
+      uses: docker/setup-buildx-action@v2.7.0
 
     - name: Log in to Docker Hub
-      uses: docker/login-action@v1
+      uses: docker/login-action@v2.2.0
       with:
         username: ${{ secrets.DOCKER_USERNAME }}
         password: ${{ secrets.DOCKER_PASSWORD }}
 
     - name: Log in to OSG Harbor
-      uses: docker/login-action@v1
+      uses: docker/login-action@v2.2.0
       with:
         registry: hub.opensciencegrid.org
         username: ${{ secrets.OSG_HARBOR_ROBOT_USER }}
         password: ${{ secrets.OSG_HARBOR_ROBOT_PASSWORD }}
 
     - name: Build and push Docker images
-      uses: docker/build-push-action@v2.2.0
+      uses: docker/build-push-action@v4
       with:
         context: .
         push: true