Skip to content

Commit

Permalink
Merge branch 'master' into update_ucsd
Browse files Browse the repository at this point in the history
  • Loading branch information
matyasselmeci authored Jul 15, 2024
2 parents 8bc9abe + 1e61375 commit 724815e
Show file tree
Hide file tree
Showing 1,491 changed files with 24,050 additions and 1,778 deletions.
188 changes: 188 additions & 0 deletions .github/scripts/check_project_fos_precision/field_of_science.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
from functools import lru_cache
from typing import Union
import string

import pandas as pd


@lru_cache()
def get_cip_df():

cip_df = pd.read_excel("data/SED-CIP-2022.xlsx")

# Drop the first two rows and make the third row the column title
cip_df.columns = cip_df.iloc[2]
cip_df = cip_df.iloc[3:]

cip_df["BroadFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 0))
cip_df["MajorFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 1))
cip_df["DetailedFieldId"] = cip_df['SED-CIP code'].apply(lambda x: get_id(x, 2))

return cip_df


def get_matching_rows(cip_df, broad_id, major_id, detailed_id):

# Check the finest grain first
detailed_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
cip_df["DetailedFieldId"] == detailed_id)]

if len(detailed_rows) > 0:
return detailed_rows

# Check the major grain
major_rows = cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id)]

if len(major_rows) > 0:
return major_rows

# Check the broad grain
broad_rows = cip_df[cip_df["BroadFieldId"] == broad_id]

if len(broad_rows) > 0:
return broad_rows

raise ValueError(f"No matching rows for {broad_id}.{major_id}{detailed_id}")


def map_id_to_fields_of_science(id: str):

# Define the fields we hope to populate
broad_field_of_science = None
major_field_of_science = None
detailed_field_of_science = None

cip_df = get_cip_df()

# If we have a direct match, return it
direct_match = cip_df[cip_df["SED-CIP code"] == id]
if len(direct_match) > 0:
return [direct_match["New broad field"].values[0], direct_match["New major field"].values[0], direct_match["New detailed field"].values[0]]

# Add the broad field
broad_id = get_id(id, 0)
major_id = get_id(id, 1)
detailed_id = get_id(id, 2)

try:
matching_rows = get_matching_rows(cip_df, broad_id, major_id, detailed_id)
except ValueError as e:
print(id)
return [broad_field_of_science, major_field_of_science, detailed_field_of_science]

possible_broad_fields = set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows()))
if broad_id is not None:
best_option = None
max_rows = 0
for possible_broad_field in set(map(lambda x: x[1]['New broad field'], matching_rows.iterrows())):
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df["New broad field"] == possible_broad_field)])

if l > max_rows:
max_rows = l
best_option = possible_broad_field

print(f"Broad Field: {broad_id}.{major_id}{detailed_id} has possible values {possible_broad_fields} we picked {best_option}")

broad_field_of_science = best_option

possible_major_fields = set(map(lambda x: x[1]['New major field'], matching_rows.iterrows()))
if major_id is not None:
best_option = None
max_rows = 0
for possible_major_field in possible_major_fields:
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
cip_df["New major field"] == possible_major_field)])
if l > max_rows:
max_rows = l
best_option = possible_major_field

print(f"Major Field: {broad_id}.{major_id}{detailed_id} has rows {possible_major_fields} we picked {best_option}")

major_field_of_science = best_option

possible_detailed_fields = set(map(lambda x: x[1]['New detailed field'], matching_rows.iterrows()))
if detailed_id is not None:
best_option = None
max_rows = 0
for possible_detailed_field in possible_detailed_fields:
l = len(cip_df[(cip_df["BroadFieldId"] == broad_id) & (cip_df['MajorFieldId'] == major_id) & (
cip_df["DetailedFieldId"] == detailed_id) & (cip_df["New detailed field"] == possible_detailed_field)])
if l > max_rows:
max_rows = l
best_option = possible_detailed_field

print(f"Detailed Field: {broad_id}.{major_id}{detailed_id} has rows {possible_detailed_fields} we picked {best_option}")

detailed_field_of_science = best_option

return [broad_field_of_science, major_field_of_science, detailed_field_of_science]


def get_id(id: Union[float, str], granularity: int):

# Check if None
if pd.isna(id):
return None

# Fix up issues from reading the id as a float
digits = [x for x in str(id) if x in string.digits]

# If the first part is preceded with a 0, (01.2023)
if len(str(id).split(".")[0]) == 1:
digits = ['0', *digits]

# If the number ends with a 0, (10.2320)
if len(digits) % 2 == 1:
digits = [*digits, '0']


if len(digits) % 2 == 1:
digits = ['0', *digits]

if granularity == 0:
return "".join(digits[:2])

if granularity == 1:

if len(digits) < 4:
return None

return "".join(digits[2:4])

if granularity == 2:

if len(digits) < 6:
return None

return "".join(digits[4:])


def tests():

if get_id(1.0, 0) != "01":
raise ValueError("Test failed")

if get_id(1.0, 1) != "00":
raise ValueError("Test failed")

if get_id(10.2320, 2) != "20":
raise ValueError("Test failed")

if get_id(10.2320, 1) != "23":
raise ValueError("Test failed")

if get_id(10.2320, 0) != "10":
raise ValueError("Test failed")

if get_id(01.23, 2) != None:
raise ValueError("Test failed")

if get_id(01.23, 0) != "01":
raise ValueError("Test failed")

if map_id_to_fields_of_science("26.15") != ['Biological and biomedical sciences','Neurobiology and neurosciences', None]:
raise ValueError("Test failed")

if __name__ == "__main__":
tests()
print("All tests passed")
88 changes: 88 additions & 0 deletions .github/scripts/check_project_fos_precision/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import sys
import datetime

import yaml
import requests

from field_of_science import get_id


def get_active_projects(start_date: datetime.datetime):
response = requests.get(
"https://gracc.opensciencegrid.org/q/gracc.osg.summary/_search",
json={
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"ResourceType": "Payload"
}
},
{
"range": {
"EndTime": {
"lte": int(datetime.datetime.now().timestamp() * 1000),
"gte": int(start_date.timestamp() * 1000)
}
}
}
]
},
},
"aggs": {
"projects": {
"terms": {
"field": "ProjectName",
"size": 99999999
},
"aggs": {
"projectJobsRan": {
"sum": {
"field": "Njobs"
}
}
}
}
}
}
)

data = response.json()

active_projects = [x['key'] for x in data['aggregations']['projects']['buckets']]

return active_projects



def has_detailed_precision(id: str):
return get_id(id, granularity=1) is not None


def main():
one_year_ago = datetime.datetime.now() - datetime.timedelta(days=365)
active_project_names = get_active_projects(one_year_ago)

print(active_project_names)

exceptions = []
for project_name in active_project_names:
try:
project_data = yaml.load(open(f"../../../projects/{project_name}.yaml"), Loader=yaml.Loader)

if "FieldOfScienceID" not in project_data or not has_detailed_precision(project_data["FieldOfScienceID"]):
exceptions.append(f"Project {project_name} is running in the OSPool without detailed precision.")

except FileNotFoundError as e:
pass


if exceptions:
print("\n".join(exceptions), sys.stderr)
raise Exception("Projects without detailed precision need to be updated.")


if __name__ == "__main__":
main()
12 changes: 12 additions & 0 deletions .github/scripts/check_project_fos_precision/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
certifi==2024.7.4
charset-normalizer==3.3.2
idna==3.7
numpy==1.26.4
pandas==2.2.2
python-dateutil==2.9.0.post0
pytz==2024.1
PyYAML==6.0.1
requests==2.32.0
six==1.16.0
tzdata==2024.1
urllib3==2.2.2
14 changes: 7 additions & 7 deletions .github/workflows/build-client-container.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
steps:
- name: make date tag
id: mkdatetag
run: echo "::set-output name=dtag::$(date +%Y%m%d-%H%M)"
run: echo "dtag=$(date +%Y%m%d-%H%M)" >> $GITHUB_OUTPUT

build:
runs-on: ubuntu-latest
Expand All @@ -29,7 +29,7 @@ jobs:
fail-fast: False

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3

- name: Generate tag list
id: generate-tag-list
Expand All @@ -43,26 +43,26 @@ jobs:
tag_list+=($registry/$docker_repo:release-$TIMESTAMP)
done
IFS=,
echo "::set-output name=taglist::${tag_list[*]}"
echo "taglist=${tag_list[*]}" >> $GITHUB_OUTPUT
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v2.7.0

- name: Log in to Docker Hub
uses: docker/login-action@v1
uses: docker/login-action@v2.2.0
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Log in to OSG Harbor
uses: docker/login-action@v1
uses: docker/login-action@v2.2.0
with:
registry: hub.opensciencegrid.org
username: ${{ secrets.OSG_HARBOR_ROBOT_USER }}
password: ${{ secrets.OSG_HARBOR_ROBOT_PASSWORD }}

- name: Build and push Client Docker images
uses: docker/build-push-action@v2.2.0
uses: docker/build-push-action@v4
with:
push: true
tags: "${{ steps.generate-tag-list.outputs.taglist }}"
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/build-sw-container.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
if: startsWith(github.repository, 'opensciencegrid/')
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3

- name: Generate tag list
id: generate-tag-list
Expand All @@ -28,26 +28,26 @@ jobs:
# This causes the tag_list array to be comma-separated below,
# which is required for build-push-action
IFS=,
echo "::set-output name=taglist::${tag_list[*]}"
echo "taglist=${tag_list[*]}" >> $GITHUB_OUTPUT
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v2.7.0

- name: Log in to Docker Hub
uses: docker/login-action@v1
uses: docker/login-action@v2.2.0
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Log in to OSG Harbor
uses: docker/login-action@v1
uses: docker/login-action@v2.2.0
with:
registry: hub.opensciencegrid.org
username: ${{ secrets.OSG_HARBOR_ROBOT_USER }}
password: ${{ secrets.OSG_HARBOR_ROBOT_PASSWORD }}

- name: Build and push Docker images
uses: docker/build-push-action@v2.2.0
uses: docker/build-push-action@v4
with:
context: .
push: true
Expand Down
Loading

0 comments on commit 724815e

Please sign in to comment.