Skip to content

Commit

Permalink
Merge pull request #1273 from Sage-Bionetworks/develop
Browse files Browse the repository at this point in the history
Schematic 23.8.1
  • Loading branch information
GiaJordan authored Aug 3, 2023
2 parents f382831 + 388584f commit d861295
Show file tree
Hide file tree
Showing 11 changed files with 232 additions and 79 deletions.
9 changes: 8 additions & 1 deletion .github/workflows/docker_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v2

- name: Set env variable for version tag
run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV

- name: Log in to the Container registry
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
with:
Expand All @@ -41,10 +44,14 @@ jobs:
type=ref,event=branch
type=ref,event=pr
type=semver,pattern={{raw}}
- name: Build and push Docker image
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
with:
file: schematic_api/Dockerfile
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
labels: ${{ steps.meta.outputs.labels }}
build-args: |
TAG=${{ env.RELEASE_VERSION }}
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ jobs:
# See also: https://api.slack.com/methods/chat.postMessage#channels
# You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
# ibc-fair-data channel and data-curator-schematic channel
channel-id: 'C01HSSMPQBG,C01ANC02U59'
channel-id: 'C050YD75QRL,C01ANC02U59'
# For posting a simple plain text message
slack-message: "Schematic has just been released. Check out new version: ${{ github.ref_name }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ This command will install the dependencies based on what we specify in poetry.lo
*Note*: If you won't interact with Synapse, please ignore this section.

There are two main configuration files that need to be edited:
[config.yml](https://github.com/Sage-Bionetworks/schematic/blob/develop/config.yml)
config.yml
and [synapseConfig](https://raw.githubusercontent.com/Sage-Bionetworks/synapsePythonClient/v2.3.0-rc/synapseclient/.synapseConfig)

<strong>Configure .synapseConfig File</strong>
Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ markers = [
Google credentials (skipped on GitHub CI) \
""",
"""\
not_windows: tests that don't work on on windows machine
""",
"""\
schematic_api: marks tests covering \
API functionality (skipped on regular GitHub CI test suite)
""",
Expand All @@ -143,4 +146,4 @@ markers = [
rule_benchmark: marks tests covering \
validation rule benchmarking
"""
]
]
4 changes: 2 additions & 2 deletions schematic/manifest/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1600,7 +1600,7 @@ def _get_end_columns(self, current_schema_headers, existing_manifest_headers, ou
end_columns.append(id_name)

# Add entity_id to the end columns if it should be there but isn't
if 'entityId' in (current_schema_headers or existing_manfiest_headers) and 'entityId' not in end_columns:
if 'entityId' in (current_schema_headers or existing_manifest_headers) and 'entityId' not in end_columns:
end_columns.append('entityId')
return end_columns

Expand All @@ -1621,7 +1621,7 @@ def _update_dataframe_with_existing_df(self, empty_manifest_url: str, existing_d

# Get headers for the current schema and existing manifest df.
current_schema_headers = list(self.get_dataframe_by_url(manifest_url=empty_manifest_url).columns)
existing_manfiest_headers = list(existing_df.columns)
existing_manifest_headers = list(existing_df.columns)

# Find columns that exist in the current schema, but are not in the manifest being downloaded.
new_columns = self._get_missing_columns(current_schema_headers, existing_manifest_headers)
Expand Down
53 changes: 25 additions & 28 deletions schematic/store/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import logging
import secrets
from dataclasses import dataclass
import tempfile
import shutil

# allows specifying explicit variable types
from typing import Dict, List, Tuple, Sequence, Union
Expand Down Expand Up @@ -43,17 +43,17 @@
from schematic_db.rdb.synapse_database import SynapseDatabase


from schematic.utils.df_utils import update_df, load_df, col_in_dataframe, populate_df_col_with_another_col
from schematic.utils.df_utils import update_df, load_df, col_in_dataframe
from schematic.utils.validate_utils import comma_separated_list_regex, rule_in_rule_list
from schematic.utils.general import entity_type_mapping, get_dir_size, convert_size, convert_gb_to_bytes, create_temp_folder
from schematic.utils.general import entity_type_mapping, get_dir_size, convert_gb_to_bytes, create_temp_folder, check_synapse_cache_size, clear_synapse_cache
from schematic.schemas.explorer import SchemaExplorer
from schematic.schemas.generator import SchemaGenerator
from schematic.store.base import BaseStorage
from schematic.exceptions import MissingConfigValueError, AccessCredentialsError

from schematic.configuration.configuration import CONFIG

from schematic.utils.general import profile
from schematic.utils.general import profile, calculate_datetime

logger = logging.getLogger("Synapse storage")

Expand All @@ -75,12 +75,16 @@ def _download_manifest_to_folder(self) -> File:
"""
if "SECRETS_MANAGER_SECRETS" in os.environ:
temporary_manifest_storage = "/var/tmp/temp_manifest_download"
# clear out all the existing manifests
if os.path.exists(temporary_manifest_storage):
shutil.rmtree(temporary_manifest_storage)
# create a new directory to store manifest
if not os.path.exists(temporary_manifest_storage):
os.mkdir("/var/tmp/temp_manifest_download")
os.mkdir(temporary_manifest_storage)
# create temporary folders for storing manifests
download_location = create_temp_folder(temporary_manifest_storage)
else:
download_location=CONFIG.manifest_folder

manifest_data = self.syn.get(
self.manifest_id,
downloadLocation=download_location,
Expand Down Expand Up @@ -177,41 +181,34 @@ def __init__(
Typical usage example:
syn_store = SynapseStorage()
"""

# TODO: turn root_synapse_cache to a parameter in init
self.syn = self.login(token, access_token)
self.project_scope = project_scope
self.storageFileview = CONFIG.synapse_master_fileview_id
self.manifest = CONFIG.synapse_manifest_basename
self.root_synapse_cache = "/root/.synapseCache"
self._query_fileview()

def _purge_synapse_cache(self, root_dir: str = "/var/www/.synapseCache/", maximum_storage_allowed_cache_gb=7):
def _purge_synapse_cache(self, maximum_storage_allowed_cache_gb=1):
"""
Purge synapse cache if it exceeds 7GB
Purge synapse cache if it exceeds a certain size. Default to 1GB.
Args:
root_dir: directory of the .synapseCache function
maximum_storage_allowed_cache_gb: the maximum storage allowed before purging cache. Default is 7 GB.
Returns:
if size of cache reaches a certain threshold (default is 7GB), return the number of files that get deleted
otherwise, return the total remaining space (assuming total ephemeral storage is 20GB on AWS )
maximum_storage_allowed_cache_gb: the maximum storage allowed before purging cache. Default is 1 GB.
"""
# try clearing the cache
# scan a directory and check size of files
cache = self.syn.cache
if os.path.exists(root_dir):
if os.path.exists(self.root_synapse_cache):
maximum_storage_allowed_cache_bytes = convert_gb_to_bytes(maximum_storage_allowed_cache_gb)
total_ephemeral_storag_gb = 20
total_ephemeral_storage_bytes = convert_gb_to_bytes(total_ephemeral_storag_gb)
nbytes = get_dir_size(root_dir)
# if 7 GB has already been taken, purge cache before 15 min
if nbytes >= maximum_storage_allowed_cache_bytes:
minutes_earlier = datetime.strftime(datetime.utcnow()- timedelta(minutes = 15), '%s')
num_of_deleted_files = cache.purge(before_date = int(minutes_earlier))
logger.info(f'{num_of_deleted_files} number of files have been deleted from {root_dir}')
nbytes = get_dir_size(self.root_synapse_cache)
dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache)
# if 1 GB has already been taken, purge cache before 15 min
if dir_size_bytes >= maximum_storage_allowed_cache_bytes:
num_of_deleted_files = clear_synapse_cache(self.syn.cache, minutes=15)
logger.info(f'{num_of_deleted_files} files have been deleted from {self.root_synapse_cache}')
else:
remaining_space = total_ephemeral_storage_bytes - nbytes
converted_space = convert_size(remaining_space)
logger.info(f'Estimated {remaining_space} bytes (which is approximately {converted_space}) remained in ephemeral storage after calculating size of .synapseCache excluding OS')
# on AWS, OS takes around 14-17% of our ephemeral storage (20GiB)
# instead of guessing how much space that we left, print out .synapseCache here
logger.info(f'the total size of .synapseCache is: {nbytes} bytes')

def _query_fileview(self):
self._purge_synapse_cache()
Expand Down
89 changes: 69 additions & 20 deletions schematic/utils/general.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
# allows specifying explicit variable types
from typing import Any, Dict, Optional, Text
import os
import math
import logging
import math
import os
import pstats
import subprocess
import tempfile
from cProfile import Profile
from datetime import datetime, timedelta
from functools import wraps

import tempfile
from typing import Union

from synapseclient.core.exceptions import SynapseHTTPError
from synapseclient.table import EntityViewSchema
from synapseclient.entity import File, Folder, Project
from synapseclient.table import EntityViewSchema

import synapseclient.core.cache as cache

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -57,24 +60,69 @@ def get_dir_size(path: str):
total += get_dir_size(entry.path)
return total

def calculate_datetime(minutes: int, input_date: datetime, before_or_after: str = "before") -> datetime:
"""calculate date time
Args:
input_date (datetime): date time object provided by users
minutes (int): number of minutes
before_or_after (str): default to "before". if "before", calculate x minutes before current date time. if "after", calculate x minutes after current date time.
Returns:
datetime: return result of date time calculation
"""
if before_or_after=="before":
date_time_result = input_date - timedelta(minutes=minutes)
elif before_or_after=="after":
date_time_result = input_date + timedelta(minutes=minutes)
else:
raise ValueError("Invalid value. Use either 'before' or 'after'.")
return date_time_result


def check_synapse_cache_size(directory='/root/.synapseCache')-> Union[float, int]:
"""use du --sh command to calculate size of .synapseCache.
def convert_size(size_bytes: int):
"""convert bytes to a human readable format
Args:
size_bytes: total byte sizes
return: a string that indicates bytes in a different format
directory (str, optional): .synapseCache directory. Defaults to '/root/.synapseCache'
Returns:
float or integer: returns size of .synapsecache directory in bytes
"""
if size_bytes == 0:
return "0B"
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
# calculate the log of size (in bytes) to base 1024 and run it down to the nearest integer
index_int = int(math.floor(math.log(size_bytes, 1024)))
# return the value of 1024 raised to the power of index
power_cal = math.pow(1024, index_int)
# convert bytes to a different unit if applicable
size_bytes_converted = round(size_bytes / power_cal, 2)
return f"{size_bytes_converted} {size_name[index_int]})"
# Note: this command might fail on windows user. But since this command is primarily for running on AWS, it is fine.
command = ['du', '-sh', directory]
output = subprocess.run(command, capture_output=True).stdout.decode('utf-8')

# Parsing the output to extract the directory size
size = output.split('\t')[0]
if "K" in size:
size_in_kb = float(size.rstrip('K'))
byte_size = size_in_kb * 1000
elif "M" in size:
size_in_mb = float(size.rstrip('M'))
byte_size = size_in_mb * 1000000
elif "G" in size:
size_in_gb = float(size.rstrip('G'))
byte_size = convert_gb_to_bytes(size_in_gb)
elif "B" in size:
byte_size = float(size.rstrip('B'))
else:
logger.error('Cannot recongize the file size unit')
return byte_size

def clear_synapse_cache(cache: cache.Cache, minutes: int) -> int:
"""clear synapse cache before a certain time
Args:
cache: an object of synapseclient Cache.
minutes (int): all files before this minute will be removed
Returns:
int: number of files that get deleted
"""
current_date = datetime.utcnow()
minutes_earlier = calculate_datetime(input_date=current_date, minutes=minutes, before_or_after="before")
num_of_deleted_files = cache.purge(before_date = minutes_earlier)
return num_of_deleted_files

def convert_gb_to_bytes(gb: int):
"""convert gb to bytes
Expand All @@ -84,6 +132,7 @@ def convert_gb_to_bytes(gb: int):
"""
return gb * 1024 * 1024 * 1024


def entity_type_mapping(syn, entity_id):
"""
Return the entity type of manifest
Expand Down
10 changes: 8 additions & 2 deletions schematic_api/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
FROM tiangolo/uwsgi-nginx-flask:python3.10

# add version tag as a build argument
ARG TAG

# the environment variables defined here are the default
# and can be overwritten by docker run -e VARIABLE = XX
# or can be overwritten by .env when using docker compose
Expand All @@ -15,7 +18,8 @@ ENV PYTHONFAULTHANDLER=1 \
APP_DIR=/app/app \
ROOT=/ \
UWSGI_INI=/app/uwsgi.ini \
NGINX_WORKER_PROCESSES=1
NGINX_WORKER_PROCESSES=1 \
VERSION=$TAG

# Note:
# The starting number of uWSGI processes is controlled by the variable UWSGI_CHEAPER, by default set to 2.
Expand Down Expand Up @@ -73,7 +77,9 @@ WORKDIR ${APP_DIR}

# copy other files to app/app
# Note: run_api.py is not needed
COPY ./pyproject.toml ./poetry.lock ./config.yml ./main.py ./

COPY ./pyproject.toml ./poetry.lock ./main.py ./
COPY ./config_example.yml ./config.yml
RUN poetry config virtualenvs.create false
RUN poetry install --no-interaction --no-ansi --no-root --with aws

Expand Down
18 changes: 18 additions & 0 deletions schematic_api/api/openapi/api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1262,4 +1262,22 @@ paths:
Family History,FamilyHistory,TBD,False,True,," If Diagnosis is ""Cancer"" then ""Family History"" is required",Patient
tags:
- Visualization Operations

/version:
get:
summary: Get the version of schematic currently being used
description: >-
Get the version of schematic that is currently deployed and being used
operationId: schematic_api.api.routes.get_schematic_version
responses:
"200":
description: Returns a JSON String containing the version of schematic.
content:
text/plain:
schema:
type: string
"500":
description: Schematic version was not able to be identified.
tags:
- Version

17 changes: 13 additions & 4 deletions schematic_api/api/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,10 +560,8 @@ def download_manifest(access_token, manifest_id, new_manifest_name='', as_json=T
# call config_handler()
config_handler()

# use Synapse Storage
store = SynapseStorage(access_token=access_token)
# try logging in to asset store
syn = store.login(access_token=access_token)
# use login method in synapse storage
syn = SynapseStorage.login(access_token=access_token)
try:
md = ManifestDownload(syn, manifest_id)
manifest_data = ManifestDownload.download_manifest(md, new_manifest_name)
Expand Down Expand Up @@ -817,3 +815,14 @@ def get_nodes_display_names(schema_url: str, node_list: list[str]) -> list:
node_display_names = gen.get_nodes_display_names(node_list, mm_graph)
return node_display_names

def get_schematic_version() -> str:
"""
Return the current version of schematic
"""
if "VERSION" in os.environ:
version = os.environ["VERSION"]
else:
raise NotImplementedError(
"Using this endpoint to check the version of schematic is only supported when the API is running in a docker container."
)
return version
Loading

0 comments on commit d861295

Please sign in to comment.