Skip to content

Commit

Permalink
create package in training, add tests to CI
Browse files Browse the repository at this point in the history
  • Loading branch information
Terézia Slanináková committed Oct 1, 2024
1 parent 47de63b commit e0c7589
Show file tree
Hide file tree
Showing 22 changed files with 495 additions and 311 deletions.
12 changes: 4 additions & 8 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,16 @@ jobs:
python -m pip install --upgrade pip
pip install -r training/requirements.txt
pip install pytest pytest-cov
cd training && pip install -e .
- name: Run tests with pytest and generate coverage
run: |
cd training
pytest --cov=. --cov-report=xml --cov-report=term-missing
cd /home/runner/work/AlphaFind/AlphaFind/training
export PYTHONPATH=$PYTHONPATH:$(pwd)
pytest -v --cov=. --cov-report=xml --cov-report=term-missing
- name: Upload coverage report
uses: actions/upload-artifact@v4
with:
name: coverage-report
path: training/coverage.xml

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
file: ./training/coverage.xml
fail_ci_if_error: true
1 change: 1 addition & 0 deletions training/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ data/kmeans.idx
models/
.coverage
coverage.xml
*.egg-info
2 changes: 2 additions & 0 deletions training/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,7 @@ RUN pip install -r /var/requirements.txt && rm -rf ~/.cache

COPY . /training
WORKDIR /training
RUN pip install -e .
RUN chmod +x /training/run.sh

CMD [ "/bin/bash", "/training/run.sh" ]
Empty file.
63 changes: 63 additions & 0 deletions training/alphafind_training/cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import logging

import numpy as np
import torch
from alphafind_training.clustering import run_clustering
from alphafind_training.utils import dir_exists, file_exists, load_dataset, load_pickle

LOG = logging.getLogger(__name__)

torch.manual_seed(2023)
np.random.seed(2023)


def create_kmeans(input_path, output_path, n_clusters=2, sample_size=108, n_iterations=10):
"""
Function for clustering the embeddings using K-Means.
Args:
input_path (str): Path to the embeddings pickle file or directory of pickle files
output_path (str): Path to the output K-Means file
n_clusters (int): Number of clusters (default: 2)
sample_size (int): Size of the sample (default: 108)
n_iterations (int): Number of k-means iterations (default: 10)
Returns:
None
"""
assert file_exists(input_path) or dir_exists(input_path), 'Input file or directory does not exist'

LOG.info('Loading embeddings')
if dir_exists(input_path) and not file_exists(input_path):
embeddings, _ = load_dataset(input_path, sample_size, shuffle=True)
else:
embeddings = load_pickle(input_path)

assert embeddings.shape[0] >= sample_size, 'Sample size must be smaller than the number of embeddings'

LOG.info(f'Loaded embeddings of shape: {embeddings.shape}')
LOG.info(f'Running clustering, result k-means object will be saved to: {output_path}')

run_clustering(
output_path,
embeddings.values,
sample_size,
n_clusters,
n_iterations,
)


if __name__ == '__main__':
import argparse

parser = argparse.ArgumentParser(description="Cluster embeddings using K-Means")
parser.add_argument(
'--input', type=str, required=True, help='Path to the embeddings pickle file or directory of pickle files'
)
parser.add_argument('--output', type=str, required=True, help='Path to the output K-Means file')
parser.add_argument('--n-clusters', type=int, default=2, help='Number of clusters')
parser.add_argument('--sample-size', type=int, default=108, help='Size of the sample')
parser.add_argument('--n-iterations', type=int, default=10, help='Number of k-means iterations')
args = parser.parse_args()

create_kmeans(args.input, args.output, args.n_clusters, args.sample_size, args.n_iterations)
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

import faiss
import numpy as np

from utils import measure_memory_usage, measure_runtime
from alphafind_training.utils import measure_memory_usage, measure_runtime

np.random.seed(2023)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

from model import LIDatasetPredict, load_model
from utils import (
from alphafind_training.model import LIDatasetPredict, load_model
from alphafind_training.utils import (
create_dir,
dir_exists,
file_exists,
Expand All @@ -20,6 +18,7 @@
save_pickle,
save_predictions,
)
from tqdm import tqdm

torch.manual_seed(2023)
np.random.seed(2023)
Expand All @@ -42,16 +41,23 @@ def load_all_embeddings(path):
def parse_model_params(model_path):
LOG.info(f'Parsing out model params from model path: {model_path}')
pattern = r'model-(\w+)--.*?n_classes-(\d+)(?:--.*?dimensionality-(\d+))?'

if model_path is None:
model = 'MLP'
dimensionality = DEFAULT_DIMENSIONALITY
n_classes = 2
LOG.info(f'Parsed out model={model}, dimensionality={dimensionality}, n_classes={n_classes}')
return model, dimensionality, n_classes

match = re.search(pattern, model_path, re.MULTILINE)
# new model format
if match and len(match.groups()) == 3:
model = match.group(1)
n_classes = int(match.group(2))
dimensionality = match.group(3)
model, n_classes, dimensionality = match.groups()
dimensionality = int(dimensionality) if dimensionality is not None else DEFAULT_DIMENSIONALITY
n_classes = int(n_classes)
else:
LOG.info(f'Failed to parse out model params from model path: {model_path}')
exit(1)

LOG.info(f'Parsed out model={model}, dimensionality={dimensionality}, n_classes={n_classes}')
return model, dimensionality, n_classes

Expand Down Expand Up @@ -108,6 +114,64 @@ def assign_proteins_to_buckets(config):
LOG.info(f'Saved predictions per class in `{config.output_predictions}`')


def create_buckets(
output_chunks, output_predictions, input_path, model_dir_path, output_bucket_path, chunk_size=1000000
):
"""
Create buckets for protein IDs based on model predictions.
Args:
output_chunks (str): Path to a folder where temporary (per class + per slice) predictions will be saved.
output_predictions (str): Path to a folder where the per bucket objects will be saved.
input_path (str): Path to the dataset.
model_dir_path (str): Path to the model.
output_bucket_path (str): Path to output bucket data.
chunk_size (int): Chunk size for processing data.
Returns:
None
"""
assert output_chunks is not None
assert output_predictions is not None

LOG.info('Saving predictions per chunk and class')

# the dir can be models/<dirs> or <specific-model-dir>/checkpoint.pt
files = listdir(model_dir_path)

if not any([f.endswith('.pt') for f in listdir(model_dir_path)]):
model_dir_path = load_newest_file_in_dir(model_dir_path)

args = argparse.Namespace(
output_chunks=output_chunks,
output_predictions=output_predictions,
input=input_path,
model_dir_path=model_dir_path,
output_bucket_path=output_bucket_path,
chunk_size=chunk_size,
)

assign_proteins_to_buckets(args)

LOG.info('Loading all data')
df = load_all_embeddings(input_path)

create_dir(output_bucket_path)

LOG.info(f'Saving predictions per bucket in `{output_bucket_path}`')
for f in tqdm(listdir(output_predictions)):
data_subset = df[df.index.isin(load_pickle(f'{output_predictions}/{f}'))]
save_pickle(f'{output_bucket_path}/{f}', data_subset)

LOG.info(f'Saved predictions per bucket in `{output_bucket_path}`')

LOG.info(f'Removing temporary files in `{output_chunks}`, `{output_predictions}`')
remove_dir(output_chunks)
remove_dir(output_predictions)

LOG.info('Done')


'''
The script loads a model and assigns protein IDs to buckets based on the model's predictions.
Expand All @@ -128,73 +192,34 @@ def assign_proteins_to_buckets(config):
--model-dir-path "./data/models/"
'''
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(description="Create buckets for protein IDs based on model predictions")
parser.add_argument(
'--output-chunks',
type=str,
default=('./data/chunks'),
help=(
'Path to a folder where temporary (per class + per slice) '
'predictions will be saved (without the / at the end)'
),
default='./data/chunks',
help='Path to a folder where temporary (per class + per slice) predictions will be saved (without the / at the end)',
)
parser.add_argument(
'--output-predictions',
type=str,
default=('./data/overall'),
default='./data/overall',
help='Path to a folder where the per bucket objects will be saved (without the / at the end)',
)
parser.add_argument('--input', type=str, default='./data/embeddings', help='Path to the dataset')
parser.add_argument('--model-dir-path', type=str, default='./data/models/', help='Path to the model')
parser.add_argument(
'--input',
type=str,
default='./data/embeddings',
help='Path to the dataset',
)
parser.add_argument(
'--model-dir-path',
type=str,
default=('./data/models/'),
help='Path to the model',
)
parser.add_argument(
'--output-bucket-path',
type=str,
default='./data/bucket-data/',
help='path to output bucket data',
'--output-bucket-path', type=str, default='./data/bucket-data/', help='path to output bucket data'
)
parser.add_argument('--chunk-size', type=int, default=1000000, help='Chunk size')

args = parser.parse_args()

assert args.output_chunks is not None
assert args.output_predictions is not None

logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s')

LOG.info('Saving predictions per chunk and class')

# the dir can be models/<dirs> or <specific-model-dir>/checkpoint.pt
files = listdir(args.model_dir_path)

if not any([f.endswith('.pt') for f in listdir(args.model_dir_path)]):
args.model_dir_path = load_newest_file_in_dir(args.model_dir_path)

assign_proteins_to_buckets(args)

LOG.info('Loading all data')
df = load_all_embeddings(args.input)

create_dir(args.output_bucket_path)

LOG.info(f'Saving predictions per bucket in `{args.output_bucket_path}`')
for f in tqdm(listdir(args.output_predictions)):
data_subset = df[df.index.isin(load_pickle(f'{args.output_predictions}/{f}'))]
save_pickle(f'{args.output_bucket_path}/{f}', data_subset)

LOG.info(f'Saved predictions per bucket in `{args.output_bucket_path}`')

LOG.info(f'Removing temporary files in `{args.output_chunks}`, `{args.output_predictions}`')
remove_dir(args.output_chunks)
remove_dir(args.output_predictions)

LOG.info('Done')
create_buckets(
args.output_chunks,
args.output_predictions,
args.input,
args.model_dir_path,
args.output_bucket_path,
args.chunk_size,
)
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,30 @@
DST_THRESHOLD = 20.0


def run(cif_path, output_path, granularity):
def create_embedding(input_path, output_path, granularity):
"""Calculate all protein descriptors
Args:
cif_path (str): path to CIF
output_path (str): output file
input_path (str or Path): path to CIF directory
output_path (str or Path): output file path
granularity (int): granularity of the descriptors
Returns:
None
"""
proteins = os.listdir(cif_path)
proteins = [file for file in proteins if file.endswith(".cif")]
input_path = Path(input_path)
output_path = Path(output_path)

proteins = [file for file in os.listdir(input_path) if file.endswith(".cif")]
LOG.info(f'Found {len(proteins)} proteins to create the embedding for')

with Pool() as pool:
results = []
data = []
index = []

for protein in proteins:
result = pool.apply_async(process_protein, (cif_path / protein, granularity))
result = pool.apply_async(process_protein, (input_path / protein, granularity))
results.append(result)

LOG.info("Processing started")
Expand All @@ -46,7 +52,7 @@ def run(cif_path, output_path, granularity):
]
index = [n for sublist in [result.get()['index'] for result in results] for n in sublist]
df = pd.DataFrame(index=index, data=data)
df.to_pickle(Path(output_path))
df.to_pickle(output_path)
t = time() - t
LOG.info(f'Processing took {t:.1f} seconds')
LOG.info(f'Output saved to {output_path}')
Expand Down Expand Up @@ -194,17 +200,17 @@ def remap(n, min_, max_):
python3 create-embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(description="Create protein descriptors from CIF files")
parser.add_argument("--input", type=str, required=True, help="Path to the directory containing CIF files")
parser.add_argument("--output", type=str, required=True, help="Path to the output file")
parser.add_argument(
"--granularity", type=int, required=False, default=10, help="How detailed should the descriptor be"
)
parser.add_argument("--granularity", type=int, default=10, help="How detailed should the descriptor be")

args = parser.parse_args()

logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s')

input_path = Path(args.input)
output_path = Path(args.output)
assert input_path.exists()
assert input_path.exists(), f"Input path {input_path} does not exist"

run(input_path, output_path, args.granularity)
create_embedding(input_path, output_path, args.granularity)
Loading

0 comments on commit e0c7589

Please sign in to comment.