create package in training, add tests to CI

Coda-Research-Group · Oct 1, 2024 · e0c7589 · e0c7589
1 parent 47de63b
commit e0c7589
Show file tree

Hide file tree

Showing 22 changed files with 495 additions and 311 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -37,20 +37,16 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r training/requirements.txt
         pip install pytest pytest-cov
+        cd training && pip install -e .
 
     - name: Run tests with pytest and generate coverage
       run: |
-        cd training
-        pytest --cov=. --cov-report=xml --cov-report=term-missing
+        cd /home/runner/work/AlphaFind/AlphaFind/training
+        export PYTHONPATH=$PYTHONPATH:$(pwd)
+        pytest -v --cov=. --cov-report=xml --cov-report=term-missing
 
     - name: Upload coverage report
       uses: actions/upload-artifact@v4
       with:
         name: coverage-report
         path: training/coverage.xml
-
-    - name: Upload coverage to Codecov
-      uses: codecov/codecov-action@v3
-      with:
-        file: ./training/coverage.xml
-        fail_ci_if_error: true
diff --git a/training/.gitignore b/training/.gitignore
@@ -13,3 +13,4 @@ data/kmeans.idx
 models/
 .coverage
 coverage.xml
+*.egg-info
diff --git a/training/Dockerfile b/training/Dockerfile
@@ -17,5 +17,7 @@ RUN pip install -r /var/requirements.txt && rm -rf ~/.cache
 
 COPY . /training
 WORKDIR /training
+RUN pip install -e .
 RUN chmod +x /training/run.sh
+
 CMD [ "/bin/bash", "/training/run.sh" ]
diff --git a/training/alphafind_training/__init__.py b/training/alphafind_training/__init__.py
diff --git a/training/alphafind_training/cluster.py b/training/alphafind_training/cluster.py
@@ -0,0 +1,63 @@
+import logging
+
+import numpy as np
+import torch
+from alphafind_training.clustering import run_clustering
+from alphafind_training.utils import dir_exists, file_exists, load_dataset, load_pickle
+
+LOG = logging.getLogger(__name__)
+
+torch.manual_seed(2023)
+np.random.seed(2023)
+
+
+def create_kmeans(input_path, output_path, n_clusters=2, sample_size=108, n_iterations=10):
+    """
+    Function for clustering the embeddings using K-Means.
+
+    Args:
+    input_path (str): Path to the embeddings pickle file or directory of pickle files
+    output_path (str): Path to the output K-Means file
+    n_clusters (int): Number of clusters (default: 2)
+    sample_size (int): Size of the sample (default: 108)
+    n_iterations (int): Number of k-means iterations (default: 10)
+
+    Returns:
+    None
+    """
+    assert file_exists(input_path) or dir_exists(input_path), 'Input file or directory does not exist'
+
+    LOG.info('Loading embeddings')
+    if dir_exists(input_path) and not file_exists(input_path):
+        embeddings, _ = load_dataset(input_path, sample_size, shuffle=True)
+    else:
+        embeddings = load_pickle(input_path)
+
+    assert embeddings.shape[0] >= sample_size, 'Sample size must be smaller than the number of embeddings'
+
+    LOG.info(f'Loaded embeddings of shape: {embeddings.shape}')
+    LOG.info(f'Running clustering, result k-means object will be saved to: {output_path}')
+
+    run_clustering(
+        output_path,
+        embeddings.values,
+        sample_size,
+        n_clusters,
+        n_iterations,
+    )
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Cluster embeddings using K-Means")
+    parser.add_argument(
+        '--input', type=str, required=True, help='Path to the embeddings pickle file or directory of pickle files'
+    )
+    parser.add_argument('--output', type=str, required=True, help='Path to the output K-Means file')
+    parser.add_argument('--n-clusters', type=int, default=2, help='Number of clusters')
+    parser.add_argument('--sample-size', type=int, default=108, help='Size of the sample')
+    parser.add_argument('--n-iterations', type=int, default=10, help='Number of k-means iterations')
+    args = parser.parse_args()
+
+    create_kmeans(args.input, args.output, args.n_clusters, args.sample_size, args.n_iterations)
diff --git a/training/clustering.py → training/alphafind_training/clustering.py b/training/clustering.py → training/alphafind_training/clustering.py
@@ -2,8 +2,7 @@
 
 import faiss
 import numpy as np
-
-from utils import measure_memory_usage, measure_runtime
+from alphafind_training.utils import measure_memory_usage, measure_runtime
 
 np.random.seed(2023)
 

diff --git a/training/create_buckets.py → ...ning/alphafind_training/create_buckets.py b/training/create_buckets.py → ...ning/alphafind_training/create_buckets.py
@@ -7,10 +7,8 @@
 import numpy as np
 import pandas as pd
 import torch
-from tqdm import tqdm
-
-from model import LIDatasetPredict, load_model
-from utils import (
+from alphafind_training.model import LIDatasetPredict, load_model
+from alphafind_training.utils import (
     create_dir,
     dir_exists,
     file_exists,
@@ -20,6 +18,7 @@
     save_pickle,
     save_predictions,
 )
+from tqdm import tqdm
 
 torch.manual_seed(2023)
 np.random.seed(2023)
@@ -42,16 +41,23 @@ def load_all_embeddings(path):
 def parse_model_params(model_path):
     LOG.info(f'Parsing out model params from model path: {model_path}')
     pattern = r'model-(\w+)--.*?n_classes-(\d+)(?:--.*?dimensionality-(\d+))?'
+
+    if model_path is None:
+        model = 'MLP'
+        dimensionality = DEFAULT_DIMENSIONALITY
+        n_classes = 2
+        LOG.info(f'Parsed out model={model}, dimensionality={dimensionality}, n_classes={n_classes}')
+        return model, dimensionality, n_classes
+
     match = re.search(pattern, model_path, re.MULTILINE)
-    # new model format
     if match and len(match.groups()) == 3:
-        model = match.group(1)
-        n_classes = int(match.group(2))
-        dimensionality = match.group(3)
+        model, n_classes, dimensionality = match.groups()
         dimensionality = int(dimensionality) if dimensionality is not None else DEFAULT_DIMENSIONALITY
+        n_classes = int(n_classes)
     else:
         LOG.info(f'Failed to parse out model params from model path: {model_path}')
         exit(1)
+
     LOG.info(f'Parsed out model={model}, dimensionality={dimensionality}, n_classes={n_classes}')
     return model, dimensionality, n_classes
 
@@ -108,6 +114,64 @@ def assign_proteins_to_buckets(config):
     LOG.info(f'Saved predictions per class in `{config.output_predictions}`')
 
 
+def create_buckets(
+    output_chunks, output_predictions, input_path, model_dir_path, output_bucket_path, chunk_size=1000000
+):
+    """
+    Create buckets for protein IDs based on model predictions.
+
+    Args:
+    output_chunks (str): Path to a folder where temporary (per class + per slice) predictions will be saved.
+    output_predictions (str): Path to a folder where the per bucket objects will be saved.
+    input_path (str): Path to the dataset.
+    model_dir_path (str): Path to the model.
+    output_bucket_path (str): Path to output bucket data.
+    chunk_size (int): Chunk size for processing data.
+
+    Returns:
+    None
+    """
+    assert output_chunks is not None
+    assert output_predictions is not None
+
+    LOG.info('Saving predictions per chunk and class')
+
+    # the dir can be models/<dirs> or <specific-model-dir>/checkpoint.pt
+    files = listdir(model_dir_path)
+
+    if not any([f.endswith('.pt') for f in listdir(model_dir_path)]):
+        model_dir_path = load_newest_file_in_dir(model_dir_path)
+
+    args = argparse.Namespace(
+        output_chunks=output_chunks,
+        output_predictions=output_predictions,
+        input=input_path,
+        model_dir_path=model_dir_path,
+        output_bucket_path=output_bucket_path,
+        chunk_size=chunk_size,
+    )
+
+    assign_proteins_to_buckets(args)
+
+    LOG.info('Loading all data')
+    df = load_all_embeddings(input_path)
+
+    create_dir(output_bucket_path)
+
+    LOG.info(f'Saving predictions per bucket in `{output_bucket_path}`')
+    for f in tqdm(listdir(output_predictions)):
+        data_subset = df[df.index.isin(load_pickle(f'{output_predictions}/{f}'))]
+        save_pickle(f'{output_bucket_path}/{f}', data_subset)
+
+    LOG.info(f'Saved predictions per bucket in `{output_bucket_path}`')
+
+    LOG.info(f'Removing temporary files in `{output_chunks}`, `{output_predictions}`')
+    remove_dir(output_chunks)
+    remove_dir(output_predictions)
+
+    LOG.info('Done')
+
+
 '''
 The script loads a model and assigns protein IDs to buckets based on the model's predictions.
 
@@ -128,73 +192,34 @@ def assign_proteins_to_buckets(config):
     --model-dir-path "./data/models/"
 '''
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(description="Create buckets for protein IDs based on model predictions")
     parser.add_argument(
         '--output-chunks',
         type=str,
-        default=('./data/chunks'),
-        help=(
-            'Path to a folder where temporary (per class + per slice) '
-            'predictions will be saved (without the / at the end)'
-        ),
+        default='./data/chunks',
+        help='Path to a folder where temporary (per class + per slice) predictions will be saved (without the / at the end)',
     )
     parser.add_argument(
         '--output-predictions',
         type=str,
-        default=('./data/overall'),
+        default='./data/overall',
         help='Path to a folder where the per bucket objects will be saved (without the / at the end)',
     )
+    parser.add_argument('--input', type=str, default='./data/embeddings', help='Path to the dataset')
+    parser.add_argument('--model-dir-path', type=str, default='./data/models/', help='Path to the model')
     parser.add_argument(
-        '--input',
-        type=str,
-        default='./data/embeddings',
-        help='Path to the dataset',
-    )
-    parser.add_argument(
-        '--model-dir-path',
-        type=str,
-        default=('./data/models/'),
-        help='Path to the model',
-    )
-    parser.add_argument(
-        '--output-bucket-path',
-        type=str,
-        default='./data/bucket-data/',
-        help='path to output bucket data',
+        '--output-bucket-path', type=str, default='./data/bucket-data/', help='path to output bucket data'
     )
     parser.add_argument('--chunk-size', type=int, default=1000000, help='Chunk size')
-
     args = parser.parse_args()
 
-    assert args.output_chunks is not None
-    assert args.output_predictions is not None
-
     logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s')
 
-    LOG.info('Saving predictions per chunk and class')
-
-    # the dir can be models/<dirs> or <specific-model-dir>/checkpoint.pt
-    files = listdir(args.model_dir_path)
-
-    if not any([f.endswith('.pt') for f in listdir(args.model_dir_path)]):
-        args.model_dir_path = load_newest_file_in_dir(args.model_dir_path)
-
-    assign_proteins_to_buckets(args)
-
-    LOG.info('Loading all data')
-    df = load_all_embeddings(args.input)
-
-    create_dir(args.output_bucket_path)
-
-    LOG.info(f'Saving predictions per bucket in `{args.output_bucket_path}`')
-    for f in tqdm(listdir(args.output_predictions)):
-        data_subset = df[df.index.isin(load_pickle(f'{args.output_predictions}/{f}'))]
-        save_pickle(f'{args.output_bucket_path}/{f}', data_subset)
-
-    LOG.info(f'Saved predictions per bucket in `{args.output_bucket_path}`')
-
-    LOG.info(f'Removing temporary files in `{args.output_chunks}`, `{args.output_predictions}`')
-    remove_dir(args.output_chunks)
-    remove_dir(args.output_predictions)
-
-    LOG.info('Done')
+    create_buckets(
+        args.output_chunks,
+        args.output_predictions,
+        args.input,
+        args.model_dir_path,
+        args.output_bucket_path,
+        args.chunk_size,
+    )
diff --git a/training/create_embedding.py → ...ng/alphafind_training/create_embedding.py b/training/create_embedding.py → ...ng/alphafind_training/create_embedding.py
@@ -19,24 +19,30 @@
 DST_THRESHOLD = 20.0
 
 
-def run(cif_path, output_path, granularity):
+def create_embedding(input_path, output_path, granularity):
     """Calculate all protein descriptors
 
     Args:
-        cif_path (str): path to CIF
-        output_path (str): output file
+        input_path (str or Path): path to CIF directory
+        output_path (str or Path): output file path
         granularity (int): granularity of the descriptors
+
+    Returns:
+        None
     """
-    proteins = os.listdir(cif_path)
-    proteins = [file for file in proteins if file.endswith(".cif")]
+    input_path = Path(input_path)
+    output_path = Path(output_path)
+
+    proteins = [file for file in os.listdir(input_path) if file.endswith(".cif")]
     LOG.info(f'Found {len(proteins)} proteins to create the embedding for')
+
     with Pool() as pool:
         results = []
         data = []
         index = []
 
         for protein in proteins:
-            result = pool.apply_async(process_protein, (cif_path / protein, granularity))
+            result = pool.apply_async(process_protein, (input_path / protein, granularity))
             results.append(result)
 
         LOG.info("Processing started")
@@ -46,7 +52,7 @@ def run(cif_path, output_path, granularity):
         ]
         index = [n for sublist in [result.get()['index'] for result in results] for n in sublist]
         df = pd.DataFrame(index=index, data=data)
-        df.to_pickle(Path(output_path))
+        df.to_pickle(output_path)
         t = time() - t
         LOG.info(f'Processing took {t:.1f} seconds')
         LOG.info(f'Output saved to {output_path}')
@@ -194,17 +200,17 @@ def remap(n, min_, max_):
 python3 create-embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10
 """
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(description="Create protein descriptors from CIF files")
     parser.add_argument("--input", type=str, required=True, help="Path to the directory containing CIF files")
     parser.add_argument("--output", type=str, required=True, help="Path to the output file")
-    parser.add_argument(
-        "--granularity", type=int, required=False, default=10, help="How detailed should the descriptor be"
-    )
+    parser.add_argument("--granularity", type=int, default=10, help="How detailed should the descriptor be")
 
     args = parser.parse_args()
 
+    logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s')
+
     input_path = Path(args.input)
     output_path = Path(args.output)
-    assert input_path.exists()
+    assert input_path.exists(), f"Input path {input_path} does not exist"
 
-    run(input_path, output_path, args.granularity)
+    create_embedding(input_path, output_path, args.granularity)