datamol-io · DomInvivo · Apr 13, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 17, 2024
@@ -16,8 +16,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
-        pytorch-version: ["2.0"]
+        include:
+          - python-version: "3.10"
+            pytorch-version: "2.0"
+          - python-version: "3.11"
+            pytorch-version: "2.0"
+          - python-version: "3.12"
+            pytorch-version: "2.3"
 
     runs-on: "ubuntu-latest"
     timeout-minutes: 30
@@ -49,8 +54,11 @@ jobs:
       - name: Install library
         run: python -m pip install --no-deps -e . # `-e` required for correct `coverage` run.
 
-      - name: Run tests
-        run: pytest -m 'not ipu'
+      - name: Install test dependencies
+        run: micromamba install -c conda-forge pytdc  # Required to run the `test_finetuning.py`
+
+      - name: Install C++ library
+        run: cd graphium/graphium_cpp && git clone https://github.com/pybind/pybind11.git && export PYTHONPATH=$PYTHONPATH:./pybind11 && python -m pip install . && cd ../..
 
       - name: Test CLI
         run: graphium --help

@@ -29,6 +29,7 @@ draft/
 scripts-expts/
 sweeps/
 mup/
+loc-*
 
 # Data and predictions
 graphium/data/ZINC_bench_gnn/
@@ -38,6 +39,7 @@ graphium/data/cache/
 graphium/data/b3lyp/
 graphium/data/PCQM4Mv2/
 graphium/data/PCQM4M/
+graphium/data/largemix/
 graphium/data/neurips2023/small-dataset/
 graphium/data/neurips2023/large-dataset/
 graphium/data/neurips2023/dummy-dataset/
@@ -53,15 +55,6 @@ debug/
 change_commits.sh
 graphium/features/test_new_pes.ipynb
 
-# IPU related ignores and profiler outputs
-*.a
-*.cbor
-*.capnp
-*.pop
-*.popart
-*.pop_cache
-*.popef
-*.pvti*
 
 ############ END graphium Custom GitIgnore ##############
 

@@ -189,6 +189,7 @@
    Copyright 2023 Valence Labs
    Copyright 2023 Recursion Pharmaceuticals
    Copyright 2023 Graphcore Limited
+   Copyright 2024 NVIDIA CORPORATION & AFFILIATES
 
    Various Academic groups have also contributed to this software under
    the given license. These include, but are not limited, to the following

@@ -13,7 +13,6 @@
 [![GitHub Repo stars](https://img.shields.io/github/stars/datamol-io/graphium)](https://github.com/datamol-io/graphium/stargazers)
 [![GitHub Repo stars](https://img.shields.io/github/forks/datamol-io/graphium)](https://github.com/datamol-io/graphium/network/members)
 [![test](https://github.com/datamol-io/graphium/actions/workflows/test.yml/badge.svg)](https://github.com/datamol-io/graphium/actions/workflows/test.yml)
-[![test-ipu](https://github.com/datamol-io/graphium/actions/workflows/test_ipu.yml/badge.svg)](https://github.com/datamol-io/graphium/actions/workflows/test_ipu.yml)
 [![release](https://github.com/datamol-io/graphium/actions/workflows/release.yml/badge.svg)](https://github.com/datamol-io/graphium/actions/workflows/release.yml)
 [![code-check](https://github.com/datamol-io/graphium/actions/workflows/code-check.yml/badge.svg)](https://github.com/datamol-io/graphium/actions/workflows/code-check.yml)
 [![doc](https://github.com/datamol-io/graphium/actions/workflows/doc.yml/badge.svg)](https://github.com/datamol-io/graphium/actions/workflows/doc.yml)
@@ -35,8 +34,6 @@ Visit https://graphium-docs.datamol.io/.
 
 ## Installation for developers
 
-### For CPU and GPU developers
-
 Use [`mamba`](https://github.com/mamba-org/mamba), a faster and better alternative to `conda`.
 
 If you are using a GPU, we recommend enforcing the CUDA version that you need with `CONDA_OVERRIDE_CUDA=XX.X`.
@@ -53,25 +50,67 @@ mamba activate graphium
 pip install --no-deps -e .
 ```
 
-### For IPU developers
+## Training a model
+
+To learn how to train a model, we invite you to look at the documentation, or the jupyter notebooks available [here](https://github.com/datamol-io/graphium/tree/master/docs/tutorials/model_training).
+
+If you are not familiar with [PyTorch](https://pytorch.org/docs) or [PyTorch-Lightning](https://pytorch-lightning.readthedocs.io/en/latest/), we highly recommend going through their tutorial first.
+
+## Running an experiment
+
+### Datasets
+
+Graphium provides configs for 2 datasets: `toymix` and `largemix`. 
+`Toymix` uses 3 datasets, which are referenced in datamodule [here](https://github.com/datamol-io/graphium/blob/d12df7e06828fa7d7f8792141d058a60b2b2d258/expts/hydra-configs/tasks/loss_metrics_datamodule/toymix.yaml#L59-L102). Its datasets and their splits files can be downloaded from here:
+
 ```bash
-# Install Graphcore's SDK and Graphium dependencies in a new environment called `.graphium_ipu`
-./install_ipu.sh .graphium_ipu
+# Change or make the directory to where the dataset is to be downloaded
+cd expts/data/neurips2023/small-dataset
+
+# QM9 
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9.csv.gz
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9_random_splits.pt
+
+# Tox21
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21-7k-12-labels.csv.gz
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21_random_splits.p
+
+# Zinc
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k.csv.gz
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k_random_splits.pt
 ```
 
-The above step needs to be done once. After that, enable the SDK and the environment as follows:
+`Largemix` uses datasets referenced in datamodule [here](https://github.com/datamol-io/graphium/blob/e887176f71ee95c3b82f8f6b56c706eaa9765bf1/expts/hydra-configs/tasks/loss_metrics_datamodule/largemix.yaml#L82C1-L155C37). Its datasets and their splits files can be downloaded from here:
+
 
 ```bash
-source enable_ipu.sh .graphium_ipu
-```
+# Change or make the directory to where the dataset is to be downloaded
+cd ../data/graphium/large-dataset/
 
-## Training a model
+# L1000_VCAP
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt
 
-To learn how to train a model, we invite you to look at the documentation, or the jupyter notebooks available [here](https://github.com/datamol-io/graphium/tree/master/docs/tutorials/model_training).
+# L1000_MCF7
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt
 
-If you are not familiar with [PyTorch](https://pytorch.org/docs) or [PyTorch-Lightning](https://pytorch-lightning.readthedocs.io/en/latest/), we highly recommend going through their tutorial first.
+# PCBA_1328
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt
+
+# PCQM4M_G25
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt
+
+#PCQM4M_N4
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt
+```
+These datasets can be used further for pretraining.
+
+### Pretraining 
 
-## Running an experiment
 We have setup Graphium with `hydra` for managing config files. To run an experiment go to the `expts/` folder. For example, to benchmark a GCN on the ToyMix dataset run
 ```bash
 graphium-train architecture=toymix tasks=toymix training=toymix model=gcn
@@ -86,34 +125,46 @@ Integrating `hydra` also allows you to quickly switch between accelerators. E.g.
 graphium-train architecture=toymix tasks=toymix training=toymix model=gcn accelerator=gpu
 ```
 automatically selects the correct configs to run the experiment on GPU.
-Finally, you can also run a fine-tuning loop:
-```bash
-graphium-train +finetuning=admet
-```
+To use Largemix dataset instead, replace `toymix` to `largemix` in the above commmands.
 
 To use a config file you built from scratch you can run
 ```bash
 graphium-train --config-path [PATH] --config-name [CONFIG]
 ```
 Thanks to the modular nature of `hydra` you can reuse many of our config settings for your own experiments with Graphium.
 
-## Preparing the data in advance
-The data preparation including the featurization (e.g., of molecules from smiles to pyg-compatible format) is embedded in the pipeline and will be performed when executing `graphium-train [...]`.
+### Finetuning
 
-However, when working with larger datasets, it is recommended to perform data preparation in advance using a machine with sufficient allocated memory (e.g., ~400GB in the case of `LargeMix`). Preparing data in advance is also beneficial when running lots of concurrent jobs with identical molecular featurization, so that resources aren't wasted and processes don't conflict reading/writing in the same directory.
+After pretraining a model and saving a model checkpoint, the model can be finetuned to a new task
 
-The following command-line will prepare the data and cache it, then use it to train a model.
 ```bash
-# First prepare the data and cache it in `path_to_cached_data`
-graphium data prepare ++datamodule.args.processed_graph_data_path=[path_to_cached_data]
+graphium-train +finetuning [example-custom OR example-tdc] finetuning.pretrained_model=[model_identifier]
+```
 
-# Then train the model on the prepared data
-graphium-train [...] datamodule.args.processed_graph_data_path=[path_to_cached_data]
+The `[model_identifier]` serves to identify the pretrained model among those maintained in the `GRAPHIUM_PRETRAINED_MODELS_DICT` in `graphium/utils/spaces.py`, where the `[model_identifier]` maps to the location of the checkpoint of the pretrained model.
+
+We have provided two example yaml configs under `expts/hydra-configs/finetuning` for finetuning on a custom dataset (`example-custom.yaml`) or for a task from the TDC benchmark collection (`example-tdc.yaml`).
+
+When using `example-custom.yaml`, to finetune on a custom dataset, we nee to provide the location of the data (`constants.data_path=[path_to_data]`) and the type of task (`constants.task_type=[cls OR reg]`).
+
+When using `example-tdc.yaml`, to finetune on a TDC task, we only need to provide the task name (`constants.task=[task_name]`) and the task type is inferred automatically.
+
+Custom datasets to finetune from consist of two files `raw.csv` and `split.csv`. The `raw.csv` contains two columns, namely `smiles` with the smiles strings, and `target` with the corresponding targets. In `split.csv`, three columns `train`, `val`, `test` contain the indices of the rows in `raw.csv`. Examples can be found under `expts/data/finetuning_example-reg` (regression) and `expts/data/finetuning_example-cls` (binary classification).
+
+### Fingerprinting
+
+Alternatively, we can also obtain molecular embeddings (fingerprints) from a pretrained model:
+```bash
+graphium fps create [example-custom OR example-tdc] pretrained.model=[model_identifier] pretrained.layers=[layer_identifiers]
 ```
 
-**Note** that `datamodule.args.processed_graph_data_path` can also be specified at `expts/hydra_configs/`.
+We have provided two example yaml configs under `expts/hydra-configs/fingerprinting` for extracting fingerprints for a custom dataset (`example-custom.yaml`) or for a dataset from the TDC benchmark collection (`expample-tdc.yaml`).
+
+After specifiying the `[model_identifier]`, we need to provide a list of layers from that model where we want to read out embeddings via `[layer_identifiers]` (which requires knowledge of the architecture of the pretrained model).
+
+When using `example-custom.yaml`, the location of the smiles to be embedded needs to be passed via `datamodule.df_path=[path_to_data]`. The data can be passed as a csv/parquet file with a column `smiles`, similar to `expts/data/finetuning_example-reg/raw.csv`.
 
-**Note** that, every time the configs of `datamodule.args.featurization` changes, you will need to run a new data preparation, which will automatically be saved in a separate directory that uses a hash unique to the configs.
+When extracting fingerprints for a TDC task using `expample-tdc.yaml`, we need to specify `datamodule.benchmark` and `datamodule.task` instead of `datamodule.df_path`.
 
 ## License
 

@@ -16,8 +16,3 @@ component_management:
         target: auto
         branches:
           - "!main"
-  individual_components:
-    - component_id: ipu # this is an identifier that should not be changed
-      name: ipu # this is a display name, and can be changed freely
-      paths:
-        - graphium/ipu/**
@@ -5,37 +5,8 @@ Feature extraction and manipulation
 === "Contents"
 
     * [Featurizer](#featurizer)
-    * [Positional Encoding](#positional-encoding)
-    * [Properties](#properties)
-    * [Spectral PE](#spectral-pe)
-    * [Random Walk PE](#random-walk-pe)
-    * [NMP](#nmp)
 
 ## Featurizer
 ------------
 ::: graphium.features.featurizer
 
-
-## Positional Encoding
-------------
-::: graphium.features.positional_encoding
-
-
-## Properties
-------------
-::: graphium.features.properties
-
-
-## Spectral PE
-------------
-::: graphium.features.spectral
-
-
-## Random Walk PE
-------------
-::: graphium.features.rw
-
-
-## NMP
-------------
-::: graphium.features.nmp
@@ -10,4 +10,4 @@ Module for finetuning models and doing linear probing (fingerprinting).
 
 ::: graphium.finetuning.finetuning_architecture.FinetuningHead
 
-::: graphium.finetuning.fingerprinting.Fingerprinter
+::: graphium.fingerprinting.fingerprinter.Fingerprinter
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,4 +10,4 @@ Module for finetuning models and doing linear probing (fingerprinting).

		::: graphium.finetuning.finetuning_architecture.FinetuningHead

		::: graphium.finetuning.fingerprinting.Fingerprinter
		::: graphium.fingerprinting.fingerprinter.Fingerprinter