From 6facdb1088d93ff679b5e5704b4dcfb3baaf010b Mon Sep 17 00:00:00 2001 From: maclandrol Date: Thu, 20 Jul 2023 13:57:40 -0400 Subject: [PATCH 1/8] better error for Moleculetransformer --- molfeat/trans/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/molfeat/trans/base.py b/molfeat/trans/base.py index 6f7f499..2bb4311 100644 --- a/molfeat/trans/base.py +++ b/molfeat/trans/base.py @@ -32,6 +32,7 @@ from molfeat.utils.cache import CacheList from molfeat.utils.commons import fn_to_hex from molfeat.utils.commons import hex_to_fn +from molfeat.utils.commons import is_callable from molfeat.utils.parsing import get_input_args from molfeat.utils.parsing import import_from_string from molfeat.utils.state import map_dtype @@ -198,6 +199,8 @@ def __init__( self._fitted = False self._save_input_args() + if self.featurizer and (not isinstance(self.featurizer, str) or not is_callable(self.featurizer)): + raise AttributeError("Featurizer must be a callable or a string") def _save_input_args(self): """Save the input arguments of a transformer to the attribute @@ -319,7 +322,7 @@ def _to_mol(x): if not ignore_errors: for ind, feat in enumerate(features): if feat is None: - raise ValueError(f"Cannot transform molecule at index {ind}") + raise ValueError(f"Cannot transform molecule at index {ind}. Please check logs (set verbose to True) to see errors!") return features From 38206135c70d746608b21c45441624f4c39a74ed Mon Sep 17 00:00:00 2001 From: maclandrol Date: Thu, 20 Jul 2023 13:58:29 -0400 Subject: [PATCH 2/8] add is_callable util function --- molfeat/utils/cache.py | 4 ++-- molfeat/utils/commons.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/molfeat/utils/cache.py b/molfeat/utils/cache.py index 13ffd87..49749bb 100644 --- a/molfeat/utils/cache.py +++ b/molfeat/utils/cache.py @@ -260,7 +260,7 @@ def fetch( try: cacher = copy.deepcopy(self) n_jobs = self.n_jobs - except: + except: # noqa # cannot parallelize process, ensure n_jobs is 0 cacher = self n_jobs = 0 @@ -357,7 +357,7 @@ def clear(self, delete: bool = False): for path in glob.glob(str(self.cache_file) + "*"): try: os.unlink(path) - except: + except: # noqa pass else: self._initialize_cache() diff --git a/molfeat/utils/commons.py b/molfeat/utils/commons.py index 7b2e2dd..09f0e2c 100644 --- a/molfeat/utils/commons.py +++ b/molfeat/utils/commons.py @@ -6,6 +6,7 @@ from typing import List from typing import Union +import types import os import inspect import hashlib @@ -25,6 +26,15 @@ from molfeat.utils import datatype +FUNCTYPES = (types.FunctionType, types.MethodType, functools.partial) + +def is_callable(func): + r""" + Check if func is a function or a callable + """ + return func and (isinstance(func, FUNCTYPES) or callable(func)) + + def sha256sum(filepath: Union[str, os.PathLike]): """Return the sha256 sum hash of a file or a directory From 6cd9c1235ae46dd3900525555146682a11b631ed Mon Sep 17 00:00:00 2001 From: maclandrol Date: Thu, 20 Jul 2023 13:59:01 -0400 Subject: [PATCH 3/8] improve docs --- docs/usage.md | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 78 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 04d1328..c3ac6a1 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -46,3 +46,80 @@ model_card = store.search(name="ChemBERTa-77M-MLM")[0] model_card.usage() ``` + +## FAQ +
+What is a molecular featurizer ? + +A molecular featurizer is function or model that provides numerical representations from molecular structures. These numerical features can then be used as input for machine learning models to predict molecular properties and activities, to design new molecules, to perform molecular analyses, or to search for similar molecules. +
+ + +
+Why so many molecular featurizers in `molfeat`? + +To date, it's not clear which molecular representation performs better. There are multiple ways of representing molecules (e.g using their physico-chemical descriptors, using a fingerprint corresponding to a hash of the molecular structure, using deep learning embeddings, etc). Depending on your tasks, one representation could perform better than another, this is why `molfeat` attempt to provide a broad range of featurizer to ensure, everyone has access to their favorite featurizers. +
+ + +
+What is the difference between a calculator and a featurizer in `molfeat`? + +In `molfeat`, +- a `calculator` operate on the level of a single molecule, it dictates how to transform an input molecule into a numerical representation. + +- a `featurizer` operates on batches of molecules, because deep learning models are often more efficient on batch of samples. Some `featurizers` uses `calculator` internally to each molecule individually and stitch them together. `featurizers` also provide convenient tools such as parallelism, caching, etc to make computation of molecular representation efficient. + +`molfeat` is designed to be extremely flexible. This is because the space of actions that users often wish to perform is huge and there are often not "wrong" ways. +
+ +
+What are the function I should know when using a `featurizer` ? + + +Every featurizer would have: + - a `preprocess` method that can perform preprocessing of your input molecules, to ensure compatibility with the expected featurizer class you are using. The preprocess steps is not called automatically for you to decouple it from the molecular transformation. It's a suggestion for the preprocessing steps you should perform when using a given featurizer. + +The `preprocess` function expect your molecule inputs, but also some optional labels and can be redefined when creating your own custom featurizer. + + - a `transform` method that operates on a batch of molecules and returns a list of representation, this is where the `magic` happens. Position where featurization failed can be `None` when you elect to `ignore_errors`. + - a `_transform` method that operates on a single input molecule, this is where the `magic` happens + - a `__call__` method that uses `transform` under the hood and add some convenient argument such as enforcing the datatype you defined when initializing your model to the outputs. If you ask to `ignore_errors`, a vector of indexes where featurization did not fail will also be returned. + +In addition to the method described above, `PretrainedMolTransformer` also defines the following functions: + +- `_embed`: since pre-trained models benefit from batched featurization, this method is called by internally during `transform` instead of an internal calculator. +- `_convert`: this method is called by the transformer to convert the molecule input into the expected format of the underlying ML model. For example for a pre-trained language model expecting SELFIES strings, we will convert for input into SELFIES strings here. + +
+ +
+I am getting an error and I am not sure what to do. + +User can decide to `ignore_errors` when featurization fails on some molecules of their dataset, with the hope of filtering them after. Therefore, some silent errors are caught in the `transform` errors. Set the verbosity of the featurizer to True to get a log of all errors. + +```python +from molfeat.trans.concat import FeatConcat +from molfeat.trans.fp import FPVecTransformer +import numpy as np +featurizer = MoleculeTransformer(..., dtype=np.float32, verbose=True) +featurizer(["CSc1nc2cc3c(cc2[nH]1)N(Cc1ccc(S(=O)(=O)c2ccccc2)cc1)CCC3"], enforce_dtype=True) +``` + +you will alway have a log of all errors. +
+ +
+What are the base featurizers class in molfeat and how to use them ? + +| Class | Module | Why ? | +|------------- |----------------------- |---------------------- | +| [`BaseFeaturizer`](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.base.html#molfeat.trans.base.BaseFeaturizer) | `molfeat.trans.base` | Lowest level featurizer class. All featurizers (even if not molecular) inherits from this class. It's recommended to use `MoleculeTransformer` as root class instead | +| [`MoleculeTransformer`](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.base.html#molfeat.trans.base.MoleculeTransformer) | `molfeat.trans.base` |
  • Base class for all molecule featurizers. This is where you start if you want to implement a new featurizer.
  • You can provide either an existing `calculator` or your own (a **python callable**) directly to define a new `featurizer`
| +|[`PrecomputedMolTransformer``](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.base.html#molfeat.trans.base.PrecomputedMolTransformer) | `molfeat.trans.base` | Class for dealing with precomputed features. You can leverage this class to compute features, save them in a file, and reload them after for other task efficiently. [See this tutorial !](https://molfeat-docs.datamol.io/stable/tutorials/datacache.html#using-a-cache-with-a-precomputed-transformer) | +|[`FeatConcat`](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.concat.html#molfeat.trans.concat.FeatConcat) | `molfeat.trans.concat` | Convenient class for concatenating multiple vector-featurizers automatically. If you want to combine multiple 'fingerprints' and descriptors, this is the class you use. [See example !](https://molfeat-docs.datamol.io/stable/tutorials/types_of_featurizers.html#concatenate-featurizers) | +|[`PretrainedMolTransformer`](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.pretrained.base.html) | `molfeat.trans.pretrained.base` | Base class for all `pretrained featurizers`. A `pretrained featurizer` is a `featurizer` that is derived from a pretrained machine learning model. Implement a subclass of this to define your new pretrained featurizer. [See example !](https://molfeat-docs.datamol.io/stable/tutorials/add_your_own.html#define-your-own-transformer) | +|`PretrainedDGLTransformer` | `molfeat.trans.pretrained.dgl_pretrained` | Base class for all `dgl pretrained featurizers`. You can initialize a new dgl/dgllife pretrained model as a `molfeat featurizer` easily using this class. You only need to add the dgl model object to a store. | +|[`PretrainedHFTransformer``](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.pretrained.hf_transformers.html#molfeat.trans.pretrained.hf_transformers.PretrainedHFTransformer) | `molfeat.trans.pretrained.hf_transformer` | Base class for all `huggingface pretrained featurizers`. You can initialize a new 🤗 Transformers pretrained model as a `molfeat featurizer` easily using this class. [See this example !](https://github.com/datamol-io/molfeat/blob/main/nb/etl/molt5-etl.ipynb) | + +
diff --git a/mkdocs.yml b/mkdocs.yml index cf82d31..a4739ca 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -46,6 +46,7 @@ nav: - molfeat.trans.struct: api/molfeat.trans.struct.md - molfeat.trans.concat: api/molfeat.trans.concat.md - molfeat.trans.pretrained: + - Base Pretrained Models: api/molfeat.trans.pretrained.base.md - HuggingFace: api/molfeat.trans.pretrained.hf_transformers.md - Graphormer: api/molfeat.trans.pretrained.graphormer.md - DGL: api/molfeat.trans.pretrained.dgl_pretrained.md From ff31666897da40ec03eb712059c54a730c3d4dac Mon Sep 17 00:00:00 2001 From: maclandrol Date: Thu, 20 Jul 2023 14:00:02 -0400 Subject: [PATCH 4/8] doc --- docs/api/molfeat.trans.pretrained.base.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 docs/api/molfeat.trans.pretrained.base.md diff --git a/docs/api/molfeat.trans.pretrained.base.md b/docs/api/molfeat.trans.pretrained.base.md new file mode 100644 index 0000000..6e9d88b --- /dev/null +++ b/docs/api/molfeat.trans.pretrained.base.md @@ -0,0 +1,3 @@ +## Pretrained Model + +::: molfeat.trans.pretrained.base \ No newline at end of file From 1eac67d65aacb0b2b368e3441ed0a3b862169b08 Mon Sep 17 00:00:00 2001 From: maclandrol Date: Thu, 20 Jul 2023 16:29:16 -0400 Subject: [PATCH 5/8] wip --- molfeat/trans/base.py | 8 ++++++-- molfeat/trans/pretrained/hf_transformers.py | 2 +- molfeat/utils/cache.py | 4 ++-- molfeat/utils/commons.py | 1 + 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/molfeat/trans/base.py b/molfeat/trans/base.py index 2bb4311..c5eb42e 100644 --- a/molfeat/trans/base.py +++ b/molfeat/trans/base.py @@ -199,7 +199,9 @@ def __init__( self._fitted = False self._save_input_args() - if self.featurizer and (not isinstance(self.featurizer, str) or not is_callable(self.featurizer)): + if self.featurizer and ( + not isinstance(self.featurizer, str) or not is_callable(self.featurizer) + ): raise AttributeError("Featurizer must be a callable or a string") def _save_input_args(self): @@ -322,7 +324,9 @@ def _to_mol(x): if not ignore_errors: for ind, feat in enumerate(features): if feat is None: - raise ValueError(f"Cannot transform molecule at index {ind}. Please check logs (set verbose to True) to see errors!") + raise ValueError( + f"Cannot transform molecule at index {ind}. Please check logs (set verbose to True) to see errors!" + ) return features diff --git a/molfeat/trans/pretrained/hf_transformers.py b/molfeat/trans/pretrained/hf_transformers.py index 2593e7f..f254144 100644 --- a/molfeat/trans/pretrained/hf_transformers.py +++ b/molfeat/trans/pretrained/hf_transformers.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import List, Optional +from typing import List from typing import Union from typing import Optional diff --git a/molfeat/utils/cache.py b/molfeat/utils/cache.py index 49749bb..4e9c2d7 100644 --- a/molfeat/utils/cache.py +++ b/molfeat/utils/cache.py @@ -260,7 +260,7 @@ def fetch( try: cacher = copy.deepcopy(self) n_jobs = self.n_jobs - except: # noqa + except: # noqa # cannot parallelize process, ensure n_jobs is 0 cacher = self n_jobs = 0 @@ -357,7 +357,7 @@ def clear(self, delete: bool = False): for path in glob.glob(str(self.cache_file) + "*"): try: os.unlink(path) - except: # noqa + except: # noqa pass else: self._initialize_cache() diff --git a/molfeat/utils/commons.py b/molfeat/utils/commons.py index 09f0e2c..83cb72e 100644 --- a/molfeat/utils/commons.py +++ b/molfeat/utils/commons.py @@ -28,6 +28,7 @@ FUNCTYPES = (types.FunctionType, types.MethodType, functools.partial) + def is_callable(func): r""" Check if func is a function or a callable From dcb66f2271739a5bb90d7c571c32b99b9a2ec950 Mon Sep 17 00:00:00 2001 From: maclandrol Date: Thu, 20 Jul 2023 16:29:31 -0400 Subject: [PATCH 6/8] wip --- docs/assets/css/custom-molfeat.css | 45 ++++++++++------- docs/usage.md | 81 ++++++++++++++---------------- 2 files changed, 64 insertions(+), 62 deletions(-) diff --git a/docs/assets/css/custom-molfeat.css b/docs/assets/css/custom-molfeat.css index 7ef1a27..7c89bc7 100644 --- a/docs/assets/css/custom-molfeat.css +++ b/docs/assets/css/custom-molfeat.css @@ -1,20 +1,20 @@ :root { - --datamol-primary: #217EBB; - --datamol-secondary: #343a40; + --molfeat-primary: #217EBB; + --molfeat-secondary: #5f6d7a; /* Primary color shades */ - --md-primary-fg-color: var(--datamol-primary); - --md-primary-fg-color--light: var(--datamol-primary); - --md-primary-fg-color--dark: var(--datamol-primary); - --md-primary-bg-color: var(--datamol-secondary); - --md-primary-bg-color--light: var(--datamol-secondary); - --md-text-link-color: var(--datamol-secondary); + --md-primary-fg-color: var(--molfeat-primary); + --md-primary-fg-color--light: var(--molfeat-primary); + --md-primary-fg-color--dark: var(--molfeat-primary); + --md-primary-bg-color: var(--molfeat-secondary); + --md-primary-bg-color--light: var(--molfeat-secondary); + --md-text-link-color: var(--molfeat-secondary); /* Accent color shades */ - --md-accent-fg-color: var(--datamol-secondary); - --md-accent-fg-color--transparent: var(--datamol-secondary); - --md-accent-bg-color: var(--datamol-secondary); - --md-accent-bg-color--light: var(--datamol-secondary); + --md-accent-fg-color: var(--molfeat-secondary); + --md-accent-fg-color--transparent: var(--molfeat-secondary); + --md-accent-bg-color: var(--molfeat-secondary); + --md-accent-bg-color--light: var(--molfeat-secondary); } :root>* { @@ -23,11 +23,11 @@ --md-code-fg-color: hsla(200, 18%, 26%, 1); /* Footer */ - --md-footer-bg-color: var(--datamol-primary); + --md-footer-bg-color: var(--molfeat-primary); /* --md-footer-bg-color--dark: hsla(0, 0%, 0%, 0.32); */ - --md-footer-fg-color: var(--datamol-secondary); - --md-footer-fg-color--light: var(--datamol-secondary); - --md-footer-fg-color--lighter: var(--datamol-secondary); + --md-footer-fg-color: var(--molfeat-secondary); + --md-footer-fg-color--light: var(--molfeat-secondary); + --md-footer-fg-color--lighter: var(--molfeat-secondary); } @@ -40,7 +40,7 @@ } .md-tabs { - background-image: linear-gradient(to right, #F4F6F9, #C3CFE2); + background-image: linear-gradient(to right, #F4F6F9, #CCE3f8); } .md-header__topic { @@ -63,7 +63,16 @@ } .md-search__form { - background-color: rgba(255, 255, 255, 0.2); + background-color: rgba(255, 255, 255, 0.4); +} + +.md-search-result__article:hover { + background-color: #CCE3f8; +} + +.md-search-result__more:hover, +.md-search-result__more:focus { + background-color: #CCE3f8 !important; } .md-search__input { diff --git a/docs/usage.md b/docs/usage.md index c3ac6a1..26b2ee8 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -48,55 +48,50 @@ model_card.usage() ## FAQ -
-What is a molecular featurizer ? +#### What is a molecular featurizer ? +A molecular featurizer is a function or model that provides numerical representations for molecular structures. These numerical features serve as inputs for machine learning models, enabling them to predict molecular properties and activities, design novel molecules, perform molecular analyses, or conduct searches for similar molecules. -A molecular featurizer is function or model that provides numerical representations from molecular structures. These numerical features can then be used as input for machine learning models to predict molecular properties and activities, to design new molecules, to perform molecular analyses, or to search for similar molecules. -
+#### Why so many molecular featurizers in `molfeat`? +The reason for providing a diverse range of molecular featurizers in `molfeat` is to address the inherent uncertainty in determining which molecular representation performs best for a given task. Different featurization methods exist, such as using physico-chemical descriptors, molecular structure fingerprints, deep learning embeddings, and more. The effectiveness of these representations varies depending on the specific application. Therefore, the availability of multiple featurizers in `molfeat` ensures that users can access the most suitable featurizer for their unique needs. -
-Why so many molecular featurizers in `molfeat`? -To date, it's not clear which molecular representation performs better. There are multiple ways of representing molecules (e.g using their physico-chemical descriptors, using a fingerprint corresponding to a hash of the molecular structure, using deep learning embeddings, etc). Depending on your tasks, one representation could perform better than another, this is why `molfeat` attempt to provide a broad range of featurizer to ensure, everyone has access to their favorite featurizers. -
+#### What is the difference between a calculator and a featurizer in `molfeat`? +In `molfeat`, -
-What is the difference between a calculator and a featurizer in `molfeat`? +- a `calculator` operates on individual molecules and specifies the process of transforming an input molecule into a numerical representation. +- a `featurizer` works with batches of molecules, leveraging the efficiency of deep learning models on batch processing. Some `featurizers` uses a `calculator` internally to feature each molecule individually and then stitch their outputs together. Additionally, `featurizers` offer convenient tools, such as parallelism and caching, to optimize the computation of molecular representations efficiently. + +`molfeat` has been designed with utmost flexibility, recognizing that the actions users wish to perform with molecular data can be vast and diverse, and there often isn't a single "right" way to approach them. -In `molfeat`, -- a `calculator` operate on the level of a single molecule, it dictates how to transform an input molecule into a numerical representation. -- a `featurizer` operates on batches of molecules, because deep learning models are often more efficient on batch of samples. Some `featurizers` uses `calculator` internally to each molecule individually and stitch them together. `featurizers` also provide convenient tools such as parallelism, caching, etc to make computation of molecular representation efficient. +#### What functions should I be familiar with when using the featurizer classes ? -`molfeat` is designed to be extremely flexible. This is because the space of actions that users often wish to perform is huge and there are often not "wrong" ways. -
+When using a `featurizer` in `molfeat`, you should be familiar with the following functions: -
-What are the function I should know when using a `featurizer` ? +- `preprocess()`: This method performs preprocessing of your input molecules to ensure compatibility with the expected featurizer class you are using. It's essential to note that the preprocessing steps **are not automatically applied to your inputs** to maintain independence from the molecular transformation. The preprocess function takes your molecule inputs, along with optional labels, and can be redefined when creating a custom featurizer. +- `transform()`: This method operates on a batch of molecules and returns a list of representations, where the actual featurization occurs. In cases where featurization fails, the position can be denoted as `None`, especially when you choose to `ignore_errors`. +- `_transform()`: This method operates on a single input molecule, performing the actual featurization. +- `__call__()`: This method uses `transform()` under the hood and provides convenient arguments, such as enforcing the datatype defined during the initialization of your model, to the outputs. If you specify `ignore_errors`, a vector of indexes where featurization did not fail will also be returned. -Every featurizer would have: - - a `preprocess` method that can perform preprocessing of your input molecules, to ensure compatibility with the expected featurizer class you are using. The preprocess steps is not called automatically for you to decouple it from the molecular transformation. It's a suggestion for the preprocessing steps you should perform when using a given featurizer. +In addition to the methods described above, PretrainedMolTransformer introduces the following functions: -The `preprocess` function expect your molecule inputs, but also some optional labels and can be redefined when creating your own custom featurizer. +- `_embed()`: For pre-trained models that benefit from batched featurization, this method is internally called during transform instead of an internal calculator. +- `_convert()`: This method is called by the transformer to convert the molecule input into the expected format of the underlying ML model. For example, for a pre-trained language model expecting SELFIES strings, we will perform the conversion to SELFIES strings here. - - a `transform` method that operates on a batch of molecules and returns a list of representation, this is where the `magic` happens. Position where featurization failed can be `None` when you elect to `ignore_errors`. - - a `_transform` method that operates on a single input molecule, this is where the `magic` happens - - a `__call__` method that uses `transform` under the hood and add some convenient argument such as enforcing the datatype you defined when initializing your model to the outputs. If you ask to `ignore_errors`, a vector of indexes where featurization did not fail will also be returned. -In addition to the method described above, `PretrainedMolTransformer` also defines the following functions: -- `_embed`: since pre-trained models benefit from batched featurization, this method is called by internally during `transform` instead of an internal calculator. -- `_convert`: this method is called by the transformer to convert the molecule input into the expected format of the underlying ML model. For example for a pre-trained language model expecting SELFIES strings, we will convert for input into SELFIES strings here. +#### I am getting an error and I am not sure what to do ? -
+When encountering an error during the featurization process, you have a couple of options to handle it: -
-I am getting an error and I am not sure what to do. +- Ignore Errors: You can choose to set the `ignore_errors` parameter to `True` when using the featurizer. This allows the featurizer to continue processing even if it encounters errors on some molecules in your dataset. The featurizer will still attempt to calculate representations for all molecules, and any molecules that failed featurization will have their position in the output list marked as `None`. -User can decide to `ignore_errors` when featurization fails on some molecules of their dataset, with the hope of filtering them after. Therefore, some silent errors are caught in the `transform` errors. Set the verbosity of the featurizer to True to get a log of all errors. +- Increase Verbosity: If you're unsure about the specific errors occurring during featurization, you can set the verbosity of the featurizer to True. This will enable the featurizer to log all errors encountered during the process, providing more detailed information about the cause of the issue, since because of the above features, some silent errors are often caught but not propagated. + +For example, the following will ensure that all errors are logged. ```python from molfeat.trans.concat import FeatConcat @@ -106,20 +101,18 @@ featurizer = MoleculeTransformer(..., dtype=np.float32, verbose=True) featurizer(["CSc1nc2cc3c(cc2[nH]1)N(Cc1ccc(S(=O)(=O)c2ccccc2)cc1)CCC3"], enforce_dtype=True) ``` -you will alway have a log of all errors. -
-
-What are the base featurizers class in molfeat and how to use them ? +#### What are the base featurizers class in molfeat and how to use them ? + + +| Class | Module | Why? | +| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [BaseFeaturizer](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.base.html#molfeat.trans.base.BaseFeaturizer) | `molfeat.trans.base` | Lowest level featurizer class. All featurizers (even if not molecular) inherit from this class. It's recommended to use `MoleculeTransformer` as the root class instead. | +| [MoleculeTransformer](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.base.html#molfeat.trans.base.MoleculeTransformer) | `molfeat.trans.base` |
  • Base class for all molecule featurizers. This is where you start if you want to implement a new featurizer.
  • You can provide either an existing `calculator` or your own (a **python callable**) directly to define a new `featurizer`.
| +| [PrecomputedMolTransformer](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.base.html#molfeat.trans.base.PrecomputedMolTransformer) | `molfeat.trans.base` | Class for dealing with precomputed features. You can leverage this class to compute features, save them in a file, and reload them after for other tasks efficiently. [See this tutorial!](https://molfeat-docs.datamol.io/stable/tutorials/datacache.html#using-a-cache-with-a-precomputed-transformer) | +| [FeatConcat](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.concat.html#molfeat.trans.concat.FeatConcat) | `molfeat.trans.concat` | Convenient class for concatenating multiple vector-featurizers automatically. If you want to combine multiple 'fingerprints' and descriptors, this is the class you use. [See example!](https://molfeat-docs.datamol.io/stable/tutorials/types_of_featurizers.html#concatenate-featurizers) | +| [PretrainedMolTransformer](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.pretrained.base.html) | `molfeat.trans.pretrained.base` | Base class for all `pretrained featurizers`. A `pretrained featurizer` is a `featurizer` that is derived from a pretrained machine learning model. Implement a subclass of this to define your new pretrained featurizer. [See example!](https://molfeat-docs.datamol.io/stable/tutorials/add_your_own.html#define-your-own-transformer) | +| [PretrainedDGLTransformer](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.pretrained.dgl_pretrained.html#molfeat.trans.pretrained.dgl_pretrained.PretrainedDGLTransformer) | `molfeat.trans.pretrained.dgl_pretrained` | Base class for all `dgl pretrained featurizers`. You can initialize a new dgl/dgllife pretrained model as a `molfeat featurizer` easily using this class. You only need to add the dgl model object to a store. [See this example!](https://github.com/datamol-io/molfeat/blob/main/nb/etl/dgl-etl.ipynb) | +| [PretrainedHFTransformer](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.pretrained.hf_transformers.html#molfeat.trans.pretrained.hf_transformers.PretrainedHFTransformer) | `molfeat.trans.pretrained.hf_transformer` | Base class for all `huggingface pretrained featurizers`. You can initialize a new 🤗 Transformers pretrained model as a `molfeat featurizer` easily using this class. [See this example!](https://github.com/datamol-io/molfeat/blob/main/nb/etl/molt5-etl.ipynb) | -| Class | Module | Why ? | -|------------- |----------------------- |---------------------- | -| [`BaseFeaturizer`](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.base.html#molfeat.trans.base.BaseFeaturizer) | `molfeat.trans.base` | Lowest level featurizer class. All featurizers (even if not molecular) inherits from this class. It's recommended to use `MoleculeTransformer` as root class instead | -| [`MoleculeTransformer`](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.base.html#molfeat.trans.base.MoleculeTransformer) | `molfeat.trans.base` |
  • Base class for all molecule featurizers. This is where you start if you want to implement a new featurizer.
  • You can provide either an existing `calculator` or your own (a **python callable**) directly to define a new `featurizer`
| -|[`PrecomputedMolTransformer``](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.base.html#molfeat.trans.base.PrecomputedMolTransformer) | `molfeat.trans.base` | Class for dealing with precomputed features. You can leverage this class to compute features, save them in a file, and reload them after for other task efficiently. [See this tutorial !](https://molfeat-docs.datamol.io/stable/tutorials/datacache.html#using-a-cache-with-a-precomputed-transformer) | -|[`FeatConcat`](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.concat.html#molfeat.trans.concat.FeatConcat) | `molfeat.trans.concat` | Convenient class for concatenating multiple vector-featurizers automatically. If you want to combine multiple 'fingerprints' and descriptors, this is the class you use. [See example !](https://molfeat-docs.datamol.io/stable/tutorials/types_of_featurizers.html#concatenate-featurizers) | -|[`PretrainedMolTransformer`](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.pretrained.base.html) | `molfeat.trans.pretrained.base` | Base class for all `pretrained featurizers`. A `pretrained featurizer` is a `featurizer` that is derived from a pretrained machine learning model. Implement a subclass of this to define your new pretrained featurizer. [See example !](https://molfeat-docs.datamol.io/stable/tutorials/add_your_own.html#define-your-own-transformer) | -|`PretrainedDGLTransformer` | `molfeat.trans.pretrained.dgl_pretrained` | Base class for all `dgl pretrained featurizers`. You can initialize a new dgl/dgllife pretrained model as a `molfeat featurizer` easily using this class. You only need to add the dgl model object to a store. | -|[`PretrainedHFTransformer``](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.pretrained.hf_transformers.html#molfeat.trans.pretrained.hf_transformers.PretrainedHFTransformer) | `molfeat.trans.pretrained.hf_transformer` | Base class for all `huggingface pretrained featurizers`. You can initialize a new 🤗 Transformers pretrained model as a `molfeat featurizer` easily using this class. [See this example !](https://github.com/datamol-io/molfeat/blob/main/nb/etl/molt5-etl.ipynb) | -
From 4c79173bda7bb8f27179815f90efaf38d93436a3 Mon Sep 17 00:00:00 2001 From: maclandrol Date: Thu, 20 Jul 2023 17:30:04 -0400 Subject: [PATCH 7/8] misc updates --- molfeat/trans/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/molfeat/trans/base.py b/molfeat/trans/base.py index c5eb42e..464476b 100644 --- a/molfeat/trans/base.py +++ b/molfeat/trans/base.py @@ -199,10 +199,10 @@ def __init__( self._fitted = False self._save_input_args() - if self.featurizer and ( - not isinstance(self.featurizer, str) or not is_callable(self.featurizer) + if self.featurizer and not ( + isinstance(self.featurizer, str) or is_callable(self.featurizer) ): - raise AttributeError("Featurizer must be a callable or a string") + raise AttributeError(f"Featurizer {self.featurizer} must be a callable or a string") def _save_input_args(self): """Save the input arguments of a transformer to the attribute From 322503d44439613de7ecf0f263564fec20b92917 Mon Sep 17 00:00:00 2001 From: maclandrol Date: Thu, 20 Jul 2023 17:46:03 -0400 Subject: [PATCH 8/8] fix dgl etl --- nb/etl/dgl-etl.ipynb | 178 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 155 insertions(+), 23 deletions(-) diff --git a/nb/etl/dgl-etl.ipynb b/nb/etl/dgl-etl.ipynb index 82b5e2c..cb6e21e 100644 --- a/nb/etl/dgl-etl.ipynb +++ b/nb/etl/dgl-etl.ipynb @@ -100,6 +100,39 @@ "cell_type": "code", "execution_count": 6, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading gin_supervised_contextpred_pre_trained.pth from https://data.dgl.ai/dgllife/pre_trained/gin_supervised_contextpred.pth...\n", + "Pretrained model loaded\n" + ] + } + ], + "source": [ + "# an example of supervised GIN model\n", + "gin_contextpred= ModelInfo(\n", + " name = \"gin_supervised_contextpred\",\n", + " inputs = \"smiles\",\n", + " type=\"pretrained\",\n", + " group=\"dgllife\",\n", + " version=0,\n", + " submitter=\"Datamol\",\n", + " description=\"GIN neural network model pre-trained with supervised learning and context prediction on molecules from ChEMBL.\",\n", + " representation=\"graph\",\n", + " require_3D=False,\n", + " tags = [\"GIN\", \"dgl\", \"pytorch\", \"graph\"],\n", + " authors= [\"Weihua Hu\", \"Bowen Liu\", \"Joseph Gomes\", \"Marinka Zitnik\", \"Percy Liang\", \"Vijay Pande\", \"Jure Leskovec\"],\n", + " reference = \"https://arxiv.org/abs/1905.12265\" \n", + ")\n", + "gin_contextpred_model = dgllife.model.load_pretrained('gin_supervised_contextpred')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -131,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -141,23 +174,58 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['gin_supervised_contextpred',\n", + "['cats2d',\n", + " 'cats3d',\n", + " 'scaffoldkeys',\n", + " 'gin_supervised_edgepred',\n", + " 'gin_supervised_infomax',\n", + " 'gin_supervised_masking',\n", " 'jtvae_zinc_no_kl',\n", + " 'map4',\n", + " 'secfp',\n", " 'pcqm4mv2_graphormer_base',\n", + " 'ChemBERTa-77M-MLM',\n", + " 'ChemBERTa-77M-MTR',\n", " 'ChemGPT-1.2B',\n", + " 'ChemGPT-19M',\n", " 'ChemGPT-4.7M',\n", - " 'DeepChem-ChemBERTa-77M-MLM',\n", - " 'DeepChem-ChemBERTa-77M-MTR',\n", - " 'maccs']" + " 'GPT2-Zinc480M-87M',\n", + " 'MolT5',\n", + " 'Roberta-Zinc480M-102M',\n", + " 'pharm2D-cats',\n", + " 'pharm2D-default',\n", + " 'pharm2D-gobbi',\n", + " 'pharm2D-pmapper',\n", + " 'pharm3D-cats',\n", + " 'pharm3D-gobbi',\n", + " 'pharm3D-pmapper',\n", + " 'atompair-count',\n", + " 'avalon',\n", + " 'desc2D',\n", + " 'desc3D',\n", + " 'ecfp-count',\n", + " 'ecfp',\n", + " 'erg',\n", + " 'estate',\n", + " 'fcfp-count',\n", + " 'fcfp',\n", + " 'maccs',\n", + " 'pattern',\n", + " 'rdkit',\n", + " 'topological-count',\n", + " 'topological',\n", + " 'electroshape',\n", + " 'usr',\n", + " 'usrcat']" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -168,13 +236,13 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1fffc7c2d6d144f79c41c8b2d944a68c", + "model_id": "10895bee541a41ed98689c2e74da8225", "version_major": 2, "version_minor": 0 }, @@ -189,7 +257,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-02-14 17:42:54.663 | INFO | molfeat.store.modelstore:register:124 - Successfuly registered model gin_supervised_edgepred !\n" + "\u001b[32m2023-07-20 17:45:12.918\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmolfeat.store.modelstore\u001b[0m:\u001b[36mregister\u001b[0m:\u001b[36m147\u001b[0m - \u001b[1mSuccessfuly registered model gin_supervised_edgepred !\u001b[0m\n" ] } ], @@ -199,13 +267,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "fe97498dacb14aefba894664eb48d21a", + "model_id": "eb859420bcfa4bf0a0e5660fbb0c3ff2", "version_major": 2, "version_minor": 0 }, @@ -220,7 +288,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-02-14 17:43:03.718 | INFO | molfeat.store.modelstore:register:124 - Successfuly registered model gin_supervised_infomax !\n" + "\u001b[32m2023-07-20 17:45:21.508\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmolfeat.store.modelstore\u001b[0m:\u001b[36mregister\u001b[0m:\u001b[36m147\u001b[0m - \u001b[1mSuccessfuly registered model gin_supervised_infomax !\u001b[0m\n" ] } ], @@ -230,13 +298,13 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2b263cfa9945403398b9f3c8c0a4d63a", + "model_id": "7dda56220b3c48878c040997a4f9d525", "version_major": 2, "version_minor": 0 }, @@ -251,7 +319,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-02-14 17:43:17.363 | INFO | molfeat.store.modelstore:register:124 - Successfuly registered model gin_supervised_masking !\n" + "\u001b[32m2023-07-20 17:45:30.070\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmolfeat.store.modelstore\u001b[0m:\u001b[36mregister\u001b[0m:\u001b[36m147\u001b[0m - \u001b[1mSuccessfuly registered model gin_supervised_masking !\u001b[0m\n" ] } ], @@ -261,26 +329,90 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "95155893ec7d4d5eab4de463283d2ed6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0.00/7.12M [00:00