From a543369cec472b33a3e31c45bafb774364a8fe53 Mon Sep 17 00:00:00 2001 From: Eric Jia Date: Mon, 3 Jun 2024 15:40:56 -0700 Subject: [PATCH 1/7] fixed adjustment function so its based on enrichment strength --- docs/tutorials/generate_in_silico_data.ipynb | 134 +- docs/tutorials/hyperparameter_sweep.ipynb | 538 +++- docs/tutorials/lightning_crash_course.ipynb | 333 ++- docs/tutorials/testing_model_metrics.ipynb | 1228 ++++++++- ..._and_testing_data_generation_methods.ipynb | 2327 +++++++++++++++-- experiments/simple_model_synthetic_data.py | 2 +- pyproject.toml | 1 + .../data_loaders/real_data_loader.py | 2 +- .../data_loaders/synthetic_data_loader.py | 48 +- .../probability_models/generate_data.py | 246 +- .../probability_models/test_generate_data.py | 94 +- 11 files changed, 4431 insertions(+), 522 deletions(-) diff --git a/docs/tutorials/generate_in_silico_data.ipynb b/docs/tutorials/generate_in_silico_data.ipynb index 26de65e..14dd53c 100644 --- a/docs/tutorials/generate_in_silico_data.ipynb +++ b/docs/tutorials/generate_in_silico_data.ipynb @@ -11,7 +11,15 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Matplotlib is building the font cache; this may take a moment.\n" + ] + } + ], "source": [ "from yeastdnnexplorer.probability_models.relation_classes import And, Or\n", "from yeastdnnexplorer.probability_models.generate_data import (generate_gene_population, \n", @@ -36,9 +44,9 @@ "\n", "The first step is to generate a gene population, or set of gene populations.\n", "A gene population is simply a class that stores a 1D tensor called `labels`.\n", - "`labels` is a boolean vector where 1 means the gene is part of the signal group\n", + "`labels` is a boolean vector where 1 means the gene is part of the bound group\n", "(a gene which is both bound and responsive to the TF) while 0 means the gene is\n", - "part of the background or noise group. The length of `labels` is the number of\n", + "part of the background or unbound group. The length of `labels` is the number of\n", "genes in the population, and the index should be considered the unique gene\n", "identifier. In other words, the indicies should never change." ] @@ -50,14 +58,14 @@ "outputs": [], "source": [ "n_genes = 1000\n", - "signal = [0.1, 0.15, 0.2, 0.25, 0.3]\n", + "bound = [0.1, 0.15, 0.2, 0.25, 0.3]\n", "n_sample = [1, 1, 2, 2, 4]\n", "\n", "# this will be a list of length 10 with a GenePopulation object in each element\n", "gene_populations_list = []\n", - "for signal_proportion, n_draws in zip(signal, n_sample):\n", + "for bound_proportion, n_draws in zip(bound, n_sample):\n", " for _ in range(n_draws):\n", - " gene_populations_list.append(generate_gene_population(n_genes, signal_proportion))\n" + " gene_populations_list.append(generate_gene_population(n_genes, bound_proportion))\n" ] }, { @@ -121,7 +129,7 @@ "source": [ "### Method 1: Generating perturbation data with no mean adjustment\n", "\n", - "If you don't pass in a value for `max_mean_adjustment` to `generate_perturbation_effects` it will default to zero, meaning the means of the perturbation effects will not be adjusted in any way and will all be equal to `signal_mean` (deault is 3.0) for bound TF-gene pairs and `noise_mean` (default is 0.0) for unbound TF-gene pairs." + "If you don't pass in a value for `max_mean_adjustment` to `generate_perturbation_effects` it will default to zero, meaning the means of the perturbation effects will not be adjusted in any way and will all be equal to `bound_mean` (deault is 3.0) for bound TF-gene pairs and `unbound_mean` (default is 0.0) for unbound TF-gene pairs." ] }, { @@ -150,7 +158,7 @@ "metadata": {}, "outputs": [], "source": [ - "# if you want to modify the default mean for bound genes, you can pass in the 'signal_mean' parameter\n", + "# if you want to modify the default mean for bound genes, you can pass in the 'bound_mean' parameter\n", "perturbation_effects_list_normal_mean_adjustment = generate_perturbation_effects(\n", " binding_data_tensor, \n", " max_mean_adjustment=10.0\n", @@ -260,7 +268,7 @@ "The final step is to assemble the data into a single tensor. Here is one way.\n", "The order of the matrix in the last dimension is:\n", "\n", - "1. signal/noise label\n", + "1. bound/unbound label\n", "1. binding effect\n", "1. binding pvalue\n", "1. perturbation effect\n", @@ -340,7 +348,7 @@ "\n", "Ensure that the generated data matches expectations.\n", "\n", - "### The signal/noise ratios should match exactly the initial signal ratio" + "### The bound/unbound ratios should match exactly the initial bound ratio" ] }, { @@ -352,7 +360,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "signal/nosie ratio is correct: True\n" + "bound/nosie ratio is correct: True\n" ] } ], @@ -360,11 +368,11 @@ "tolerance = 1e-5\n", "are_equal = torch.isclose(\n", " torch.sum(final_data_tensor[:, :, 0] == 1, axis=0),\n", - " torch.tensor([val * n_genes for val, count in zip(signal, n_sample) for _ in range(count)],\n", + " torch.tensor([val * n_genes for val, count in zip(bound, n_sample) for _ in range(count)],\n", " dtype=torch.long),\n", " atol=tolerance)\n", "\n", - "print(f\"signal/nosie ratio is correct: {are_equal.all()}\")" + "print(f\"bound/nosie ratio is correct: {are_equal.all()}\")" ] }, { @@ -385,26 +393,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "The noise binding max is 13.157892227172852 and the min is 0.0\n", - "the noise min is 0.0\n", - "the noise mean is 0.3589712679386139 and the std is 1.1559306383132935\n", - "The signal binding max is 78.94734954833984 and the min is 0.1315789520740509\n", - "the signal min is 0.1315789520740509\n", - "the signal mean is 2.4840002059936523 and the std is 6.374814510345459\n" + "The unbound binding max is 13.157892227172852 and the min is 0.0\n", + "the unbound min is 0.0\n", + "the unbound mean is 0.3589712679386139 and the std is 1.1559306383132935\n", + "The bound binding max is 78.94734954833984 and the min is 0.1315789520740509\n", + "the bound min is 0.1315789520740509\n", + "the bound mean is 2.4840002059936523 and the std is 6.374814510345459\n" ] } ], "source": [ "labels = final_data_tensor[:, :, 0].flatten()\n", - "noise_binding = final_data_tensor[:, :, 1].flatten()[labels == 0]\n", - "signal_binding = final_data_tensor[:, :, 1].flatten()[labels == 1]\n", + "unbound_binding = final_data_tensor[:, :, 1].flatten()[labels == 0]\n", + "bound_binding = final_data_tensor[:, :, 1].flatten()[labels == 1]\n", "\n", - "print(f\"The noise binding max is {noise_binding.max()} and the min is {noise_binding.min()}\")\n", - "print(f\"the noise min is {noise_binding.min()}\")\n", - "print(f\"the noise mean is {noise_binding.mean()} and the std is {noise_binding.std()}\")\n", - "print(f\"The signal binding max is {signal_binding.max()} and the min is {signal_binding.min()}\")\n", - "print(f\"the signal min is {signal_binding.min()}\")\n", - "print(f\"the signal mean is {signal_binding.mean()} and the std is {signal_binding.std()}\")" + "print(f\"The unbound binding max is {unbound_binding.max()} and the min is {unbound_binding.min()}\")\n", + "print(f\"the unbound min is {unbound_binding.min()}\")\n", + "print(f\"the unbound mean is {unbound_binding.mean()} and the std is {unbound_binding.std()}\")\n", + "print(f\"The bound binding max is {bound_binding.max()} and the min is {bound_binding.min()}\")\n", + "print(f\"the bound min is {bound_binding.min()}\")\n", + "print(f\"the bound mean is {bound_binding.mean()} and the std is {bound_binding.std()}\")" ] }, { @@ -414,7 +422,7 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -427,8 +435,8 @@ "\n", "# Plotting\n", "plt.figure(figsize=(10, 6))\n", - "plt.hist(noise_binding, bins=30, alpha=0.5, label='Label 0', color='orange')\n", - "plt.hist(signal_binding, bins=30, alpha=0.5, label='Label 1', color='blue')\n", + "plt.hist(unbound_binding, bins=30, alpha=0.5, label='Label 0', color='orange')\n", + "plt.hist(bound_binding, bins=30, alpha=0.5, label='Label 1', color='blue')\n", "plt.xlim(0,5)\n", "plt.title('Histogram of Values in the 2nd Column')\n", "plt.xlabel('Values')\n", @@ -453,25 +461,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "The noise binding max is 3.423511505126953 and the min is -3.506139039993286\n", - "the noise min is -3.506139039993286\n", - "the noise mean is 0.010617653839290142 and the std is 0.988001823425293\n", - "The signal binding max is 6.107701301574707 and the min is -6.406703948974609\n", - "the signal min is -6.406703948974609\n", - "the signal mean is -0.011303802020847797 and the std is 3.136451482772827\n" + "The unbound binding max is 3.423511505126953 and the min is -3.506139039993286\n", + "the unbound min is -3.506139039993286\n", + "the unbound mean is 0.010617653839290142 and the std is 0.988001823425293\n", + "The bound binding max is 6.107701301574707 and the min is -6.406703948974609\n", + "the bound min is -6.406703948974609\n", + "the bound mean is -0.011303802020847797 and the std is 3.136451482772827\n" ] } ], "source": [ - "noise_perturbation = final_data_tensor[:, :, 3].flatten()[labels == 0]\n", - "signal_perturbation = final_data_tensor[:, :, 3].flatten()[labels == 1]\n", + "unbound_perturbation = final_data_tensor[:, :, 3].flatten()[labels == 0]\n", + "bound_perturbation = final_data_tensor[:, :, 3].flatten()[labels == 1]\n", "\n", - "print(f\"The noise binding max is {noise_perturbation.max()} and the min is {noise_perturbation.min()}\")\n", - "print(f\"the noise min is {noise_perturbation.min()}\")\n", - "print(f\"the noise mean is {noise_perturbation.mean()} and the std is {noise_perturbation.std()}\")\n", - "print(f\"The signal binding max is {signal_perturbation.max()} and the min is {signal_perturbation.min()}\")\n", - "print(f\"the signal min is {signal_perturbation.min()}\")\n", - "print(f\"the signal mean is {signal_perturbation.mean()} and the std is {signal_perturbation.std()}\")" + "print(f\"The unbound binding max is {unbound_perturbation.max()} and the min is {unbound_perturbation.min()}\")\n", + "print(f\"the unbound min is {unbound_perturbation.min()}\")\n", + "print(f\"the unbound mean is {unbound_perturbation.mean()} and the std is {unbound_perturbation.std()}\")\n", + "print(f\"The bound binding max is {bound_perturbation.max()} and the min is {bound_perturbation.min()}\")\n", + "print(f\"the bound min is {bound_perturbation.min()}\")\n", + "print(f\"the bound mean is {bound_perturbation.mean()} and the std is {bound_perturbation.std()}\")" ] }, { @@ -481,7 +489,7 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -493,8 +501,8 @@ "source": [ "# Plotting\n", "plt.figure(figsize=(10, 6))\n", - "plt.hist(noise_perturbation, bins=30, alpha=0.5, label='Label 0', color='orange')\n", - "plt.hist(signal_perturbation, bins=30, alpha=0.5, label='Label 1', color='blue')\n", + "plt.hist(unbound_perturbation, bins=30, alpha=0.5, label='Label 0', color='orange')\n", + "plt.hist(bound_perturbation, bins=30, alpha=0.5, label='Label 1', color='blue')\n", "plt.title('Histogram of Values in the 2nd Column')\n", "plt.xlabel('Values')\n", "plt.ylabel('Frequency')\n", @@ -516,7 +524,7 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -568,8 +576,8 @@ "perturbation_effects_tf_influenced = generate_perturbation_effects(\n", " binding_data_tensor, \n", " max_mean_adjustment=3.0, # try 0.1, 3.0, and 10.0\n", - " signal_mean=5.0, # try 3.0, 5.0, or 10.0\n", - " noise_mean=0.0, # try adjusting this\n", + " bound_mean=5.0, # try 3.0, 5.0, or 10.0\n", + " unbound_mean=0.0, # try adjusting this\n", ")\n", "perturbation_pvalue_tf_influenced = torch.zeros_like(perturbation_effects_tf_influenced)\n", "for col_idx in range(perturbation_effects_tf_influenced.shape[1]):\n", @@ -592,12 +600,12 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -607,7 +615,7 @@ } ], "source": [ - "# Plotting. Note that the 'noise' group effects are still range from 0 to 3\n", + "# Plotting. Note that the 'unbound' group effects are still range from 0 to 3\n", "\n", "plt.figure(figsize=(10, 6))\n", "plt.scatter(final_data_tensor_tf_influenced[:, :, 1].flatten(), final_data_tensor_tf_influenced[:, :, 3].flatten().abs(), c=['orange' if x == 0 else 'blue' for x in labels])\n", @@ -622,11 +630,25 @@ "\n", "plt.show()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -640,9 +662,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.11.1" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/tutorials/hyperparameter_sweep.ipynb b/docs/tutorials/hyperparameter_sweep.ipynb index 36220c1..f62dfc2 100644 --- a/docs/tutorials/hyperparameter_sweep.ipynb +++ b/docs/tutorials/hyperparameter_sweep.ipynb @@ -99,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -158,8 +158,8 @@ " data_module = SyntheticDataLoader(\n", " batch_size=batch_size,\n", " num_genes=4000,\n", - " signal_mean=3.0,\n", - " signal=[0.5] * 10,\n", + " bound_mean=3.0,\n", + " bound=[0.5] * 10,\n", " n_sample=[1, 2, 2, 4, 4],\n", " val_size=0.1,\n", " test_size=0.1,\n", @@ -208,9 +208,510 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[I 2024-05-29 13:18:03,548] A new study created in memory with name: CustomizableModelHyperparameterSweep3\n", + "/Users/ericjia/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains [64] which is of type list.\n", + " warnings.warn(message)\n", + "/Users/ericjia/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains [256] which is of type list.\n", + " warnings.warn(message)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "About to create model with the following hyperparameters:\n", + "lr: 0.01\n", + "hidden_layer_num: 1\n", + "hidden_layer_sizes: [256]\n", + "activation: Tanh\n", + "optimizer: RMSprop\n", + "L2_regularization_term: 0.1\n", + "dropout_rate: 0.5\n", + "batch_size: 32\n", + "max_epochs: 1\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:260: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_train, Y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:263: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_val, Y_val = torch.tensor(X_val, dtype=torch.float32), torch.tensor(\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_test, Y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(\n", + "\n", + " | Name | Type | Params\n", + "----------------------------------------------------\n", + "0 | activation | Tanh | 0 \n", + "1 | input_layer | Linear | 3.6 K \n", + "2 | hidden_layers | ModuleList | 0 \n", + "3 | output_layer | Linear | 3.3 K \n", + "4 | dropout | Dropout | 0 \n", + "5 | mae | MeanAbsoluteError | 0 \n", + "6 | SMSE | SMSE | 0 \n", + "----------------------------------------------------\n", + "6.9 K Trainable params\n", + "0 Non-trainable params\n", + "6.9 K Total params\n", + "0.028 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ericjia/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/torch/utils/data/dataloader.py:558: UserWarning: This DataLoader will create 15 worker processes in total. Our suggested max number of worker in current system is 8 (`cpuset` is not taken into account), which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", + " warnings.warn(_create_warning_msg(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9874d59e855a45b09fcd3891e60fc48b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=1` reached.\n", + "[I 2024-05-29 13:18:26,417] Trial 0 finished with value: 4.489274501800537 and parameters: {'lr': 0.01, 'hidden_layer_num': 1, 'activation': 'Tanh', 'optimizer': 'RMSprop', 'L2_regularization_term': 0.1, 'dropout_rate': 0.5, 'batch_size': 32, 'max_epochs': 1, 'hidden_layer_sizes_1_layers': [256]}. Best is trial 0 with value: 4.489274501800537.\n", + "/Users/ericjia/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains [64] which is of type list.\n", + " warnings.warn(message)\n", + "/Users/ericjia/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains [256] which is of type list.\n", + " warnings.warn(message)\n", + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "About to create model with the following hyperparameters:\n", + "lr: 0.01\n", + "hidden_layer_num: 1\n", + "hidden_layer_sizes: [256]\n", + "activation: LeakyReLU\n", + "optimizer: SGD\n", + "L2_regularization_term: 0.1\n", + "dropout_rate: 0.5\n", + "batch_size: 32\n", + "max_epochs: 1\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " | Name | Type | Params\n", + "----------------------------------------------------\n", + "0 | activation | LeakyReLU | 0 \n", + "1 | input_layer | Linear | 3.6 K \n", + "2 | hidden_layers | ModuleList | 0 \n", + "3 | output_layer | Linear | 3.3 K \n", + "4 | dropout | Dropout | 0 \n", + "5 | mae | MeanAbsoluteError | 0 \n", + "6 | SMSE | SMSE | 0 \n", + "----------------------------------------------------\n", + "6.9 K Trainable params\n", + "0 Non-trainable params\n", + "6.9 K Total params\n", + "0.028 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a3f7fa1a66da47818f9a97b47763e2c6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=1` reached.\n", + "[I 2024-05-29 13:18:45,320] Trial 1 finished with value: 6.033911228179932 and parameters: {'lr': 0.01, 'hidden_layer_num': 1, 'activation': 'LeakyReLU', 'optimizer': 'SGD', 'L2_regularization_term': 0.1, 'dropout_rate': 0.5, 'batch_size': 32, 'max_epochs': 1, 'hidden_layer_sizes_1_layers': [256]}. Best is trial 0 with value: 4.489274501800537.\n", + "/Users/ericjia/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains [64, 32] which is of type list.\n", + " warnings.warn(message)\n", + "/Users/ericjia/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains [256, 64] which is of type list.\n", + " warnings.warn(message)\n", + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "About to create model with the following hyperparameters:\n", + "lr: 0.01\n", + "hidden_layer_num: 2\n", + "hidden_layer_sizes: [256, 64]\n", + "activation: ReLU\n", + "optimizer: SGD\n", + "L2_regularization_term: 0.0\n", + "dropout_rate: 0.5\n", + "batch_size: 32\n", + "max_epochs: 1\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " | Name | Type | Params\n", + "----------------------------------------------------\n", + "0 | activation | ReLU | 0 \n", + "1 | input_layer | Linear | 3.6 K \n", + "2 | hidden_layers | ModuleList | 16.4 K\n", + "3 | output_layer | Linear | 845 \n", + "4 | dropout | Dropout | 0 \n", + "5 | mae | MeanAbsoluteError | 0 \n", + "6 | SMSE | SMSE | 0 \n", + "----------------------------------------------------\n", + "20.9 K Trainable params\n", + "0 Non-trainable params\n", + "20.9 K Total params\n", + "0.084 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2c96e3c7274a460ebbe021e43699d992", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=1` reached.\n", + "[I 2024-05-29 13:19:02,993] Trial 2 finished with value: 6.900921821594238 and parameters: {'lr': 0.01, 'hidden_layer_num': 2, 'activation': 'ReLU', 'optimizer': 'SGD', 'L2_regularization_term': 0.0, 'dropout_rate': 0.5, 'batch_size': 32, 'max_epochs': 1, 'hidden_layer_sizes_2_layers': [256, 64]}. Best is trial 0 with value: 4.489274501800537.\n", + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "About to create model with the following hyperparameters:\n", + "lr: 0.01\n", + "hidden_layer_num: 2\n", + "hidden_layer_sizes: [64, 32]\n", + "activation: Tanh\n", + "optimizer: Adam\n", + "L2_regularization_term: 0.1\n", + "dropout_rate: 0.0\n", + "batch_size: 32\n", + "max_epochs: 1\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " | Name | Type | Params\n", + "----------------------------------------------------\n", + "0 | activation | Tanh | 0 \n", + "1 | input_layer | Linear | 896 \n", + "2 | hidden_layers | ModuleList | 2.1 K \n", + "3 | output_layer | Linear | 429 \n", + "4 | dropout | Dropout | 0 \n", + "5 | mae | MeanAbsoluteError | 0 \n", + "6 | SMSE | SMSE | 0 \n", + "----------------------------------------------------\n", + "3.4 K Trainable params\n", + "0 Non-trainable params\n", + "3.4 K Total params\n", + "0.014 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a854f1a313d34d8192602b17986182b1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=1` reached.\n", + "[I 2024-05-29 13:19:19,976] Trial 3 finished with value: 4.5260910987854 and parameters: {'lr': 0.01, 'hidden_layer_num': 2, 'activation': 'Tanh', 'optimizer': 'Adam', 'L2_regularization_term': 0.1, 'dropout_rate': 0.0, 'batch_size': 32, 'max_epochs': 1, 'hidden_layer_sizes_2_layers': [64, 32]}. Best is trial 0 with value: 4.489274501800537.\n", + "/Users/ericjia/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains [512, 256, 128, 64, 32] which is of type list.\n", + " warnings.warn(message)\n", + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "About to create model with the following hyperparameters:\n", + "lr: 0.01\n", + "hidden_layer_num: 5\n", + "hidden_layer_sizes: [512, 256, 128, 64, 32]\n", + "activation: Tanh\n", + "optimizer: RMSprop\n", + "L2_regularization_term: 0.1\n", + "dropout_rate: 0.5\n", + "batch_size: 32\n", + "max_epochs: 1\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " | Name | Type | Params\n", + "----------------------------------------------------\n", + "0 | activation | Tanh | 0 \n", + "1 | input_layer | Linear | 7.2 K \n", + "2 | hidden_layers | ModuleList | 174 K \n", + "3 | output_layer | Linear | 429 \n", + "4 | dropout | Dropout | 0 \n", + "5 | mae | MeanAbsoluteError | 0 \n", + "6 | SMSE | SMSE | 0 \n", + "----------------------------------------------------\n", + "182 K Trainable params\n", + "0 Non-trainable params\n", + "182 K Total params\n", + "0.729 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5e56902ea3474fcc8f1e106e3fc4f19d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=1` reached.\n", + "[I 2024-05-29 13:19:37,861] Trial 4 finished with value: 4.612905502319336 and parameters: {'lr': 0.01, 'hidden_layer_num': 5, 'activation': 'Tanh', 'optimizer': 'RMSprop', 'L2_regularization_term': 0.1, 'dropout_rate': 0.5, 'batch_size': 32, 'max_epochs': 1, 'hidden_layer_sizes_5_layers': [512, 256, 128, 64, 32]}. Best is trial 0 with value: 4.489274501800537.\n" + ] + } + ], "source": [ "STUDY_NAME = \"CustomizableModelHyperparameterSweep3\"\n", "NUM_TRIALS = 5 # you will need a lot more than 5 trials if you have many possible combinations of hyperparams\n", @@ -237,9 +738,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RESULTS======================================================================\n", + "Best hyperparameters: {'lr': 0.01, 'hidden_layer_num': 1, 'activation': 'Tanh', 'optimizer': 'RMSprop', 'L2_regularization_term': 0.1, 'dropout_rate': 0.5, 'batch_size': 32, 'max_epochs': 1, 'hidden_layer_sizes_1_layers': [256]}\n", + "Best loss: 4.489274501800537\n" + ] + } + ], "source": [ "print(\"RESULTS\" + (\"=\" * 70))\n", "print(f\"Best hyperparameters: {best_params}\")\n", @@ -252,11 +763,18 @@ "source": [ "And that's it! Now you could take what you found to be the best hyperparameters and train a model with them for many more epochs. The [Optuna Documentation](https://optuna.readthedocs.io/en/stable/) will be a helpful resource if you'd like to add more to this notebook or the hyperparam sweep functions" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -270,9 +788,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.11.1" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/tutorials/lightning_crash_course.ipynb b/docs/tutorials/lightning_crash_course.ipynb index f51e3a2..5954640 100644 --- a/docs/tutorials/lightning_crash_course.ipynb +++ b/docs/tutorials/lightning_crash_course.ipynb @@ -38,9 +38,249 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "Missing logger folder: /Users/ericjia/yeastdnnexplorer/docs/tutorials/lightning_logs\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:260: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_train, Y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:263: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_val, Y_val = torch.tensor(X_val, dtype=torch.float32), torch.tensor(\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_test, Y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(\n", + "\n", + " | Name | Type | Params\n", + "----------------------------------------------\n", + "0 | mae | MeanAbsoluteError | 0 \n", + "1 | SMSE | SMSE | 0 \n", + "2 | linear1 | Linear | 110 \n", + "----------------------------------------------\n", + "110 Trainable params\n", + "0 Non-trainable params\n", + "110 Total params\n", + "0.000 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ericjia/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/torch/utils/data/dataloader.py:558: UserWarning: This DataLoader will create 15 worker processes in total. Our suggested max number of worker in current system is 8 (`cpuset` is not taken into account), which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", + " warnings.warn(_create_warning_msg(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "03961a09a4b64a63b68f3cd670bdc8db", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b68c57cdd4e34f44aac1cc03849ee343", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 1.1637259721755981\n", + " test_mse 1.8661913871765137\n", + " test_smse 10.101052284240723\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + "[{'test_mse': 1.8661913871765137, 'test_mae': 1.1637259721755981, 'test_smse': 10.101052284240723}]\n" + ] + } + ], "source": [ "# define an instance of our simple linear baseline model\n", "model = SimpleModel(\n", @@ -54,11 +294,11 @@ "data_module = SyntheticDataLoader(\n", " batch_size=32,\n", " num_genes=3000,\n", - " signal=[0.5] * 5,\n", + " bound=[0.5] * 5,\n", " n_sample=[1, 1, 2, 2, 4],\n", " val_size=0.1,\n", " test_size=0.1,\n", - " signal_mean=3.0,\n", + " bound_mean=3.0,\n", ")\n", "\n", "# define a trainer instance\n", @@ -85,9 +325,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n" + ] + }, + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '../../data/init_analysis_data_20240409/binding/brent_nf_cc'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 23\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# we also have to define a new trainer instance, not really sure why but it seems to be necessary\u001b[39;00m\n\u001b[1;32m 17\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Trainer(\n\u001b[1;32m 18\u001b[0m max_epochs\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m,\n\u001b[1;32m 19\u001b[0m deterministic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 20\u001b[0m accelerator\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# change to \"gpu\" if you have access to one\u001b[39;00m\n\u001b[1;32m 21\u001b[0m )\n\u001b[0;32m---> 23\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnew_model\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreal_data_module\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 24\u001b[0m test_results \u001b[38;5;241m=\u001b[39m trainer\u001b[38;5;241m.\u001b[39mtest(new_model, real_data_module)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28mprint\u001b[39m(test_results)\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/pytorch_lightning/trainer/trainer.py:544\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 542\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m=\u001b[39m TrainerStatus\u001b[38;5;241m.\u001b[39mRUNNING\n\u001b[1;32m 543\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 544\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_and_handle_interrupt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 545\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_impl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdatamodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\n\u001b[1;32m 546\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/pytorch_lightning/trainer/call.py:44\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher\u001b[38;5;241m.\u001b[39mlaunch(trainer_fn, \u001b[38;5;241m*\u001b[39margs, trainer\u001b[38;5;241m=\u001b[39mtrainer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m---> 44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtrainer_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 46\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _TunerExitException:\n\u001b[1;32m 47\u001b[0m _call_teardown_hook(trainer)\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/pytorch_lightning/trainer/trainer.py:580\u001b[0m, in \u001b[0;36mTrainer._fit_impl\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 573\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 574\u001b[0m ckpt_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_select_ckpt_path(\n\u001b[1;32m 575\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn,\n\u001b[1;32m 576\u001b[0m ckpt_path,\n\u001b[1;32m 577\u001b[0m model_provided\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 578\u001b[0m model_connected\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 579\u001b[0m )\n\u001b[0;32m--> 580\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mckpt_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 582\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstopped\n\u001b[1;32m 583\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/pytorch_lightning/trainer/trainer.py:947\u001b[0m, in \u001b[0;36mTrainer._run\u001b[0;34m(self, model, ckpt_path)\u001b[0m\n\u001b[1;32m 944\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__setup_profiler()\n\u001b[1;32m 946\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: preparing data\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 947\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_data_connector\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 949\u001b[0m call\u001b[38;5;241m.\u001b[39m_call_setup_hook(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;66;03m# allow user to set up LightningModule in accelerator environment\u001b[39;00m\n\u001b[1;32m 950\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: configuring model\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:94\u001b[0m, in \u001b[0;36m_DataConnector.prepare_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 92\u001b[0m dm_prepare_data_per_node \u001b[38;5;241m=\u001b[39m datamodule\u001b[38;5;241m.\u001b[39mprepare_data_per_node\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (dm_prepare_data_per_node \u001b[38;5;129;01mand\u001b[39;00m local_rank_zero) \u001b[38;5;129;01mor\u001b[39;00m (\u001b[38;5;129;01mnot\u001b[39;00m dm_prepare_data_per_node \u001b[38;5;129;01mand\u001b[39;00m global_rank_zero):\n\u001b[0;32m---> 94\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_lightning_datamodule_hook\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mprepare_data\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 95\u001b[0m \u001b[38;5;66;03m# handle lightning module prepare data:\u001b[39;00m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;66;03m# check for prepare_data_per_node before calling lightning_module.prepare_data\u001b[39;00m\n\u001b[1;32m 97\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m lightning_module \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/pytorch_lightning/trainer/call.py:179\u001b[0m, in \u001b[0;36m_call_lightning_datamodule_hook\u001b[0;34m(trainer, hook_name, *args, **kwargs)\u001b[0m\n\u001b[1;32m 177\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(fn):\n\u001b[1;32m 178\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[LightningDataModule]\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtrainer\u001b[38;5;241m.\u001b[39mdatamodule\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhook_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 179\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/yeastdnnexplorer/yeastdnnexplorer/data_loaders/real_data_loader.py:118\u001b[0m, in \u001b[0;36mRealDataLoader.prepare_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 107\u001b[0m \u001b[38;5;124;03mThis function reads in the binding data and perturbation data from the CSV files\u001b[39;00m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;124;03mthat we have for these datasets.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 113\u001b[0m \n\u001b[1;32m 114\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 116\u001b[0m brent_cc_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdata_dir_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbinding/brent_nf_cc\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 117\u001b[0m brent_nf_csv_files \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m--> 118\u001b[0m f \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlistdir\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbrent_cc_path\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m f\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 119\u001b[0m ]\n\u001b[1;32m 120\u001b[0m perturb_dataset_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\n\u001b[1;32m 121\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdata_dir_path, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mperturbation/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mperturbation_dataset_title\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 122\u001b[0m )\n\u001b[1;32m 123\u001b[0m perturb_dataset_csv_files \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 124\u001b[0m f \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m os\u001b[38;5;241m.\u001b[39mlistdir(perturb_dataset_path) \u001b[38;5;28;01mif\u001b[39;00m f\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 125\u001b[0m ]\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../../data/init_analysis_data_20240409/binding/brent_nf_cc'" + ] + } + ], "source": [ "# we need to redefine a new instance with the same params unless we want it to pick up where it left off\n", "new_model = SimpleModel(\n", @@ -139,9 +408,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n" + ] + } + ], "source": [ "# this will be used to save the model checkpoint that performs the best on the validation set\n", "best_model_checkpoint = ModelCheckpoint(\n", @@ -186,9 +466,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/Users/ericjia/yeastdnnexplorer/docs/tutorials/example/path/not/real.ckpt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 6\u001b[0m\n\u001b[1;32m 3\u001b[0m path_to_checkpoint \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexample/path/not/real.ckpt\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# note that we need to use the same model class that was used to save the checkpoint\u001b[39;00m\n\u001b[0;32m----> 6\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mSimpleModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_from_checkpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath_to_checkpoint\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# we can load the model and continue training from where it left off\u001b[39;00m\n\u001b[1;32m 9\u001b[0m trainer\u001b[38;5;241m.\u001b[39mfit(model, data_module)\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/pytorch_lightning/utilities/model_helpers.py:125\u001b[0m, in \u001b[0;36m_restricted_classmethod_impl.__get__..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m instance \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_scripting:\n\u001b[1;32m 121\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m 122\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe classmethod `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmethod\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m` cannot be called on an instance.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Please call it on the class type and make sure the return value is used.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 124\u001b[0m )\n\u001b[0;32m--> 125\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/pytorch_lightning/core/module.py:1581\u001b[0m, in \u001b[0;36mLightningModule.load_from_checkpoint\u001b[0;34m(cls, checkpoint_path, map_location, hparams_file, strict, **kwargs)\u001b[0m\n\u001b[1;32m 1492\u001b[0m \u001b[38;5;129m@_restricted_classmethod\u001b[39m\n\u001b[1;32m 1493\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_from_checkpoint\u001b[39m(\n\u001b[1;32m 1494\u001b[0m \u001b[38;5;28mcls\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 1500\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Self:\n\u001b[1;32m 1501\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Primary way of loading a model from a checkpoint. When Lightning saves a checkpoint it stores the arguments\u001b[39;00m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;124;03m passed to ``__init__`` in the checkpoint under ``\"hyper_parameters\"``.\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1579\u001b[0m \n\u001b[1;32m 1580\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1581\u001b[0m loaded \u001b[38;5;241m=\u001b[39m \u001b[43m_load_from_checkpoint\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1582\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[arg-type]\u001b[39;49;00m\n\u001b[1;32m 1583\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheckpoint_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1584\u001b[0m \u001b[43m \u001b[49m\u001b[43mmap_location\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1585\u001b[0m \u001b[43m \u001b[49m\u001b[43mhparams_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1586\u001b[0m \u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1587\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1588\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1589\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m cast(Self, loaded)\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/pytorch_lightning/core/saving.py:63\u001b[0m, in \u001b[0;36m_load_from_checkpoint\u001b[0;34m(cls, checkpoint_path, map_location, hparams_file, strict, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m map_location \u001b[38;5;241m=\u001b[39m map_location \u001b[38;5;129;01mor\u001b[39;00m _default_map_location\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m pl_legacy_patch():\n\u001b[0;32m---> 63\u001b[0m checkpoint \u001b[38;5;241m=\u001b[39m \u001b[43mpl_load\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcheckpoint_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_location\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmap_location\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;66;03m# convert legacy checkpoints to the new format\u001b[39;00m\n\u001b[1;32m 66\u001b[0m checkpoint \u001b[38;5;241m=\u001b[39m _pl_migrate_checkpoint(\n\u001b[1;32m 67\u001b[0m checkpoint, checkpoint_path\u001b[38;5;241m=\u001b[39m(checkpoint_path \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(checkpoint_path, (\u001b[38;5;28mstr\u001b[39m, Path)) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 68\u001b[0m )\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/lightning_fabric/utilities/cloud_io.py:56\u001b[0m, in \u001b[0;36m_load\u001b[0;34m(path_or_url, map_location)\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mhub\u001b[38;5;241m.\u001b[39mload_state_dict_from_url(\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28mstr\u001b[39m(path_or_url),\n\u001b[1;32m 53\u001b[0m map_location\u001b[38;5;241m=\u001b[39mmap_location, \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[1;32m 54\u001b[0m )\n\u001b[1;32m 55\u001b[0m fs \u001b[38;5;241m=\u001b[39m get_filesystem(path_or_url)\n\u001b[0;32m---> 56\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mfs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath_or_url\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mload(f, map_location\u001b[38;5;241m=\u001b[39mmap_location)\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/fsspec/spec.py:1298\u001b[0m, in \u001b[0;36mAbstractFileSystem.open\u001b[0;34m(self, path, mode, block_size, cache_options, compression, **kwargs)\u001b[0m\n\u001b[1;32m 1296\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1297\u001b[0m ac \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mautocommit\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_intrans)\n\u001b[0;32m-> 1298\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_open\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1299\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1300\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1301\u001b[0m \u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1302\u001b[0m \u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mac\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1303\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1304\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1305\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1306\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m compression \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1307\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfsspec\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompression\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compr\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/fsspec/implementations/local.py:191\u001b[0m, in \u001b[0;36mLocalFileSystem._open\u001b[0;34m(self, path, mode, block_size, **kwargs)\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mauto_mkdir \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 190\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmakedirs(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parent(path), exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 191\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mLocalFileOpener\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/fsspec/implementations/local.py:355\u001b[0m, in \u001b[0;36mLocalFileOpener.__init__\u001b[0;34m(self, path, mode, autocommit, fs, compression, **kwargs)\u001b[0m\n\u001b[1;32m 353\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompression \u001b[38;5;241m=\u001b[39m get_compression(path, compression)\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblocksize \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mDEFAULT_BUFFER_SIZE\n\u001b[0;32m--> 355\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/fsspec/implementations/local.py:360\u001b[0m, in \u001b[0;36mLocalFileOpener._open\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf\u001b[38;5;241m.\u001b[39mclosed:\n\u001b[1;32m 359\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mautocommit \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmode:\n\u001b[0;32m--> 360\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpath, mode\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmode)\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompression:\n\u001b[1;32m 362\u001b[0m compress \u001b[38;5;241m=\u001b[39m compr[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompression]\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/Users/ericjia/yeastdnnexplorer/docs/tutorials/example/path/not/real.ckpt'" + ] + } + ], "source": [ "# Load a model from a checkpoint\n", "# We can load a model from a checkpoint like so:\n", @@ -206,11 +506,18 @@ "# we could also load the model and make predictions\n", "predictions = model(data_module.test_dataloader())" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -224,9 +531,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.11.1" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/tutorials/testing_model_metrics.ipynb b/docs/tutorials/testing_model_metrics.ipynb index 493715e..8a63f5e 100644 --- a/docs/tutorials/testing_model_metrics.ipynb +++ b/docs/tutorials/testing_model_metrics.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -84,18 +84,18 @@ " using_random_seed: bool,\n", " accelerator: str,\n", " num_genes: int,\n", - " signal_mean: float,\n", + " bound_mean: float,\n", " val_size: float,\n", " test_size: float,\n", - " signal: list[float],\n", + " bound: list[float],\n", " n_sample: list[int],\n", " max_mean_adjustment: float,\n", ") -> LightningModule:\n", " data_module = SyntheticDataLoader(\n", " batch_size=batch_size,\n", " num_genes=num_genes,\n", - " signal_mean=signal_mean,\n", - " signal=signal, # old: [0.1, 0.15, 0.2, 0.25, 0.3],\n", + " bound_mean=bound_mean,\n", + " bound=bound, # old: [0.1, 0.15, 0.2, 0.25, 0.3],\n", " n_sample=n_sample, # sum of this is num of tfs\n", " val_size=val_size,\n", " test_size=test_size,\n", @@ -136,13 +136,1169 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "\n", + " | Name | Type | Params\n", + "----------------------------------------------\n", + "0 | mae | MeanAbsoluteError | 0 \n", + "1 | SMSE | SMSE | 0 \n", + "2 | linear1 | Linear | 110 \n", + "----------------------------------------------\n", + "110 Trainable params\n", + "0 Non-trainable params\n", + "110 Total params\n", + "0.000 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "81f0cc52e9c6419ea965fd0eed66b4e1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6a35ccabcd0e48c28b0fd3725ee0f3a2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 0.5135628581047058\n", + " test_mse 0.416797935962677\n", + " test_smse 10.241324424743652\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "\n", + " | Name | Type | Params\n", + "----------------------------------------------\n", + "0 | mae | MeanAbsoluteError | 0 \n", + "1 | SMSE | SMSE | 0 \n", + "2 | linear1 | Linear | 110 \n", + "----------------------------------------------\n", + "110 Trainable params\n", + "0 Non-trainable params\n", + "110 Total params\n", + "0.000 Total estimated model params size (MB)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Printing test results...\n", + "[{'test_mse': 0.416797935962677, 'test_mae': 0.5135628581047058, 'test_smse': 10.241324424743652}]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c5eeb9a7b5944d3c85683e0e0b8a31ac", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ea3a981f9c0247aba2551941ebd1127c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 0.5821905136108398\n", + " test_mse 0.5283595323562622\n", + " test_smse 10.348736763000488\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "\n", + " | Name | Type | Params\n", + "----------------------------------------------\n", + "0 | mae | MeanAbsoluteError | 0 \n", + "1 | SMSE | SMSE | 0 \n", + "2 | linear1 | Linear | 110 \n", + "----------------------------------------------\n", + "110 Trainable params\n", + "0 Non-trainable params\n", + "110 Total params\n", + "0.000 Total estimated model params size (MB)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Printing test results...\n", + "[{'test_mse': 0.5283595323562622, 'test_mae': 0.5821905136108398, 'test_smse': 10.348736763000488}]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a83e317d249c478fa8a8903ed6ffbd52", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b14fcb56940f4300a3b9357a4a075ae4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 0.8307084441184998\n", + " test_mse 1.050934910774231\n", + " test_smse 10.213595390319824\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Printing test results...\n", + "[{'test_mse': 1.050934910774231, 'test_mae': 0.8307084441184998, 'test_smse': 10.213595390319824}]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " | Name | Type | Params\n", + "----------------------------------------------\n", + "0 | mae | MeanAbsoluteError | 0 \n", + "1 | SMSE | SMSE | 0 \n", + "2 | linear1 | Linear | 110 \n", + "----------------------------------------------\n", + "110 Trainable params\n", + "0 Non-trainable params\n", + "110 Total params\n", + "0.000 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b6177aafea8a40efa7bd3e354a7fdd48", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ad8ed0e588954f07b698c88b7dde3b7c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 1.1858488321304321\n", + " test_mse 2.014770984649658\n", + " test_smse 10.195466995239258\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "\n", + " | Name | Type | Params\n", + "----------------------------------------------\n", + "0 | mae | MeanAbsoluteError | 0 \n", + "1 | SMSE | SMSE | 0 \n", + "2 | linear1 | Linear | 110 \n", + "----------------------------------------------\n", + "110 Trainable params\n", + "0 Non-trainable params\n", + "110 Total params\n", + "0.000 Total estimated model params size (MB)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Printing test results...\n", + "[{'test_mse': 2.014770984649658, 'test_mae': 1.1858488321304321, 'test_smse': 10.195466995239258}]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a467879ee86d4a5b8d15490b21ffd6ab", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c527988542004759a3cb282abda532a9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 2.091959238052368\n", + " test_mse 6.157958984375\n", + " test_smse 11.987293243408203\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + "Printing test results...\n", + "[{'test_mse': 6.157958984375, 'test_mae': 2.091959238052368, 'test_smse': 11.987293243408203}]\n" + ] + } + ], "source": [ - "signal_means = [0.5, 1.0, 2.0, 3.0, 5.0]\n", + "bound_means = [0.5, 1.0, 2.0, 3.0, 5.0]\n", "test_mses = []\n", - "for signal_mean in signal_means:\n", + "for bound_mean in bound_means:\n", " model, test_results = train_simple_model_with_params(\n", " batch_size=32,\n", " lr=0.01,\n", @@ -152,9 +1308,9 @@ " num_genes=1000,\n", " val_size=0.1,\n", " test_size=0.1,\n", - " signal=[0.5] * 5,\n", + " bound=[0.5] * 5,\n", " n_sample=[1, 1, 2, 2, 4], # sum of this is num of tfs\n", - " signal_mean=signal_mean,\n", + " bound_mean=bound_mean,\n", " max_mean_adjustment=0.0\n", " )\n", " test_mses.append(test_results[0][\"test_mse\"])" @@ -169,12 +1325,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -184,12 +1340,12 @@ } ], "source": [ - "plt.plot(signal_means, test_mses, marker=\"o\")\n", - "plt.xlabel(\"Signal Mean\")\n", - "plt.xticks(signal_means, rotation=45)\n", + "plt.plot(bound_means, test_mses, marker=\"o\")\n", + "plt.xlabel(\"bound Mean\")\n", + "plt.xticks(bound_means, rotation=45)\n", "plt.yticks(test_mses)\n", "plt.ylabel(\"Test MSE\")\n", - "plt.title(\"Test MSE as a function of Signal Mean\")\n", + "plt.title(\"Test MSE as a function of bound Mean\")\n", "plt.show()" ] }, @@ -199,7 +1355,7 @@ "source": [ "### Experiment 2\n", "\n", - "We can run a similar experiment where we test the effect of the bound / unbound ratio (aka signal / noise ratio) on the model's MSE" + "We can run a similar experiment where we test the effect of the bound / unbound ratio (aka bound / unbound ratio) on the model's MSE" ] }, { @@ -208,10 +1364,10 @@ "metadata": {}, "outputs": [], "source": [ - "signal_noise_ratios = [0.05, 0.1, 0.25, 0.5, 0.75, 0.9]\n", + "bound_unbound_ratios = [0.05, 0.1, 0.25, 0.5, 0.75, 0.9]\n", "test_mses = []\n", "\n", - "for signal_noise_ratio in signal_noise_ratios:\n", + "for bound_unbound_ratio in bound_unbound_ratios:\n", " model, test_results = train_simple_model_with_params(\n", " batch_size=32,\n", " lr=0.01,\n", @@ -221,9 +1377,9 @@ " num_genes=1000,\n", " val_size=0.1,\n", " test_size=0.1,\n", - " signal=[signal_noise_ratio] * 5,\n", + " bound=[bound_unbound_ratio] * 5,\n", " n_sample=[1, 1, 2, 2, 4],\n", - " signal_mean=3.0,\n", + " bound_mean=3.0,\n", " max_mean_adjustment=0.0\n", " )\n", " print(test_results)\n", @@ -247,12 +1403,12 @@ } ], "source": [ - "plt.plot(signal_noise_ratios, test_mses, marker=\"o\")\n", - "plt.xlabel(\"Percentage of Data in Signal Group\")\n", + "plt.plot(bound_unbound_ratios, test_mses, marker=\"o\")\n", + "plt.xlabel(\"Percentage of Data in bound Group\")\n", "plt.ylabel(\"Test MSE\")\n", - "plt.xticks(signal_noise_ratios, rotation=45)\n", + "plt.xticks(bound_unbound_ratios, rotation=45)\n", "plt.yticks(test_mses)\n", - "plt.title(\"Test MSE as a function of signal/noise ratio (signal mean = 3.0)\")\n", + "plt.title(\"Test MSE as a function of bound/unbound ratio (bound mean = 3.0)\")\n", "plt.show()" ] }, @@ -277,31 +1433,31 @@ "num_genes = 3000\n", "val_size = 0.1\n", "test_size = 0.1\n", - "signal = [0.5] * 5\n", + "bound = [0.5] * 5\n", "n_sample = [1, 1, 2, 2, 4]\n", "random_state = 42\n", "\n", "# the first data loader will load a dataset with a small scale and a small bound mean\n", "small_scale_and_mean_dataloader = SyntheticDataLoader(\n", " num_genes=num_genes,\n", - " signal=signal, \n", + " bound=bound, \n", " n_sample=n_sample,\n", " val_size=val_size,\n", " test_size=test_size,\n", " random_state=random_state,\n", - " signal_mean=1.0,\n", + " bound_mean=1.0,\n", " max_mean_adjustment=1.0\n", ")\n", "\n", "# the second data loader will generate a dataset with a large scale and a large bound mean\n", "large_scale_and_mean_dataloader = SyntheticDataLoader(\n", " num_genes=num_genes,\n", - " signal=signal, \n", + " bound=bound, \n", " n_sample=n_sample,\n", " val_size=val_size,\n", " test_size=test_size,\n", " random_state=random_state,\n", - " signal_mean=10.0,\n", + " bound_mean=10.0,\n", " max_mean_adjustment=10.0\n", ")\n", "\n", @@ -331,7 +1487,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -345,9 +1501,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.11.1" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb b/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb index 6167edc..a0178dc 100644 --- a/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb +++ b/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb @@ -65,14 +65,14 @@ "source": [ "n_genes = 3000\n", "\n", - "signal = [0.5, 0.5, 0.5, 0.5, 0.5]\n", + "bound = [0.5, 0.5, 0.5, 0.5, 0.5]\n", "n_sample = [1, 1, 2, 2, 4]\n", "\n", "# this will be a list of length 10 with a GenePopulation object in each element\n", "gene_populations_list = []\n", - "for signal_proportion, n_draws in zip(signal, n_sample):\n", + "for bound_proportion, n_draws in zip(bound, n_sample):\n", " for _ in range(n_draws):\n", - " gene_populations_list.append(generate_gene_population(n_genes, signal_proportion))\n", + " gene_populations_list.append(generate_gene_population(n_genes, bound_proportion))\n", " \n", "# Generate binding data for each gene population\n", "binding_effect_list = [generate_binding_effects(gene_population)\n", @@ -197,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -233,21 +233,21 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Bound (signal) TFs for gene 0 are: [3, 4, 5, 6, 7, 9]\n", - "Unbound (noise) TFs for gene 0 are: [0, 1, 2, 8]\n", + "Bound (bound) TFs for gene 0 are: [3, 4, 5, 6, 7, 9]\n", + "Unbound (unbound) TFs for gene 0 are: [0, 1, 2, 8]\n", "tensor([0., 0., 0., 1., 1., 1., 1., 1., 0., 1.])\n" ] }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -258,8 +258,8 @@ ], "source": [ "x_vals = list(range(sum(n_sample)))\n", - "print(\"Bound (signal) TFs for gene \" + str(GENE_IDX) + \" are: \" + str(binding_data_tensor[GENE_IDX, :, 0].nonzero().flatten().tolist()))\n", - "print(\"Unbound (noise) TFs for gene \" + str(GENE_IDX) + \" are: \" + str((1 - binding_data_tensor[GENE_IDX, :, 0]).nonzero().flatten().tolist()))\n", + "print(\"Bound (bound) TFs for gene \" + str(GENE_IDX) + \" are: \" + str(binding_data_tensor[GENE_IDX, :, 0].nonzero().flatten().tolist()))\n", + "print(\"Unbound (unbound) TFs for gene \" + str(GENE_IDX) + \" are: \" + str((1 - binding_data_tensor[GENE_IDX, :, 0]).nonzero().flatten().tolist()))\n", "print(binding_data_tensor[GENE_IDX, :, 0])\n", "plt.figure(figsize=(10, 6))\n", "\n", @@ -298,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -331,7 +331,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -339,8 +339,8 @@ " return SyntheticDataLoader(\n", " batch_size=32,\n", " num_genes=4000,\n", - " signal_mean=3.0,\n", - " signal=[0.5] * 5,\n", + " bound_mean=3.0,\n", + " bound=[0.5] * 5,\n", " n_sample=[1, 1, 2, 2, 4], # sum of this is num of tfs\n", " val_size=0.1,\n", " test_size=0.1,\n", @@ -383,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -399,217 +399,2120 @@ "Train models on data generated with no mean adjustment" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_module = get_data_module(0.0)\n", - "num_tfs = sum(data_module.n_sample)\n", - "\n", - "# nonlinear model\n", - "model = get_model(num_tfs)\n", - "trainer = get_trainer()\n", - "trainer.fit(model, data_module)\n", - "test_results = trainer.test(model, datamodule=data_module)\n", - "print(\"Printing test results...\")\n", - "print(test_results)\n", - "model_mses.append(test_results[0][\"test_mse\"])\n", - "\n", - "# linear model\n", - "linear_model = get_linear_model(num_tfs)\n", - "trainer = get_trainer()\n", - "trainer.fit(linear_model, data_module)\n", - "test_results = trainer.test(linear_model, datamodule=data_module)\n", - "print(\"Printing linear model test results\")\n", - "print(test_results)\n", - "linear_model_test_mses.append(test_results[0][\"test_mse\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Train models on data generated with normal mean adjustments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_module = get_data_module(3.0)\n", - "num_tfs = sum(data_module.n_sample)\n", - "\n", - "# nonlinear model\n", - "model = get_model(num_tfs)\n", - "trainer = get_trainer()\n", - "trainer.fit(model, data_module)\n", - "test_results = trainer.test(model, datamodule=data_module)\n", - "print(\"Printing test results...\")\n", - "print(test_results)\n", - "model_mses.append(test_results[0][\"test_mse\"])\n", - "\n", - "# linear model\n", - "linear_model = get_linear_model(num_tfs)\n", - "trainer = get_trainer()\n", - "trainer.fit(linear_model, data_module)\n", - "test_results = trainer.test(linear_model, datamodule=data_module)\n", - "print(\"Printing linear model test results\")\n", - "print(test_results)\n", - "linear_model_test_mses.append(test_results[0][\"test_mse\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Train model on data generated with dependent mean adjustments (method 3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# define dictionary of relations between TFs (see generate_in_silico_data.ipynb for an explanation of how this dict is defined / used)\n", - "tf_relationships_dict = {\n", - " 0: [1],\n", - " 1: [8],\n", - " 2: [5, 6],\n", - " 3: [4],\n", - " 4: [5],\n", - " 5: [9],\n", - " 6: [4],\n", - " 7: [1, 4],\n", - " 8: [6],\n", - " 9: [4],\n", - "}\n", - "\n", - "data_module = get_data_module(\n", - " 3.0, \n", - " adjustment_function=perturbation_effect_adjustment_function_with_tf_relationships, \n", - " tf_relationships_dict=tf_relationships_dict\n", - ")\n", - "num_tfs = sum(data_module.n_sample)\n", - "\n", - "print(\"Number of TFs: \", num_tfs)\n", - "\n", - "# nonlinear model\n", - "model = get_model(num_tfs)\n", - "trainer = get_trainer()\n", - "trainer.fit(model, data_module)\n", - "test_results = trainer.test(model, datamodule=data_module)\n", - "print(\"Printing test results...\")\n", - "print(test_results)\n", - "model_mses.append(test_results[0][\"test_mse\"])\n", - "\n", - "# linear model\n", - "linear_model = get_linear_model(num_tfs)\n", - "trainer = get_trainer()\n", - "trainer.fit(linear_model, data_module)\n", - "test_results = trainer.test(linear_model, datamodule=data_module)\n", - "print(\"Printing linear model test results\")\n", - "print(test_results)\n", - "linear_model_test_mses.append(test_results[0][\"test_mse\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Train models on data generated using the binary relations between TFs (method 4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tf_relationships_dict_boolean_logic = {\n", - " 0: [And(3, 4, 8), Or(3, 7), Or(1, 1)],\n", - " 1: [And(5, Or(7, 8))],\n", - " 2: [],\n", - " 3: [Or(7, 9), And(6, 7)],\n", - " 4: [And(1, 2)],\n", - " 5: [Or(0, 1, 2, 8, 9)],\n", - " 6: [And(0, Or(1, 2))],\n", - " 7: [Or(2, And(5, 6, 9))],\n", - " 8: [],\n", - " 9: [And(6, And(3, Or(0, 9)))],\n", - "}\n", - "\n", - "data_module = get_data_module(\n", - " 3.0, \n", - " adjustment_function=perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic, \n", - " tf_relationships_dict=tf_relationships_dict_boolean_logic\n", - ")\n", - "\n", - "# nonlinear model\n", - "model = get_model(num_tfs)\n", - "trainer = get_trainer()\n", - "trainer.fit(model, data_module)\n", - "test_results = trainer.test(model, datamodule=data_module)\n", - "print(\"Printing test results...\")\n", - "print(test_results)\n", - "model_mses.append(test_results[0][\"test_mse\"])\n", - "\n", - "# linear model\n", - "linear_model = get_linear_model(num_tfs)\n", - "trainer = get_trainer()\n", - "trainer.fit(linear_model, data_module)\n", - "test_results = trainer.test(linear_model, datamodule=data_module)\n", - "print(\"Printing linear model test results\")\n", - "print(test_results)\n", - "linear_model_test_mses.append(test_results[0][\"test_mse\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can plot the results of our experiment. TODO add explantion for plot here? Probably not the right place to put it (I feel like that belongs in the presentation or something, because this notebook could be modified and the explanation wouldn't make sense)" - ] - }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:260: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_train, Y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:263: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_val, Y_val = torch.tensor(X_val, dtype=torch.float32), torch.tensor(\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_test, Y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(\n", + "\n", + " | Name | Type | Params\n", + "----------------------------------------------------\n", + "0 | activation | LeakyReLU | 0 \n", + "1 | input_layer | Linear | 704 \n", + "2 | hidden_layers | ModuleList | 2.1 K \n", + "3 | output_layer | Linear | 330 \n", + "4 | dropout | Dropout | 0 \n", + "5 | mae | MeanAbsoluteError | 0 \n", + "6 | SMSE | SMSE | 0 \n", + "----------------------------------------------------\n", + "3.1 K Trainable params\n", + "0 Non-trainable params\n", + "3.1 K Total params\n", + "0.012 Total estimated model params size (MB)\n" + ] + }, { "data": { - "image/png": "", + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, "text/plain": [ - "
" + "Sanity Checking: | …" ] }, "metadata": {}, "output_type": "display_data" - } - ], - "source": [ - "data_gen_methods = [\"No Mean Adjustment\", \"Dependent Mean Adjustment\", \"TF Dependent Mean Adjustment\", \"TF Dependent Mean Adjust with Boolean Logic\"]\n", - "plt.figure(figsize=(10, 6))\n", - "plt.scatter(data_gen_methods, model_mses, color='blue')\n", - "plt.scatter(data_gen_methods, linear_model_test_mses, color='orange')\n", - "plt.title('Model MSE Comparison (bound mean = 3.0)')\n", - "plt.xlabel('Model')\n", - "plt.ylabel('MSE')\n", - "plt.grid(True)\n", - "plt.xticks(rotation=45, ha=\"right\")\n", - "plt.legend(['Complex (Customizable) Model', 'Linear Model'])\n", - "plt.tight_layout() # Adjust layout to make room for the rotated x-axis labels\n", - "plt.show()" - ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ericjia/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/torch/utils/data/dataloader.py:558: UserWarning: This DataLoader will create 15 worker processes in total. Our suggested max number of worker in current system is 8 (`cpuset` is not taken into account), which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", + " warnings.warn(_create_warning_msg(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d02b096d74cd495494b5fcc85f4ace17", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "edd3984a91d948f19aa575e99e0c23e7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 0.9961513876914978\n", + " test_mse 1.5293521881103516\n", + " test_smse 8.054170608520508\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "\n", + " | Name | Type | Params\n", + "----------------------------------------------\n", + "0 | mae | MeanAbsoluteError | 0 \n", + "1 | SMSE | SMSE | 0 \n", + "2 | linear1 | Linear | 110 \n", + "----------------------------------------------\n", + "110 Trainable params\n", + "0 Non-trainable params\n", + "110 Total params\n", + "0.000 Total estimated model params size (MB)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Printing test results...\n", + "[{'test_mse': 1.5293521881103516, 'test_mae': 0.9961513876914978, 'test_smse': 8.054170608520508}]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ericjia/Library/Caches/pypoetry/virtualenvs/yeastdnnexplorer-iu4_cpc2-py3.11/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (9) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4aee007cf5864b73a5513a10bae06a30", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "121686510d0c4a818c083d308800cc56", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 1.3378851413726807\n", + " test_mse 3.5256669521331787\n", + " test_smse 18.614917755126953\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + "Printing linear model test results\n", + "[{'test_mse': 3.5256669521331787, 'test_mae': 1.3378851413726807, 'test_smse': 18.614917755126953}]\n" + ] + } + ], + "source": [ + "data_module = get_data_module(0.0)\n", + "num_tfs = sum(data_module.n_sample)\n", + "\n", + "# nonlinear model\n", + "model = get_model(num_tfs)\n", + "trainer = get_trainer()\n", + "trainer.fit(model, data_module)\n", + "test_results = trainer.test(model, datamodule=data_module)\n", + "print(\"Printing test results...\")\n", + "print(test_results)\n", + "model_mses.append(test_results[0][\"test_mse\"])\n", + "\n", + "# linear model\n", + "linear_model = get_linear_model(num_tfs)\n", + "trainer = get_trainer()\n", + "trainer.fit(linear_model, data_module)\n", + "test_results = trainer.test(linear_model, datamodule=data_module)\n", + "print(\"Printing linear model test results\")\n", + "print(test_results)\n", + "linear_model_test_mses.append(test_results[0][\"test_mse\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train models on data generated with normal mean adjustments" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "\n", + " | Name | Type | Params\n", + "----------------------------------------------------\n", + "0 | activation | LeakyReLU | 0 \n", + "1 | input_layer | Linear | 704 \n", + "2 | hidden_layers | ModuleList | 2.1 K \n", + "3 | output_layer | Linear | 330 \n", + "4 | dropout | Dropout | 0 \n", + "5 | mae | MeanAbsoluteError | 0 \n", + "6 | SMSE | SMSE | 0 \n", + "----------------------------------------------------\n", + "3.1 K Trainable params\n", + "0 Non-trainable params\n", + "3.1 K Total params\n", + "0.012 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "72376aa92d0645bf84f29376c378dff0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d887cf27ea7d48b4ae10ec19eb6c223b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 1.4062790870666504\n", + " test_mse 3.1310036182403564\n", + " test_smse 7.031686305999756\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Printing test results...\n", + "[{'test_mse': 3.1310036182403564, 'test_mae': 1.4062790870666504, 'test_smse': 7.031686305999756}]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " | Name | Type | Params\n", + "----------------------------------------------\n", + "0 | mae | MeanAbsoluteError | 0 \n", + "1 | SMSE | SMSE | 0 \n", + "2 | linear1 | Linear | 110 \n", + "----------------------------------------------\n", + "110 Trainable params\n", + "0 Non-trainable params\n", + "110 Total params\n", + "0.000 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2c1542f49b1349a2a04b9046d2be5805", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a1a2fa38661a4675837a6d72dbd83029", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 2.076347827911377\n", + " test_mse 8.463006973266602\n", + " test_smse 19.093679428100586\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + "Printing linear model test results\n", + "[{'test_mse': 8.463006973266602, 'test_mae': 2.076347827911377, 'test_smse': 19.093679428100586}]\n" + ] + } + ], + "source": [ + "data_module = get_data_module(3.0)\n", + "num_tfs = sum(data_module.n_sample)\n", + "\n", + "# nonlinear model\n", + "model = get_model(num_tfs)\n", + "trainer = get_trainer()\n", + "trainer.fit(model, data_module)\n", + "test_results = trainer.test(model, datamodule=data_module)\n", + "print(\"Printing test results...\")\n", + "print(test_results)\n", + "model_mses.append(test_results[0][\"test_mse\"])\n", + "\n", + "# linear model\n", + "linear_model = get_linear_model(num_tfs)\n", + "trainer = get_trainer()\n", + "trainer.fit(linear_model, data_module)\n", + "test_results = trainer.test(linear_model, datamodule=data_module)\n", + "print(\"Printing linear model test results\")\n", + "print(test_results)\n", + "linear_model_test_mses.append(test_results[0][\"test_mse\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train model on data generated with dependent mean adjustments (method 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of TFs: 10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " | Name | Type | Params\n", + "----------------------------------------------------\n", + "0 | activation | LeakyReLU | 0 \n", + "1 | input_layer | Linear | 704 \n", + "2 | hidden_layers | ModuleList | 2.1 K \n", + "3 | output_layer | Linear | 330 \n", + "4 | dropout | Dropout | 0 \n", + "5 | mae | MeanAbsoluteError | 0 \n", + "6 | SMSE | SMSE | 0 \n", + "----------------------------------------------------\n", + "3.1 K Trainable params\n", + "0 Non-trainable params\n", + "3.1 K Total params\n", + "0.012 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "63b2ea19474e478a8b313f50b35c046a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fdcf8b63a1284789b141f4e178132fa1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 1.116166114807129\n", + " test_mse 2.3565773963928223\n", + " test_smse 7.7063517570495605\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Printing test results...\n", + "[{'test_mse': 2.3565773963928223, 'test_mae': 1.116166114807129, 'test_smse': 7.7063517570495605}]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " | Name | Type | Params\n", + "----------------------------------------------\n", + "0 | mae | MeanAbsoluteError | 0 \n", + "1 | SMSE | SMSE | 0 \n", + "2 | linear1 | Linear | 110 \n", + "----------------------------------------------\n", + "110 Trainable params\n", + "0 Non-trainable params\n", + "110 Total params\n", + "0.000 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5391c299f8c04b2e862d2dc8262032de", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7bb86cad87c84481a514913c82e5a306", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 1.30489981174469\n", + " test_mse 3.8554797172546387\n", + " test_smse 12.853811264038086\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + "Printing linear model test results\n", + "[{'test_mse': 3.8554797172546387, 'test_mae': 1.30489981174469, 'test_smse': 12.853811264038086}]\n" + ] + } + ], + "source": [ + "# define dictionary of relations between TFs (see generate_in_silico_data.ipynb for an explanation of how this dict is defined / used)\n", + "tf_relationships_dict = {\n", + " 0: [1],\n", + " 1: [8],\n", + " 2: [5, 6],\n", + " 3: [4],\n", + " 4: [5],\n", + " 5: [9],\n", + " 6: [4],\n", + " 7: [1, 4],\n", + " 8: [6],\n", + " 9: [4],\n", + "}\n", + "\n", + "data_module = get_data_module(\n", + " 3.0, \n", + " adjustment_function=perturbation_effect_adjustment_function_with_tf_relationships, \n", + " tf_relationships_dict=tf_relationships_dict\n", + ")\n", + "num_tfs = sum(data_module.n_sample)\n", + "\n", + "print(\"Number of TFs: \", num_tfs)\n", + "\n", + "# nonlinear model\n", + "model = get_model(num_tfs)\n", + "trainer = get_trainer()\n", + "trainer.fit(model, data_module)\n", + "test_results = trainer.test(model, datamodule=data_module)\n", + "print(\"Printing test results...\")\n", + "print(test_results)\n", + "model_mses.append(test_results[0][\"test_mse\"])\n", + "\n", + "# linear model\n", + "linear_model = get_linear_model(num_tfs)\n", + "trainer = get_trainer()\n", + "trainer.fit(linear_model, data_module)\n", + "test_results = trainer.test(linear_model, datamodule=data_module)\n", + "print(\"Printing linear model test results\")\n", + "print(test_results)\n", + "linear_model_test_mses.append(test_results[0][\"test_mse\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train models on data generated using the binary relations between TFs (method 4)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "\n", + " | Name | Type | Params\n", + "----------------------------------------------------\n", + "0 | activation | LeakyReLU | 0 \n", + "1 | input_layer | Linear | 704 \n", + "2 | hidden_layers | ModuleList | 2.1 K \n", + "3 | output_layer | Linear | 330 \n", + "4 | dropout | Dropout | 0 \n", + "5 | mae | MeanAbsoluteError | 0 \n", + "6 | SMSE | SMSE | 0 \n", + "----------------------------------------------------\n", + "3.1 K Trainable params\n", + "0 Non-trainable params\n", + "3.1 K Total params\n", + "0.012 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ea687cff710146bba3befdc793689d2e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "392244bf842049c3a56d74ef6662f7bd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 1.1678574085235596\n", + " test_mse 2.400193452835083\n", + " test_smse 7.260862827301025\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Printing test results...\n", + "[{'test_mse': 2.400193452835083, 'test_mae': 1.1678574085235596, 'test_smse': 7.260862827301025}]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " | Name | Type | Params\n", + "----------------------------------------------\n", + "0 | mae | MeanAbsoluteError | 0 \n", + "1 | SMSE | SMSE | 0 \n", + "2 | linear1 | Linear | 110 \n", + "----------------------------------------------\n", + "110 Trainable params\n", + "0 Non-trainable params\n", + "110 Total params\n", + "0.000 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ebfccf4bb873401ebb9e252080bc4593", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Training: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Validation: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Trainer.fit` stopped: `max_epochs=10` reached.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e575e6a693af4040baeb6862fa70c35a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Testing: | …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " Test metric DataLoader 0\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + " test_mae 1.3436788320541382\n", + " test_mse 4.040103912353516\n", + " test_smse 12.06108570098877\n", + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", + "Printing linear model test results\n", + "[{'test_mse': 4.040103912353516, 'test_mae': 1.3436788320541382, 'test_smse': 12.06108570098877}]\n" + ] + } + ], + "source": [ + "tf_relationships_dict_boolean_logic = {\n", + " 0: [And(3, 4, 8), Or(3, 7), Or(1, 1)],\n", + " 1: [And(5, Or(7, 8))],\n", + " 2: [],\n", + " 3: [Or(7, 9), And(6, 7)],\n", + " 4: [And(1, 2)],\n", + " 5: [Or(0, 1, 2, 8, 9)],\n", + " 6: [And(0, Or(1, 2))],\n", + " 7: [Or(2, And(5, 6, 9))],\n", + " 8: [],\n", + " 9: [And(6, And(3, Or(0, 9)))],\n", + "}\n", + "\n", + "data_module = get_data_module(\n", + " 3.0, \n", + " adjustment_function=perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic, \n", + " tf_relationships_dict=tf_relationships_dict_boolean_logic\n", + ")\n", + "\n", + "# nonlinear model\n", + "model = get_model(num_tfs)\n", + "trainer = get_trainer()\n", + "trainer.fit(model, data_module)\n", + "test_results = trainer.test(model, datamodule=data_module)\n", + "print(\"Printing test results...\")\n", + "print(test_results)\n", + "model_mses.append(test_results[0][\"test_mse\"])\n", + "\n", + "# linear model\n", + "linear_model = get_linear_model(num_tfs)\n", + "trainer = get_trainer()\n", + "trainer.fit(linear_model, data_module)\n", + "test_results = trainer.test(linear_model, datamodule=data_module)\n", + "print(\"Printing linear model test results\")\n", + "print(test_results)\n", + "linear_model_test_mses.append(test_results[0][\"test_mse\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can plot the results of our experiment. TODO add explantion for plot here? Probably not the right place to put it (I feel like that belongs in the presentation or something, because this notebook could be modified and the explanation wouldn't make sense)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data_gen_methods = [\"No Mean Adjustment\", \"Dependent Mean Adjustment\", \"TF Dependent Mean Adjustment\", \"TF Dependent Mean Adjust with Boolean Logic\"]\n", + "plt.figure(figsize=(10, 6))\n", + "plt.scatter(data_gen_methods, model_mses, color='blue')\n", + "plt.scatter(data_gen_methods, linear_model_test_mses, color='orange')\n", + "plt.title('Model MSE Comparison (bound mean = 3.0)')\n", + "plt.xlabel('Model')\n", + "plt.ylabel('MSE')\n", + "plt.grid(True)\n", + "plt.xticks(rotation=45, ha=\"right\")\n", + "plt.legend(['Complex (Customizable) Model', 'Linear Model'])\n", + "plt.tight_layout() # Adjust layout to make room for the rotated x-axis labels\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -623,9 +2526,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.11.1" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/experiments/simple_model_synthetic_data.py b/experiments/simple_model_synthetic_data.py index 021841d..f0eb4fd 100644 --- a/experiments/simple_model_synthetic_data.py +++ b/experiments/simple_model_synthetic_data.py @@ -61,7 +61,7 @@ def simple_model_synthetic_data_experiment( data_module = SyntheticDataLoader( batch_size=batch_size, num_genes=1000, - signal=[0.1, 0.15, 0.2, 0.25, 0.3], + bound=[0.1, 0.15, 0.2, 0.25, 0.3], n_sample=[1, 1, 2, 2, 4], val_size=0.1, test_size=0.1, diff --git a/pyproject.toml b/pyproject.toml index 2a7d309..f994d36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ tensorboard = "^2.16.1" torchsummary = "^1.5.1" optuna = "^3.6.0" optuna-dashboard = "^0.15.1" +jupyter = "^1.0.0" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" diff --git a/yeastdnnexplorer/data_loaders/real_data_loader.py b/yeastdnnexplorer/data_loaders/real_data_loader.py index 4c914a6..0f51ab1 100644 --- a/yeastdnnexplorer/data_loaders/real_data_loader.py +++ b/yeastdnnexplorer/data_loaders/real_data_loader.py @@ -222,7 +222,7 @@ def prepare_data(self) -> None: perturbation_pvalues.values, dtype=torch.float64 ) - # note that we no longer have a signal / noise tensor + # note that we no longer have a bound / unbound tensor # (like for the synthetic data) self.final_data_tensor = torch.stack( [ diff --git a/yeastdnnexplorer/data_loaders/synthetic_data_loader.py b/yeastdnnexplorer/data_loaders/synthetic_data_loader.py index 8c53670..858cacd 100644 --- a/yeastdnnexplorer/data_loaders/synthetic_data_loader.py +++ b/yeastdnnexplorer/data_loaders/synthetic_data_loader.py @@ -27,8 +27,8 @@ def __init__( self, batch_size: int = 32, num_genes: int = 1000, - signal: list[float] = [0.1, 0.2, 0.2, 0.4, 0.5], - signal_mean: float = 3.0, + bound: list[float] = [0.1, 0.2, 0.2, 0.4, 0.5], + bound_mean: float = 3.0, n_sample: list[int] = [1, 2, 2, 4, 4], val_size: float = 0.1, test_size: float = 0.1, @@ -47,10 +47,10 @@ def __init__( :param num_genes: The number of genes in the synthetic data (this is the number of datapoints in our dataset) :type num_genes: int - :param signal: The proportion of genes in each sample group that are put in the - signal grop (i.e. have a non-zero binding effect and expression response) - :type signal: List[int] - :param n_sample: The number of samples to draw from each signal group + :param bound: The proportion of genes in each sample group that are put in the + bound grop (i.e. have a non-zero binding effect and expression response) + :type bound: List[int] + :param n_sample: The number of samples to draw from each bound group :type n_sample: List[int] :param val_size: The proportion of the dataset to include in the validation split @@ -60,23 +60,23 @@ def __init__( :param random_state: The random seed to use for splitting the data (keep this consistent to ensure reproduceability) :type random_state: int - :param signal_mean: The mean of the signal distribution - :type signal_mean: float + :param bound_mean: The mean of the bound distribution + :type bound_mean: float :param max_mean_adjustment: The maximum mean adjustment to apply to the mean - of the signal (bound) perturbation effects + of the bound (bound) perturbation effects :type max_mean_adjustment: float - :param adjustment_function: A function that adjusts the mean of the signal + :param adjustment_function: A function that adjusts the mean of the bound (bound) perturbation effects :type adjustment_function: Callable[[torch.Tensor, float, float, float, dict[int, list[int]]], torch.Tensor] :raises TypeError: If batch_size is not an positive integer :raises TypeError: If num_genes is not an positive integer - :raises TypeError: If signal is not a list of integers or floats + :raises TypeError: If bound is not a list of integers or floats :raises TypeError: If n_sample is not a list of integers :raises TypeError: If val_size is not a float between 0 and 1 (inclusive) :raises TypeError: If test_size is not a float between 0 and 1 (inclusive) :raises TypeError: If random_state is not an integer - :raises TypeError: If signal_mean is not a float + :raises TypeError: If bound_mean is not a float :raises ValueError: If val_size + test_size is greater than 1 (i.e. the splits are too large) @@ -85,10 +85,10 @@ def __init__( raise TypeError("batch_size must be a positive integer") if not isinstance(num_genes, int) or num_genes < 1: raise TypeError("num_genes must be a positive integer") - if not isinstance(signal, list) or not all( - isinstance(x, (int, float)) for x in signal + if not isinstance(bound, list) or not all( + isinstance(x, (int, float)) for x in bound ): - raise TypeError("signal must be a list of integers or floats") + raise TypeError("bound must be a list of integers or floats") if not isinstance(n_sample, list) or not all( isinstance(x, int) for x in n_sample ): @@ -99,17 +99,17 @@ def __init__( raise TypeError("test_size must be a float between 0 and 1 (inclusive)") if not isinstance(random_state, int): raise TypeError("random_state must be an integer") - if not isinstance(signal_mean, float): - raise TypeError("signal_mean must be a float") + if not isinstance(bound_mean, float): + raise TypeError("bound_mean must be a float") if test_size + val_size > 1: raise ValueError("val_size + test_size must be less than or equal to 1") super().__init__() self.batch_size = batch_size self.num_genes = num_genes - self.signal_mean = signal_mean - self.signal = signal or [0.1, 0.15, 0.2, 0.25, 0.3] - self.n_sample = n_sample or [1 for _ in range(len(self.signal))] + self.bound_mean = bound_mean + self.bound = bound or [0.1, 0.15, 0.2, 0.25, 0.3] + self.n_sample = n_sample or [1 for _ in range(len(self.bound))] self.num_tfs = sum(self.n_sample) # sum of all n_sample is the number of TFs self.val_size = val_size self.test_size = test_size @@ -132,10 +132,10 @@ def prepare_data(self) -> None: performed as that is handled in the functions in generate_data.py.""" # this will be a list of length 10 with a GenePopulation object in each element gene_populations_list = [] - for signal_proportion, n_draws in zip(self.signal, self.n_sample): + for bound_proportion, n_draws in zip(self.bound, self.n_sample): for _ in range(n_draws): gene_populations_list.append( - generate_gene_population(self.num_genes, signal_proportion) + generate_gene_population(self.num_genes, bound_proportion) ) # Generate binding data for each gene population @@ -166,7 +166,7 @@ def prepare_data(self) -> None: if self.max_mean_adjustment > 0: perturbation_effects_list = generate_perturbation_effects( binding_data_tensor, - signal_mean=self.signal_mean, + bound_mean=self.bound_mean, tf_index=0, # unused max_mean_adjustment=self.max_mean_adjustment, adjustment_function=self.adjustment_function, @@ -188,7 +188,7 @@ def prepare_data(self) -> None: perturbation_effects_list = [ generate_perturbation_effects( binding_data_tensor[:, tf_index, :].unsqueeze(1), - signal_mean=self.signal_mean, + bound_mean=self.bound_mean, tf_index=0, # unused ) for tf_index in range(sum(self.n_sample)) diff --git a/yeastdnnexplorer/probability_models/generate_data.py b/yeastdnnexplorer/probability_models/generate_data.py index f6d8d4b..b0d2047 100644 --- a/yeastdnnexplorer/probability_models/generate_data.py +++ b/yeastdnnexplorer/probability_models/generate_data.py @@ -39,39 +39,39 @@ def __repr__(self): def generate_gene_population( - total: int = 1000, signal_group: float = 0.3 + total: int = 1000, bound_group: float = 0.3 ) -> GenePopulation: """ Generate two sets of genes, one of which will be considered genes which show a - signal, and the other which does not. The return is a one dimensional boolean tensor - where a value of '0' means that the gene at that index is part of the noise group - and a '1' means the gene at that index is part of the signal group. The length of - the tensor is the number of genes in this simulated organism. + bound, and the other which does not. The return is a one dimensional boolean tensor + where a value of '0' means that the gene at that index is part of the unbound group + and a '1' means the gene at that index is part of the bound group. The length of the + tensor is the number of genes in this simulated organism. :param total: The total number of genes. defaults to 1000 :type total: int, optional - :param signal_group: The proportion of genes in the signal group. defaults to 0.3 - :type signal_group: float, optional + :param bound_group: The proportion of genes in the bound group. defaults to 0.3 + :type bound_group: float, optional :return: A one dimensional tensor of boolean values where the set of indices with a - value of '1' are the signal group and the set of indices with a value of '0' are - the noise group. + value of '1' are the bound group and the set of indices with a value of '0' are + the unbound group. :rtype: GenePopulation :raises TypeError: if total is not an integer - :raises ValueError: If signal_group is not between 0 and 1 + :raises ValueError: If bound_group is not between 0 and 1 """ if not isinstance(total, int): raise TypeError("total must be an integer") - if not 0 <= signal_group <= 1: - raise ValueError("signal_group must be between 0 and 1") + if not 0 <= bound_group <= 1: + raise ValueError("bound_group must be between 0 and 1") - signal_group_size = int(total * signal_group) - logger.info("Generating %s genes with signal", signal_group_size) + bound_group_size = int(total * bound_group) + logger.info("Generating %s genes with bound", bound_group_size) labels = torch.cat( ( - torch.ones(signal_group_size, dtype=torch.bool), - torch.zeros(total - signal_group_size, dtype=torch.bool), + torch.ones(bound_group_size, dtype=torch.bool), + torch.zeros(total - bound_group_size, dtype=torch.bool), ) )[torch.randperm(total)] @@ -81,15 +81,15 @@ def generate_gene_population( def generate_binding_effects( gene_population: GenePopulation, background_hops_range: tuple[int, int] = (1, 100), - noise_experiment_hops_range: tuple[int, int] = (0, 1), - signal_experiment_hops_range: tuple[int, int] = (1, 6), + unbound_experiment_hops_range: tuple[int, int] = (0, 1), + bound_experiment_hops_range: tuple[int, int] = (1, 6), total_background_hops: int = 1000, total_experiment_hops: int = 76, pseudocount: float = 1e-10, ) -> torch.Tensor: """ Generate enrichment effects for genes using vectorized operations, based on their - signal designation, with separate experiment hops ranges for noise and signal genes. + bound designation, with separate experiment hops ranges for unbound and bound genes. Note that the default values are a scaled down version of actual data. See also https://github.com/cmatKhan/callingCardsTools/blob/main/callingcardstools/PeakCalling/yeast/enrichment.py @@ -99,12 +99,12 @@ def generate_binding_effects( :param background_hops_range: The range of hops for background genes. Defaults to (1, 100) :type background_hops_range: Tuple[int, int], optional - :param noise_experiment_hops_range: The range of hops for noise genes. Defaults to - (0, 1) - :type noise_experiment_hops_range: Tuple[int, int], optional - :param signal_experiment_hops_range: The range of hops for signal genes. Defaults to + :param unbound_experiment_hops_range: The range of hops for unbound genes. Defaults + to (0, 1) + :type unbound_experiment_hops_range: Tuple[int, int], optional + :param bound_experiment_hops_range: The range of hops for bound genes. Defaults to (1, 6) - :type signal_experiment_hops_range: Tuple[int, int], optional + :type bound_experiment_hops_range: Tuple[int, int], optional :param total_background_hops: The total number of background hops. Defaults to 1000 :type total_background_hops: int, optional :param total_experiment_hops: The total number of experiment hops. Defaults to 76 @@ -118,11 +118,11 @@ def generate_binding_effects( :raises TypeError: If total_experiment_hops is not an integer :raises TypeError: If pseudocount is not a float :raises TypeError: If background_hops_range is not a tuple - :raises TypeError: If noise_experiment_hops_range is not a tuple - :raises TypeError: If signal_experiment_hops_range is not a tuple + :raises TypeError: If unbound_experiment_hops_range is not a tuple + :raises TypeError: If bound_experiment_hops_range is not a tuple :raises ValueError: If background_hops_range is not a tuple of length 2 - :raises ValueError: If noise_experiment_hops_range is not a tuple of length 2 - :raises ValueError: If signal_experiment_hops_range is not a tuple of length 2 + :raises ValueError: If unbound_experiment_hops_range is not a tuple of length 2 + :raises ValueError: If bound_experiment_hops_range is not a tuple of length 2 """ # NOTE: torch intervals are half open on the right, so we add 1 to the @@ -139,8 +139,8 @@ def generate_binding_effects( raise TypeError("pseudocount must be a float") for arg, tup in { "background_hops_range": background_hops_range, - "noise_experiment_hops_range": noise_experiment_hops_range, - "signal_experiment_hops_range": signal_experiment_hops_range, + "unbound_experiment_hops_range": unbound_experiment_hops_range, + "bound_experiment_hops_range": bound_experiment_hops_range, }.items(): if not isinstance(tup, tuple): raise TypeError(f"{arg} must be a tuple") @@ -156,22 +156,22 @@ def generate_binding_effects( size=(gene_population.labels.shape[0],), ) - # Generate experiment hops noise genes - noise_experiment_hops = torch.randint( - low=noise_experiment_hops_range[0], - high=noise_experiment_hops_range[1] + 1, + # Generate experiment hops unbound genes + unbound_experiment_hops = torch.randint( + low=unbound_experiment_hops_range[0], + high=unbound_experiment_hops_range[1] + 1, size=(gene_population.labels.shape[0],), ) - # Generate experiment hops signal genes - signal_experiment_hops = torch.randint( - low=signal_experiment_hops_range[0], - high=signal_experiment_hops_range[1] + 1, + # Generate experiment hops bound genes + bound_experiment_hops = torch.randint( + low=bound_experiment_hops_range[0], + high=bound_experiment_hops_range[1] + 1, size=(gene_population.labels.shape[0],), ) - # Use signal designation to select appropriate experiment hops + # Use bound designation to select appropriate experiment hops experiment_hops = torch.where( - gene_population.labels == 1, signal_experiment_hops, noise_experiment_hops + gene_population.labels == 1, bound_experiment_hops, unbound_experiment_hops ) # Calculate enrichment for all genes @@ -230,8 +230,8 @@ def generate_pvalues( def default_perturbation_effect_adjustment_function( binding_enrichment_data: torch.Tensor, - signal_mean: float, - noise_mean: float, + bound_mean: float, + unbound_mean: float, max_adjustment: float, **kwargs, ) -> torch.Tensor: @@ -246,10 +246,10 @@ def default_perturbation_effect_adjustment_function( dimensions [n_genes, n_tfs, 3] where the entries in the third dimension are a matrix with columns [label, enrichment, pvalue]. :type binding_enrichment_data: torch.Tensor - :param signal_mean: The mean for signal genes. - :type signal_mean: float - :param noise_mean: The mean for noise genes. - :type noise_mean: float + :param bound_mean: The mean for bound genes. + :type bound_mean: float + :param unbound_mean: The mean for unbound genes. + :type unbound_mean: float :param max_adjustment: The maximum adjustment to the base mean based on enrichment. :type max_adjustment: float :param tf_relationships: Unused in this function. It is only here to match the @@ -259,37 +259,39 @@ def default_perturbation_effect_adjustment_function( :rtype: torch.Tensor """ - # Extract signal/noise labels and enrichment scores - signal_labels = binding_enrichment_data[:, :, 0] + # Extract bound/unbound labels and enrichment scores + bound_labels = binding_enrichment_data[:, :, 0] enrichment_scores = binding_enrichment_data[:, :, 1] adjusted_mean_matrix = torch.where( - signal_labels == 1, enrichment_scores, torch.zeros_like(enrichment_scores) + bound_labels == 1, enrichment_scores, torch.zeros_like(enrichment_scores) ) - for gene_idx in range(signal_labels.shape[0]): - for tf_index in range(signal_labels.shape[1]): - if signal_labels[gene_idx, tf_index] == 1: - # draw a random value between 0 and 1 to use to control - # magnitude of adjustment - adjustment_multiplier = torch.rand(1) + for gene_idx in range(bound_labels.shape[0]): + for tf_index in range(bound_labels.shape[1]): + if bound_labels[gene_idx, tf_index] == 1: + # divide its enrichment score by the maximum magnitude possible to + # create an adjustment multipler that scales with increasing enrichment + adjustment_multiplier = enrichment_scores[gene_idx, tf_index] / abs( + enrichment_scores.max() + ) # randomly adjust the gene by some portion of the max adjustment - adjusted_mean_matrix[gene_idx, tf_index] = signal_mean + ( + adjusted_mean_matrix[gene_idx, tf_index] = bound_mean + ( adjustment_multiplier * max_adjustment ) else: # related tfs are not all bound, so set the enrichment - # score to noise mean - adjusted_mean_matrix[gene_idx, tf_index] = noise_mean + # score to unbound mean + adjusted_mean_matrix[gene_idx, tf_index] = unbound_mean return adjusted_mean_matrix def perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic( binding_enrichment_data: torch.Tensor, - signal_mean: float, - noise_mean: float, + bound_mean: float, + unbound_mean: float, max_adjustment: float, tf_relationships: dict[int, list[Relation]], ) -> torch.Tensor: @@ -307,10 +309,10 @@ def perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic( dimensions [n_genes, n_tfs, 3] where the entries in the third dimension are a matrix with columns [label, enrichment, pvalue]. :type binding_enrichment_data: torch.Tensor - :param signal_mean: The mean for signal genes. - :type signal_mean: float - :param noise_mean: The mean for noise genes. - :type noise_mean: float + :param bound_mean: The mean for bound genes. + :type bound_mean: float + :param unbound_mean: The mean for unbound genes. + :type unbound_mean: float :param max_adjustment: The maximum adjustment to the base mean based on enrichment. :type max_adjustment: float :param tf_relationships: A dictionary where the keys are TF indices and the values @@ -354,22 +356,22 @@ def perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic( the binding_data tensor passed into the function" ) - # Extract signal/noise labels and enrichment scores - signal_labels = binding_enrichment_data[:, :, 0] # shape: (num_genes, num_tfs) + # Extract bound/unbound labels and enrichment scores + bound_labels = binding_enrichment_data[:, :, 0] # shape: (num_genes, num_tfs) enrichment_scores = binding_enrichment_data[:, :, 1] # shape: (num_genes, num_tfs) # we set all unbound scores to 0, then we will go through and also set any - # bound scores to noise_mean if the related boolean statements are not satisfied + # bound scores to unbound_mean if the related boolean statements are not satisfied adjusted_mean_matrix = torch.where( - signal_labels == 1, enrichment_scores, torch.zeros_like(enrichment_scores) + bound_labels == 1, enrichment_scores, torch.zeros_like(enrichment_scores) ) # shape: (num_genes, num_tfs) - for gene_idx in range(signal_labels.shape[0]): + for gene_idx in range(bound_labels.shape[0]): for tf_index, relations in tf_relationships.items(): # check if all relations (boolean relationships) # associated with TFs are satisfied - if signal_labels[gene_idx, tf_index] == 1 and all( - relation.evaluate(signal_labels[gene_idx].tolist()) + if bound_labels[gene_idx, tf_index] == 1 and all( + relation.evaluate(bound_labels[gene_idx].tolist()) for relation in relations ): # draw a random value between 0 and 1 to use to @@ -377,20 +379,21 @@ def perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic( adjustment_multiplier = torch.rand(1) # randomly adjust the gene by some portion of the max adjustment - adjusted_mean_matrix[gene_idx, tf_index] = signal_mean + ( + adjusted_mean_matrix[gene_idx, tf_index] = bound_mean + ( adjustment_multiplier * max_adjustment ) else: - # related tfs are not all bound, set the enrichment score to noise mean - adjusted_mean_matrix[gene_idx, tf_index] = noise_mean + # related tfs are not all bound, set the enrichment score to unbound + # mean + adjusted_mean_matrix[gene_idx, tf_index] = unbound_mean return adjusted_mean_matrix # shape (num_genes, num_tfs) def perturbation_effect_adjustment_function_with_tf_relationships( binding_enrichment_data: torch.Tensor, - signal_mean: float, - noise_mean: float, + bound_mean: float, + unbound_mean: float, max_adjustment: float, tf_relationships: dict[int, list[int]], ) -> torch.Tensor: @@ -405,10 +408,10 @@ def perturbation_effect_adjustment_function_with_tf_relationships( dimensions [n_genes, n_tfs, 3] where the entries in the third dimension are a matrix with columns [label, enrichment, pvalue]. :type binding_enrichment_data: torch.Tensor - :param signal_mean: The mean for signal genes. - :type signal_mean: float - :param noise_mean: The mean for noise genes. - :type noise_mean: float + :param bound_mean: The mean for bound genes. + :type bound_mean: float + :param unbound_mean: The mean for unbound genes. + :type unbound_mean: float :param max_adjustment: The maximum adjustment to the base mean based on enrichment. :type max_adjustment: float :param tf_relationships: A dictionary where the keys are the indices of the TFs and @@ -451,32 +454,33 @@ def perturbation_effect_adjustment_function_with_tf_relationships( binding_data tensor passed into the function" ) - # Extract signal/noise labels and enrichment scores - signal_labels = binding_enrichment_data[:, :, 0] # shape: (num_genes, num_tfs) + # Extract bound/unbound labels and enrichment scores + bound_labels = binding_enrichment_data[:, :, 0] # shape: (num_genes, num_tfs) enrichment_scores = binding_enrichment_data[:, :, 1] # shape: (num_genes, num_tfs) # we set all unbound scores to 0, then we will go through and also - # set any bound scores to noise_mean if the related tfs are not also bound + # set any bound scores to unbound_mean if the related tfs are not also bound adjusted_mean_matrix = torch.where( - signal_labels == 1, enrichment_scores, torch.zeros_like(enrichment_scores) + bound_labels == 1, enrichment_scores, torch.zeros_like(enrichment_scores) ) # shape: (num_genes, num_tfs) - for gene_idx in range(signal_labels.shape[0]): + for gene_idx in range(bound_labels.shape[0]): for tf_index, related_tfs in tf_relationships.items(): - if signal_labels[gene_idx, tf_index] == 1 and torch.all( - signal_labels[gene_idx, related_tfs] == 1 + if bound_labels[gene_idx, tf_index] == 1 and torch.all( + bound_labels[gene_idx, related_tfs] == 1 ): # draw a random value between 0 and 1 to use to # control magnitude of adjustment adjustment_multiplier = torch.rand(1) # randomly adjust the gene by some portion of the max adjustment - adjusted_mean_matrix[gene_idx, tf_index] = signal_mean + ( + adjusted_mean_matrix[gene_idx, tf_index] = bound_mean + ( adjustment_multiplier * max_adjustment ) else: - # related tfs are not all bound, set the enrichment score to noise mean - adjusted_mean_matrix[gene_idx, tf_index] = noise_mean + # related tfs are not all bound, set the enrichment score to unbound + # mean + adjusted_mean_matrix[gene_idx, tf_index] = unbound_mean return adjusted_mean_matrix # shape (num_genes, num_tfs) @@ -484,10 +488,10 @@ def perturbation_effect_adjustment_function_with_tf_relationships( def generate_perturbation_effects( binding_data: torch.Tensor, tf_index: int | None = None, - noise_mean: float = 0.0, - noise_std: float = 1.0, - signal_mean: float = 3.0, - signal_std: float = 1.0, + unbound_mean: float = 0.0, + unbound_std: float = 1.0, + bound_mean: float = 3.0, + bound_std: float = 1.0, max_mean_adjustment: float = 0.0, adjustment_function: Callable[ [torch.Tensor, float, float, float], torch.Tensor @@ -512,14 +516,14 @@ def generate_perturbation_effects( are adjusting the means (ie only used if max_mean_adjustment == 0). Defaults to None :type tf_index: int - :param noise_mean: The mean for noise genes. Defaults to 0.0 - :type noise_mean: float, optional - :param noise_std: The standard deviation for noise genes. Defaults to 1.0 - :type noise_std: float, optional - :param signal_mean: The mean for signal genes. Defaults to 3.0 - :type signal_mean: float, optional - :param signal_std: The standard deviation for signal genes. Defaults to 1.0 - :type signal_std: float, optional + :param unbound_mean: The mean for unbound genes. Defaults to 0.0 + :type unbound_mean: float, optional + :param unbound_std: The standard deviation for unbound genes. Defaults to 1.0 + :type unbound_std: float, optional + :param bound_mean: The mean for bound genes. Defaults to 3.0 + :type bound_mean: float, optional + :param bound_std: The standard deviation for bound genes. Defaults to 1.0 + :type bound_std: float, optional :param max_mean_adjustment: The maximum adjustment to the base mean based on enrichment. Defaults to 0.0 :type max_mean_adjustment: float, optional @@ -529,7 +533,7 @@ def generate_perturbation_effects( :raises ValueError: If binding_data is not a 3D tensor with the third dimension having a length of 3 - :raises ValueError: If noise_mean, noise_std, signal_mean, signal_std, + :raises ValueError: If unbound_mean, unbound_std, bound_mean, bound_std, or max_mean_adjustment are not floats """ @@ -545,10 +549,10 @@ def generate_perturbation_effects( # check the rest of the inputs if not all( isinstance(i, float) - for i in (noise_mean, noise_std, signal_mean, signal_std, max_mean_adjustment) + for i in (unbound_mean, unbound_std, bound_mean, bound_std, max_mean_adjustment) ): raise ValueError( - "noise_mean, noise_std, signal_mean, signal_std, " + "unbound_mean, unbound_std, bound_mean, bound_std, " "and max_mean_adjustment must be floats" ) # check the Callable signature @@ -556,14 +560,14 @@ def generate_perturbation_effects( i in inspect.signature(adjustment_function).parameters for i in ( "binding_enrichment_data", - "signal_mean", - "noise_mean", + "bound_mean", + "unbound_mean", "max_adjustment", ) ): raise ValueError( "adjustment_function must have the signature " - "(binding_enrichment_data, signal_mean, noise_mean, max_adjustment)" + "(binding_enrichment_data, bound_mean, unbound_mean, max_adjustment)" ) # Initialize an effects tensor for all genes @@ -578,16 +582,16 @@ def generate_perturbation_effects( device=binding_data.device) * 2 - 1 # fmt: on - # Apply adjustments to the base mean for the signal genes, if necessary + # Apply adjustments to the base mean for the bound genes, if necessary if max_mean_adjustment > 0 and adjustment_function is not None: # Assuming adjustment_function returns a vector of means for each gene. - # Signal genes that meet the criteria for adjustment will be affected by + # bound genes that meet the criteria for adjustment will be affected by # the status of the TFs. What TFs affect a given gene must be specified by # the adjustment_function() adjusted_means = adjustment_function( binding_data, - signal_mean, - noise_mean, + bound_mean, + unbound_mean, max_mean_adjustment, **kwargs, ) @@ -595,27 +599,25 @@ def generate_perturbation_effects( # add adjustments, ensuring they respect the original sign if adjusted_means.ndim == 1: effects = signs * torch.abs( - torch.normal(mean=adjusted_means, std=signal_std) + torch.normal(mean=adjusted_means, std=bound_std) ) else: effects = torch.zeros_like(adjusted_means) for col_idx in range(effects.size(1)): effects[:, col_idx] = signs * torch.abs( - torch.normal(mean=adjusted_means[:, col_idx], std=signal_std) + torch.normal(mean=adjusted_means[:, col_idx], std=bound_std) ) else: - signal_mask = binding_data[:, tf_index, 0] == 1 + bound_mask = binding_data[:, tf_index, 0] == 1 - # Generate effects based on the noise and signal means, applying the sign - effects[~signal_mask] = signs[~signal_mask] * torch.abs( + # Generate effects based on the unbound and bound means, applying the sign + effects[~bound_mask] = signs[~bound_mask] * torch.abs( torch.normal( - mean=noise_mean, std=noise_std, size=(torch.sum(~signal_mask),) + mean=unbound_mean, std=unbound_std, size=(torch.sum(~bound_mask),) ) ) - effects[signal_mask] = signs[signal_mask] * torch.abs( - torch.normal( - mean=signal_mean, std=signal_std, size=(torch.sum(signal_mask),) - ) + effects[bound_mask] = signs[bound_mask] * torch.abs( + torch.normal(mean=bound_mean, std=bound_std, size=(torch.sum(bound_mask),)) ) return effects diff --git a/yeastdnnexplorer/tests/probability_models/test_generate_data.py b/yeastdnnexplorer/tests/probability_models/test_generate_data.py index 00f4d4f..81195c2 100644 --- a/yeastdnnexplorer/tests/probability_models/test_generate_data.py +++ b/yeastdnnexplorer/tests/probability_models/test_generate_data.py @@ -13,10 +13,10 @@ def test_generate_gene_population(): total_genes = 1000 - signal_ratio = 0.3 - signal_group_size = int(total_genes * signal_ratio) + bound_ratio = 0.3 + bound_group_size = int(total_genes * bound_ratio) - gene_population = generate_gene_population(total_genes, signal_ratio) + gene_population = generate_gene_population(total_genes, bound_ratio) # Check if the output is a 1D tensor assert gene_population.labels.ndim == 1 @@ -24,10 +24,10 @@ def test_generate_gene_population(): # Check if the output has the correct shape assert gene_population.labels.shape == torch.Size([total_genes]) - # Check if the second column contains the correct number of signal - # and non-signal genes - assert torch.sum(gene_population.labels) == signal_group_size - assert torch.sum(gene_population.labels == 0) == total_genes - signal_group_size + # Check if the second column contains the correct number of bound + # and non-bound genes + assert torch.sum(gene_population.labels) == bound_group_size + assert torch.sum(gene_population.labels == 0) == total_genes - bound_group_size # Additional tests could include checking the datatype of the tensor elements assert gene_population.labels.dtype == torch.bool @@ -37,7 +37,7 @@ def test_generate_binding_effects_success(): # set torch seed torch.manual_seed(42) # Create a mock GenePopulation with some genes - # labeled as signal and others as noise + # labeled as bound and others as unbound gene_population = GenePopulation(torch.tensor([1, 0, 1, 0], dtype=torch.bool)) # Call generate_binding_effects with valid arguments enrichment = generate_binding_effects(gene_population) @@ -84,7 +84,7 @@ def test_generate_pvalues_invalid_input(): def test_generate_perturbation_effects_with_and_without_adjustment(): torch.manual_seed(42) # Create mock binding data with the first - # column indicating signal (1) or noise (0), + # column indicating bound (1) or unbound (0), # the second column indicates the enrichment, and the third the p-value. # Add an extra dimension for TFs -- the function requires a 3D tensor. binding_data = torch.tensor( @@ -99,77 +99,77 @@ def test_generate_perturbation_effects_with_and_without_adjustment(): ) # Add TF dimension # Specify means and standard deviations - noise_mean = 0.0 - noise_std = 1.0 - signal_mean = 4.0 - signal_std = 1.0 + unbound_mean = 0.0 + unbound_std = 1.0 + bound_mean = 4.0 + bound_std = 1.0 # First, test without mean adjustment effects_without_adjustment = generate_perturbation_effects( binding_data=binding_data, tf_index=0, - noise_mean=noise_mean, - noise_std=noise_std, - signal_mean=signal_mean, - signal_std=signal_std, + unbound_mean=unbound_mean, + unbound_std=unbound_std, + bound_mean=bound_mean, + bound_std=bound_std, max_mean_adjustment=0.0, # No adjustment ) - # Extract masks for signal and noise genes based on labels - signal_mask = binding_data[:, :, 0].squeeze() == 1 - noise_mask = binding_data[:, :, 0].squeeze() == 0 + # Extract masks for bound and unbound genes based on labels + bound_mask = binding_data[:, :, 0].squeeze() == 1 + unbound_mask = binding_data[:, :, 0].squeeze() == 0 # Assert the effects tensor is of the correct shape assert effects_without_adjustment.shape[0] == binding_data.shape[0] assert torch.isclose( - torch.abs(effects_without_adjustment[signal_mask]).mean(), - torch.tensor(signal_mean), - atol=signal_std, + torch.abs(effects_without_adjustment[bound_mask]).mean(), + torch.tensor(bound_mean), + atol=bound_std, ) assert torch.isclose( - torch.abs(effects_without_adjustment[~signal_mask]).mean(), - torch.tensor(noise_mean), - atol=noise_std, + torch.abs(effects_without_adjustment[~bound_mask]).mean(), + torch.tensor(unbound_mean), + atol=unbound_std, ) assert torch.isclose( - torch.abs(effects_without_adjustment[signal_mask]).std(), - torch.tensor(signal_std), - atol=signal_std, + torch.abs(effects_without_adjustment[bound_mask]).std(), + torch.tensor(bound_std), + atol=bound_std, ) assert torch.isclose( - torch.abs(effects_without_adjustment[~signal_mask]).std(), - torch.tensor(noise_std), - atol=noise_std, + torch.abs(effects_without_adjustment[~bound_mask]).std(), + torch.tensor(unbound_std), + atol=unbound_std, ) # Test with mean adjustment effects_with_adjustment = generate_perturbation_effects( binding_data=binding_data, tf_index=0, - noise_mean=noise_mean, - noise_std=noise_std, - signal_mean=signal_mean, - signal_std=signal_std, + unbound_mean=unbound_mean, + unbound_std=unbound_std, + bound_mean=bound_mean, + bound_std=bound_std, max_mean_adjustment=4.0, # Applying adjustment ) - # Assert that signal genes with adjustments have a mean effect greater than + # Assert that bound genes with adjustments have a mean effect greater than # the base mean assert ( - torch.abs(effects_with_adjustment[signal_mask]).mean() - > torch.abs(effects_without_adjustment[signal_mask]).mean() + torch.abs(effects_with_adjustment[bound_mask]).mean() + > torch.abs(effects_without_adjustment[bound_mask]).mean() ) - # Assert that the mean effect for noise genes remains close to the noise mean + # Assert that the mean effect for unbound genes remains close to the unbound mean assert torch.isclose( - torch.abs(effects_with_adjustment[noise_mask]).mean(), - torch.tensor(noise_mean), - atol=noise_std, + torch.abs(effects_with_adjustment[unbound_mask]).mean(), + torch.tensor(unbound_mean), + atol=unbound_std, ) - # and that the noise standard deviation remains close to the noise std + # and that the unbound standard deviation remains close to the unbound std assert torch.isclose( - torch.abs(effects_with_adjustment[noise_mask]).std(), - torch.tensor(noise_std), - atol=noise_std, + torch.abs(effects_with_adjustment[unbound_mask]).std(), + torch.tensor(unbound_std), + atol=unbound_std, ) From 53ecf9f89688566254b280a908dc852fa86ea1f9 Mon Sep 17 00:00:00 2001 From: ejiawustl Date: Fri, 7 Jun 2024 14:26:19 -0700 Subject: [PATCH 2/7] added new file util.py and new test suite and updated notebook --- docs/tutorials/generate_in_silico_data.ipynb | 38 +- ..._and_testing_data_generation_methods.ipynb | 3140 ++++++++++++++--- .../probability_models/generate_data.py | 20 +- 3 files changed, 2717 insertions(+), 481 deletions(-) diff --git a/docs/tutorials/generate_in_silico_data.ipynb b/docs/tutorials/generate_in_silico_data.ipynb index 14dd53c..8e739f6 100644 --- a/docs/tutorials/generate_in_silico_data.ipynb +++ b/docs/tutorials/generate_in_silico_data.ipynb @@ -11,15 +11,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Matplotlib is building the font cache; this may take a moment.\n" - ] - } - ], + "outputs": [], "source": [ "from yeastdnnexplorer.probability_models.relation_classes import And, Or\n", "from yeastdnnexplorer.probability_models.generate_data import (generate_gene_population, \n", @@ -360,7 +352,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "bound/nosie ratio is correct: True\n" + "bound/noise ratio is correct: True\n" ] } ], @@ -372,7 +364,7 @@ " dtype=torch.long),\n", " atol=tolerance)\n", "\n", - "print(f\"bound/nosie ratio is correct: {are_equal.all()}\")" + "print(f\"bound/noise ratio is correct: {are_equal.all()}\")" ] }, { @@ -412,7 +404,15 @@ "print(f\"the unbound mean is {unbound_binding.mean()} and the std is {unbound_binding.std()}\")\n", "print(f\"The bound binding max is {bound_binding.max()} and the min is {bound_binding.min()}\")\n", "print(f\"the bound min is {bound_binding.min()}\")\n", - "print(f\"the bound mean is {bound_binding.mean()} and the std is {bound_binding.std()}\")" + "print(f\"the bound mean is {bound_binding.mean()} and the std is {bound_binding.std()}\")\n", + "\n", + "#this is the output before EJ change to adjustment mean\n", + "# The unbound binding max is 13.157892227172852 and the min is 0.0\n", + "# the unbound min is 0.0\n", + "# the unbound mean is 0.3589712679386139 and the std is 1.1559306383132935\n", + "# The bound binding max is 78.94734954833984 and the min is 0.1315789520740509\n", + "# the bound min is 0.1315789520740509\n", + "# the bound mean is 2.4840002059936523 and the std is 6.374814510345459" ] }, { @@ -479,7 +479,15 @@ "print(f\"the unbound mean is {unbound_perturbation.mean()} and the std is {unbound_perturbation.std()}\")\n", "print(f\"The bound binding max is {bound_perturbation.max()} and the min is {bound_perturbation.min()}\")\n", "print(f\"the bound min is {bound_perturbation.min()}\")\n", - "print(f\"the bound mean is {bound_perturbation.mean()} and the std is {bound_perturbation.std()}\")" + "print(f\"the bound mean is {bound_perturbation.mean()} and the std is {bound_perturbation.std()}\")\n", + "\n", + "#pre change data\n", + "# The unbound binding max is 3.423511505126953 and the min is -3.506139039993286\n", + "# the unbound min is -3.506139039993286\n", + "# the unbound mean is 0.010617653839290142 and the std is 0.988001823425293\n", + "# The bound binding max is 6.107701301574707 and the min is -6.406703948974609\n", + "# the bound min is -6.406703948974609\n", + "# the bound mean is -0.011303802020847797 and the std is 3.136451482772827" ] }, { @@ -600,12 +608,12 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] diff --git a/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb b/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb index a0178dc..88347bb 100644 --- a/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb +++ b/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -197,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -233,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -247,7 +247,7 @@ }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -271,6 +271,11 @@ "plt.title('Pertubation Effects for Gene ' + str(GENE_IDX) + ' with Different Adjustment Functions (averaged across 100 trials)')\n", "plt.xlabel('TF Index')\n", "plt.ylabel('Perturbation Effect Val')\n", + "\n", + "#added to compare this to previous graph, REMOVE LATER\n", + "plt.ylim(0,9)\n", + "\n", + "\n", "plt.xticks(x_vals)\n", "plt.grid(True)\n", "plt.legend(['No Mean Adjustment', 'Normal (non-dependent) Mean Adjust', 'Dependent Mean Adjustment', 'Boolean Logic Adjustment'])\n", @@ -304,7 +309,7 @@ "source": [ "# define checkpoints and loggers\n", "best_model_checkpoint = ModelCheckpoint(\n", - " monitor=\"val_mse\",\n", + " monitor=\"val_explained_variance\",\n", " mode=\"min\",\n", " filename=\"best-model-{epoch:02d}-{val_loss:.2f}\",\n", " save_top_k=1,\n", @@ -388,20 +393,84 @@ "outputs": [], "source": [ "# These lists will store the test results for different models and data generation methods\n", - "model_mses = []\n", - "linear_model_test_mses = []" + "model_ves = []\n", + "linear_model_test_ves = []" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from sklearn.metrics import explained_variance_score\n", + "\n", + "data_module = get_data_module(0.0)\n", + "num_tfs = sum(data_module.n_sample)\n", + "model_ves = [] # List to store explained variance for the non-linear model\n", + "linear_model_test_ves = [] # List to store explained variance for the linear model\n", + "\n", + "def calculate_explained_variance(test_results, data_module, model):\n", + " predictions = []\n", + " targets = []\n", + "\n", + " model.eval() # Set the model to evaluation mode\n", + "\n", + " with torch.no_grad(): # Disable gradient calculation\n", + " for batch in data_module.test_dataloader():\n", + " # Assuming your data is in the format (x, y)\n", + " x, y = batch\n", + " outputs = model(x)\n", + " predictions.append(outputs)\n", + " targets.append(y)\n", + " mse = torch.nn.functional.mse_loss(torch.tensor(predictions), torch.tensor(targets)).item()\n", + " var_y = torch.var(torch.tensor(targets)).item() \n", + " explained_variance = 1 - (mse / var_y)\n", + " return explained_variance \n", + "\n", + "# # Function to calculate explained variance from test results\n", + "# def calculate_explained_variance(test_results, data_module, model):\n", + "# \"\"\"\n", + "# Calculates the explained variance score using PyTorch and scikit-learn.\n", + "\n", + "# Args:\n", + "# test_results: The results dictionary from the trainer.test() function.\n", + "# data_module: The data module containing the test dataloader.\n", + "# model: The trained PyTorch model.\n", + "\n", + "# Returns:\n", + "# float: The explained variance score.\n", + "# \"\"\"\n", + "# predictions = []\n", + "# targets = []\n", + "\n", + "# model.eval() # Set the model to evaluation mode\n", + "\n", + "# with torch.no_grad(): # Disable gradient calculation\n", + "# for batch in data_module.test_dataloader():\n", + "# # Assuming your data is in the format (x, y)\n", + "# x, y = batch\n", + "# outputs = model(x)\n", + "# predictions.append(outputs)\n", + "# targets.append(y)\n", + "\n", + "# predictions = torch.cat(predictions, dim=0).numpy() # Concatenate predictions\n", + "# targets = torch.cat(targets, dim=0).numpy() # Concatenate targets\n", + "\n", + "# return explained_variance_score(targets, predictions)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Train models on data generated with no mean adjustment" + "# **Train models on data generated with no mean adjustment**" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -412,12 +481,6 @@ "TPU available: False, using: 0 TPU cores\n", "IPU available: False, using: 0 IPUs\n", "HPU available: False, using: 0 HPUs\n", - "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:260: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", - " X_train, Y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(\n", - "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:263: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", - " X_val, Y_val = torch.tensor(X_val, dtype=torch.float32), torch.tensor(\n", - "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", - " X_test, Y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(\n", "\n", " | Name | Type | Params\n", "----------------------------------------------------\n", @@ -443,29 +506,21 @@ "version_minor": 0 }, "text/plain": [ - "Sanity Checking: | …" + "Sanity Checking: | | 0/? [00:00 6\u001b[0m explained_variance \u001b[38;5;241m=\u001b[39m \u001b[43mcalculate_explained_variance\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtest_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7\u001b[0m model_ves\u001b[38;5;241m.\u001b[39mappend(explained_variance) \u001b[38;5;66;03m# Append explained variance to the list\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPrinting test results...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "Cell \u001b[0;32mIn[30], line 23\u001b[0m, in \u001b[0;36mcalculate_explained_variance\u001b[0;34m(test_results, data_module, model)\u001b[0m\n\u001b[1;32m 21\u001b[0m predictions\u001b[38;5;241m.\u001b[39mappend(outputs)\n\u001b[1;32m 22\u001b[0m targets\u001b[38;5;241m.\u001b[39mappend(y)\n\u001b[0;32m---> 23\u001b[0m mse \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mnn\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39mmse_loss(\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpredictions\u001b[49m\u001b[43m)\u001b[49m, torch\u001b[38;5;241m.\u001b[39mtensor(targets))\u001b[38;5;241m.\u001b[39mitem()\n\u001b[1;32m 24\u001b[0m var_y \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mvar(torch\u001b[38;5;241m.\u001b[39mtensor(targets))\u001b[38;5;241m.\u001b[39mitem() \n\u001b[1;32m 25\u001b[0m explained_variance \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;241m-\u001b[39m (mse \u001b[38;5;241m/\u001b[39m var_y)\n", + "\u001b[0;31mValueError\u001b[0m: only one element tensors can be converted to Python scalars" + ] + } + ], + "source": [ + "# --- Nonlinear Model ---\n", + "model = get_model(num_tfs)\n", + "trainer = get_trainer()\n", + "trainer.fit(model, data_module)\n", + "test_results = trainer.test(model, datamodule=data_module)\n", + "explained_variance = calculate_explained_variance(test_results, data_module, model)\n", + "model_ves.append(explained_variance) # Append explained variance to the list\n", + "print(\"Printing test results...\")\n", + "print(test_results)\n", + "print(\"Printing explained variance\")\n", + "print(explained_variance)\n", + "\n", + "\n", + "# --- Linear Model ---\n", + "linear_model = get_linear_model(num_tfs)\n", + "trainer = get_trainer()\n", + "trainer.fit(linear_model, data_module)\n", + "test_results = trainer.test(linear_model, datamodule=data_module)\n", + "explained_variance = calculate_explained_variance(test_results, data_module, linear_model)\n", + "linear_model_test_ves.append(explained_variance) # Append explained variance to the list\n", + "print(\"Printing linear model test results\")\n", + "print(test_results)\n", + "print(\"Printing linear model explained variance\")\n", + "print(explained_variance)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "#NOTE: replaced this kernel with the two above to implement the explained variance\n", + "\n", + "\n", + "# data_module = get_data_module(0.0)\n", + "# num_tfs = sum(data_module.n_sample)\n", + "\n", + "# # nonlinear model\n", + "# model = get_model(num_tfs)\n", + "# trainer = get_trainer()\n", + "# trainer.fit(model, data_module)\n", + "# test_results = trainer.test(model, datamodule=data_module)\n", + "# print(\"Printing test results...\")\n", + "# print(test_results)\n", + "# model_ves.append(test_results[0][\"test_ve\"])\n", + "\n", + "# # linear model\n", + "# linear_model = get_linear_model(num_tfs)\n", + "# trainer = get_trainer()\n", + "# trainer.fit(linear_model, data_module)\n", + "# test_results = trainer.test(linear_model, datamodule=data_module)\n", + "# print(\"Printing linear model test results\")\n", + "# print(test_results)\n", + "# linear_model_test_ves.append(test_results[0][\"test_ve\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Train models on data generated with normal mean adjustments**" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ { "name": "stderr", "output_type": "stream", @@ -654,24 +792,20 @@ "IPU available: False, using: 0 IPUs\n", "HPU available: False, using: 0 HPUs\n", "\n", - " | Name | Type | Params\n", - "----------------------------------------------\n", - "0 | mae | MeanAbsoluteError | 0 \n", - "1 | SMSE | SMSE | 0 \n", - "2 | linear1 | Linear | 110 \n", - "----------------------------------------------\n", - "110 Trainable params\n", + " | Name | Type | Params\n", + "----------------------------------------------------\n", + "0 | activation | LeakyReLU | 0 \n", + "1 | input_layer | Linear | 704 \n", + "2 | hidden_layers | ModuleList | 2.1 K \n", + "3 | output_layer | Linear | 330 \n", + "4 | dropout | Dropout | 0 \n", + "5 | mae | MeanAbsoluteError | 0 \n", + "6 | SMSE | SMSE | 0 \n", + "----------------------------------------------------\n", + "3.1 K Trainable params\n", "0 Non-trainable params\n", - "110 Total params\n", - "0.000 Total estimated model params size (MB)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Printing test results...\n", - "[{'test_mse': 1.5293521881103516, 'test_mae': 0.9961513876914978, 'test_smse': 8.054170608520508}]\n" + "3.1 K Total params\n", + "0.012 Total estimated model params size (MB)\n" ] }, { @@ -682,28 +816,21 @@ "version_minor": 0 }, "text/plain": [ - "Sanity Checking: | …" + "Sanity Checking: | | 0/? [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data_gen_methods = [\"No Mean Adjustment\", \"Dependent Mean Adjustment\", \"TF Dependent Mean Adjustment\", \"TF Dependent Mean Adjust with Boolean Logic\"]\n", + "plt.figure(figsize=(10, 6))\n", + "plt.scatter(data_gen_methods, model_ves, color='blue')\n", + "plt.scatter(data_gen_methods, linear_model_test_ves, color='orange')\n", + "plt.title('Model VE Comparison (bound mean = 3.0)')\n", + "plt.xlabel('Model')\n", + "plt.ylabel('VE')\n", + "plt.grid(True)\n", + "plt.xticks(rotation=45, ha=\"right\")\n", + "plt.legend(['Complex (Customizable) Model', 'Linear Model'])\n", + "plt.tight_layout() # Adjust layout to make room for the rotated x-axis labels\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Seed set to 42\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bound (1) and Unbound (0) Labels for gene 0:\n", + "tensor([0., 0., 0., 1., 1., 1., 1., 1., 0., 1.])\n", + "iteration 5 completed\n", + "iteration 10 completed\n", + "iteration 15 completed\n", + "iteration 20 completed\n", + "iteration 25 completed\n", + "iteration 30 completed\n", + "iteration 35 completed\n", + "iteration 40 completed\n", + "iteration 45 completed\n", + "iteration 50 completed\n" + ] + } + ], + "source": [ + "# imports\n", + "from yeastdnnexplorer.probability_models.generate_data import (\n", + " generate_gene_population, \n", + " generate_binding_effects, \n", + " generate_pvalues, \n", + " generate_perturbation_effects\n", + ")\n", + "\n", + "from yeastdnnexplorer.probability_models.util import (\n", + " calculate_explained_variance\n", + ")\n", + "\n", + "import torch\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from yeastdnnexplorer.probability_models.relation_classes import Relation, And, Or\n", + "from yeastdnnexplorer.probability_models.generate_data import (\n", + " default_perturbation_effect_adjustment_function,\n", + " perturbation_effect_adjustment_function_with_tf_relationships,\n", + " perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic\n", + ")\n", + "\n", + "from pytorch_lightning import Trainer, seed_everything\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "from sklearn.metrics import explained_variance_score\n", + "\n", + "from yeastdnnexplorer.data_loaders.synthetic_data_loader import SyntheticDataLoader\n", + "from yeastdnnexplorer.ml_models.simple_model import SimpleModel\n", + "from yeastdnnexplorer.ml_models.customizable_model import CustomizableModel\n", + "\n", + "seed_everything(42)\n", + "\n", + "n_genes = 3000\n", + "bound = [0.5, 0.5, 0.5, 0.5, 0.5]\n", + "n_sample = [1, 1, 2, 2, 4]\n", + "\n", + "# Generate gene populations\n", + "gene_populations_list = []\n", + "for bound_proportion, n_draws in zip(bound, n_sample):\n", + " for _ in range(n_draws):\n", + " gene_populations_list.append(generate_gene_population(n_genes, bound_proportion))\n", + " \n", + "# Generate binding data for each gene population\n", + "binding_effect_list = [generate_binding_effects(gene_population) for gene_population in gene_populations_list]\n", + "\n", + "# Calculate p-values for binding data\n", + "binding_pvalue_list = [generate_pvalues(binding_data) for binding_data in binding_effect_list]\n", + "\n", + "# Combine binding data into a tensor\n", + "binding_data_combined = [torch.stack((gene_population.labels, binding_effect, binding_pval), dim=1)\n", + " for gene_population, binding_effect, binding_pval in zip(gene_populations_list, binding_effect_list, binding_pvalue_list)]\n", + "binding_data_tensor = torch.stack(binding_data_combined, dim=1)\n", + "\n", + "# TF relationships\n", + "tf_relationships = {\n", + " 0: [1],\n", + " 1: [8],\n", + " 2: [5, 6],\n", + " 3: [4],\n", + " 4: [5],\n", + " 5: [9],\n", + " 6: [4],\n", + " 7: [1, 4],\n", + " 8: [6],\n", + " 9: [4],\n", + "}\n", + "\n", + "tf_relationships_dict_boolean_logic = {\n", + " 0: [And(3, 4, 8), Or(3, 7), Or(1, 1)],\n", + " 1: [And(5, Or(7, 8))],\n", + " 2: [],\n", + " 3: [Or(7, 9), And(6, 7)],\n", + " 4: [And(1, 2)],\n", + " 5: [Or(0, 1, 2, 8, 9)],\n", + " 6: [And(0, Or(1, 2))],\n", + " 7: [Or(2, And(5, 6, 9))],\n", + " 8: [],\n", + " 9: [And(6, And(3, Or(0, 9)))],\n", + "}\n", + "\n", + "def experiment(n_iterations=10, GENE_IDX=0):\n", + " print(\"Bound (1) and Unbound (0) Labels for gene \" + str(GENE_IDX) + \":\")\n", + " print(binding_data_tensor[GENE_IDX, :, 0])\n", + "\n", + " num_tfs = sum(n_sample)\n", + " \n", + " no_mean_adjustment_scores = torch.zeros(num_tfs)\n", + " normal_mean_adjustment_scores = torch.zeros(num_tfs)\n", + " dep_mean_adjustment_scores = torch.zeros(num_tfs)\n", + " boolean_logic_scores = torch.zeros(num_tfs)\n", + "\n", + " for i in range(n_iterations):\n", + " # Method 1: Generate perturbation effects without mean adjustment\n", + " perturbation_effects_list_no_mean_adjustment = [generate_perturbation_effects(binding_data_tensor[:, tf_index, :].unsqueeze(1), tf_index=0) \n", + " for tf_index in range(num_tfs)]\n", + " perturbation_effects_list_no_mean_adjustment = torch.stack(perturbation_effects_list_no_mean_adjustment, dim=1)\n", + "\n", + " # Method 2: Generate perturbation effects with normal mean adjustment\n", + " perturbation_effects_list_normal_mean_adjustment = generate_perturbation_effects(\n", + " binding_data_tensor, \n", + " max_mean_adjustment=10.0\n", + " )\n", + "\n", + " # Method 3: Generate perturbation effects with dependent mean adjustment\n", + " perturbation_effects_list_dep_mean_adjustment = generate_perturbation_effects(\n", + " binding_data_tensor, \n", + " tf_relationships=tf_relationships,\n", + " adjustment_function=perturbation_effect_adjustment_function_with_tf_relationships,\n", + " max_mean_adjustment=10.0,\n", + " )\n", + " \n", + " # Method 4: Generate perturbation effects with binary relations between the TFs\n", + " perturbation_effects_list_boolean_logic = generate_perturbation_effects(\n", + " binding_data_tensor, \n", + " adjustment_function=perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic,\n", + " tf_relationships=tf_relationships_dict_boolean_logic,\n", + " max_mean_adjustment=10.0,\n", + " )\n", + "\n", + " no_mean_adjustment_scores += abs(perturbation_effects_list_no_mean_adjustment[GENE_IDX, :])\n", + " normal_mean_adjustment_scores += abs(perturbation_effects_list_normal_mean_adjustment[GENE_IDX, :])\n", + " dep_mean_adjustment_scores += abs(perturbation_effects_list_dep_mean_adjustment[GENE_IDX, :])\n", + " boolean_logic_scores += abs(perturbation_effects_list_boolean_logic[GENE_IDX, :])\n", + "\n", + " if (i + 1) % 5 == 0:\n", + " print(f\"iteration {i+1} completed\")\n", + " \n", + " no_mean_adjustment_scores /= n_iterations\n", + " normal_mean_adjustment_scores /= n_iterations\n", + " dep_mean_adjustment_scores /= n_iterations\n", + " boolean_logic_scores /= n_iterations\n", + " \n", + " return no_mean_adjustment_scores, normal_mean_adjustment_scores, dep_mean_adjustment_scores, boolean_logic_scores\n", + "\n", + "GENE_IDX = 0\n", + "experiment_results = experiment(n_iterations=50, GENE_IDX=GENE_IDX)\n", + "\n", + "def get_data_module(max_mean_adjustment, adjustment_function=default_perturbation_effect_adjustment_function, tf_relationships_dict={}):\n", + " return SyntheticDataLoader(\n", + " batch_size=32,\n", + " num_genes=4000,\n", + " bound_mean=3.0,\n", + " bound=[0.5] * 5,\n", + " n_sample=[1, 1, 2, 2, 4],\n", + " val_size=0.1,\n", + " test_size=0.1,\n", + " random_state=42,\n", + " max_mean_adjustment=max_mean_adjustment,\n", + " adjustment_function=adjustment_function,\n", + " tf_relationships=tf_relationships_dict,\n", + " )\n", + "\n", + "def get_model(num_tfs):\n", + " return CustomizableModel(\n", + " input_dim=num_tfs,\n", + " output_dim=num_tfs,\n", + " lr=0.01,\n", + " hidden_layer_num=2,\n", + " hidden_layer_sizes=[64, 32],\n", + " activation=\"LeakyReLU\",\n", + " optimizer=\"RMSprop\",\n", + " L2_regularization_term=0.0,\n", + " dropout_rate=0.0,\n", + " )\n", + "\n", + "def get_linear_model(num_tfs):\n", + " return SimpleModel(\n", + " input_dim=num_tfs,\n", + " output_dim=num_tfs,\n", + " lr=0.01\n", + " )\n", + "\n", + "def get_trainer():\n", + " return Trainer(\n", + " max_epochs=10,\n", + " deterministic=True,\n", + " accelerator=\"cpu\",\n", + " )\n", + "\n", + "model_ves = []\n", + "linear_model_test_ves = []" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:260: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_train, Y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:263: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_val, Y_val = torch.tensor(X_val, dtype=torch.float32), torch.tensor(\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_test, Y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(\n", + "\n", + " | Name | Type | Params\n", + "----------------------------------------------------\n", + "0 | activation | LeakyReLU | 0 \n", + "1 | input_layer | Linear | 704 \n", + "2 | hidden_layers | ModuleList | 2.1 K \n", + "3 | output_layer | Linear | 330 \n", + "4 | dropout | Dropout | 0 \n", + "5 | mae | MeanAbsoluteError | 0 \n", + "6 | SMSE | SMSE | 0 \n", + "----------------------------------------------------\n", + "3.1 K Trainable params\n", + "0 Non-trainable params\n", + "3.1 K Total params\n", + "0.012 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | | 0/? [00:00" ] @@ -2490,15 +4712,15 @@ "source": [ "data_gen_methods = [\"No Mean Adjustment\", \"Dependent Mean Adjustment\", \"TF Dependent Mean Adjustment\", \"TF Dependent Mean Adjust with Boolean Logic\"]\n", "plt.figure(figsize=(10, 6))\n", - "plt.scatter(data_gen_methods, model_mses, color='blue')\n", - "plt.scatter(data_gen_methods, linear_model_test_mses, color='orange')\n", - "plt.title('Model MSE Comparison (bound mean = 3.0)')\n", + "plt.scatter(data_gen_methods, model_ves, color='blue')\n", + "plt.scatter(data_gen_methods, linear_model_test_ves, color='orange')\n", + "plt.title('Model VE Comparison (bound mean = 3.0)')\n", "plt.xlabel('Model')\n", - "plt.ylabel('MSE')\n", + "plt.ylabel('Variance Explained')\n", "plt.grid(True)\n", "plt.xticks(rotation=45, ha=\"right\")\n", "plt.legend(['Complex (Customizable) Model', 'Linear Model'])\n", - "plt.tight_layout() # Adjust layout to make room for the rotated x-axis labels\n", + "plt.tight_layout()\n", "plt.show()" ] }, diff --git a/yeastdnnexplorer/probability_models/generate_data.py b/yeastdnnexplorer/probability_models/generate_data.py index b0d2047..6f00e90 100644 --- a/yeastdnnexplorer/probability_models/generate_data.py +++ b/yeastdnnexplorer/probability_models/generate_data.py @@ -273,7 +273,7 @@ def default_perturbation_effect_adjustment_function( # divide its enrichment score by the maximum magnitude possible to # create an adjustment multipler that scales with increasing enrichment adjustment_multiplier = enrichment_scores[gene_idx, tf_index] / abs( - enrichment_scores.max() + enrichment_scores.max() * 10 ) # randomly adjust the gene by some portion of the max adjustment @@ -374,9 +374,12 @@ def perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic( relation.evaluate(bound_labels[gene_idx].tolist()) for relation in relations ): - # draw a random value between 0 and 1 to use to - # control magnitude of adjustment - adjustment_multiplier = torch.rand(1) + # OLD: adjustment_multiplier = torch.rand(1) + # divide its enrichment score by the maximum magnitude possible to + # create an adjustment multipler that scales with increasing enrichment + adjustment_multiplier = enrichment_scores[gene_idx, tf_index] / abs( + enrichment_scores.max() * 10 + ) # randomly adjust the gene by some portion of the max adjustment adjusted_mean_matrix[gene_idx, tf_index] = bound_mean + ( @@ -469,9 +472,12 @@ def perturbation_effect_adjustment_function_with_tf_relationships( if bound_labels[gene_idx, tf_index] == 1 and torch.all( bound_labels[gene_idx, related_tfs] == 1 ): - # draw a random value between 0 and 1 to use to - # control magnitude of adjustment - adjustment_multiplier = torch.rand(1) + # OLD: adjustment_multiplier = torch.rand(1) + # divide its enrichment score by the maximum magnitude possible to + # create an adjustment multipler that scales with increasing enrichment + adjustment_multiplier = enrichment_scores[gene_idx, tf_index] / abs( + enrichment_scores.max() * 10 + ) # randomly adjust the gene by some portion of the max adjustment adjusted_mean_matrix[gene_idx, tf_index] = bound_mean + ( From e133afa5e2e2b07e03e1bb4058417e065e7f843e Mon Sep 17 00:00:00 2001 From: Chase Mateusiak Date: Mon, 10 Jun 2024 17:54:50 -0500 Subject: [PATCH 3/7] Update pyproject.toml This is already in the dev dependencies. I forgot to go over that. To add 'production' depdencies with python, you add to the default dependencies section with just: ``` poetry add ``` You can also add dependencies to a group, eg: ``` poetry add --group dev ``` See https://python-poetry.org/docs/cli/#options-4 That way, you can control what dependencies get installed. For a typical user, I don't think we'll want to install jupyter in the environment. They should have jupyter in their environment, and then install yeastdnnexplorer into it. --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f994d36..2a7d309 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,6 @@ tensorboard = "^2.16.1" torchsummary = "^1.5.1" optuna = "^3.6.0" optuna-dashboard = "^0.15.1" -jupyter = "^1.0.0" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" From 84fa6eb646dbd1d6782c26be16164c362c2baff8 Mon Sep 17 00:00:00 2001 From: ejiawustl Date: Mon, 22 Jul 2024 14:58:24 -0700 Subject: [PATCH 4/7] parameterizing the max_adjustment value and adding the calculate_variance_explained function and test suite --- .../probability_models/generate_data.py | 4 +- yeastdnnexplorer/probability_models/util.py | 50 +++++++++++ .../tests/probability_models/test_util.py | 89 +++++++++++++++++++ 3 files changed, 141 insertions(+), 2 deletions(-) create mode 100644 yeastdnnexplorer/probability_models/util.py create mode 100644 yeastdnnexplorer/tests/probability_models/test_util.py diff --git a/yeastdnnexplorer/probability_models/generate_data.py b/yeastdnnexplorer/probability_models/generate_data.py index 6f00e90..a0b6b40 100644 --- a/yeastdnnexplorer/probability_models/generate_data.py +++ b/yeastdnnexplorer/probability_models/generate_data.py @@ -378,7 +378,7 @@ def perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic( # divide its enrichment score by the maximum magnitude possible to # create an adjustment multipler that scales with increasing enrichment adjustment_multiplier = enrichment_scores[gene_idx, tf_index] / abs( - enrichment_scores.max() * 10 + enrichment_scores.max() ) # randomly adjust the gene by some portion of the max adjustment @@ -476,7 +476,7 @@ def perturbation_effect_adjustment_function_with_tf_relationships( # divide its enrichment score by the maximum magnitude possible to # create an adjustment multipler that scales with increasing enrichment adjustment_multiplier = enrichment_scores[gene_idx, tf_index] / abs( - enrichment_scores.max() * 10 + enrichment_scores.max() ) # randomly adjust the gene by some portion of the max adjustment diff --git a/yeastdnnexplorer/probability_models/util.py b/yeastdnnexplorer/probability_models/util.py new file mode 100644 index 0000000..8bf32c9 --- /dev/null +++ b/yeastdnnexplorer/probability_models/util.py @@ -0,0 +1,50 @@ +import logging +from collections.abc import Callable + +import torch + +from yeastdnnexplorer.probability_models.relation_classes import Relation + +logger = logging.getLogger(__name__) + +from sklearn.metrics import explained_variance_score + + +def calculate_explained_variance(model, data_module): + """ + Calculates the explained variance score of the model's predictions on the test + dataset + + Parameters: + - model: The trained model to predict with + - data_module: The data module for the test data loader + + Returns: + - explained_variance: should be a float between 0 and 1 (but could be negative) + """ + + predictions = [] + targets = [] + + # Set the model to evaluation mode to disable dropout, + # batch normalization, etc. + model.eval() + + # Disable gradient calculation to save memory and computation + # Iterate over the test data batches, get the input features (x) and true targets + # (y) from the batch, make predictions using the model, add everything to both lists + + with torch.no_grad(): + for batch in data_module.test_dataloader(): + x, y = batch + outputs = model(x) + predictions.append(outputs) + targets.append(y) + + # Concatenate all batch predictions and targets into single tensors + # They should be numpy arrays in order for explained_variance_score to work properly + predictions = torch.cat(predictions, dim=0).cpu().numpy() + targets = torch.cat(targets, dim=0).cpu().numpy() + + # Calculate and return the explained variance score + return explained_variance_score(targets, predictions) diff --git a/yeastdnnexplorer/tests/probability_models/test_util.py b/yeastdnnexplorer/tests/probability_models/test_util.py new file mode 100644 index 0000000..9c1a1be --- /dev/null +++ b/yeastdnnexplorer/tests/probability_models/test_util.py @@ -0,0 +1,89 @@ +import pytest +import torch +from sklearn.metrics import explained_variance_score +from yeastdnnexplorer.probability_models.util import calculate_explained_variance e + + +# Create a test model directly +def create_test_model(): + model = torch.nn.Linear(10, 1) + return model + + +# Create a specific model with known weights for testing +def create_specific_model(): + model = torch.nn.Linear(1, 1) + with torch.no_grad(): + model.weight.fill_(2.0) + model.bias.fill_(1.0) + return model + + +# Create test data directly +def create_test_data(): + x = torch.randn(100, 10) + y = torch.randn(100, 1) + dataset = torch.utils.data.TensorDataset(x, y) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=10) + return dataloader + + +# Create specific test data for testing +# This data uses the linear relationship: y = 2x + 1 +# You can modify this if needed to make new tests in the future +def create_specific_data(): + x = torch.arange(0, 10, dtype=torch.float32).unsqueeze(1) + y = 2 * x + 1 + dataset = torch.utils.data.TensorDataset(x, y) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=10) + return dataloader + +class DataModule: + def test_dataloader(self): + return dataloader + +def test_calculate_explained_variance(): + # Create test data + dataloader = create_test_data() + + # Create a test model and set it to evaluation mode to prevent model from udpating + model = create_test_model() + model.eval() + + # Create a mock data module structure + data_module = DataModule() + + # Calculate explained variance using the function + explained_variance = calculate_explained_variance(model, data_module) + + # Assert that the explained variance is a float and between 0-1 (it could be neg) + assert isinstance(explained_variance, float) + assert 0 <= explained_variance <= 1 + + +def test_specific_model_explained_variance(): + # Create specific test data based on the functions at the top + dataloader = create_specific_data() + + # Create a specific model with known weights and set it to evaluation mode + model = create_specific_model() + model.eval() + + # Create a mock data module structure + data_module = DataModule() + + # Calculate explained variance using the function + explained_variance = calculate_explained_variance(model, data_module) + + # Expected explained variance is 1 since the model should perfectly fit the line + expected_explained_variance = 1.0 + + # Asset that the calculated explained variance is a float and equal to 1.0 + assert isinstance(explained_variance, float), "Explained variance should be a float" + assert ( + explained_variance == expected_explained_variance + ), f"Explained variance should be {expected_explained_variance}" + + +if __name__ == "__main__": + pytest.main([__file__]) From be595137555678d7eb45e6b8649e5075b3260c3f Mon Sep 17 00:00:00 2001 From: ejiawustl Date: Tue, 23 Jul 2024 10:22:10 -0700 Subject: [PATCH 5/7] removing the function and test suite for calculating the variance explained and adding the function to the visualizing_and_testing_data_generation_methods notebook --- ..._and_testing_data_generation_methods.ipynb | 431 ++++++++++++++---- yeastdnnexplorer/probability_models/util.py | 50 -- .../tests/probability_models/test_util.py | 89 ---- 3 files changed, 333 insertions(+), 237 deletions(-) delete mode 100644 yeastdnnexplorer/probability_models/util.py delete mode 100644 yeastdnnexplorer/tests/probability_models/test_util.py diff --git a/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb b/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb index 88347bb..1a320ad 100644 --- a/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb +++ b/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -29,6 +29,8 @@ "\n", "import torch\n", "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import explained_variance_score\n", + "from typing import Dict, Any\n", "\n", "from yeastdnnexplorer.probability_models.relation_classes import Relation, And, Or\n", "from yeastdnnexplorer.probability_models.generate_data import (\n", @@ -303,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -336,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -383,12 +385,42 @@ " accelerator=\"cpu\",\n", " # callbacks=[best_model_checkpoint, periodic_checkpoint],\n", " # logger=[tb_logger, csv_logger],\n", - " )" + " )\n", + "\n", + "def calculate_explained_variance(test_results: Dict[str, Any], data_module: Any, model: torch.nn.Module):\n", + " \"\"\"\n", + " Calculates the explained variance score using PyTorch and scikit-learn.\n", + "\n", + " Params:\n", + " test_results (Dict[str, Any]): The results dictionary from the trainer.test() function.\n", + " data_module (Any): The data module containing the test dataloader.\n", + " model (torch.nn.Module): The trained PyTorch neural network model.\n", + "\n", + " Returns:\n", + " float: The explained variance score.\n", + " \"\"\"\n", + " predictions = []\n", + " targets = []\n", + "\n", + " model.eval() # Set the model to evaluation mode\n", + "\n", + " with torch.no_grad(): # Disable gradient calculation\n", + " for batch in data_module.test_dataloader():\n", + " # Assuming your data is in the format (x, y)\n", + " x, y = batch\n", + " outputs = model(x)\n", + " predictions.append(outputs)\n", + " targets.append(y)\n", + "\n", + " predictions = torch.cat(predictions, dim=0).numpy() # Concatenate predictions\n", + " targets = torch.cat(targets, dim=0).numpy() # Concatenate targets\n", + "\n", + " return explained_variance_score(targets, predictions)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -399,66 +431,14 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "import torch\n", - "from sklearn.metrics import explained_variance_score\n", - "\n", "data_module = get_data_module(0.0)\n", "num_tfs = sum(data_module.n_sample)\n", "model_ves = [] # List to store explained variance for the non-linear model\n", - "linear_model_test_ves = [] # List to store explained variance for the linear model\n", - "\n", - "def calculate_explained_variance(test_results, data_module, model):\n", - " predictions = []\n", - " targets = []\n", - "\n", - " model.eval() # Set the model to evaluation mode\n", - "\n", - " with torch.no_grad(): # Disable gradient calculation\n", - " for batch in data_module.test_dataloader():\n", - " # Assuming your data is in the format (x, y)\n", - " x, y = batch\n", - " outputs = model(x)\n", - " predictions.append(outputs)\n", - " targets.append(y)\n", - " mse = torch.nn.functional.mse_loss(torch.tensor(predictions), torch.tensor(targets)).item()\n", - " var_y = torch.var(torch.tensor(targets)).item() \n", - " explained_variance = 1 - (mse / var_y)\n", - " return explained_variance \n", - "\n", - "# # Function to calculate explained variance from test results\n", - "# def calculate_explained_variance(test_results, data_module, model):\n", - "# \"\"\"\n", - "# Calculates the explained variance score using PyTorch and scikit-learn.\n", - "\n", - "# Args:\n", - "# test_results: The results dictionary from the trainer.test() function.\n", - "# data_module: The data module containing the test dataloader.\n", - "# model: The trained PyTorch model.\n", - "\n", - "# Returns:\n", - "# float: The explained variance score.\n", - "# \"\"\"\n", - "# predictions = []\n", - "# targets = []\n", - "\n", - "# model.eval() # Set the model to evaluation mode\n", - "\n", - "# with torch.no_grad(): # Disable gradient calculation\n", - "# for batch in data_module.test_dataloader():\n", - "# # Assuming your data is in the format (x, y)\n", - "# x, y = batch\n", - "# outputs = model(x)\n", - "# predictions.append(outputs)\n", - "# targets.append(y)\n", - "\n", - "# predictions = torch.cat(predictions, dim=0).numpy() # Concatenate predictions\n", - "# targets = torch.cat(targets, dim=0).numpy() # Concatenate targets\n", - "\n", - "# return explained_variance_score(targets, predictions)" + "linear_model_test_ves = [] # List to store explained variance for the linear model" ] }, { @@ -470,7 +450,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -481,6 +461,13 @@ "TPU available: False, using: 0 TPU cores\n", "IPU available: False, using: 0 IPUs\n", "HPU available: False, using: 0 HPUs\n", + "Missing logger folder: /Users/ericjia/yeastdnnexplorer/docs/tutorials/lightning_logs\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:260: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_train, Y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:263: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_val, Y_val = torch.tensor(X_val, dtype=torch.float32), torch.tensor(\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_test, Y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(\n", "\n", " | Name | Type | Params\n", "----------------------------------------------------\n", @@ -501,26 +488,34 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "", + "model_id": "d1244d8d5c6342d9b9070d36b4cf635b", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Sanity Checking: | | 0/? [00:00 6\u001b[0m explained_variance \u001b[38;5;241m=\u001b[39m \u001b[43mcalculate_explained_variance\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtest_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7\u001b[0m model_ves\u001b[38;5;241m.\u001b[39mappend(explained_variance) \u001b[38;5;66;03m# Append explained variance to the list\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPrinting test results...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "Cell \u001b[0;32mIn[30], line 23\u001b[0m, in \u001b[0;36mcalculate_explained_variance\u001b[0;34m(test_results, data_module, model)\u001b[0m\n\u001b[1;32m 21\u001b[0m predictions\u001b[38;5;241m.\u001b[39mappend(outputs)\n\u001b[1;32m 22\u001b[0m targets\u001b[38;5;241m.\u001b[39mappend(y)\n\u001b[0;32m---> 23\u001b[0m mse \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mnn\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39mmse_loss(\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpredictions\u001b[49m\u001b[43m)\u001b[49m, torch\u001b[38;5;241m.\u001b[39mtensor(targets))\u001b[38;5;241m.\u001b[39mitem()\n\u001b[1;32m 24\u001b[0m var_y \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mvar(torch\u001b[38;5;241m.\u001b[39mtensor(targets))\u001b[38;5;241m.\u001b[39mitem() \n\u001b[1;32m 25\u001b[0m explained_variance \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;241m-\u001b[39m (mse \u001b[38;5;241m/\u001b[39m var_y)\n", - "\u001b[0;31mValueError\u001b[0m: only one element tensors can be converted to Python scalars" + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: False, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:260: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_train, Y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:263: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_val, Y_val = torch.tensor(X_val, dtype=torch.float32), torch.tensor(\n", + "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " X_test, Y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(\n", + "\n", + " | Name | Type | Params\n", + "----------------------------------------------\n", + "0 | mae | MeanAbsoluteError | 0 \n", + "1 | SMSE | SMSE | 0 \n", + "2 | linear1 | Linear | 110 \n", + "----------------------------------------------\n", + "110 Trainable params\n", + "0 Non-trainable params\n", + "110 Total params\n", + "0.000 Total estimated model params size (MB)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Printing test results...\n", + "[{'test_mse': 1.4547666311264038, 'test_mae': 0.9667437672615051, 'test_smse': 7.648471832275391}]\n", + "Printing explained variance\n", + "0.24665151238441468\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f08e2062f52b4c13a1f4c021f481018c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | | 0/? [00:00 Date: Mon, 5 Aug 2024 13:03:31 -0700 Subject: [PATCH 6/7] Added docstrings and typehinting, removed unnecessary work and added exposition to graphs and methods --- ..._and_testing_data_generation_methods.ipynb | 2891 ++--------------- 1 file changed, 278 insertions(+), 2613 deletions(-) diff --git a/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb b/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb index 88347bb..fee8e14 100644 --- a/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb +++ b/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb @@ -17,2589 +17,7 @@ }, { "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "# imports\n", - "from yeastdnnexplorer.probability_models.generate_data import (generate_gene_population, \n", - " generate_binding_effects,\n", - " generate_pvalues,\n", - " generate_perturbation_effects)\n", - "\n", - "import torch\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from yeastdnnexplorer.probability_models.relation_classes import Relation, And, Or\n", - "from yeastdnnexplorer.probability_models.generate_data import (\n", - " default_perturbation_effect_adjustment_function,\n", - " perturbation_effect_adjustment_function_with_tf_relationships,\n", - " perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic\n", - ")\n", - "\n", - "from pytorch_lightning import Trainer, LightningModule, seed_everything\n", - "from pytorch_lightning.callbacks import ModelCheckpoint\n", - "from pytorch_lightning.loggers import CSVLogger, TensorBoardLogger\n", - "from torchsummary import summary\n", - "\n", - "from yeastdnnexplorer.data_loaders.synthetic_data_loader import SyntheticDataLoader\n", - "from yeastdnnexplorer.ml_models.simple_model import SimpleModel\n", - "from yeastdnnexplorer.ml_models.customizable_model import CustomizableModel\n", - "\n", - "torch.manual_seed(42) # For CPU\n", - "torch.cuda.manual_seed_all(42) # For all CUDA devices" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generating the binding data will be the same as always, see `generate_in_silico_data.ipynb`" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "n_genes = 3000\n", - "\n", - "bound = [0.5, 0.5, 0.5, 0.5, 0.5]\n", - "n_sample = [1, 1, 2, 2, 4]\n", - "\n", - "# this will be a list of length 10 with a GenePopulation object in each element\n", - "gene_populations_list = []\n", - "for bound_proportion, n_draws in zip(bound, n_sample):\n", - " for _ in range(n_draws):\n", - " gene_populations_list.append(generate_gene_population(n_genes, bound_proportion))\n", - " \n", - "# Generate binding data for each gene population\n", - "binding_effect_list = [generate_binding_effects(gene_population)\n", - " for gene_population in gene_populations_list]\n", - "\n", - "# Calculate p-values for binding data\n", - "binding_pvalue_list = [generate_pvalues(binding_data) for binding_data in binding_effect_list]\n", - "\n", - "binding_data_combined = [torch.stack((gene_population.labels, binding_effect, binding_pval), dim=1)\n", - " for gene_population, binding_effect, binding_pval\n", - " in zip (gene_populations_list, binding_effect_list, binding_pvalue_list)]\n", - "\n", - "# Stack along a new dimension (dim=1) to create a tensor of shape [num_genes, num_TFs, 3]\n", - "binding_data_tensor = torch.stack(binding_data_combined, dim=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we define our experiment, this function will return the average perturbation effects (across n_iterations iterations) for each TF for a specific gene for each of the 4 data generation method we have at our disposal. Due to the randomness in the generated data, we need to find the averages over a number of iterations to get the true common values.\n", - "\n", - "We also need to define dictionaries of TF relationships for our third and fourth methods of generating perturbation data, see `generate_in_silico_data.ipynb` for an explanation of what these represent and how they are used / structured. The documentation in `generate_data.py` may be helpful as well." - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "tf_relationships = {\n", - " 0: [1],\n", - " 1: [8],\n", - " 2: [5, 6],\n", - " 3: [4],\n", - " 4: [5],\n", - " 5: [9],\n", - " 6: [4],\n", - " 7: [1, 4],\n", - " 8: [6],\n", - " 9: [4],\n", - "}\n", - "\n", - "tf_relationships_dict_boolean_logic = {\n", - " 0: [And(3, 4, 8), Or(3, 7), Or(1, 1)],\n", - " 1: [And(5, Or(7, 8))],\n", - " 2: [],\n", - " 3: [Or(7, 9), And(6, 7)],\n", - " 4: [And(1, 2)],\n", - " 5: [Or(0, 1, 2, 8, 9)],\n", - " 6: [And(0, Or(1, 2))],\n", - " 7: [Or(2, And(5, 6, 9))],\n", - " 8: [],\n", - " 9: [And(6, And(3, Or(0, 9)))],\n", - "}\n", - "\n", - "def experiment(n_iterations = 10, GENE_IDX = 0):\n", - " print(\"Bound (1) and Unbound (0) Labels for gene \" + str(GENE_IDX) + \":\")\n", - " print(binding_data_tensor[GENE_IDX, :, 0])\n", - "\n", - " num_tfs = sum(n_sample)\n", - " \n", - " no_mean_adjustment_scores = torch.zeros(num_tfs)\n", - " normal_mean_adjustment_scores = torch.zeros(num_tfs)\n", - " dep_mean_adjustment_scores = torch.zeros(num_tfs)\n", - " boolean_logic_scores = torch.zeros(num_tfs)\n", - "\n", - " # we generate perturbation effects for each TF on each iteration and then add them to the running totals\n", - " for i in range(n_iterations):\n", - " # Method 1: Generate perturbation effects without mean adjustment\n", - " perturbation_effects_list_no_mean_adjustment = [generate_perturbation_effects(binding_data_tensor[:, tf_index, :].unsqueeze(1), tf_index=0) \n", - " for tf_index in range(sum(n_sample))]\n", - " perturbation_effects_list_no_mean_adjustment = torch.stack(perturbation_effects_list_no_mean_adjustment, dim=1)\n", - "\n", - " # Method 2: Generate perturbation effects with normal mean adjustment\n", - " perturbation_effects_list_normal_mean_adjustment = generate_perturbation_effects(\n", - " binding_data_tensor, \n", - " max_mean_adjustment=10.0\n", - " )\n", - "\n", - " # Method 3: Generate perturbation effects with dependent mean adjustment\n", - " perturbation_effects_list_dep_mean_adjustment = generate_perturbation_effects(\n", - " binding_data_tensor, \n", - " tf_relationships=tf_relationships,\n", - " adjustment_function=perturbation_effect_adjustment_function_with_tf_relationships,\n", - " max_mean_adjustment=10.0,\n", - " )\n", - " \n", - " # Method 4: Generate perturbation effects with binary relations between the TFs\n", - " perturbation_effects_list_boolean_logic = generate_perturbation_effects(\n", - " binding_data_tensor, \n", - " adjustment_function=perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic,\n", - " tf_relationships=tf_relationships_dict_boolean_logic,\n", - " max_mean_adjustment=10.0,\n", - " )\n", - "\n", - " # take absolute values since we only care about the magnitude of the effects\n", - " no_mean_adjustment_scores += abs(perturbation_effects_list_no_mean_adjustment[GENE_IDX, :])\n", - " normal_mean_adjustment_scores += abs(perturbation_effects_list_normal_mean_adjustment[GENE_IDX, :])\n", - " dep_mean_adjustment_scores += abs(perturbation_effects_list_dep_mean_adjustment[GENE_IDX, :])\n", - " boolean_logic_scores += abs(perturbation_effects_list_boolean_logic[GENE_IDX, :])\n", - "\n", - " if (i + 1) % 5 == 0:\n", - " print(f\"iteration {i+1} completed\")\n", - " \n", - " # divide by the number of iterations to get the averages\n", - " no_mean_adjustment_scores /= n_iterations\n", - " normal_mean_adjustment_scores /= n_iterations\n", - " dep_mean_adjustment_scores /= n_iterations\n", - " boolean_logic_scores /= n_iterations\n", - " \n", - " return no_mean_adjustment_scores, normal_mean_adjustment_scores, dep_mean_adjustment_scores, boolean_logic_scores" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can run the experiment for n_iterations, I find that you should iterate at least 30 times, but closer to 100 is most ideal. This could take 1-5 minutes depending on your computer." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bound (1) and Unbound (0) Labels for gene 0:\n", - "tensor([0., 0., 0., 1., 1., 1., 1., 1., 0., 1.])\n", - "iteration 5 completed\n", - "iteration 10 completed\n", - "iteration 15 completed\n", - "iteration 20 completed\n", - "iteration 25 completed\n", - "iteration 30 completed\n", - "iteration 35 completed\n", - "iteration 40 completed\n", - "iteration 45 completed\n", - "iteration 50 completed\n" - ] - } - ], - "source": [ - "GENE_IDX = 0\n", - "experiment_results = experiment(n_iterations=50, GENE_IDX=GENE_IDX)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We now plot our results." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bound (bound) TFs for gene 0 are: [3, 4, 5, 6, 7, 9]\n", - "Unbound (unbound) TFs for gene 0 are: [0, 1, 2, 8]\n", - "tensor([0., 0., 0., 1., 1., 1., 1., 1., 0., 1.])\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "x_vals = list(range(sum(n_sample)))\n", - "print(\"Bound (bound) TFs for gene \" + str(GENE_IDX) + \" are: \" + str(binding_data_tensor[GENE_IDX, :, 0].nonzero().flatten().tolist()))\n", - "print(\"Unbound (unbound) TFs for gene \" + str(GENE_IDX) + \" are: \" + str((1 - binding_data_tensor[GENE_IDX, :, 0]).nonzero().flatten().tolist()))\n", - "print(binding_data_tensor[GENE_IDX, :, 0])\n", - "plt.figure(figsize=(10, 6))\n", - "\n", - "# Plot each set of experiment results with a different color\n", - "colors = ['red', 'green', 'blue', 'orange']\n", - "for index, results in enumerate(experiment_results):\n", - " plt.scatter(x_vals, results, color=colors[index])\n", - "\n", - "plt.title('Pertubation Effects for Gene ' + str(GENE_IDX) + ' with Different Adjustment Functions (averaged across 100 trials)')\n", - "plt.xlabel('TF Index')\n", - "plt.ylabel('Perturbation Effect Val')\n", - "\n", - "#added to compare this to previous graph, REMOVE LATER\n", - "plt.ylim(0,9)\n", - "\n", - "\n", - "plt.xticks(x_vals)\n", - "plt.grid(True)\n", - "plt.legend(['No Mean Adjustment', 'Normal (non-dependent) Mean Adjust', 'Dependent Mean Adjustment', 'Boolean Logic Adjustment'])\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Recall that for the dependent mean adjustment, the TF in question must be bound and all of the TFs in its dependency array (in the tf_relationships dictionary) must be bound as well. This is why we do not adjust the mean for TF 7 despite it being bound, it depends on TF 1 and TF 4 both being bound, and TF1 is not bound.\n", - "\n", - "Similarly, for the boolean logic adjustment, we do not adjust the mean for 6 despite it being bound because it depends on (TF0 && (TF1 || TF2)) being bound, and none of those 3 TFs are bound to the gene we are studying.\n", - "\n", - "Note that if you change GENE_IDX, the random seed, or any of the relationship dictionaris that this explanation will no longer apply to the data you are seeing in the plot." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training models on data generated from the 4 different methods\n", - "In the next experiment, we will be training the exact same model on data generated from each of these 4 methods. We will also train a simple linear model on all four methods to use as a baseline to compare to. Other than the method used to generate the data, everything else will be held the same." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# define checkpoints and loggers\n", - "best_model_checkpoint = ModelCheckpoint(\n", - " monitor=\"val_explained_variance\",\n", - " mode=\"min\",\n", - " filename=\"best-model-{epoch:02d}-{val_loss:.2f}\",\n", - " save_top_k=1,\n", - ")\n", - "\n", - "# Callback to save checkpoints every 5 epochs, regardless of performance\n", - "periodic_checkpoint = ModelCheckpoint(\n", - " filename=\"periodic-{epoch:02d}\",\n", - " every_n_epochs=2,\n", - " save_top_k=-1, # Setting -1 saves all checkpoints\n", - ")\n", - "\n", - "# define loggers for the model\n", - "tb_logger = TensorBoardLogger(\"logs/tensorboard_logs\")\n", - "csv_logger = CSVLogger(\"logs/csv_logs\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define a few helper functions to run our experiment. We make helper functions for things that will mostly be the same across each training loop so that we don't have to keep redefining them." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "def get_data_module(max_mean_adjustment, adjustment_function = default_perturbation_effect_adjustment_function, tf_relationships_dict = {}):\n", - " return SyntheticDataLoader(\n", - " batch_size=32,\n", - " num_genes=4000,\n", - " bound_mean=3.0,\n", - " bound=[0.5] * 5,\n", - " n_sample=[1, 1, 2, 2, 4], # sum of this is num of tfs\n", - " val_size=0.1,\n", - " test_size=0.1,\n", - " random_state=42,\n", - " max_mean_adjustment=max_mean_adjustment,\n", - " adjustment_function=adjustment_function,\n", - " tf_relationships=tf_relationships_dict,\n", - " )\n", - "\n", - "def get_model(num_tfs):\n", - " return CustomizableModel(\n", - " input_dim=num_tfs,\n", - " output_dim=num_tfs,\n", - " lr=0.01,\n", - " hidden_layer_num=2,\n", - " hidden_layer_sizes=[64, 32],\n", - " activation=\"LeakyReLU\",\n", - " optimizer=\"RMSprop\",\n", - " L2_regularization_term=0.0,\n", - " dropout_rate=0.0,\n", - " )\n", - "\n", - "def get_linear_model(num_tfs):\n", - " return SimpleModel(\n", - " input_dim=num_tfs,\n", - " output_dim=num_tfs,\n", - " lr=0.01\n", - " )\n", - "\n", - "def get_trainer():\n", - " # uncomment callbacks or logggers if you would like checkpoints / logs\n", - " return Trainer(\n", - " max_epochs=10,\n", - " deterministic=True,\n", - " accelerator=\"cpu\",\n", - " # callbacks=[best_model_checkpoint, periodic_checkpoint],\n", - " # logger=[tb_logger, csv_logger],\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# These lists will store the test results for different models and data generation methods\n", - "model_ves = []\n", - "linear_model_test_ves = []" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "from sklearn.metrics import explained_variance_score\n", - "\n", - "data_module = get_data_module(0.0)\n", - "num_tfs = sum(data_module.n_sample)\n", - "model_ves = [] # List to store explained variance for the non-linear model\n", - "linear_model_test_ves = [] # List to store explained variance for the linear model\n", - "\n", - "def calculate_explained_variance(test_results, data_module, model):\n", - " predictions = []\n", - " targets = []\n", - "\n", - " model.eval() # Set the model to evaluation mode\n", - "\n", - " with torch.no_grad(): # Disable gradient calculation\n", - " for batch in data_module.test_dataloader():\n", - " # Assuming your data is in the format (x, y)\n", - " x, y = batch\n", - " outputs = model(x)\n", - " predictions.append(outputs)\n", - " targets.append(y)\n", - " mse = torch.nn.functional.mse_loss(torch.tensor(predictions), torch.tensor(targets)).item()\n", - " var_y = torch.var(torch.tensor(targets)).item() \n", - " explained_variance = 1 - (mse / var_y)\n", - " return explained_variance \n", - "\n", - "# # Function to calculate explained variance from test results\n", - "# def calculate_explained_variance(test_results, data_module, model):\n", - "# \"\"\"\n", - "# Calculates the explained variance score using PyTorch and scikit-learn.\n", - "\n", - "# Args:\n", - "# test_results: The results dictionary from the trainer.test() function.\n", - "# data_module: The data module containing the test dataloader.\n", - "# model: The trained PyTorch model.\n", - "\n", - "# Returns:\n", - "# float: The explained variance score.\n", - "# \"\"\"\n", - "# predictions = []\n", - "# targets = []\n", - "\n", - "# model.eval() # Set the model to evaluation mode\n", - "\n", - "# with torch.no_grad(): # Disable gradient calculation\n", - "# for batch in data_module.test_dataloader():\n", - "# # Assuming your data is in the format (x, y)\n", - "# x, y = batch\n", - "# outputs = model(x)\n", - "# predictions.append(outputs)\n", - "# targets.append(y)\n", - "\n", - "# predictions = torch.cat(predictions, dim=0).numpy() # Concatenate predictions\n", - "# targets = torch.cat(targets, dim=0).numpy() # Concatenate targets\n", - "\n", - "# return explained_variance_score(targets, predictions)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# **Train models on data generated with no mean adjustment**" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GPU available: False, used: False\n", - "TPU available: False, using: 0 TPU cores\n", - "IPU available: False, using: 0 IPUs\n", - "HPU available: False, using: 0 HPUs\n", - "\n", - " | Name | Type | Params\n", - "----------------------------------------------------\n", - "0 | activation | LeakyReLU | 0 \n", - "1 | input_layer | Linear | 704 \n", - "2 | hidden_layers | ModuleList | 2.1 K \n", - "3 | output_layer | Linear | 330 \n", - "4 | dropout | Dropout | 0 \n", - "5 | mae | MeanAbsoluteError | 0 \n", - "6 | SMSE | SMSE | 0 \n", - "----------------------------------------------------\n", - "3.1 K Trainable params\n", - "0 Non-trainable params\n", - "3.1 K Total params\n", - "0.012 Total estimated model params size (MB)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Sanity Checking: | | 0/? [00:00 6\u001b[0m explained_variance \u001b[38;5;241m=\u001b[39m \u001b[43mcalculate_explained_variance\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtest_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7\u001b[0m model_ves\u001b[38;5;241m.\u001b[39mappend(explained_variance) \u001b[38;5;66;03m# Append explained variance to the list\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPrinting test results...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "Cell \u001b[0;32mIn[30], line 23\u001b[0m, in \u001b[0;36mcalculate_explained_variance\u001b[0;34m(test_results, data_module, model)\u001b[0m\n\u001b[1;32m 21\u001b[0m predictions\u001b[38;5;241m.\u001b[39mappend(outputs)\n\u001b[1;32m 22\u001b[0m targets\u001b[38;5;241m.\u001b[39mappend(y)\n\u001b[0;32m---> 23\u001b[0m mse \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mnn\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39mmse_loss(\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpredictions\u001b[49m\u001b[43m)\u001b[49m, torch\u001b[38;5;241m.\u001b[39mtensor(targets))\u001b[38;5;241m.\u001b[39mitem()\n\u001b[1;32m 24\u001b[0m var_y \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mvar(torch\u001b[38;5;241m.\u001b[39mtensor(targets))\u001b[38;5;241m.\u001b[39mitem() \n\u001b[1;32m 25\u001b[0m explained_variance \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;241m-\u001b[39m (mse \u001b[38;5;241m/\u001b[39m var_y)\n", - "\u001b[0;31mValueError\u001b[0m: only one element tensors can be converted to Python scalars" - ] - } - ], - "source": [ - "# --- Nonlinear Model ---\n", - "model = get_model(num_tfs)\n", - "trainer = get_trainer()\n", - "trainer.fit(model, data_module)\n", - "test_results = trainer.test(model, datamodule=data_module)\n", - "explained_variance = calculate_explained_variance(test_results, data_module, model)\n", - "model_ves.append(explained_variance) # Append explained variance to the list\n", - "print(\"Printing test results...\")\n", - "print(test_results)\n", - "print(\"Printing explained variance\")\n", - "print(explained_variance)\n", - "\n", - "\n", - "# --- Linear Model ---\n", - "linear_model = get_linear_model(num_tfs)\n", - "trainer = get_trainer()\n", - "trainer.fit(linear_model, data_module)\n", - "test_results = trainer.test(linear_model, datamodule=data_module)\n", - "explained_variance = calculate_explained_variance(test_results, data_module, linear_model)\n", - "linear_model_test_ves.append(explained_variance) # Append explained variance to the list\n", - "print(\"Printing linear model test results\")\n", - "print(test_results)\n", - "print(\"Printing linear model explained variance\")\n", - "print(explained_variance)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "#NOTE: replaced this kernel with the two above to implement the explained variance\n", - "\n", - "\n", - "# data_module = get_data_module(0.0)\n", - "# num_tfs = sum(data_module.n_sample)\n", - "\n", - "# # nonlinear model\n", - "# model = get_model(num_tfs)\n", - "# trainer = get_trainer()\n", - "# trainer.fit(model, data_module)\n", - "# test_results = trainer.test(model, datamodule=data_module)\n", - "# print(\"Printing test results...\")\n", - "# print(test_results)\n", - "# model_ves.append(test_results[0][\"test_ve\"])\n", - "\n", - "# # linear model\n", - "# linear_model = get_linear_model(num_tfs)\n", - "# trainer = get_trainer()\n", - "# trainer.fit(linear_model, data_module)\n", - "# test_results = trainer.test(linear_model, datamodule=data_module)\n", - "# print(\"Printing linear model test results\")\n", - "# print(test_results)\n", - "# linear_model_test_ves.append(test_results[0][\"test_ve\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# **Train models on data generated with normal mean adjustments**" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GPU available: False, used: False\n", - "TPU available: False, using: 0 TPU cores\n", - "IPU available: False, using: 0 IPUs\n", - "HPU available: False, using: 0 HPUs\n", - "\n", - " | Name | Type | Params\n", - "----------------------------------------------------\n", - "0 | activation | LeakyReLU | 0 \n", - "1 | input_layer | Linear | 704 \n", - "2 | hidden_layers | ModuleList | 2.1 K \n", - "3 | output_layer | Linear | 330 \n", - "4 | dropout | Dropout | 0 \n", - "5 | mae | MeanAbsoluteError | 0 \n", - "6 | SMSE | SMSE | 0 \n", - "----------------------------------------------------\n", - "3.1 K Trainable params\n", - "0 Non-trainable params\n", - "3.1 K Total params\n", - "0.012 Total estimated model params size (MB)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Sanity Checking: | | 0/? [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "data_gen_methods = [\"No Mean Adjustment\", \"Dependent Mean Adjustment\", \"TF Dependent Mean Adjustment\", \"TF Dependent Mean Adjust with Boolean Logic\"]\n", - "plt.figure(figsize=(10, 6))\n", - "plt.scatter(data_gen_methods, model_ves, color='blue')\n", - "plt.scatter(data_gen_methods, linear_model_test_ves, color='orange')\n", - "plt.title('Model VE Comparison (bound mean = 3.0)')\n", - "plt.xlabel('Model')\n", - "plt.ylabel('VE')\n", - "plt.grid(True)\n", - "plt.xticks(rotation=45, ha=\"right\")\n", - "plt.legend(['Complex (Customizable) Model', 'Linear Model'])\n", - "plt.tight_layout() # Adjust layout to make room for the rotated x-axis labels\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 38, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -2610,22 +28,14 @@ ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bound (1) and Unbound (0) Labels for gene 0:\n", - "tensor([0., 0., 0., 1., 1., 1., 1., 1., 0., 1.])\n", - "iteration 5 completed\n", - "iteration 10 completed\n", - "iteration 15 completed\n", - "iteration 20 completed\n", - "iteration 25 completed\n", - "iteration 30 completed\n", - "iteration 35 completed\n", - "iteration 40 completed\n", - "iteration 45 completed\n", - "iteration 50 completed\n" - ] + "data": { + "text/plain": [ + "42" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -2658,9 +68,24 @@ "from yeastdnnexplorer.data_loaders.synthetic_data_loader import SyntheticDataLoader\n", "from yeastdnnexplorer.ml_models.simple_model import SimpleModel\n", "from yeastdnnexplorer.ml_models.customizable_model import CustomizableModel\n", + "from typing import Tuple, List, Dict, Union\n", "\n", - "seed_everything(42)\n", - "\n", + "seed_everything(42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generating the binding data will be the same as always, see `generate_in_silico_data.ipynb`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ "n_genes = 3000\n", "bound = [0.5, 0.5, 0.5, 0.5, 0.5]\n", "n_sample = [1, 1, 2, 2, 4]\n", @@ -2680,8 +105,24 @@ "# Combine binding data into a tensor\n", "binding_data_combined = [torch.stack((gene_population.labels, binding_effect, binding_pval), dim=1)\n", " for gene_population, binding_effect, binding_pval in zip(gene_populations_list, binding_effect_list, binding_pvalue_list)]\n", - "binding_data_tensor = torch.stack(binding_data_combined, dim=1)\n", + "binding_data_tensor = torch.stack(binding_data_combined, dim=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we define our experiment, this function will return the average perturbation effects (across n_iterations iterations) for each TF for a specific gene for each of the 4 data generation method we have at our disposal. Due to the randomness in the generated data, we need to find the averages over a number of iterations to get the true common values.\n", "\n", + "We also need to define dictionaries of TF relationships for our third and fourth methods of generating perturbation data, see generate_in_silico_data.ipynb for an explanation of what these represent and how they are used / structured. The documentation in generate_data.py may be helpful as well." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ "# TF relationships\n", "tf_relationships = {\n", " 0: [1],\n", @@ -2709,7 +150,18 @@ " 9: [And(6, And(3, Or(0, 9)))],\n", "}\n", "\n", - "def experiment(n_iterations=10, GENE_IDX=0):\n", + "def experiment(n_iterations: int = 10, GENE_IDX: int = 0) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:\n", + " \"\"\"\n", + " Conducts an experiment by generating perturbation effects for a specific gene over multiple iterations\n", + " using different methods and averaging the results.\n", + "\n", + " Params:\n", + " n_iterations (int): Number of iterations to perform.\n", + " GENE_IDX (int): Index of the gene to analyze.\n", + "\n", + " Returns:\n", + " Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: Averaged perturbation effects scores for each method.\n", + " \"\"\"\n", " print(\"Bound (1) and Unbound (0) Labels for gene \" + str(GENE_IDX) + \":\")\n", " print(binding_data_tensor[GENE_IDX, :, 0])\n", "\n", @@ -2761,12 +213,128 @@ " dep_mean_adjustment_scores /= n_iterations\n", " boolean_logic_scores /= n_iterations\n", " \n", - " return no_mean_adjustment_scores, normal_mean_adjustment_scores, dep_mean_adjustment_scores, boolean_logic_scores\n", - "\n", + " return no_mean_adjustment_scores, normal_mean_adjustment_scores, dep_mean_adjustment_scores, boolean_logic_scores\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bound (1) and Unbound (0) Labels for gene 0:\n", + "tensor([0., 0., 0., 1., 1., 1., 1., 1., 0., 1.])\n", + "iteration 5 completed\n", + "iteration 10 completed\n", + "iteration 15 completed\n", + "iteration 20 completed\n", + "iteration 25 completed\n", + "iteration 30 completed\n", + "iteration 35 completed\n", + "iteration 40 completed\n", + "iteration 45 completed\n", + "iteration 50 completed\n" + ] + } + ], + "source": [ "GENE_IDX = 0\n", - "experiment_results = experiment(n_iterations=50, GENE_IDX=GENE_IDX)\n", + "experiment_results = experiment(n_iterations=50, GENE_IDX=GENE_IDX)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we plot our results." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bound (bound) TFs for gene 0 are: [3, 4, 5, 6, 7, 9]\n", + "Unbound (unbound) TFs for gene 0 are: [0, 1, 2, 8]\n", + "tensor([0., 0., 0., 1., 1., 1., 1., 1., 0., 1.])\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "x_vals = list(range(sum(n_sample)))\n", + "print(\"Bound (bound) TFs for gene \" + str(GENE_IDX) + \" are: \" + str(binding_data_tensor[GENE_IDX, :, 0].nonzero().flatten().tolist()))\n", + "print(\"Unbound (unbound) TFs for gene \" + str(GENE_IDX) + \" are: \" + str((1 - binding_data_tensor[GENE_IDX, :, 0]).nonzero().flatten().tolist()))\n", + "print(binding_data_tensor[GENE_IDX, :, 0])\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "# Plot each set of experiment results with a different color\n", + "colors = ['red', 'green', 'blue', 'orange']\n", + "for index, results in enumerate(experiment_results):\n", + " plt.scatter(x_vals, results, color=colors[index])\n", + "\n", + "plt.title('Pertubation Effects for Gene ' + str(GENE_IDX) + ' with Different Adjustment Functions (averaged across 100 trials)')\n", + "plt.xlabel('TF Index')\n", + "plt.ylabel('Perturbation Effect Val')\n", + "plt.xticks(x_vals)\n", + "plt.grid(True)\n", + "plt.legend(['No Mean Adjustment', 'Normal (non-dependent) Mean Adjust', 'Dependent Mean Adjustment', 'Boolean Logic Adjustment'])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Recall that for the dependent mean adjustment, the TF in question must be bound and all of the TFs in its dependency array (in the tf_relationships dictionary) must be bound as well. This is why we do not adjust the mean for TF 7 despite it being bound, it depends on TF 1 and TF 4 both being bound, and TF1 is not bound.\n", "\n", - "def get_data_module(max_mean_adjustment, adjustment_function=default_perturbation_effect_adjustment_function, tf_relationships_dict={}):\n", + "Similarly, for the boolean logic adjustment, we do not adjust the mean for 6 despite it being bound because it depends on (TF0 && (TF1 || TF2)) being bound, and none of those 3 TFs are bound to the gene we are studying.\n", + "\n", + "Note that if you change GENE_IDX, the random seed, or any of the relationship dictionaris that this explanation will no longer apply to the data you are seeing in the plot." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training models on data generated from the 4 different methods\n", + "In the next experiment, we will be training the exact same model on data generated from each of these 4 methods. We will also train a simple linear model on all four methods to use as a baseline to compare to. Other than the method used to generate the data, everything else will be held the same. We define a few helper functions to run our experiment. We make helper functions for things that will mostly be the same across each training loop so that we don't have to keep redefining them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_data_module(max_mean_adjustment: float, adjustment_function=default_perturbation_effect_adjustment_function, tf_relationships_dict: Dict[str, Union[List[int], float]] = {}) -> SyntheticDataLoader:\n", + " \"\"\"\n", + " Creates a data loader module for synthetic data.\n", + "\n", + " Params:\n", + " max_mean_adjustment (float): Maximum mean adjustment value.\n", + " adjustment_function (callable): Function to adjust perturbation effects.\n", + " tf_relationships_dict (Dict[str, Union[List[int], float]]): Dictionary of transcription factor relationships.\n", + "\n", + " Returns:\n", + " SyntheticDataLoader: Configured data loader for synthetic data.\n", + " \"\"\"\n", " return SyntheticDataLoader(\n", " batch_size=32,\n", " num_genes=4000,\n", @@ -2781,7 +349,16 @@ " tf_relationships=tf_relationships_dict,\n", " )\n", "\n", - "def get_model(num_tfs):\n", + "def get_model(num_tfs: int) -> CustomizableModel:\n", + " \"\"\"\n", + " Creates a customizable model.\n", + "\n", + " Params:\n", + " num_tfs (int): Number of transcription factors.\n", + "\n", + " Returns:\n", + " CustomizableModel: Configured model.\n", + " \"\"\"\n", " return CustomizableModel(\n", " input_dim=num_tfs,\n", " output_dim=num_tfs,\n", @@ -2794,24 +371,48 @@ " dropout_rate=0.0,\n", " )\n", "\n", - "def get_linear_model(num_tfs):\n", + "def get_linear_model(num_tfs: int) -> SimpleModel:\n", + " \"\"\"\n", + " Creates a simple linear model.\n", + "\n", + " Params:\n", + " num_tfs (int): Number of transcription factors.\n", + "\n", + " Returns:\n", + " SimpleModel: Configured linear model.\n", + " \"\"\"\n", " return SimpleModel(\n", " input_dim=num_tfs,\n", " output_dim=num_tfs,\n", " lr=0.01\n", " )\n", "\n", - "def get_trainer():\n", + "def get_trainer() -> Trainer:\n", + " \"\"\"\n", + " Creates a trainer for model training.\n", + "\n", + " Returns:\n", + " Trainer: Configured trainer.\n", + " \"\"\"\n", " return Trainer(\n", " max_epochs=10,\n", " deterministic=True,\n", " accelerator=\"cpu\",\n", " )\n", "\n", + "# These lists will store the test results for different models and data generation methods\n", "model_ves = []\n", "linear_model_test_ves = []" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Train models on data generated with no mean adjustment**\n", + "We will first compare the models performances on data generated without any mean adjustments. This is the most simple dataset we will generate, and serves as a good starting point for the models." + ] + }, { "cell_type": "code", "execution_count": 18, @@ -3273,6 +874,21 @@ "print(\"Linear Model Explained Variance:\", explained_variance_linear)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The explained variance for the linear model is quite small compared to the complex, customizable model which yielded a significantly larger positive explained variance. This suggests that the customizable model is able to better fit to the generated data in this condition. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Train models on data generated with normal mean adjustments**\n", + "Now, let us perform the same comparison but using this condition, with a normal mean adjustment of 3." + ] + }, { "cell_type": "code", "execution_count": 19, @@ -3738,6 +1354,21 @@ "print(\"Linear Model Explained Variance (Method 2):\", explained_variance_linear)\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once again, a similar explained variance metric was obtained using both models, suggesting that the customizable model performs substantially better than the simple linear model based on the generated data. We will continue to explore whether this relationship holds across all 4 conditions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Train model on data generated with dependent mean adjustments (method 3)**\n", + "Now we are implementing a dataset that contains dependent mean adjustments as shown below, with a mean adjustment of 3 if the TF meets the criteria defined by the dictionary." + ] + }, { "cell_type": "code", "execution_count": 20, @@ -4221,6 +1852,21 @@ "#Linear Model Explained Variance (Method 3): -0.0013749837875366212\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The simple linear model's explained variance is the lowest it has been out of the three conditions so far, and it appears once again that the customizable model obtains a better, larger explained variance compared to the simple linear model even when implementing dependencies among TFs. This would make sense as the added layer of complexity makes it more difficult for the simple linear model to make an accurate prediction. Lastly, it would be interesting to consider how the models will perform on data including more complex dependencies that involve binary relations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Train models on data generated using the binary relations between TFs (method 4)**\n", + "Similar to the previous condition, we are implementing dependencies between TFs. However, the following dictionary contains simple logic that makes these dependencies far more complex. For example, in order for transcription factor 4 to be perturbed based on the dictionary below, both TFs 1 and 2 need to be considered perturbed in order for this TF to be perturbed as well. Adding this additional layer of complexity will be an interesting challenge: let us see how the two models perform here." + ] + }, { "cell_type": "code", "execution_count": 21, @@ -4693,6 +2339,20 @@ "#Linear Model Explained Variance (Method 4): -0.013428604602813721" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once again, our customizable model outperforms the simple linear model in terms of obtaining a higher explained variance. What is interesting across all of these conditions explored so far is that the simple linear model has obtained a negative explained variance in each condition. This may be of further interest and could use more research to better determine exactly why this is occurring based on the generated data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can plot the results across each of the 4 conditions tested above to visualize how the simple linear model and the complex, customizable model perform compared to one another." + ] + }, { "cell_type": "code", "execution_count": 24, @@ -4725,10 +2385,15 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we can clearly see that across the 4 conditions, the customizable model acheives a significantly higher positive explained variance compared to the simple linear model, which is good for us because it helps to confirm that the customizable model we are using is able to better utilize the data to produce accurate predictions, resulting in a higher explained variance compared to the simple linear model. " + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [] } ], From 5d7ffa499d70ee657e0768206bd649f5dd11fce8 Mon Sep 17 00:00:00 2001 From: ejiawustl Date: Thu, 8 Aug 2024 10:00:03 -0700 Subject: [PATCH 7/7] updated notebook to use sphinx docstrings, added headings and subheadings and improved exposition --- ..._and_testing_data_generation_methods.ipynb | 4490 +---------------- 1 file changed, 280 insertions(+), 4210 deletions(-) diff --git a/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb b/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb index d277732..b54cfb7 100644 --- a/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb +++ b/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb @@ -4,6 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "# **Visualizing and Testing Data Generation Methods**\n", "In this notebook, we will run an experiment to display the average perturbation effect values that we generate with the 4 different methods we have for perturbation effect generation (other than the method for generating the perturbation effect values, we will be holding everything else the same). \n", "\n", "Recall that we have 4 methods for generating perturbation effect data (see `generate_in_silico_data.ipynb` for more information on these):\n", @@ -17,15 +18,35 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Seed set to 42\n" + ] + }, + { + "data": { + "text/plain": [ + "42" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# imports\n", - "from yeastdnnexplorer.probability_models.generate_data import (generate_gene_population, \n", - " generate_binding_effects,\n", - " generate_pvalues,\n", - " generate_perturbation_effects)\n", + "from yeastdnnexplorer.probability_models.generate_data import (\n", + " generate_gene_population, \n", + " generate_binding_effects, \n", + " generate_pvalues, \n", + " generate_perturbation_effects\n", + ")\n", "\n", "import torch\n", "import matplotlib.pyplot as plt\n", @@ -50,6 +71,13 @@ "torch.cuda.manual_seed_all(42) # For all CUDA devices" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **Generating the Data**" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -59,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -100,11 +128,12 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "tf_relationships = {\n", + "# TF relationships\n", + "tf_relationships_dict = {\n", " 0: [1],\n", " 1: [8],\n", " 2: [5, 6],\n", @@ -117,6 +146,8 @@ " 9: [4],\n", "}\n", "\n", + "# TF relationships that incorporate boolean logic; this is more complex than\n", + "# the simple relationships above as it implements \"and\" and \"or\" operations\n", "tf_relationships_dict_boolean_logic = {\n", " 0: [And(3, 4, 8), Or(3, 7), Or(1, 1)],\n", " 1: [And(5, Or(7, 8))],\n", @@ -134,13 +165,14 @@ " \"\"\"\n", " Conducts an experiment by generating perturbation effects for a specific gene over multiple iterations\n", " using different methods and averaging the results.\n", + " \n", + " :param n_iterations: Number of iterations to perform.\n", + " :type n_iterations: int\n", + " :param GENE_IDX: Index of the gene to analyze.\n", + " :type GENE_IDX: int\n", "\n", - " Params:\n", - " n_iterations (int): Number of iterations to perform.\n", - " GENE_IDX (int): Index of the gene to analyze.\n", - "\n", - " Returns:\n", - " Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: Averaged perturbation effects scores for each method.\n", + " :returns: A tuple containing averaged perturbation effects scores for each method.\n", + " :rtype: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]\n", " \"\"\"\n", " print(\"Bound (1) and Unbound (0) Labels for gene \" + str(GENE_IDX) + \":\")\n", " print(binding_data_tensor[GENE_IDX, :, 0])\n", @@ -199,16 +231,9 @@ " return no_mean_adjustment_scores, normal_mean_adjustment_scores, dep_mean_adjustment_scores, boolean_logic_scores" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can run the experiment for n_iterations, I find that you should iterate at least 30 times, but closer to 100 is most ideal. This could take 1-5 minutes depending on your computer." - ] - }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -239,12 +264,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We now plot our results." + "## **Visualizing the Results**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we plot our results." ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -297,6 +329,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "The x-axis labels represent the corresponding TFs whose perturbation effect values are being plotted on the y-axis. The color of each plotted point indicates which of the four data generation methods it was derived from. For example, based on the legend included in the graph, a red point was generated using no mean adjustment. This graph allows us to visualize the perturbation effects for the same TF under a variety of conditions.\n", + "\n", "Recall that for the dependent mean adjustment, the TF in question must be bound and all of the TFs in its dependency array (in the tf_relationships dictionary) must be bound as well. This is why we do not adjust the mean for TF 7 despite it being bound, it depends on TF 1 and TF 4 both being bound, and TF1 is not bound.\n", "\n", "Similarly, for the boolean logic adjustment, we do not adjust the mean for 6 despite it being bound because it depends on (TF0 && (TF1 || TF2)) being bound, and none of those 3 TFs are bound to the gene we are studying.\n", @@ -308,8 +342,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Training models on data generated from the 4 different methods\n", - "In the next experiment, we will be training the exact same model on data generated from each of these 4 methods. We will also train a simple linear model on all four methods to use as a baseline to compare to. Other than the method used to generate the data, everything else will be held the same." + "## **Training models on data generated from the 4 different methods**\n", + "In the next experiment, we will be training the exact same model on data generated from each of these 4 methods. We will also train a simple linear model on all four methods to use as a baseline to compare to. Other than the method used to generate the data, everything else will be held the same. We define a few helper functions to run our experiment. We make helper functions for things that will mostly be the same across each training loop so that we don't have to keep redefining them." ] }, { @@ -326,13 +360,15 @@ " save_top_k=1,\n", ")\n", "\n", - " Params:\n", - " max_mean_adjustment (float): Maximum mean adjustment value.\n", - " adjustment_function (callable): Function to adjust perturbation effects.\n", - " tf_relationships_dict (Dict[str, Union[List[int], float]]): Dictionary of transcription factor relationships.\n", + " :param max_mean_adjustment: Maximum mean adjustment value.\n", + " :type max_mean_adjustment: float\n", + " :param adjustment_function: Function to adjust perturbation effects.\n", + " :type adjustment_function: callable\n", + " :param tf_relationships_dict: Dictionary of transcription factor relationships.\n", + " :type tf_relationships_dict: Dict[str, Union[List[int], float]]\n", "\n", - " Returns:\n", - " SyntheticDataLoader: Configured data loader for synthetic data.\n", + " :returns: Configured data loader for synthetic data.\n", + " :rtype: SyntheticDataLoader\n", " \"\"\"\n", " return SyntheticDataLoader(\n", " batch_size=32,\n", @@ -352,11 +388,11 @@ " \"\"\"\n", " Creates a customizable model.\n", "\n", - " Params:\n", - " num_tfs (int): Number of transcription factors.\n", + " :param num_tfs: Number of transcription factors.\n", + " :type num_tfs: int\n", "\n", - " Returns:\n", - " CustomizableModel: Configured model.\n", + " :returns: Configured model.\n", + " :rtype: CustomizableModel\n", " \"\"\"\n", " return CustomizableModel(\n", " input_dim=num_tfs,\n", @@ -374,11 +410,11 @@ " \"\"\"\n", " Creates a simple linear model.\n", "\n", - " Params:\n", - " num_tfs (int): Number of transcription factors.\n", + " :param num_tfs: Number of transcription factors.\n", + " :type num_tfs: int\n", "\n", - " Returns:\n", - " SimpleModel: Configured linear model.\n", + " :returns: Configured linear model.\n", + " :rtype: SimpleModel\n", " \"\"\"\n", " return SimpleModel(\n", " input_dim=num_tfs,\n", @@ -390,24 +426,56 @@ " \"\"\"\n", " Creates a trainer for model training.\n", "\n", - " Returns:\n", - " Trainer: Configured trainer.\n", + " :returns: Configured trainer.\n", + " :rtype: Trainer\n", " \"\"\"\n", " return Trainer(\n", " max_epochs=10,\n", " deterministic=True,\n", " accelerator=\"cpu\",\n", - " # callbacks=[best_model_checkpoint, periodic_checkpoint],\n", - " # logger=[tb_logger, csv_logger],\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ + " # The following are turned false to reduce the output in the training cells below. You can toggle them to true to see\n", + " # a model summary and training progress if desired \n", + " logger=False, \n", + " enable_progress_bar=False, \n", + " enable_model_summary=False, \n", + " enable_checkpointing=False \n", + " )\n", + "\n", + "def calculate_explained_variance( \n", + " model: torch.nn.Module, data_module: DataLoader\n", + ") -> float:\n", + " \"\"\"\n", + " Calculates the explained variance of a model's predictions on a test dataset.\n", + "\n", + " :param test_results: List of test results containing the expected outcomes.\n", + " :type test_results: List[Union[float, int]]\n", + " :param data_module: Data loader for the test dataset.\n", + " :type data_module: DataLoader\n", + " :param model: The model to evaluate.\n", + " :type model: torch.nn.Module\n", + "\n", + " :returns: The explained variance of the model's predictions.\n", + " :rtype: float\n", + " \"\"\"\n", + " predictions = []\n", + " targets = []\n", + "\n", + " model.eval() # Set the model to evaluation mode\n", + " \n", + " with torch.no_grad(): # Disable gradient calculation\n", + " for batch in data_module.test_dataloader():\n", + " x, y = batch\n", + " outputs = model(x).cpu().numpy()\n", + " predictions.extend(outputs)\n", + " targets.extend(y.cpu().numpy())\n", + " \n", + " # Use scikit-learn to calculate explained variance\n", + " if len(targets) > 0:\n", + " explained_variance = explained_variance_score(targets, predictions)\n", + " return explained_variance\n", + " else:\n", + " return None\n", + "\n", "# These lists will store the test results for different models and data generation methods\n", "model_ves = []\n", "linear_model_test_ves = []" @@ -481,4213 +549,196 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# **Train models on data generated with no mean adjustment**" + "### **1) Train models on data generated with no mean adjustment**\n", + "We will first compare the models performances on data generated without any mean adjustments. This is the most simple dataset we will generate, and serves as a good starting point for the models." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "import logging\n", + "\n", + "# Suppress specific warnings\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning, message=\".*torch.tensor.*\")\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning, message=\".*DataLoader.*\")\n", + "logging.getLogger(\"pytorch_lightning\").setLevel(logging.ERROR)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 8, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GPU available: False, used: False\n", - "TPU available: False, using: 0 TPU cores\n", - "IPU available: False, using: 0 IPUs\n", - "HPU available: False, using: 0 HPUs\n", - "\n", - " | Name | Type | Params\n", - "----------------------------------------------------\n", - "0 | activation | LeakyReLU | 0 \n", - "1 | input_layer | Linear | 704 \n", - "2 | hidden_layers | ModuleList | 2.1 K \n", - "3 | output_layer | Linear | 330 \n", - "4 | dropout | Dropout | 0 \n", - "5 | mae | MeanAbsoluteError | 0 \n", - "6 | SMSE | SMSE | 0 \n", - "----------------------------------------------------\n", - "3.1 K Trainable params\n", - "0 Non-trainable params\n", - "3.1 K Total params\n", - "0.012 Total estimated model params size (MB)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Sanity Checking: | | 0/? [00:00 6\u001b[0m explained_variance \u001b[38;5;241m=\u001b[39m \u001b[43mcalculate_explained_variance\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtest_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7\u001b[0m model_ves\u001b[38;5;241m.\u001b[39mappend(explained_variance) \u001b[38;5;66;03m# Append explained variance to the list\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPrinting test results...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "Cell \u001b[0;32mIn[30], line 23\u001b[0m, in \u001b[0;36mcalculate_explained_variance\u001b[0;34m(test_results, data_module, model)\u001b[0m\n\u001b[1;32m 21\u001b[0m predictions\u001b[38;5;241m.\u001b[39mappend(outputs)\n\u001b[1;32m 22\u001b[0m targets\u001b[38;5;241m.\u001b[39mappend(y)\n\u001b[0;32m---> 23\u001b[0m mse \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mnn\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39mmse_loss(\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpredictions\u001b[49m\u001b[43m)\u001b[49m, torch\u001b[38;5;241m.\u001b[39mtensor(targets))\u001b[38;5;241m.\u001b[39mitem()\n\u001b[1;32m 24\u001b[0m var_y \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mvar(torch\u001b[38;5;241m.\u001b[39mtensor(targets))\u001b[38;5;241m.\u001b[39mitem() \n\u001b[1;32m 25\u001b[0m explained_variance \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;241m-\u001b[39m (mse \u001b[38;5;241m/\u001b[39m var_y)\n", - "\u001b[0;31mValueError\u001b[0m: only one element tensors can be converted to Python scalars" + "Nonlinear Model Explained Variance: 0.24550879001617432\n", + "Linear Model Explained Variance: -0.00506981611251831\n" ] } ], "source": [ + "# Initialize data module\n", + "data_module = get_data_module(0.0)\n", + "num_tfs = sum(data_module.n_sample)\n", + "\n", "# --- Nonlinear Model ---\n", "model = get_model(num_tfs)\n", "trainer = get_trainer()\n", "trainer.fit(model, data_module)\n", - "test_results = trainer.test(model, datamodule=data_module)\n", - "explained_variance = calculate_explained_variance(test_results, data_module, model)\n", - "model_ves.append(explained_variance) # Append explained variance to the list\n", - "print(\"Printing test results...\")\n", - "print(test_results)\n", - "print(\"Printing explained variance\")\n", - "print(explained_variance)\n", - "\n", + "explained_variance = calculate_explained_variance(model, data_module)\n", + "model_ves.append(explained_variance)\n", + "print(\"Nonlinear Model Explained Variance:\", explained_variance)\n", "\n", "# --- Linear Model ---\n", "linear_model = get_linear_model(num_tfs)\n", "trainer = get_trainer()\n", "trainer.fit(linear_model, data_module)\n", - "test_results = trainer.test(linear_model, datamodule=data_module)\n", - "explained_variance = calculate_explained_variance(test_results, data_module, linear_model)\n", - "linear_model_test_ves.append(explained_variance) # Append explained variance to the list\n", - "print(\"Printing linear model test results\")\n", - "print(test_results)\n", - "print(\"Printing linear model explained variance\")\n", - "print(explained_variance)" + "explained_variance_linear = calculate_explained_variance(linear_model, data_module)\n", + "linear_model_test_ves.append(explained_variance_linear)\n", + "\n", + "print(\"Linear Model Explained Variance:\", explained_variance_linear)" ] }, { - "cell_type": "code", - "execution_count": 17, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "#NOTE: replaced this kernel with the two above to implement the explained variance\n", - "\n", - "\n", - "# data_module = get_data_module(0.0)\n", - "# num_tfs = sum(data_module.n_sample)\n", - "\n", - "# # nonlinear model\n", - "# model = get_model(num_tfs)\n", - "# trainer = get_trainer()\n", - "# trainer.fit(model, data_module)\n", - "# test_results = trainer.test(model, datamodule=data_module)\n", - "# print(\"Printing test results...\")\n", - "# print(test_results)\n", - "# model_ves.append(test_results[0][\"test_ve\"])\n", - "\n", - "# # linear model\n", - "# linear_model = get_linear_model(num_tfs)\n", - "# trainer = get_trainer()\n", - "# trainer.fit(linear_model, data_module)\n", - "# test_results = trainer.test(linear_model, datamodule=data_module)\n", - "# print(\"Printing linear model test results\")\n", - "# print(test_results)\n", - "# linear_model_test_ves.append(test_results[0][\"test_ve\"])" + "The explained variance for the linear model is surprisingly sightly negative in contrast to the nonlinear, customizable model which yielded a significantly larger positive explained variance. This suggests that the customizable model is able to better account for the distribution of the generated data with no mean adjustments, yielding a significantly higher explained variance. It is interesting to consider whether the same relationship will be observed in the next few conditions as the data generation methods becoome increasingly more complex. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# **Train models on data generated with normal mean adjustments**" + "### **2) Train models on data generated with normal mean adjustments**\n", + "Now, let us perform the same comparison but using this condition, with a normal mean adjustment of 3." ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 9, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "GPU available: False, used: False\n", - "TPU available: False, using: 0 TPU cores\n", - "IPU available: False, using: 0 IPUs\n", - "HPU available: False, using: 0 HPUs\n", - "\n", - " | Name | Type | Params\n", - "----------------------------------------------------\n", - "0 | activation | LeakyReLU | 0 \n", - "1 | input_layer | Linear | 704 \n", - "2 | hidden_layers | ModuleList | 2.1 K \n", - "3 | output_layer | Linear | 330 \n", - "4 | dropout | Dropout | 0 \n", - "5 | mae | MeanAbsoluteError | 0 \n", - "6 | SMSE | SMSE | 0 \n", - "----------------------------------------------------\n", - "3.1 K Trainable params\n", - "0 Non-trainable params\n", - "3.1 K Total params\n", - "0.012 Total estimated model params size (MB)\n" + "Nonlinear Model Explained Variance (Method 2): 0.2549255728721619\n", + "Linear Model Explained Variance (Method 2): 0.07210595607757568\n" ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Sanity Checking: | | 0/? [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "data_gen_methods = [\"No Mean Adjustment\", \"Dependent Mean Adjustment\", \"TF Dependent Mean Adjustment\", \"TF Dependent Mean Adjust with Boolean Logic\"]\n", - "plt.figure(figsize=(10, 6))\n", - "plt.scatter(data_gen_methods, model_ves, color='blue')\n", - "plt.scatter(data_gen_methods, linear_model_test_ves, color='orange')\n", - "plt.title('Model VE Comparison (bound mean = 3.0)')\n", - "plt.xlabel('Model')\n", - "plt.ylabel('VE')\n", - "plt.grid(True)\n", - "plt.xticks(rotation=45, ha=\"right\")\n", - "plt.legend(['Complex (Customizable) Model', 'Linear Model'])\n", - "plt.tight_layout() # Adjust layout to make room for the rotated x-axis labels\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Seed set to 42\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bound (1) and Unbound (0) Labels for gene 0:\n", - "tensor([0., 0., 0., 1., 1., 1., 1., 1., 0., 1.])\n", - "iteration 5 completed\n", - "iteration 10 completed\n", - "iteration 15 completed\n", - "iteration 20 completed\n", - "iteration 25 completed\n", - "iteration 30 completed\n", - "iteration 35 completed\n", - "iteration 40 completed\n", - "iteration 45 completed\n", - "iteration 50 completed\n" - ] - } - ], - "source": [ - "# imports\n", - "from yeastdnnexplorer.probability_models.generate_data import (\n", - " generate_gene_population, \n", - " generate_binding_effects, \n", - " generate_pvalues, \n", - " generate_perturbation_effects\n", - ")\n", - "\n", - "from yeastdnnexplorer.probability_models.util import (\n", - " calculate_explained_variance\n", - ")\n", - "\n", - "import torch\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from yeastdnnexplorer.probability_models.relation_classes import Relation, And, Or\n", - "from yeastdnnexplorer.probability_models.generate_data import (\n", - " default_perturbation_effect_adjustment_function,\n", - " perturbation_effect_adjustment_function_with_tf_relationships,\n", - " perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic\n", - ")\n", - "\n", - "from pytorch_lightning import Trainer, seed_everything\n", - "from torch.utils.data import DataLoader, TensorDataset\n", - "from sklearn.metrics import explained_variance_score\n", - "\n", - "from yeastdnnexplorer.data_loaders.synthetic_data_loader import SyntheticDataLoader\n", - "from yeastdnnexplorer.ml_models.simple_model import SimpleModel\n", - "from yeastdnnexplorer.ml_models.customizable_model import CustomizableModel\n", - "\n", - "seed_everything(42)\n", - "\n", - "n_genes = 3000\n", - "bound = [0.5, 0.5, 0.5, 0.5, 0.5]\n", - "n_sample = [1, 1, 2, 2, 4]\n", - "\n", - "# Generate gene populations\n", - "gene_populations_list = []\n", - "for bound_proportion, n_draws in zip(bound, n_sample):\n", - " for _ in range(n_draws):\n", - " gene_populations_list.append(generate_gene_population(n_genes, bound_proportion))\n", - " \n", - "# Generate binding data for each gene population\n", - "binding_effect_list = [generate_binding_effects(gene_population) for gene_population in gene_populations_list]\n", - "\n", - "# Calculate p-values for binding data\n", - "binding_pvalue_list = [generate_pvalues(binding_data) for binding_data in binding_effect_list]\n", - "\n", - "# Combine binding data into a tensor\n", - "binding_data_combined = [torch.stack((gene_population.labels, binding_effect, binding_pval), dim=1)\n", - " for gene_population, binding_effect, binding_pval in zip(gene_populations_list, binding_effect_list, binding_pvalue_list)]\n", - "binding_data_tensor = torch.stack(binding_data_combined, dim=1)\n", - "\n", - "# TF relationships\n", - "tf_relationships = {\n", - " 0: [1],\n", - " 1: [8],\n", - " 2: [5, 6],\n", - " 3: [4],\n", - " 4: [5],\n", - " 5: [9],\n", - " 6: [4],\n", - " 7: [1, 4],\n", - " 8: [6],\n", - " 9: [4],\n", - "}\n", - "\n", - "tf_relationships_dict_boolean_logic = {\n", - " 0: [And(3, 4, 8), Or(3, 7), Or(1, 1)],\n", - " 1: [And(5, Or(7, 8))],\n", - " 2: [],\n", - " 3: [Or(7, 9), And(6, 7)],\n", - " 4: [And(1, 2)],\n", - " 5: [Or(0, 1, 2, 8, 9)],\n", - " 6: [And(0, Or(1, 2))],\n", - " 7: [Or(2, And(5, 6, 9))],\n", - " 8: [],\n", - " 9: [And(6, And(3, Or(0, 9)))],\n", - "}\n", - "\n", - "def experiment(n_iterations=10, GENE_IDX=0):\n", - " print(\"Bound (1) and Unbound (0) Labels for gene \" + str(GENE_IDX) + \":\")\n", - " print(binding_data_tensor[GENE_IDX, :, 0])\n", - "\n", - " num_tfs = sum(n_sample)\n", - " \n", - " no_mean_adjustment_scores = torch.zeros(num_tfs)\n", - " normal_mean_adjustment_scores = torch.zeros(num_tfs)\n", - " dep_mean_adjustment_scores = torch.zeros(num_tfs)\n", - " boolean_logic_scores = torch.zeros(num_tfs)\n", - "\n", - " for i in range(n_iterations):\n", - " # Method 1: Generate perturbation effects without mean adjustment\n", - " perturbation_effects_list_no_mean_adjustment = [generate_perturbation_effects(binding_data_tensor[:, tf_index, :].unsqueeze(1), tf_index=0) \n", - " for tf_index in range(num_tfs)]\n", - " perturbation_effects_list_no_mean_adjustment = torch.stack(perturbation_effects_list_no_mean_adjustment, dim=1)\n", - "\n", - " # Method 2: Generate perturbation effects with normal mean adjustment\n", - " perturbation_effects_list_normal_mean_adjustment = generate_perturbation_effects(\n", - " binding_data_tensor, \n", - " max_mean_adjustment=10.0\n", - " )\n", - "\n", - " # Method 3: Generate perturbation effects with dependent mean adjustment\n", - " perturbation_effects_list_dep_mean_adjustment = generate_perturbation_effects(\n", - " binding_data_tensor, \n", - " tf_relationships=tf_relationships,\n", - " adjustment_function=perturbation_effect_adjustment_function_with_tf_relationships,\n", - " max_mean_adjustment=10.0,\n", - " )\n", - " \n", - " # Method 4: Generate perturbation effects with binary relations between the TFs\n", - " perturbation_effects_list_boolean_logic = generate_perturbation_effects(\n", - " binding_data_tensor, \n", - " adjustment_function=perturbation_effect_adjustment_function_with_tf_relationships_boolean_logic,\n", - " tf_relationships=tf_relationships_dict_boolean_logic,\n", - " max_mean_adjustment=10.0,\n", - " )\n", - "\n", - " no_mean_adjustment_scores += abs(perturbation_effects_list_no_mean_adjustment[GENE_IDX, :])\n", - " normal_mean_adjustment_scores += abs(perturbation_effects_list_normal_mean_adjustment[GENE_IDX, :])\n", - " dep_mean_adjustment_scores += abs(perturbation_effects_list_dep_mean_adjustment[GENE_IDX, :])\n", - " boolean_logic_scores += abs(perturbation_effects_list_boolean_logic[GENE_IDX, :])\n", - "\n", - " if (i + 1) % 5 == 0:\n", - " print(f\"iteration {i+1} completed\")\n", - " \n", - " no_mean_adjustment_scores /= n_iterations\n", - " normal_mean_adjustment_scores /= n_iterations\n", - " dep_mean_adjustment_scores /= n_iterations\n", - " boolean_logic_scores /= n_iterations\n", - " \n", - " return no_mean_adjustment_scores, normal_mean_adjustment_scores, dep_mean_adjustment_scores, boolean_logic_scores\n", - "\n", - "GENE_IDX = 0\n", - "experiment_results = experiment(n_iterations=50, GENE_IDX=GENE_IDX)\n", - "\n", - "def get_data_module(max_mean_adjustment, adjustment_function=default_perturbation_effect_adjustment_function, tf_relationships_dict={}):\n", - " return SyntheticDataLoader(\n", - " batch_size=32,\n", - " num_genes=4000,\n", - " bound_mean=3.0,\n", - " bound=[0.5] * 5,\n", - " n_sample=[1, 1, 2, 2, 4],\n", - " val_size=0.1,\n", - " test_size=0.1,\n", - " random_state=42,\n", - " max_mean_adjustment=max_mean_adjustment,\n", - " adjustment_function=adjustment_function,\n", - " tf_relationships=tf_relationships_dict,\n", - " )\n", - "\n", - "def get_model(num_tfs):\n", - " return CustomizableModel(\n", - " input_dim=num_tfs,\n", - " output_dim=num_tfs,\n", - " lr=0.01,\n", - " hidden_layer_num=2,\n", - " hidden_layer_sizes=[64, 32],\n", - " activation=\"LeakyReLU\",\n", - " optimizer=\"RMSprop\",\n", - " L2_regularization_term=0.0,\n", - " dropout_rate=0.0,\n", - " )\n", - "\n", - "def get_linear_model(num_tfs):\n", - " return SimpleModel(\n", - " input_dim=num_tfs,\n", - " output_dim=num_tfs,\n", - " lr=0.01\n", - " )\n", - "\n", - "def get_trainer():\n", - " return Trainer(\n", - " max_epochs=10,\n", - " deterministic=True,\n", - " accelerator=\"cpu\",\n", - " )\n", - "\n", - "model_ves = []\n", - "linear_model_test_ves = []" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GPU available: False, used: False\n", - "TPU available: False, using: 0 TPU cores\n", - "IPU available: False, using: 0 IPUs\n", - "HPU available: False, using: 0 HPUs\n", - "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:260: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", - " X_train, Y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(\n", - "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:263: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", - " X_val, Y_val = torch.tensor(X_val, dtype=torch.float32), torch.tensor(\n", - "/Users/ericjia/yeastdnnexplorer/yeastdnnexplorer/data_loaders/synthetic_data_loader.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", - " X_test, Y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(\n", - "\n", - " | Name | Type | Params\n", - "----------------------------------------------------\n", - "0 | activation | LeakyReLU | 0 \n", - "1 | input_layer | Linear | 704 \n", - "2 | hidden_layers | ModuleList | 2.1 K \n", - "3 | output_layer | Linear | 330 \n", - "4 | dropout | Dropout | 0 \n", - "5 | mae | MeanAbsoluteError | 0 \n", - "6 | SMSE | SMSE | 0 \n", - "----------------------------------------------------\n", - "3.1 K Trainable params\n", - "0 Non-trainable params\n", - "3.1 K Total params\n", - "0.012 Total estimated model params size (MB)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Sanity Checking: | | 0/? [00:00" ] @@ -4741,11 +811,11 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "The x-axis labels the method in which the data was generated according to the 4 options above. The y-axis represents the corresponding variance explained attained by these models. Each point represents the variance explained achieved after generating the data based on the x-axis, and the color of the point represents which model architecture was trained on the data resulting in the specificed explained variance. Now, we can clearly see that across the 4 conditions, the nonlinear, customizable model acheives a significantly higher positive explained variance compared to the simple linear model, which is good because it helps to confirm that the nonlinear model we are using is able to train on the data and better account for the distribution of the data, resulting in a higher explained variance compared to the simple linear model. " + ] } ], "metadata": {