-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
282 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"authors": "Hongwei Du, Hong Wang (original code by Hongwei Du)", | ||
"algorithm": "LGDCNN", | ||
"algorithm_long": "Rational Design of Deep Learning Networks Based on Fusion Strategy for Improved Materials Property Predictions. See github page for more information: https://github.com/dhw059/DeepModelFusion.", | ||
"bibtex_refs": "@article{UnderReview, author = {Hongwei Du, Hong Wang}, title = {Rational Design of Deep Learning Networks Based on Fusion Strategy for Improved Materials Property Predictions}, journal = {Journal of Chemical Theory and Computation}, volume = {}, number = {}, pages = {}, year = {2024}, doi = {Under Review}, URL = {}, eprint = {}}", | ||
"requirements": "See GitHub page for LGDCNN.", | ||
"notes": "" | ||
} |
274 changes: 274 additions & 0 deletions
274
benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/matbench_notebook.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,274 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from matbench.bench import MatbenchBenchmark\n", | ||
"import os\n", | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"import torch\n", | ||
"from sklearn.metrics import roc_auc_score \n", | ||
"from lgdcnn.fusion_lstm_dcnn import LGDCNN\n", | ||
"from lgdcnn.train import Model\n", | ||
"from lgdcnn.utils.get_compute_device import get_compute_device\n", | ||
"\n", | ||
"compute_device = get_compute_device(prefer_last=False)\n", | ||
"RNG_SEED = 42\n", | ||
"torch.manual_seed(RNG_SEED)\n", | ||
"np.random.seed(RNG_SEED)\n", | ||
"model_name = \"L-G-DCNN-matbench\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# %%\n", | ||
"def get_model(data_dir,model_name, mat_prop, i, classification=False, batch_size=None,\n", | ||
" transfer=None, verbose=True):\n", | ||
" # Get the TorchedLGDCNN architecture loaded\n", | ||
" model = Model(LGDCNN(compute_device=compute_device).to(compute_device),\n", | ||
" model_name=f'{mat_prop}{i}', verbose=verbose)\n", | ||
"\n", | ||
" # Train network starting at pretrained weights\n", | ||
" if transfer is not None:\n", | ||
" model.load_network(f'{transfer}.pth')\n", | ||
" model.model_name = f'{mat_prop}'\n", | ||
"\n", | ||
" # Apply BCEWithLogitsLoss to model output if binary classification is True\n", | ||
" if classification:\n", | ||
" model.classification = True\n", | ||
"\n", | ||
" # Get the datafiles you will learn from\n", | ||
" train_data = f'{data_dir}/{mat_prop}/train.csv'\n", | ||
" val_data = f'{data_dir}/{mat_prop}/val.csv'\n", | ||
"\n", | ||
" # Load the train and validation data before fitting the network\n", | ||
" data_size = pd.read_csv(train_data).shape[0]\n", | ||
" batch_size = 2**round(np.log2(data_size)-4)\n", | ||
" if batch_size < 2**7:\n", | ||
" batch_size = 2**7\n", | ||
" if batch_size > 2**12:\n", | ||
" batch_size = 2**12\n", | ||
" \n", | ||
" model.load_data(train_data, batch_size=batch_size//2, train=True)\n", | ||
" print(f'training with batchsize {model.batch_size} '\n", | ||
" f'(2**{np.log2(model.batch_size):0.3f})')\n", | ||
" model.load_data(val_data, batch_size=batch_size//2)\n", | ||
"\n", | ||
" # Set the number of epochs, decide if you want a loss curve to be plotted\n", | ||
" model.fit(epochs=300, losscurve=False)\n", | ||
"\n", | ||
" # Save the network (saved as f\"{model_name}.pth\")\n", | ||
" model.save_network(model_name)\n", | ||
" return model\n", | ||
"\n", | ||
"def load_model(data_dir, model_name, mat_prop, i, classification, file_name, verbose=True):\n", | ||
" # Load up a saved network.\n", | ||
" model = Model(LGDCNN(compute_device=compute_device).to(compute_device),\n", | ||
" model_name=f'{mat_prop}{i}', verbose=verbose)\n", | ||
" model.load_network(model_name, f'{mat_prop}{i}.pth')\n", | ||
"\n", | ||
" # Check if classifcation task\n", | ||
" if classification:\n", | ||
" model.classification = True\n", | ||
" # Load the data you want to predict with\n", | ||
" data = f'{data_dir}/{mat_prop}/{file_name}'\n", | ||
" # data is reloaded to model.data_loader\n", | ||
" model.load_data(data, batch_size=2**9)\n", | ||
" return model\n", | ||
"\n", | ||
"def get_results(model):\n", | ||
" output = model.predict(model.data_loader) # predict the data saved here\n", | ||
" return model, output\n", | ||
"\n", | ||
"def to_csv(output, save_name):\n", | ||
" # parse output and save to csv\n", | ||
" act, pred, formulae, uncertainty = output\n", | ||
" df = pd.DataFrame([formulae, act, pred, uncertainty]).T\n", | ||
" # df.columns = ['composition', 'target', 'pred-0', 'uncertainty']\n", | ||
" df.columns = ['formula', 'actual', 'predicted', 'uncertainty']\n", | ||
" save_path = 'matbench_predictions/'\n", | ||
" os.makedirs(save_path, exist_ok=True)\n", | ||
" df.to_csv(f'{save_path}/{save_name}', index_label='Index')\n", | ||
" \n", | ||
"\n", | ||
"def save_results(data_dir, model_name,mat_prop, fold, classification, file_name, ):\n", | ||
" model = load_model(data_dir, model_name,mat_prop, fold, classification, file_name = 'test.csv' )\n", | ||
" model, output = get_results(model)\n", | ||
" \n", | ||
" # Get appropriate metrics for saving to csv\n", | ||
" if model.classification:\n", | ||
" auc = roc_auc_score(output[0], output[1])\n", | ||
" print(f'\\n{mat_prop} ROC AUC: {auc:0.3f}')\n", | ||
" else:\n", | ||
" mae = np.abs(output[0] - output[1]).mean()\n", | ||
" print(f'\\n{mat_prop} mae: {mae:0.3g}')\n", | ||
"\n", | ||
" # save predictions to a csv\n", | ||
" fname = f'{mat_prop}_{file_name.replace(\".csv\", \"\")}_output{fold}.csv'\n", | ||
" to_csv(output, fname)\n", | ||
" return model, output" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"#condesne_formula takes a material and returns the chemical formula in the correct format for LGDCNN\n", | ||
"def condense_formula(mat):\n", | ||
" if isinstance(mat, str):\n", | ||
" return mat\n", | ||
" else:\n", | ||
" return mat.formula.replace(' ', '')\n", | ||
"\n", | ||
"#change_input runs condesne_formula on all the input data used for training\n", | ||
"def change_input(train_inputs):\n", | ||
" inputs = []\n", | ||
" for input in train_inputs:\n", | ||
" inputs.append(condense_formula(input))\n", | ||
" return inputs\n", | ||
"\n", | ||
"#make_df creates a data frame containing the train inputs and outputs for LGDCNN\n", | ||
"def make_df(train_inputs, train_outputs):\n", | ||
" input_df = pd.DataFrame({'formula': train_inputs, 'target': train_outputs})\n", | ||
" return input_df\n", | ||
"\n", | ||
"#make_df_test creates a data frame containing the test inputs for LGDCNN\n", | ||
"def make_df_test(test_inputs, test_outputs):\n", | ||
" test_df = pd.DataFrame({'formula' : test_inputs, 'target': test_outputs})\n", | ||
" # test_df['target'] = np.nan\n", | ||
" return test_df\n", | ||
"\n", | ||
"#split_train_val splits the training data into two sets: training and validation\n", | ||
"def split_train_val(df):\n", | ||
" df = df.sample(frac = 1.0, random_state = 7)\n", | ||
" val_df = df.sample(frac = 0.1, random_state = 7)\n", | ||
" train_df = df.drop(val_df.index) \n", | ||
" print(train_df.shape, val_df.shape) \n", | ||
" return train_df, val_df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"subset = [\"matbench_jdft2d\", \"matbench_steels\", \n", | ||
" \"matbench_perovskites\", \"matbench_expt_gap\",\n", | ||
" \"matbench_phonons\", \"matbench_dielectric\", \n", | ||
" \"matbench_log_gvrh\", \"matbench_log_kvrh\",\n", | ||
" \"matbench_mp_gap\", \"matbench_mp_e_form\"]\n", | ||
"\n", | ||
"mb = MatbenchBenchmark(autoload=False, subset=subset)\n", | ||
"data_dir = 'data/matbench_temp'\n", | ||
"os.makedirs(data_dir, exist_ok= True)\n", | ||
"\n", | ||
"results_dict = {}\n", | ||
"\n", | ||
"for task in mb.tasks:\n", | ||
" task.load()\n", | ||
" mat_prop = task.dataset_name\n", | ||
" os.makedirs(f'{data_dir}/{mat_prop}', exist_ok= True)\n", | ||
" for fold in task.folds:\n", | ||
" train_inputs, train_outputs = task.get_train_and_val_data(fold)\n", | ||
" test_inputs , test_outputs = task.get_test_data(fold, include_target=True)\n", | ||
"\n", | ||
" #Preparing the inputs data for LGDCNN\n", | ||
" inputs = change_input(train_inputs)\n", | ||
" df = make_df(inputs, train_outputs)\n", | ||
"\n", | ||
" #Creating the training and validation sets\n", | ||
" train_df, val_df = split_train_val(df)\n", | ||
" train_df.to_csv(f'{data_dir}/{mat_prop}/train.csv')\n", | ||
" val_df.to_csv(f'{data_dir}/{mat_prop}/val.csv')\n", | ||
"\n", | ||
" #Getting and preparing the testing data\n", | ||
" test_inputs_formula = change_input(test_inputs)\n", | ||
" test_df = make_df_test(test_inputs_formula, test_outputs)\n", | ||
" test_df.to_csv(f'{data_dir}/{mat_prop}/test.csv')\n", | ||
"\n", | ||
" #Training LGDCNN\n", | ||
" model = get_model(data_dir, model_name, mat_prop, fold, classification = False, verbose = True, )\n", | ||
" \n", | ||
" model_test, output = save_results(data_dir, model_name,mat_prop, fold, classification = False,\n", | ||
" file_name='test.csv',)\n", | ||
" \n", | ||
" # Recording our data!\n", | ||
" predictions = output[1]\n", | ||
" task.record(fold, predictions)\n", | ||
"\n", | ||
"# Saving our results\n", | ||
"mb.to_file(\"LGDCNN_\"+mat_prop +\".json\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import gzip\n", | ||
"import shutil\n", | ||
"\n", | ||
"def compress_json(input_file, output_file):\n", | ||
" with open(input_file, 'rb') as f_in:\n", | ||
" with gzip.open(output_file, 'wb') as f_out:\n", | ||
" shutil.copyfileobj(f_in, f_out)\n", | ||
"\n", | ||
"compress_json('results.json', 'results.json.gz')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "pytorch", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.17" | ||
}, | ||
"orig_nbformat": 4, | ||
"vscode": { | ||
"interpreter": { | ||
"hash": "7dd5d76405b906035e1d1a24c7f24088f68ab8fc773386bbbd9b8e7c7c6d48a3" | ||
} | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Binary file not shown.