Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
dhw059 authored Apr 10, 2024
1 parent 830d919 commit 545623f
Show file tree
Hide file tree
Showing 3 changed files with 282 additions and 0 deletions.
8 changes: 8 additions & 0 deletions benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"authors": "Hongwei Du, Hong Wang (original code by Hongwei Du)",
"algorithm": "LGDCNN",
"algorithm_long": "Rational Design of Deep Learning Networks Based on Fusion Strategy for Improved Materials Property Predictions. See github page for more information: https://github.com/dhw059/DeepModelFusion.",
"bibtex_refs": "@article{UnderReview, author = {Hongwei Du, Hong Wang}, title = {Rational Design of Deep Learning Networks Based on Fusion Strategy for Improved Materials Property Predictions}, journal = {Journal of Chemical Theory and Computation}, volume = {}, number = {}, pages = {}, year = {2024}, doi = {Under Review}, URL = {}, eprint = {}}",
"requirements": "See GitHub page for LGDCNN.",
"notes": ""
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from matbench.bench import MatbenchBenchmark\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"from sklearn.metrics import roc_auc_score \n",
"from lgdcnn.fusion_lstm_dcnn import LGDCNN\n",
"from lgdcnn.train import Model\n",
"from lgdcnn.utils.get_compute_device import get_compute_device\n",
"\n",
"compute_device = get_compute_device(prefer_last=False)\n",
"RNG_SEED = 42\n",
"torch.manual_seed(RNG_SEED)\n",
"np.random.seed(RNG_SEED)\n",
"model_name = \"L-G-DCNN-matbench\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# %%\n",
"def get_model(data_dir,model_name, mat_prop, i, classification=False, batch_size=None,\n",
" transfer=None, verbose=True):\n",
" # Get the TorchedLGDCNN architecture loaded\n",
" model = Model(LGDCNN(compute_device=compute_device).to(compute_device),\n",
" model_name=f'{mat_prop}{i}', verbose=verbose)\n",
"\n",
" # Train network starting at pretrained weights\n",
" if transfer is not None:\n",
" model.load_network(f'{transfer}.pth')\n",
" model.model_name = f'{mat_prop}'\n",
"\n",
" # Apply BCEWithLogitsLoss to model output if binary classification is True\n",
" if classification:\n",
" model.classification = True\n",
"\n",
" # Get the datafiles you will learn from\n",
" train_data = f'{data_dir}/{mat_prop}/train.csv'\n",
" val_data = f'{data_dir}/{mat_prop}/val.csv'\n",
"\n",
" # Load the train and validation data before fitting the network\n",
" data_size = pd.read_csv(train_data).shape[0]\n",
" batch_size = 2**round(np.log2(data_size)-4)\n",
" if batch_size < 2**7:\n",
" batch_size = 2**7\n",
" if batch_size > 2**12:\n",
" batch_size = 2**12\n",
" \n",
" model.load_data(train_data, batch_size=batch_size//2, train=True)\n",
" print(f'training with batchsize {model.batch_size} '\n",
" f'(2**{np.log2(model.batch_size):0.3f})')\n",
" model.load_data(val_data, batch_size=batch_size//2)\n",
"\n",
" # Set the number of epochs, decide if you want a loss curve to be plotted\n",
" model.fit(epochs=300, losscurve=False)\n",
"\n",
" # Save the network (saved as f\"{model_name}.pth\")\n",
" model.save_network(model_name)\n",
" return model\n",
"\n",
"def load_model(data_dir, model_name, mat_prop, i, classification, file_name, verbose=True):\n",
" # Load up a saved network.\n",
" model = Model(LGDCNN(compute_device=compute_device).to(compute_device),\n",
" model_name=f'{mat_prop}{i}', verbose=verbose)\n",
" model.load_network(model_name, f'{mat_prop}{i}.pth')\n",
"\n",
" # Check if classifcation task\n",
" if classification:\n",
" model.classification = True\n",
" # Load the data you want to predict with\n",
" data = f'{data_dir}/{mat_prop}/{file_name}'\n",
" # data is reloaded to model.data_loader\n",
" model.load_data(data, batch_size=2**9)\n",
" return model\n",
"\n",
"def get_results(model):\n",
" output = model.predict(model.data_loader) # predict the data saved here\n",
" return model, output\n",
"\n",
"def to_csv(output, save_name):\n",
" # parse output and save to csv\n",
" act, pred, formulae, uncertainty = output\n",
" df = pd.DataFrame([formulae, act, pred, uncertainty]).T\n",
" # df.columns = ['composition', 'target', 'pred-0', 'uncertainty']\n",
" df.columns = ['formula', 'actual', 'predicted', 'uncertainty']\n",
" save_path = 'matbench_predictions/'\n",
" os.makedirs(save_path, exist_ok=True)\n",
" df.to_csv(f'{save_path}/{save_name}', index_label='Index')\n",
" \n",
"\n",
"def save_results(data_dir, model_name,mat_prop, fold, classification, file_name, ):\n",
" model = load_model(data_dir, model_name,mat_prop, fold, classification, file_name = 'test.csv' )\n",
" model, output = get_results(model)\n",
" \n",
" # Get appropriate metrics for saving to csv\n",
" if model.classification:\n",
" auc = roc_auc_score(output[0], output[1])\n",
" print(f'\\n{mat_prop} ROC AUC: {auc:0.3f}')\n",
" else:\n",
" mae = np.abs(output[0] - output[1]).mean()\n",
" print(f'\\n{mat_prop} mae: {mae:0.3g}')\n",
"\n",
" # save predictions to a csv\n",
" fname = f'{mat_prop}_{file_name.replace(\".csv\", \"\")}_output{fold}.csv'\n",
" to_csv(output, fname)\n",
" return model, output"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"\n",
"#condesne_formula takes a material and returns the chemical formula in the correct format for LGDCNN\n",
"def condense_formula(mat):\n",
" if isinstance(mat, str):\n",
" return mat\n",
" else:\n",
" return mat.formula.replace(' ', '')\n",
"\n",
"#change_input runs condesne_formula on all the input data used for training\n",
"def change_input(train_inputs):\n",
" inputs = []\n",
" for input in train_inputs:\n",
" inputs.append(condense_formula(input))\n",
" return inputs\n",
"\n",
"#make_df creates a data frame containing the train inputs and outputs for LGDCNN\n",
"def make_df(train_inputs, train_outputs):\n",
" input_df = pd.DataFrame({'formula': train_inputs, 'target': train_outputs})\n",
" return input_df\n",
"\n",
"#make_df_test creates a data frame containing the test inputs for LGDCNN\n",
"def make_df_test(test_inputs, test_outputs):\n",
" test_df = pd.DataFrame({'formula' : test_inputs, 'target': test_outputs})\n",
" # test_df['target'] = np.nan\n",
" return test_df\n",
"\n",
"#split_train_val splits the training data into two sets: training and validation\n",
"def split_train_val(df):\n",
" df = df.sample(frac = 1.0, random_state = 7)\n",
" val_df = df.sample(frac = 0.1, random_state = 7)\n",
" train_df = df.drop(val_df.index) \n",
" print(train_df.shape, val_df.shape) \n",
" return train_df, val_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"subset = [\"matbench_jdft2d\", \"matbench_steels\", \n",
" \"matbench_perovskites\", \"matbench_expt_gap\",\n",
" \"matbench_phonons\", \"matbench_dielectric\", \n",
" \"matbench_log_gvrh\", \"matbench_log_kvrh\",\n",
" \"matbench_mp_gap\", \"matbench_mp_e_form\"]\n",
"\n",
"mb = MatbenchBenchmark(autoload=False, subset=subset)\n",
"data_dir = 'data/matbench_temp'\n",
"os.makedirs(data_dir, exist_ok= True)\n",
"\n",
"results_dict = {}\n",
"\n",
"for task in mb.tasks:\n",
" task.load()\n",
" mat_prop = task.dataset_name\n",
" os.makedirs(f'{data_dir}/{mat_prop}', exist_ok= True)\n",
" for fold in task.folds:\n",
" train_inputs, train_outputs = task.get_train_and_val_data(fold)\n",
" test_inputs , test_outputs = task.get_test_data(fold, include_target=True)\n",
"\n",
" #Preparing the inputs data for LGDCNN\n",
" inputs = change_input(train_inputs)\n",
" df = make_df(inputs, train_outputs)\n",
"\n",
" #Creating the training and validation sets\n",
" train_df, val_df = split_train_val(df)\n",
" train_df.to_csv(f'{data_dir}/{mat_prop}/train.csv')\n",
" val_df.to_csv(f'{data_dir}/{mat_prop}/val.csv')\n",
"\n",
" #Getting and preparing the testing data\n",
" test_inputs_formula = change_input(test_inputs)\n",
" test_df = make_df_test(test_inputs_formula, test_outputs)\n",
" test_df.to_csv(f'{data_dir}/{mat_prop}/test.csv')\n",
"\n",
" #Training LGDCNN\n",
" model = get_model(data_dir, model_name, mat_prop, fold, classification = False, verbose = True, )\n",
" \n",
" model_test, output = save_results(data_dir, model_name,mat_prop, fold, classification = False,\n",
" file_name='test.csv',)\n",
" \n",
" # Recording our data!\n",
" predictions = output[1]\n",
" task.record(fold, predictions)\n",
"\n",
"# Saving our results\n",
"mb.to_file(\"LGDCNN_\"+mat_prop +\".json\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import gzip\n",
"import shutil\n",
"\n",
"def compress_json(input_file, output_file):\n",
" with open(input_file, 'rb') as f_in:\n",
" with gzip.open(output_file, 'wb') as f_out:\n",
" shutil.copyfileobj(f_in, f_out)\n",
"\n",
"compress_json('results.json', 'results.json.gz')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "pytorch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.17"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "7dd5d76405b906035e1d1a24c7f24088f68ab8fc773386bbbd9b8e7c7c6d48a3"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Binary file not shown.

0 comments on commit 545623f

Please sign in to comment.