Add files via upload

materialsproject · Apr 10, 2024 · 545623f · 545623f
1 parent 830d919
commit 545623f
Show file tree

Hide file tree

Showing 3 changed files with 282 additions and 0 deletions.
diff --git a/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/info.json b/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/info.json
@@ -0,0 +1,8 @@
+{
+    "authors": "Hongwei Du, Hong Wang (original code by Hongwei Du)",
+    "algorithm": "LGDCNN",
+    "algorithm_long": "Rational Design of Deep Learning Networks Based on Fusion Strategy for Improved Materials Property Predictions. See github page for more information: https://github.com/dhw059/DeepModelFusion.",
+    "bibtex_refs": "@article{UnderReview, author = {Hongwei Du, Hong Wang}, title = {Rational Design of Deep Learning Networks Based on Fusion Strategy for Improved Materials Property Predictions}, journal = {Journal of Chemical Theory and Computation}, volume = {}, number = {}, pages = {}, year = {2024}, doi = {Under Review}, URL = {}, eprint = {}}",
+    "requirements": "See GitHub page for LGDCNN.",
+    "notes": ""
+}
diff --git a/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/matbench_notebook.ipynb b/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/matbench_notebook.ipynb
@@ -0,0 +1,274 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from matbench.bench import MatbenchBenchmark\n",
+    "import os\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import torch\n",
+    "from sklearn.metrics import roc_auc_score  \n",
+    "from lgdcnn.fusion_lstm_dcnn import LGDCNN\n",
+    "from lgdcnn.train import Model\n",
+    "from lgdcnn.utils.get_compute_device import get_compute_device\n",
+    "\n",
+    "compute_device = get_compute_device(prefer_last=False)\n",
+    "RNG_SEED = 42\n",
+    "torch.manual_seed(RNG_SEED)\n",
+    "np.random.seed(RNG_SEED)\n",
+    "model_name = \"L-G-DCNN-matbench\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%\n",
+    "def get_model(data_dir,model_name, mat_prop, i, classification=False, batch_size=None,\n",
+    "              transfer=None, verbose=True):\n",
+    "    # Get the TorchedLGDCNN architecture loaded\n",
+    "    model = Model(LGDCNN(compute_device=compute_device).to(compute_device),\n",
+    "                  model_name=f'{mat_prop}{i}', verbose=verbose)\n",
+    "\n",
+    "    # Train network starting at pretrained weights\n",
+    "    if transfer is not None:\n",
+    "        model.load_network(f'{transfer}.pth')\n",
+    "        model.model_name = f'{mat_prop}'\n",
+    "\n",
+    "    # Apply BCEWithLogitsLoss to model output if binary classification is True\n",
+    "    if classification:\n",
+    "        model.classification = True\n",
+    "\n",
+    "    # Get the datafiles you will learn from\n",
+    "    train_data = f'{data_dir}/{mat_prop}/train.csv'\n",
+    "    val_data = f'{data_dir}/{mat_prop}/val.csv'\n",
+    "\n",
+    "    # Load the train and validation data before fitting the network\n",
+    "    data_size = pd.read_csv(train_data).shape[0]\n",
+    "    batch_size = 2**round(np.log2(data_size)-4)\n",
+    "    if batch_size < 2**7:\n",
+    "        batch_size = 2**7\n",
+    "    if batch_size > 2**12:\n",
+    "        batch_size = 2**12\n",
+    "  \n",
+    "    model.load_data(train_data, batch_size=batch_size//2, train=True)\n",
+    "    print(f'training with batchsize {model.batch_size} '\n",
+    "          f'(2**{np.log2(model.batch_size):0.3f})')\n",
+    "    model.load_data(val_data, batch_size=batch_size//2)\n",
+    "\n",
+    "    # Set the number of epochs, decide if you want a loss curve to be plotted\n",
+    "    model.fit(epochs=300, losscurve=False)\n",
+    "\n",
+    "    # Save the network (saved as f\"{model_name}.pth\")\n",
+    "    model.save_network(model_name)\n",
+    "    return model\n",
+    "\n",
+    "def load_model(data_dir, model_name, mat_prop, i, classification, file_name, verbose=True):\n",
+    "    # Load up a saved network.\n",
+    "    model = Model(LGDCNN(compute_device=compute_device).to(compute_device),\n",
+    "                  model_name=f'{mat_prop}{i}', verbose=verbose)\n",
+    "    model.load_network(model_name, f'{mat_prop}{i}.pth')\n",
+    "\n",
+    "    # Check if classifcation task\n",
+    "    if classification:\n",
+    "        model.classification = True\n",
+    "    # Load the data you want to predict with\n",
+    "    data = f'{data_dir}/{mat_prop}/{file_name}'\n",
+    "    # data is reloaded to model.data_loader\n",
+    "    model.load_data(data, batch_size=2**9)\n",
+    "    return model\n",
+    "\n",
+    "def get_results(model):\n",
+    "    output = model.predict(model.data_loader)  # predict the data saved here\n",
+    "    return model, output\n",
+    "\n",
+    "def to_csv(output, save_name):\n",
+    "    # parse output and save to csv\n",
+    "    act, pred, formulae, uncertainty = output\n",
+    "    df = pd.DataFrame([formulae, act, pred, uncertainty]).T\n",
+    "    # df.columns = ['composition', 'target', 'pred-0', 'uncertainty']\n",
+    "    df.columns = ['formula', 'actual', 'predicted', 'uncertainty']\n",
+    "    save_path = 'matbench_predictions/'\n",
+    "    os.makedirs(save_path, exist_ok=True)\n",
+    "    df.to_csv(f'{save_path}/{save_name}', index_label='Index')\n",
+    "    \n",
+    "\n",
+    "def save_results(data_dir, model_name,mat_prop, fold, classification, file_name, ):\n",
+    "    model = load_model(data_dir, model_name,mat_prop, fold, classification, file_name = 'test.csv' )\n",
+    "    model, output = get_results(model)\n",
+    "    \n",
+    "    # Get appropriate metrics for saving to csv\n",
+    "    if model.classification:\n",
+    "        auc = roc_auc_score(output[0], output[1])\n",
+    "        print(f'\\n{mat_prop} ROC AUC: {auc:0.3f}')\n",
+    "    else:\n",
+    "        mae = np.abs(output[0] - output[1]).mean()\n",
+    "        print(f'\\n{mat_prop} mae: {mae:0.3g}')\n",
+    "\n",
+    "    # save predictions to a csv\n",
+    "    fname = f'{mat_prop}_{file_name.replace(\".csv\", \"\")}_output{fold}.csv'\n",
+    "    to_csv(output, fname)\n",
+    "    return model, output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "#condesne_formula takes a material and returns the chemical formula in the correct format for LGDCNN\n",
+    "def condense_formula(mat):\n",
+    "    if isinstance(mat, str):\n",
+    "        return mat\n",
+    "    else:\n",
+    "        return mat.formula.replace(' ', '')\n",
+    "\n",
+    "#change_input runs condesne_formula on all the input data used for training\n",
+    "def change_input(train_inputs):\n",
+    "  inputs = []\n",
+    "  for input in train_inputs:\n",
+    "    inputs.append(condense_formula(input))\n",
+    "  return inputs\n",
+    "\n",
+    "#make_df creates a data frame containing the train inputs and outputs for LGDCNN\n",
+    "def make_df(train_inputs, train_outputs):\n",
+    "  input_df = pd.DataFrame({'formula': train_inputs, 'target': train_outputs})\n",
+    "  return input_df\n",
+    "\n",
+    "#make_df_test creates a data frame containing the test inputs for LGDCNN\n",
+    "def make_df_test(test_inputs, test_outputs):\n",
+    "  test_df = pd.DataFrame({'formula' : test_inputs, 'target': test_outputs})\n",
+    "  # test_df['target'] = np.nan\n",
+    "  return test_df\n",
+    "\n",
+    "#split_train_val splits the training data into two sets: training and validation\n",
+    "def split_train_val(df):\n",
+    "  df = df.sample(frac = 1.0, random_state = 7)\n",
+    "  val_df = df.sample(frac = 0.1, random_state = 7)\n",
+    "  train_df = df.drop(val_df.index)  \n",
+    "  print(train_df.shape, val_df.shape) \n",
+    "  return train_df, val_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subset = [\"matbench_jdft2d\", \"matbench_steels\", \n",
+    "          \"matbench_perovskites\", \"matbench_expt_gap\",\n",
+    "          \"matbench_phonons\", \"matbench_dielectric\", \n",
+    "          \"matbench_log_gvrh\", \"matbench_log_kvrh\",\n",
+    "          \"matbench_mp_gap\", \"matbench_mp_e_form\"]\n",
+    "\n",
+    "mb = MatbenchBenchmark(autoload=False, subset=subset)\n",
+    "data_dir = 'data/matbench_temp'\n",
+    "os.makedirs(data_dir, exist_ok= True)\n",
+    "\n",
+    "results_dict = {}\n",
+    "\n",
+    "for task in mb.tasks:\n",
+    "    task.load()\n",
+    "    mat_prop = task.dataset_name\n",
+    "    os.makedirs(f'{data_dir}/{mat_prop}', exist_ok= True)\n",
+    "    for fold in task.folds:\n",
+    "        train_inputs, train_outputs = task.get_train_and_val_data(fold)\n",
+    "        test_inputs , test_outputs = task.get_test_data(fold, include_target=True)\n",
+    "\n",
+    "        #Preparing the inputs data for LGDCNN\n",
+    "        inputs = change_input(train_inputs)\n",
+    "        df = make_df(inputs, train_outputs)\n",
+    "\n",
+    "        #Creating the training and validation sets\n",
+    "        train_df, val_df = split_train_val(df)\n",
+    "        train_df.to_csv(f'{data_dir}/{mat_prop}/train.csv')\n",
+    "        val_df.to_csv(f'{data_dir}/{mat_prop}/val.csv')\n",
+    "\n",
+    "        #Getting and preparing the testing data\n",
+    "        test_inputs_formula = change_input(test_inputs)\n",
+    "        test_df = make_df_test(test_inputs_formula, test_outputs)\n",
+    "        test_df.to_csv(f'{data_dir}/{mat_prop}/test.csv')\n",
+    "\n",
+    "        #Training LGDCNN\n",
+    "        model = get_model(data_dir, model_name, mat_prop, fold, classification = False, verbose = True, )\n",
+    "        \n",
+    "        model_test, output = save_results(data_dir, model_name,mat_prop, fold, classification = False,\n",
+    "                                     file_name='test.csv',)\n",
+    "        \n",
+    "        # Recording our data!\n",
+    "        predictions = output[1]\n",
+    "        task.record(fold, predictions)\n",
+    "\n",
+    "# Saving our results\n",
+    "mb.to_file(\"LGDCNN_\"+mat_prop +\".json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gzip\n",
+    "import shutil\n",
+    "\n",
+    "def compress_json(input_file, output_file):\n",
+    "    with open(input_file, 'rb') as f_in:\n",
+    "        with gzip.open(output_file, 'wb') as f_out:\n",
+    "            shutil.copyfileobj(f_in, f_out)\n",
+    "\n",
+    "compress_json('results.json', 'results.json.gz')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pytorch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.17"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "7dd5d76405b906035e1d1a24c7f24088f68ab8fc773386bbbd9b8e7c7c6d48a3"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/results.json.gz b/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/results.json.gz