From d12b3d9772fd1a52873f8d81af395f7cc7e25166 Mon Sep 17 00:00:00 2001
From: Maike Vahl <m.vahl@tu-braunschweig.de>
Date: Tue, 3 Oct 2023 08:41:24 +0000
Subject: [PATCH] Upload New File

---
 QSRR_MLR_GPR_notebook.ipynb | 474 ++++++++++++++++++++++++++++++++++++
 1 file changed, 474 insertions(+)
 create mode 100644 QSRR_MLR_GPR_notebook.ipynb

diff --git a/QSRR_MLR_GPR_notebook.ipynb b/QSRR_MLR_GPR_notebook.ipynb
new file mode 100644
index 0000000..b71978b
--- /dev/null
+++ b/QSRR_MLR_GPR_notebook.ipynb
@@ -0,0 +1,474 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5d630471",
+   "metadata": {},
+   "source": [
+    "# Notebook to perform multivariate linear regression (MLR) and Gaussian Process Regression (GPR)\n",
+    "\n",
+    "A regression class is provided. By initializing, two/three settings need to be specified:\n",
+    "1) Standardization True/False: Whether the data is standardized before the regression task\\\n",
+    "2) Choice for linear model: linBay=True -> BayesianRidge module, linBay=False -> LinearRegression module\n",
+    "\n",
+    "If GPR is used by setting gp=true, a specification whether the optimization of hyperparameters needs to be done (o_model=false) or if optimized hyperparameters are provided. Please comment the right keywords in the regression class/regression function out."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "e4d6a6e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas     as pd\n",
+    "import numpy      as np\n",
+    "import pickle     as pkl\n",
+    "import GPy\n",
+    "\n",
+    "from sklearn.linear_model    import LinearRegression, BayesianRidge\n",
+    "from sklearn.preprocessing   import PolynomialFeatures, StandardScaler"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f9e97a4",
+   "metadata": {},
+   "source": [
+    "### Data preparation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "42aaafae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read in pandas dataframe for structures\n",
+    "df       = pd.read_pickle('QC_and_descriptor_dataframe.pkl')     # M=3570 molecules\n",
+    "df_ref   = pd.read_pickle('QC_and_descriptor_dataframe_ref.pkl') # K=27 molecules\n",
+    "\n",
+    "df_noRef = df.drop(index=df_ref.index.values).copy()     # M-K=L=3543 molecules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "5416826e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read in pandas dataframe for models\n",
+    "model_df = pd.read_pickle('QSRR_MLR_GPR_model_coefficients_new.pkl')\n",
+    "\n",
+    "# get optimized hyperparameters of GPR model Y for descriptor X, exemplary for C_FG\n",
+    "model_GPR_X_LU = model_df.C_FG.values[-2]\n",
+    "model_GPR_X_HO = model_df.C_FG.values[-3]\n",
+    "\n",
+    "model_A_GPR_X = model_df.C_FG.values[-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "8378b6c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get descriptor values from dataframes, exemplary for C_FG\n",
+    "X_raw     = np.stack(df_noRef.C_FG.values) # L=3543\n",
+    "X_ref_raw = np.stack(df_ref.C_FG.values)   # K=27"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "a50f3bbe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get HOMO and LUMO energies from dataframes\n",
+    "y_LU     = df_noRef.ELUMO.values # L=3545\n",
+    "y_HO     = df_noRef.EHOMO.values # L=3545\n",
+    "\n",
+    "y_ref_LU = df_ref.ELUMO.values   # K=27\n",
+    "y_ref_HO = df_ref.EHOMO.values   # K=27"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "ab5e62b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get experimental E parameters for K=27 reference molecules\n",
+    "E = df_ref.E2012.values.astype(float)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "addc2d78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# data preparation\n",
+    "X     = PolynomialFeatures(1).fit_transform(X_raw)\n",
+    "X_ref = PolynomialFeatures(1).fit_transform(X_ref_raw)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8515261",
+   "metadata": {},
+   "source": [
+    "### Regression class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "f0032879",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class regression:\n",
+    "    \n",
+    "        def __init__(self, HO_ref, LU_ref, std=True, linBay=False):\n",
+    "                \n",
+    "            self.LU_ref = LU_ref\n",
+    "            self.HO_ref = HO_ref\n",
+    "            \n",
+    "            self.std    = std\n",
+    "            self.linBay = linBay\n",
+    "            \n",
+    "            # prepare for eq 6 (main text)\n",
+    "            set_scaler = np.append(self.HO_ref.reshape(-1,1), self.LU_ref.reshape(-1,1), axis=1)\n",
+    "            set_scaler = PolynomialFeatures(2).fit_transform(set_scaler)\n",
+    "            set_scaler = np.append(set_scaler, (1 / (self.LU_ref - self.HO_ref)).reshape(-1,1), axis=1)\n",
+    "            \n",
+    "            self.CDFT_scaler = StandardScaler().fit(set_scaler[:,1:])\n",
+    "\n",
+    "    #-----------------------------------------------------------------------------#\n",
+    "         \n",
+    "        def RMSE(self, f, y):\n",
+    "            \n",
+    "            '''Calculating the root-mean-square error (RMSE).'''\n",
+    "            \n",
+    "            return np.sqrt(np.mean((y.flatten() - f.flatten())**2))\n",
+    "        \n",
+    "    #-----------------------------------------------------------------------------#\n",
+    "        \n",
+    "        def r2(self, f, y):\n",
+    "            \n",
+    "            '''Calculating the squared correlation coefficient.'''\n",
+    "            \n",
+    "            return np.corrcoef(f.flatten(), y.flatten())[0,1]**2\n",
+    "            \n",
+    "    #-----------------------------------------------------------------------------#\n",
+    "        \n",
+    "        def R2(self, f, y):\n",
+    "            \n",
+    "            '''Calculating coefficient of determination R^2 values.'''\n",
+    "            \n",
+    "            return (1 - (self.RMSE(f, y)**2/(self.RMSE(np.mean(y), y)**2)))\n",
+    "        \n",
+    "    #-----------------------------------------------------------------------------#    \n",
+    "    \n",
+    "        def cdft_features(self, HO, LU):\n",
+    "            \n",
+    "            '''Preparing frontier molecular orbital energies for eq 6 (main text).'''\n",
+    "        \n",
+    "            X_cdft_full = np.append(HO.reshape(-1,1), LU.reshape(-1,1), axis=1)\n",
+    "            X_cdft_full = PolynomialFeatures(2).fit_transform(X_cdft_full)\n",
+    "            X_cdft_full = np.append(X_cdft_full, (1 / (LU - HO)).reshape(-1,1), axis=1)\n",
+    "            \n",
+    "            if self.std == True:\n",
+    "                print('std=True')\n",
+    "                X_cdft_full = X_cdft_full[:,1:]\n",
+    "            \n",
+    "                X_cdft_full = self.CDFT_scaler.transform(X_cdft_full)\n",
+    "                X_cdft_full = PolynomialFeatures(1).fit_transform(X_cdft_full)\n",
+    "            \n",
+    "            return X_cdft_full\n",
+    "                \n",
+    "    #-----------------------------------------------------------------------------#\n",
+    "    \n",
+    "        def regression(self, X, y, o_model=False, gp=False):\n",
+    "            \n",
+    "            '''Performing the MLR/GPR task. Returning the model, the predictions, \n",
+    "            and values for coefficient of determination and RMSE.\n",
+    "            o_model is a list and contains the optimized hyperparameters.'''\n",
+    "            \n",
+    "            if gp:\n",
+    "                \n",
+    "                kernel = GPy.kern.Matern32(input_dim=X.shape[1])\n",
+    "                model  = GPy.models.GPRegression(X, y.reshape(-1,1), kernel, normalizer=True)\n",
+    "                \n",
+    "                #model optimization with o_model=False\n",
+    "#                 model.optimize_restarts(num_restarts=5, parallel=True)\n",
+    "\n",
+    "                #optimized hyperparameter are provided\n",
+    "                model.kern.variance           = o_model[0]\n",
+    "                model.kern.lengthscale        = o_model[1] \n",
+    "                model.Gaussian_noise.variance = o_model[2]\n",
+    "                            \n",
+    "            elif self.linBay == True:\n",
+    "                model = BayesianRidge(fit_intercept=False, lambda_init=1e-3).fit(X, y) \n",
+    "            else:\n",
+    "                model = LinearRegression(fit_intercept=False).fit(X, y) \n",
+    "                \n",
+    "            if gp: \n",
+    "                y_pred = model.predict(X)[0].flatten()\n",
+    "            else:\n",
+    "                y_pred = model.predict(X)            \n",
+    "                  \n",
+    "            R2     = self.R2(y_pred, y)     \n",
+    "            RMSE   = self.RMSE(y_pred, y)         \n",
+    "            \n",
+    "            return model, y_pred, R2, RMSE\n",
+    "                 \n",
+    "    #-----------------------------------------------------------------------------#"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "57e11442",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build class regression with specified input\n",
+    "self = regression(y_ref_HO, y_ref_LU, std=True, linBay=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa4eca68",
+   "metadata": {},
+   "source": [
+    "## The second step (QMP to $E$) \n",
+    "\n",
+    "Reference MLR (rMLR)\\\n",
+    "Final settings: std=True, linBay=False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "6c3926ca",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "std=True\n"
+     ]
+    }
+   ],
+   "source": [
+    "# rMLR model trained on K=27 molecules:\n",
+    "X_rMLR = self.cdft_features(y_ref_HO, y_ref_LU)\n",
+    "model_rMLR, y_pred_rMLR, R2_rMLR, RMSE_rMLR = self.regression(X_rMLR, E)\n",
+    "rMLR_coefs = model_rMLR.coef_.copy()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e2337912",
+   "metadata": {},
+   "source": [
+    "## The first step (structure to QMP) \n",
+    "\n",
+    "Path A and path B:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b23ae7a2",
+   "metadata": {},
+   "source": [
+    "### Path A"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "6f93524f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "std=True\n"
+     ]
+    }
+   ],
+   "source": [
+    "# calculation of E for non-reference values\n",
+    "# prepare QC values (for eq 6, main text)\n",
+    "X_cdft_QC = self.cdft_features(y_HO, y_LU)\n",
+    "E_ML_QC   = model_rMLR.predict(X_cdft_QC)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "4b30ce63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# path A: set up MLR model\n",
+    "model_A,  y_pred_A,  R2_A,  RMSE_A = self.regression(X,  E_ML_QC)\n",
+    "\n",
+    "# path A (MLR): predictions for training (L=3543) and test (K=27) data\n",
+    "pred_A      = model_A.predict(X)\n",
+    "pred_ref_A  = model_A.predict(X_ref)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "bcab064a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# path A: set up GPR model\n",
+    "model_A_G,  y_pred_A_G,  R2_A_G,  RMSE_A_G = self.regression(X,  E_ML_QC, o_model=model_A_GPR_X, gp=True)\n",
+    "\n",
+    "# path A (GPR): predictions for training (L=3543) and test (K=27) data\n",
+    "pred_A_G      = model_A_G.predict(X)[0]\n",
+    "pred_ref_A_G  = model_A_G.predict(X_ref)[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ac8ace1",
+   "metadata": {},
+   "source": [
+    "### Path B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "20056317",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# path B: set up MLR models\n",
+    "model_LU, y_pred_LU, R2_LU, RMSE_LU = self.regression(X, y_LU)\n",
+    "model_HO, y_pred_HO, R2_HO, RMSE_HO = self.regression(X, y_HO)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "7b072dad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# path B: set up GPR models\n",
+    "model_LU_G, y_pred_LU_G, R2_LU_G, RMSE_LU_G = self.regression(X, y_LU, o_model=model_GPR_X_LU, gp=True)\n",
+    "model_HO_G, y_pred_HO_G, R2_HO_G, RMSE_HO_G = self.regression(X, y_HO, o_model=model_GPR_X_LU, gp=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "aa0e1a12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# path B: MLR predictions of LUMO and HOMO energies for K=27 molecules \n",
+    "pred_ref_LU = model_LU.predict(X_ref)\n",
+    "pred_ref_HO = model_HO.predict(X_ref)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "a8603208",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# path B: GPR predictions of LUMO and HOMO energies for K=27 molecules \n",
+    "pred_ref_LU_G = model_LU_G.predict(X_ref)[0]\n",
+    "pred_ref_HO_G = model_HO_G.predict(X_ref)[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "b1e857e2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "std=True\n"
+     ]
+    }
+   ],
+   "source": [
+    "# path B (MLR): predictions of E with rMLR\n",
+    "X_cdft_ML_B = self.cdft_features(pred_ref_HO, pred_ref_LU)\n",
+    "E_ML_B      = model_rMLR.predict(X_cdft_ML_B)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "ea2b7934",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "std=True\n"
+     ]
+    }
+   ],
+   "source": [
+    "# path B (GPR): predictions of E with rMLR\n",
+    "X_cdft_ML_B_G = self.cdft_features(pred_ref_HO_G, pred_ref_LU_G)\n",
+    "E_ML_B_G      = model_rMLR.predict(X_cdft_ML_B_G)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de5964ad",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab