Upload New File

23eeb785 · Jonny Proppe · 13073c8e · 23eeb785
Commit 23eeb785 authored 1 year ago by Jonny Proppe
--- a/QSRR_MLR_notebook.ipynb
+++ b/QSRR_MLR_notebook.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5d630471",
+   "metadata": {},
+   "source": [
+    "# Notebook to perform multivariate linear regression (MLR)\n",
+    "\n",
+    "A regression class is provided. By initializing, two settings need to be specified:\n",
+    "1) Standardization True/False: Whether the data is standardized before the regression task\\\n",
+    "2) Choice for linear model: linBay=True -> BayesianRidge module, linBay=False -> LinearRegression module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4d6a6e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas     as pd\n",
+    "import numpy      as np\n",
+    "import pickle     as pkl\n",
+    "\n",
+    "from sklearn.linear_model    import LinearRegression, BayesianRidge\n",
+    "from sklearn.preprocessing   import PolynomialFeatures, StandardScaler"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f9e97a4",
+   "metadata": {},
+   "source": [
+    "### Data preparation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42aaafae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read in pandas dataframe\n",
+    "df       = pd.read_pickle('QC_and_descriptor_dataframe.pkl')     # M=3570 molecules\n",
+    "df_ref   = pd.read_pickle('QC_and_descriptor_dataframe_ref.pkl') # K=27 molecules\n",
+    "\n",
+    "df_noRef = df.drop(index=df_ref.index.values).copy()     # M-K=L=3543 molecules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8378b6c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get descriptor values from dataframes, exemplary for C_FG\n",
+    "X_raw     = np.stack(df_noRef.C_FG.values) # L=3543\n",
+    "X_ref_raw = np.stack(df_ref.C_FG.values)   # K=27"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a50f3bbe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get HOMO and LUMO energies from dataframes\n",
+    "y_LU     = df_noRef.ELUMO.values # L=3545\n",
+    "y_HO     = df_noRef.EHOMO.values # L=3545\n",
+    "\n",
+    "y_ref_LU = df_ref.ELUMO.values   # K=27\n",
+    "y_ref_HO = df_ref.EHOMO.values   # K=27"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab5e62b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get experimental E parameters for K=27 reference molecules\n",
+    "E = df_ref.E2012.values.astype(float)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "addc2d78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# data preparation\n",
+    "X     = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_raw))\n",
+    "X_ref = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_ref_raw))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8515261",
+   "metadata": {},
+   "source": [
+    "### Regression class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0032879",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class regression:\n",
+    "    \n",
+    "        def __init__(self, HO_ref, LU_ref, std=True, linBay=False):\n",
+    "                \n",
+    "            self.LU_ref = LU_ref\n",
+    "            self.HO_ref = HO_ref\n",
+    "            \n",
+    "            self.std    = std\n",
+    "            self.linBay = linBay\n",
+    "            \n",
+    "            # prepare for eq 6 (main text)\n",
+    "            set_scaler = np.append(self.HO_ref.reshape(-1,1), self.LU_ref.reshape(-1,1), axis=1)\n",
+    "            set_scaler = PolynomialFeatures(2).fit_transform(set_scaler)\n",
+    "            set_scaler = np.append(set_scaler, (1 / (self.LU_ref - self.HO_ref)).reshape(-1,1), axis=1)\n",
+    "            \n",
+    "            self.CDFT_scaler = StandardScaler().fit(set_scaler[:,1:])\n",
+    "\n",
+    "    #-----------------------------------------------------------------------------#\n",
+    "         \n",
+    "        def RMSE(self, f, y):\n",
+    "            \n",
+    "            '''Calculating the root-mean-square error (RMSE).'''\n",
+    "            \n",
+    "            return np.sqrt(np.mean((y.flatten() - f.flatten())**2))\n",
+    "        \n",
+    "    #-----------------------------------------------------------------------------#\n",
+    "        \n",
+    "        def r2(self, f, y):\n",
+    "            \n",
+    "            '''Calculating the squared correlation coefficient.'''\n",
+    "            \n",
+    "            return np.corrcoef(f.flatten(), y.flatten())[0,1]**2\n",
+    "            \n",
+    "    #-----------------------------------------------------------------------------#\n",
+    "        \n",
+    "        def R2(self, f, y):\n",
+    "            \n",
+    "            '''Calculating coefficient of determination R^2 values.'''\n",
+    "            \n",
+    "            return (1 - (self.RMSE(f, y)**2/(self.RMSE(np.mean(y), y)**2)))\n",
+    "        \n",
+    "    #-----------------------------------------------------------------------------#    \n",
+    "    \n",
+    "        def cdft_features(self, HO, LU):\n",
+    "            \n",
+    "            '''Preparing frontier molecular orbital energies for eq 6 (main text).'''\n",
+    "        \n",
+    "            X_cdft_full = np.append(HO.reshape(-1,1), LU.reshape(-1,1), axis=1)\n",
+    "            X_cdft_full = PolynomialFeatures(2).fit_transform(X_cdft_full)\n",
+    "            X_cdft_full = np.append(X_cdft_full, (1 / (LU - HO)).reshape(-1,1), axis=1)\n",
+    "            \n",
+    "            if self.std == True:\n",
+    "                print('std=True')\n",
+    "                X_cdft_full = X_cdft_full[:,1:]\n",
+    "            \n",
+    "                X_cdft_full = self.CDFT_scaler.transform(X_cdft_full)\n",
+    "                X_cdft_full = StandardScaler().fit_transform(X_cdft_full)\n",
+    "                X_cdft_full = PolynomialFeatures(1).fit_transform(X_cdft_full)\n",
+    "            \n",
+    "            return X_cdft_full\n",
+    "                \n",
+    "    #-----------------------------------------------------------------------------#\n",
+    "    \n",
+    "        def regression(self, X, y):\n",
+    "            \n",
+    "            '''Performing the MLR task. Returning the model, the predictions, \n",
+    "            and values for coefficient of determination and RMSE.'''\n",
+    "            \n",
+    "            if self.linBay == True:\n",
+    "                model = BayesianRidge(fit_intercept=False, lambda_init=1e-3).fit(X, y) \n",
+    "                print(model)\n",
+    "            else:\n",
+    "                model = LinearRegression(fit_intercept=False).fit(X, y) \n",
+    "                print(model)\n",
+    "            \n",
+    "            y_pred = model.predict(X)            \n",
+    "            R2     = self.R2(y_pred, y)     \n",
+    "            RMSE   = self.RMSE(y_pred, y)         \n",
+    "            \n",
+    "            return model, y_pred, R2, RMSE\n",
+    "                 \n",
+    "    #-----------------------------------------------------------------------------#"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57e11442",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build class regression with specified input\n",
+    "self = regression(y_ref_HO, y_ref_LU, std=True, linBay=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa4eca68",
+   "metadata": {},
+   "source": [
+    "## The second step (QMP to $E$) \n",
+    "\n",
+    "Reference MLR (rMLR)\\\n",
+    "Final settings: std=True, linBay=False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c3926ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# rMLR model trained on K=27 molecules:\n",
+    "X_rMLR = self.cdft_features(y_ref_HO, y_ref_LU)\n",
+    "model_rMLR, y_pred_rMLR, R2_rMLR, RMSE_rMLR = self.regression(X_rMLR, E)\n",
+    "rMLR_coefs = model_rMLR.coef_.copy()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e2337912",
+   "metadata": {},
+   "source": [
+    "## The first step (structure to QMP) \n",
+    "\n",
+    "Path A and path B:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b23ae7a2",
+   "metadata": {},
+   "source": [
+    "### Path A"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f93524f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# calculation of E for non-reference values\n",
+    "# prepare QC values (for eq 6, main text)\n",
+    "X_cdft_QC = self.cdft_features(y_HO, y_LU)\n",
+    "E_ML_QC   = model_rMLR.predict(X_cdft_QC)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b30ce63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# path A: set up MLR model\n",
+    "model_A,  y_pred_A,  R2_A,  RMSE_A = self.regression(X,  E_ML_QC)\n",
+    "\n",
+    "# path A: predictions for training (L=3543) and test (K=27) data\n",
+    "pred_A      = model_A.predict(X)\n",
+    "pred_ref_A  = model_A.predict(X_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ac8ace1",
+   "metadata": {},
+   "source": [
+    "### Path B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20056317",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# path B: set up MLR models\n",
+    "model_LU, y_pred_LU, R2_LU, RMSE_LU = self.regression(X, y_LU)\n",
+    "model_HO, y_pred_HO, R2_HO, RMSE_HO = self.regression(X, y_HO)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa0e1a12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# path B: predictions of LUMO and HOMO energies for K=27 molecules \n",
+    "pred_ref_LU = model_LU.predict(X_ref)\n",
+    "pred_ref_HO = model_HO.predict(X_ref)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1e857e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# path B: predictions of E with rMLR\n",
+    "X_cdft_ML_B = self.cdft_features(pred_ref_HO, pred_ref_LU)\n",
+    "E_ML_B      = model_rMLR.predict(X_cdft_ML_B)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:5d630471 tags:
+
+# Notebook to perform multivariate linear regression (MLR)
+
+A regression class is provided. By initializing, two settings need to be specified:
+1) Standardization True/False: Whether the data is standardized before the regression task\
+2) Choice for linear model: linBay=True -> BayesianRidge module, linBay=False -> LinearRegression module
+
+%% Cell type:code id:e4d6a6e5 tags:
+
+``` python
+import pandas     as pd
+import numpy      as np
+import pickle     as pkl
+
+from sklearn.linear_model    import LinearRegression, BayesianRidge
+from sklearn.preprocessing   import PolynomialFeatures, StandardScaler
+```
+
+%% Cell type:markdown id:6f9e97a4 tags:
+
+### Data preparation
+
+%% Cell type:code id:42aaafae tags:
+
+``` python
+# read in pandas dataframe
+df       = pd.read_pickle('QC_and_descriptor_dataframe.pkl')     # M=3570 molecules
+df_ref   = pd.read_pickle('QC_and_descriptor_dataframe_ref.pkl') # K=27 molecules
+
+df_noRef = df.drop(index=df_ref.index.values).copy()     # M-K=L=3543 molecules
+```
+
+%% Cell type:code id:8378b6c1 tags:
+
+``` python
+# get descriptor values from dataframes, exemplary for C_FG
+X_raw     = np.stack(df_noRef.C_FG.values) # L=3543
+X_ref_raw = np.stack(df_ref.C_FG.values)   # K=27
+```
+
+%% Cell type:code id:a50f3bbe tags:
+
+``` python
+# get HOMO and LUMO energies from dataframes
+y_LU     = df_noRef.ELUMO.values # L=3545
+y_HO     = df_noRef.EHOMO.values # L=3545
+
+y_ref_LU = df_ref.ELUMO.values   # K=27
+y_ref_HO = df_ref.EHOMO.values   # K=27
+```
+
+%% Cell type:code id:ab5e62b2 tags:
+
+``` python
+# get experimental E parameters for K=27 reference molecules
+E = df_ref.E2012.values.astype(float)
+```
+
+%% Cell type:code id:addc2d78 tags:
+
+``` python
+# data preparation
+X     = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_raw))
+X_ref = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_ref_raw))
+```
+
+%% Cell type:markdown id:f8515261 tags:
+
+### Regression class
+
+%% Cell type:code id:f0032879 tags:
+
+``` python
+class regression:
+
+        def __init__(self, HO_ref, LU_ref, std=True, linBay=False):
+
+            self.LU_ref = LU_ref
+            self.HO_ref = HO_ref
+
+            self.std    = std
+            self.linBay = linBay
+
+            # prepare for eq 6 (main text)
+            set_scaler = np.append(self.HO_ref.reshape(-1,1), self.LU_ref.reshape(-1,1), axis=1)
+            set_scaler = PolynomialFeatures(2).fit_transform(set_scaler)
+            set_scaler = np.append(set_scaler, (1 / (self.LU_ref - self.HO_ref)).reshape(-1,1), axis=1)
+
+            self.CDFT_scaler = StandardScaler().fit(set_scaler[:,1:])
+
+    #-----------------------------------------------------------------------------#
+
+        def RMSE(self, f, y):
+
+            '''Calculating the root-mean-square error (RMSE).'''
+
+            return np.sqrt(np.mean((y.flatten() - f.flatten())**2))
+
+    #-----------------------------------------------------------------------------#
+
+        def r2(self, f, y):
+
+            '''Calculating the squared correlation coefficient.'''
+
+            return np.corrcoef(f.flatten(), y.flatten())[0,1]**2
+
+    #-----------------------------------------------------------------------------#
+
+        def R2(self, f, y):
+
+            '''Calculating coefficient of determination R^2 values.'''
+
+            return (1 - (self.RMSE(f, y)**2/(self.RMSE(np.mean(y), y)**2)))
+
+    #-----------------------------------------------------------------------------#
+
+        def cdft_features(self, HO, LU):
+
+            '''Preparing frontier molecular orbital energies for eq 6 (main text).'''
+
+            X_cdft_full = np.append(HO.reshape(-1,1), LU.reshape(-1,1), axis=1)
+            X_cdft_full = PolynomialFeatures(2).fit_transform(X_cdft_full)
+            X_cdft_full = np.append(X_cdft_full, (1 / (LU - HO)).reshape(-1,1), axis=1)
+
+            if self.std == True:
+                print('std=True')
+                X_cdft_full = X_cdft_full[:,1:]
+
+                X_cdft_full = self.CDFT_scaler.transform(X_cdft_full)
+                X_cdft_full = StandardScaler().fit_transform(X_cdft_full)
+                X_cdft_full = PolynomialFeatures(1).fit_transform(X_cdft_full)
+
+            return X_cdft_full
+
+    #-----------------------------------------------------------------------------#
+
+        def regression(self, X, y):
+
+            '''Performing the MLR task. Returning the model, the predictions,
+            and values for coefficient of determination and RMSE.'''
+
+            if self.linBay == True:
+                model = BayesianRidge(fit_intercept=False, lambda_init=1e-3).fit(X, y)
+                print(model)
+            else:
+                model = LinearRegression(fit_intercept=False).fit(X, y)
+                print(model)
+
+            y_pred = model.predict(X)
+            R2     = self.R2(y_pred, y)
+            RMSE   = self.RMSE(y_pred, y)
+
+            return model, y_pred, R2, RMSE
+
+    #-----------------------------------------------------------------------------#
+```
+
+%% Cell type:code id:57e11442 tags:
+
+``` python
+# build class regression with specified input
+self = regression(y_ref_HO, y_ref_LU, std=True, linBay=False)
+```
+
+%% Cell type:markdown id:aa4eca68 tags:
+
+## The second step (QMP to $E$)
+
+Reference MLR (rMLR)\
+Final settings: std=True, linBay=False
+
+%% Cell type:code id:6c3926ca tags:
+
+``` python
+# rMLR model trained on K=27 molecules:
+X_rMLR = self.cdft_features(y_ref_HO, y_ref_LU)
+model_rMLR, y_pred_rMLR, R2_rMLR, RMSE_rMLR = self.regression(X_rMLR, E)
+rMLR_coefs = model_rMLR.coef_.copy()
+```
+
+%% Cell type:markdown id:e2337912 tags:
+
+## The first step (structure to QMP)
+
+Path A and path B:
+
+%% Cell type:markdown id:b23ae7a2 tags:
+
+### Path A
+
+%% Cell type:code id:6f93524f tags:
+
+``` python
+# calculation of E for non-reference values
+# prepare QC values (for eq 6, main text)
+X_cdft_QC = self.cdft_features(y_HO, y_LU)
+E_ML_QC   = model_rMLR.predict(X_cdft_QC)
+```
+
+%% Cell type:code id:4b30ce63 tags:
+
+``` python
+# path A: set up MLR model
+model_A,  y_pred_A,  R2_A,  RMSE_A = self.regression(X,  E_ML_QC)
+
+# path A: predictions for training (L=3543) and test (K=27) data
+pred_A      = model_A.predict(X)
+pred_ref_A  = model_A.predict(X_ref)
+```
+
+%% Cell type:markdown id:8ac8ace1 tags:
+
+### Path B
+
+%% Cell type:code id:20056317 tags:
+
+``` python
+# path B: set up MLR models
+model_LU, y_pred_LU, R2_LU, RMSE_LU = self.regression(X, y_LU)
+model_HO, y_pred_HO, R2_HO, RMSE_HO = self.regression(X, y_HO)
+```
+
+%% Cell type:code id:aa0e1a12 tags:
+
+``` python
+# path B: predictions of LUMO and HOMO energies for K=27 molecules
+pred_ref_LU = model_LU.predict(X_ref)
+pred_ref_HO = model_HO.predict(X_ref)
+```
+
+%% Cell type:code id:b1e857e2 tags:
+
+``` python
+# path B: predictions of E with rMLR
+X_cdft_ML_B = self.cdft_features(pred_ref_HO, pred_ref_LU)
+E_ML_B      = model_rMLR.predict(X_cdft_ML_B)
+```