Delete QSRR_MLR_notebook.ipynb

b105fbf4 · Maike Vahl · b6d23409 · b6d23409
Commit b105fbf4 authored 1 year ago by Maike Vahl
--- a/QSRR_MLR_notebook.ipynb
+++ b/QSRR_MLR_notebook.ipynb
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "5d630471",
-   "metadata": {},
-   "source": [
-    "# Notebook to perform multivariate linear regression (MLR)\n",
-    "\n",
-    "A regression class is provided. By initializing, two settings need to be specified:\n",
-    "1) Standardization True/False: Whether the data is standardized before the regression task\\\n",
-    "2) Choice for linear model: linBay=True -> BayesianRidge module, linBay=False -> LinearRegression module"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e4d6a6e5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas     as pd\n",
-    "import numpy      as np\n",
-    "import pickle     as pkl\n",
-    "\n",
-    "from sklearn.linear_model    import LinearRegression, BayesianRidge\n",
-    "from sklearn.preprocessing   import PolynomialFeatures, StandardScaler"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6f9e97a4",
-   "metadata": {},
-   "source": [
-    "### Data preparation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "42aaafae",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# read in pandas dataframe\n",
-    "df       = pd.read_pickle('QC_and_descriptor_dataframe.pkl')     # M=3570 molecules\n",
-    "df_ref   = pd.read_pickle('QC_and_descriptor_dataframe_ref.pkl') # K=27 molecules\n",
-    "\n",
-    "df_noRef = df.drop(index=df_ref.index.values).copy()     # M-K=L=3543 molecules"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8378b6c1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# get descriptor values from dataframes, exemplary for C_FG\n",
-    "X_raw     = np.stack(df_noRef.C_FG.values) # L=3543\n",
-    "X_ref_raw = np.stack(df_ref.C_FG.values)   # K=27"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a50f3bbe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# get HOMO and LUMO energies from dataframes\n",
-    "y_LU     = df_noRef.ELUMO.values # L=3545\n",
-    "y_HO     = df_noRef.EHOMO.values # L=3545\n",
-    "\n",
-    "y_ref_LU = df_ref.ELUMO.values   # K=27\n",
-    "y_ref_HO = df_ref.EHOMO.values   # K=27"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ab5e62b2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# get experimental E parameters for K=27 reference molecules\n",
-    "E = df_ref.E2012.values.astype(float)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "addc2d78",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# data preparation\n",
-    "X     = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_raw))\n",
-    "X_ref = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_ref_raw))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f8515261",
-   "metadata": {},
-   "source": [
-    "### Regression class"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f0032879",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class regression:\n",
-    "    \n",
-    "        def __init__(self, HO_ref, LU_ref, std=True, linBay=False):\n",
-    "                \n",
-    "            self.LU_ref = LU_ref\n",
-    "            self.HO_ref = HO_ref\n",
-    "            \n",
-    "            self.std    = std\n",
-    "            self.linBay = linBay\n",
-    "            \n",
-    "            # prepare for eq 6 (main text)\n",
-    "            set_scaler = np.append(self.HO_ref.reshape(-1,1), self.LU_ref.reshape(-1,1), axis=1)\n",
-    "            set_scaler = PolynomialFeatures(2).fit_transform(set_scaler)\n",
-    "            set_scaler = np.append(set_scaler, (1 / (self.LU_ref - self.HO_ref)).reshape(-1,1), axis=1)\n",
-    "            \n",
-    "            self.CDFT_scaler = StandardScaler().fit(set_scaler[:,1:])\n",
-    "\n",
-    "    #-----------------------------------------------------------------------------#\n",
-    "         \n",
-    "        def RMSE(self, f, y):\n",
-    "            \n",
-    "            '''Calculating the root-mean-square error (RMSE).'''\n",
-    "            \n",
-    "            return np.sqrt(np.mean((y.flatten() - f.flatten())**2))\n",
-    "        \n",
-    "    #-----------------------------------------------------------------------------#\n",
-    "        \n",
-    "        def r2(self, f, y):\n",
-    "            \n",
-    "            '''Calculating the squared correlation coefficient.'''\n",
-    "            \n",
-    "            return np.corrcoef(f.flatten(), y.flatten())[0,1]**2\n",
-    "            \n",
-    "    #-----------------------------------------------------------------------------#\n",
-    "        \n",
-    "        def R2(self, f, y):\n",
-    "            \n",
-    "            '''Calculating coefficient of determination R^2 values.'''\n",
-    "            \n",
-    "            return (1 - (self.RMSE(f, y)**2/(self.RMSE(np.mean(y), y)**2)))\n",
-    "        \n",
-    "    #-----------------------------------------------------------------------------#    \n",
-    "    \n",
-    "        def cdft_features(self, HO, LU):\n",
-    "            \n",
-    "            '''Preparing frontier molecular orbital energies for eq 6 (main text).'''\n",
-    "        \n",
-    "            X_cdft_full = np.append(HO.reshape(-1,1), LU.reshape(-1,1), axis=1)\n",
-    "            X_cdft_full = PolynomialFeatures(2).fit_transform(X_cdft_full)\n",
-    "            X_cdft_full = np.append(X_cdft_full, (1 / (LU - HO)).reshape(-1,1), axis=1)\n",
-    "            \n",
-    "            if self.std == True:\n",
-    "                print('std=True')\n",
-    "                X_cdft_full = X_cdft_full[:,1:]\n",
-    "            \n",
-    "                X_cdft_full = self.CDFT_scaler.transform(X_cdft_full)\n",
-    "                X_cdft_full = StandardScaler().fit_transform(X_cdft_full)\n",
-    "                X_cdft_full = PolynomialFeatures(1).fit_transform(X_cdft_full)\n",
-    "            \n",
-    "            return X_cdft_full\n",
-    "                \n",
-    "    #-----------------------------------------------------------------------------#\n",
-    "    \n",
-    "        def regression(self, X, y):\n",
-    "            \n",
-    "            '''Performing the MLR task. Returning the model, the predictions, \n",
-    "            and values for coefficient of determination and RMSE.'''\n",
-    "            \n",
-    "            if self.linBay == True:\n",
-    "                model = BayesianRidge(fit_intercept=False, lambda_init=1e-3).fit(X, y) \n",
-    "                print(model)\n",
-    "            else:\n",
-    "                model = LinearRegression(fit_intercept=False).fit(X, y) \n",
-    "                print(model)\n",
-    "            \n",
-    "            y_pred = model.predict(X)            \n",
-    "            R2     = self.R2(y_pred, y)     \n",
-    "            RMSE   = self.RMSE(y_pred, y)         \n",
-    "            \n",
-    "            return model, y_pred, R2, RMSE\n",
-    "                 \n",
-    "    #-----------------------------------------------------------------------------#"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "57e11442",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# build class regression with specified input\n",
-    "self = regression(y_ref_HO, y_ref_LU, std=True, linBay=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "aa4eca68",
-   "metadata": {},
-   "source": [
-    "## The second step (QMP to $E$) \n",
-    "\n",
-    "Reference MLR (rMLR)\\\n",
-    "Final settings: std=True, linBay=False"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6c3926ca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rMLR model trained on K=27 molecules:\n",
-    "X_rMLR = self.cdft_features(y_ref_HO, y_ref_LU)\n",
-    "model_rMLR, y_pred_rMLR, R2_rMLR, RMSE_rMLR = self.regression(X_rMLR, E)\n",
-    "rMLR_coefs = model_rMLR.coef_.copy()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e2337912",
-   "metadata": {},
-   "source": [
-    "## The first step (structure to QMP) \n",
-    "\n",
-    "Path A and path B:"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b23ae7a2",
-   "metadata": {},
-   "source": [
-    "### Path A"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6f93524f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# calculation of E for non-reference values\n",
-    "# prepare QC values (for eq 6, main text)\n",
-    "X_cdft_QC = self.cdft_features(y_HO, y_LU)\n",
-    "E_ML_QC   = model_rMLR.predict(X_cdft_QC)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4b30ce63",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# path A: set up MLR model\n",
-    "model_A,  y_pred_A,  R2_A,  RMSE_A = self.regression(X,  E_ML_QC)\n",
-    "\n",
-    "# path A: predictions for training (L=3543) and test (K=27) data\n",
-    "pred_A      = model_A.predict(X)\n",
-    "pred_ref_A  = model_A.predict(X_ref)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8ac8ace1",
-   "metadata": {},
-   "source": [
-    "### Path B"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "20056317",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# path B: set up MLR models\n",
-    "model_LU, y_pred_LU, R2_LU, RMSE_LU = self.regression(X, y_LU)\n",
-    "model_HO, y_pred_HO, R2_HO, RMSE_HO = self.regression(X, y_HO)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "aa0e1a12",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# path B: predictions of LUMO and HOMO energies for K=27 molecules \n",
-    "pred_ref_LU = model_LU.predict(X_ref)\n",
-    "pred_ref_HO = model_HO.predict(X_ref)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b1e857e2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# path B: predictions of E with rMLR\n",
-    "X_cdft_ML_B = self.cdft_features(pred_ref_HO, pred_ref_LU)\n",
-    "E_ML_B      = model_rMLR.predict(X_cdft_ML_B)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
-%% Cell type:markdown id:5d630471 tags:
-# Notebook to perform multivariate linear regression (MLR)
-A regression class is provided. By initializing, two settings need to be specified:
-1) Standardization True/False: Whether the data is standardized before the regression task\
-2) Choice for linear model: linBay=True -> BayesianRidge module, linBay=False -> LinearRegression module
-%% Cell type:code id:e4d6a6e5 tags:
-``` python
-import pandas     as pd
-import numpy      as np
-import pickle     as pkl
-from sklearn.linear_model    import LinearRegression, BayesianRidge
-from sklearn.preprocessing   import PolynomialFeatures, StandardScaler
-```
-%% Cell type:markdown id:6f9e97a4 tags:
-### Data preparation
-%% Cell type:code id:42aaafae tags:
-``` python
-# read in pandas dataframe
-df       = pd.read_pickle('QC_and_descriptor_dataframe.pkl')     # M=3570 molecules
-df_ref   = pd.read_pickle('QC_and_descriptor_dataframe_ref.pkl') # K=27 molecules
-df_noRef = df.drop(index=df_ref.index.values).copy()     # M-K=L=3543 molecules
-```
-%% Cell type:code id:8378b6c1 tags:
-``` python
-# get descriptor values from dataframes, exemplary for C_FG
-X_raw     = np.stack(df_noRef.C_FG.values) # L=3543
-X_ref_raw = np.stack(df_ref.C_FG.values)   # K=27
-```
-%% Cell type:code id:a50f3bbe tags:
-``` python
-# get HOMO and LUMO energies from dataframes
-y_LU     = df_noRef.ELUMO.values # L=3545
-y_HO     = df_noRef.EHOMO.values # L=3545
-y_ref_LU = df_ref.ELUMO.values   # K=27
-y_ref_HO = df_ref.EHOMO.values   # K=27
-```
-%% Cell type:code id:ab5e62b2 tags:
-``` python
-# get experimental E parameters for K=27 reference molecules
-E = df_ref.E2012.values.astype(float)
-```
-%% Cell type:code id:addc2d78 tags:
-``` python
-# data preparation
-X     = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_raw))
-X_ref = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_ref_raw))
-```
-%% Cell type:markdown id:f8515261 tags:
-### Regression class
-%% Cell type:code id:f0032879 tags:
-``` python
-class regression:
-        def __init__(self, HO_ref, LU_ref, std=True, linBay=False):
-            self.LU_ref = LU_ref
-            self.HO_ref = HO_ref
-            self.std    = std
-            self.linBay = linBay
-            # prepare for eq 6 (main text)
-            set_scaler = np.append(self.HO_ref.reshape(-1,1), self.LU_ref.reshape(-1,1), axis=1)
-            set_scaler = PolynomialFeatures(2).fit_transform(set_scaler)
-            set_scaler = np.append(set_scaler, (1 / (self.LU_ref - self.HO_ref)).reshape(-1,1), axis=1)
-            self.CDFT_scaler = StandardScaler().fit(set_scaler[:,1:])
-    #-----------------------------------------------------------------------------#
-        def RMSE(self, f, y):
-            '''Calculating the root-mean-square error (RMSE).'''
-            return np.sqrt(np.mean((y.flatten() - f.flatten())**2))
-    #-----------------------------------------------------------------------------#
-        def r2(self, f, y):
-            '''Calculating the squared correlation coefficient.'''
-            return np.corrcoef(f.flatten(), y.flatten())[0,1]**2
-    #-----------------------------------------------------------------------------#
-        def R2(self, f, y):
-            '''Calculating coefficient of determination R^2 values.'''
-            return (1 - (self.RMSE(f, y)**2/(self.RMSE(np.mean(y), y)**2)))
-    #-----------------------------------------------------------------------------#
-        def cdft_features(self, HO, LU):
-            '''Preparing frontier molecular orbital energies for eq 6 (main text).'''
-            X_cdft_full = np.append(HO.reshape(-1,1), LU.reshape(-1,1), axis=1)
-            X_cdft_full = PolynomialFeatures(2).fit_transform(X_cdft_full)
-            X_cdft_full = np.append(X_cdft_full, (1 / (LU - HO)).reshape(-1,1), axis=1)
-            if self.std == True:
-                print('std=True')
-                X_cdft_full = X_cdft_full[:,1:]
-                X_cdft_full = self.CDFT_scaler.transform(X_cdft_full)
-                X_cdft_full = StandardScaler().fit_transform(X_cdft_full)
-                X_cdft_full = PolynomialFeatures(1).fit_transform(X_cdft_full)
-            return X_cdft_full
-    #-----------------------------------------------------------------------------#
-        def regression(self, X, y):
-            '''Performing the MLR task. Returning the model, the predictions,
-            and values for coefficient of determination and RMSE.'''
-            if self.linBay == True:
-                model = BayesianRidge(fit_intercept=False, lambda_init=1e-3).fit(X, y)
-                print(model)
-            else:
-                model = LinearRegression(fit_intercept=False).fit(X, y)
-                print(model)
-            y_pred = model.predict(X)
-            R2     = self.R2(y_pred, y)
-            RMSE   = self.RMSE(y_pred, y)
-            return model, y_pred, R2, RMSE
-    #-----------------------------------------------------------------------------#
-```
-%% Cell type:code id:57e11442 tags:
-``` python
-# build class regression with specified input
-self = regression(y_ref_HO, y_ref_LU, std=True, linBay=False)
-```
-%% Cell type:markdown id:aa4eca68 tags:
-## The second step (QMP to $E$)
-Reference MLR (rMLR)\
-Final settings: std=True, linBay=False
-%% Cell type:code id:6c3926ca tags:
-``` python
-# rMLR model trained on K=27 molecules:
-X_rMLR = self.cdft_features(y_ref_HO, y_ref_LU)
-model_rMLR, y_pred_rMLR, R2_rMLR, RMSE_rMLR = self.regression(X_rMLR, E)
-rMLR_coefs = model_rMLR.coef_.copy()
-```
-%% Cell type:markdown id:e2337912 tags:
-## The first step (structure to QMP)
-Path A and path B:
-%% Cell type:markdown id:b23ae7a2 tags:
-### Path A
-%% Cell type:code id:6f93524f tags:
-``` python
-# calculation of E for non-reference values
-# prepare QC values (for eq 6, main text)
-X_cdft_QC = self.cdft_features(y_HO, y_LU)
-E_ML_QC   = model_rMLR.predict(X_cdft_QC)
-```
-%% Cell type:code id:4b30ce63 tags:
-``` python
-# path A: set up MLR model
-model_A,  y_pred_A,  R2_A,  RMSE_A = self.regression(X,  E_ML_QC)
-# path A: predictions for training (L=3543) and test (K=27) data
-pred_A      = model_A.predict(X)
-pred_ref_A  = model_A.predict(X_ref)
-```
-%% Cell type:markdown id:8ac8ace1 tags:
-### Path B
-%% Cell type:code id:20056317 tags:
-``` python
-# path B: set up MLR models
-model_LU, y_pred_LU, R2_LU, RMSE_LU = self.regression(X, y_LU)
-model_HO, y_pred_HO, R2_HO, RMSE_HO = self.regression(X, y_HO)
-```
-%% Cell type:code id:aa0e1a12 tags:
-``` python
-# path B: predictions of LUMO and HOMO energies for K=27 molecules
-pred_ref_LU = model_LU.predict(X_ref)
-pred_ref_HO = model_HO.predict(X_ref)
-```
-%% Cell type:code id:b1e857e2 tags:
-``` python
-# path B: predictions of E with rMLR
-X_cdft_ML_B = self.cdft_features(pred_ref_HO, pred_ref_LU)
-E_ML_B      = model_rMLR.predict(X_cdft_ML_B)
-```