From b105fbf4f36dc7510627a2cc5aa22512e0ccce05 Mon Sep 17 00:00:00 2001
From: Maike Vahl <m.vahl@tu-braunschweig.de>
Date: Tue, 3 Oct 2023 08:44:02 +0000
Subject: [PATCH] Delete QSRR_MLR_notebook.ipynb

---
 QSRR_MLR_notebook.ipynb | 348 ----------------------------------------
 1 file changed, 348 deletions(-)
 delete mode 100644 QSRR_MLR_notebook.ipynb

diff --git a/QSRR_MLR_notebook.ipynb b/QSRR_MLR_notebook.ipynb
deleted file mode 100644
index 3c3df2f..0000000
--- a/QSRR_MLR_notebook.ipynb
+++ /dev/null
@@ -1,348 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "5d630471",
-   "metadata": {},
-   "source": [
-    "# Notebook to perform multivariate linear regression (MLR)\n",
-    "\n",
-    "A regression class is provided. By initializing, two settings need to be specified:\n",
-    "1) Standardization True/False: Whether the data is standardized before the regression task\\\n",
-    "2) Choice for linear model: linBay=True -> BayesianRidge module, linBay=False -> LinearRegression module"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e4d6a6e5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas     as pd\n",
-    "import numpy      as np\n",
-    "import pickle     as pkl\n",
-    "\n",
-    "from sklearn.linear_model    import LinearRegression, BayesianRidge\n",
-    "from sklearn.preprocessing   import PolynomialFeatures, StandardScaler"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6f9e97a4",
-   "metadata": {},
-   "source": [
-    "### Data preparation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "42aaafae",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# read in pandas dataframe\n",
-    "df       = pd.read_pickle('QC_and_descriptor_dataframe.pkl')     # M=3570 molecules\n",
-    "df_ref   = pd.read_pickle('QC_and_descriptor_dataframe_ref.pkl') # K=27 molecules\n",
-    "\n",
-    "df_noRef = df.drop(index=df_ref.index.values).copy()     # M-K=L=3543 molecules"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8378b6c1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# get descriptor values from dataframes, exemplary for C_FG\n",
-    "X_raw     = np.stack(df_noRef.C_FG.values) # L=3543\n",
-    "X_ref_raw = np.stack(df_ref.C_FG.values)   # K=27"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a50f3bbe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# get HOMO and LUMO energies from dataframes\n",
-    "y_LU     = df_noRef.ELUMO.values # L=3545\n",
-    "y_HO     = df_noRef.EHOMO.values # L=3545\n",
-    "\n",
-    "y_ref_LU = df_ref.ELUMO.values   # K=27\n",
-    "y_ref_HO = df_ref.EHOMO.values   # K=27"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ab5e62b2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# get experimental E parameters for K=27 reference molecules\n",
-    "E = df_ref.E2012.values.astype(float)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "addc2d78",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# data preparation\n",
-    "X     = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_raw))\n",
-    "X_ref = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_ref_raw))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f8515261",
-   "metadata": {},
-   "source": [
-    "### Regression class"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f0032879",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class regression:\n",
-    "    \n",
-    "        def __init__(self, HO_ref, LU_ref, std=True, linBay=False):\n",
-    "                \n",
-    "            self.LU_ref = LU_ref\n",
-    "            self.HO_ref = HO_ref\n",
-    "            \n",
-    "            self.std    = std\n",
-    "            self.linBay = linBay\n",
-    "            \n",
-    "            # prepare for eq 6 (main text)\n",
-    "            set_scaler = np.append(self.HO_ref.reshape(-1,1), self.LU_ref.reshape(-1,1), axis=1)\n",
-    "            set_scaler = PolynomialFeatures(2).fit_transform(set_scaler)\n",
-    "            set_scaler = np.append(set_scaler, (1 / (self.LU_ref - self.HO_ref)).reshape(-1,1), axis=1)\n",
-    "            \n",
-    "            self.CDFT_scaler = StandardScaler().fit(set_scaler[:,1:])\n",
-    "\n",
-    "    #-----------------------------------------------------------------------------#\n",
-    "         \n",
-    "        def RMSE(self, f, y):\n",
-    "            \n",
-    "            '''Calculating the root-mean-square error (RMSE).'''\n",
-    "            \n",
-    "            return np.sqrt(np.mean((y.flatten() - f.flatten())**2))\n",
-    "        \n",
-    "    #-----------------------------------------------------------------------------#\n",
-    "        \n",
-    "        def r2(self, f, y):\n",
-    "            \n",
-    "            '''Calculating the squared correlation coefficient.'''\n",
-    "            \n",
-    "            return np.corrcoef(f.flatten(), y.flatten())[0,1]**2\n",
-    "            \n",
-    "    #-----------------------------------------------------------------------------#\n",
-    "        \n",
-    "        def R2(self, f, y):\n",
-    "            \n",
-    "            '''Calculating coefficient of determination R^2 values.'''\n",
-    "            \n",
-    "            return (1 - (self.RMSE(f, y)**2/(self.RMSE(np.mean(y), y)**2)))\n",
-    "        \n",
-    "    #-----------------------------------------------------------------------------#    \n",
-    "    \n",
-    "        def cdft_features(self, HO, LU):\n",
-    "            \n",
-    "            '''Preparing frontier molecular orbital energies for eq 6 (main text).'''\n",
-    "        \n",
-    "            X_cdft_full = np.append(HO.reshape(-1,1), LU.reshape(-1,1), axis=1)\n",
-    "            X_cdft_full = PolynomialFeatures(2).fit_transform(X_cdft_full)\n",
-    "            X_cdft_full = np.append(X_cdft_full, (1 / (LU - HO)).reshape(-1,1), axis=1)\n",
-    "            \n",
-    "            if self.std == True:\n",
-    "                print('std=True')\n",
-    "                X_cdft_full = X_cdft_full[:,1:]\n",
-    "            \n",
-    "                X_cdft_full = self.CDFT_scaler.transform(X_cdft_full)\n",
-    "                X_cdft_full = StandardScaler().fit_transform(X_cdft_full)\n",
-    "                X_cdft_full = PolynomialFeatures(1).fit_transform(X_cdft_full)\n",
-    "            \n",
-    "            return X_cdft_full\n",
-    "                \n",
-    "    #-----------------------------------------------------------------------------#\n",
-    "    \n",
-    "        def regression(self, X, y):\n",
-    "            \n",
-    "            '''Performing the MLR task. Returning the model, the predictions, \n",
-    "            and values for coefficient of determination and RMSE.'''\n",
-    "            \n",
-    "            if self.linBay == True:\n",
-    "                model = BayesianRidge(fit_intercept=False, lambda_init=1e-3).fit(X, y) \n",
-    "                print(model)\n",
-    "            else:\n",
-    "                model = LinearRegression(fit_intercept=False).fit(X, y) \n",
-    "                print(model)\n",
-    "            \n",
-    "            y_pred = model.predict(X)            \n",
-    "            R2     = self.R2(y_pred, y)     \n",
-    "            RMSE   = self.RMSE(y_pred, y)         \n",
-    "            \n",
-    "            return model, y_pred, R2, RMSE\n",
-    "                 \n",
-    "    #-----------------------------------------------------------------------------#"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "57e11442",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# build class regression with specified input\n",
-    "self = regression(y_ref_HO, y_ref_LU, std=True, linBay=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "aa4eca68",
-   "metadata": {},
-   "source": [
-    "## The second step (QMP to $E$) \n",
-    "\n",
-    "Reference MLR (rMLR)\\\n",
-    "Final settings: std=True, linBay=False"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6c3926ca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rMLR model trained on K=27 molecules:\n",
-    "X_rMLR = self.cdft_features(y_ref_HO, y_ref_LU)\n",
-    "model_rMLR, y_pred_rMLR, R2_rMLR, RMSE_rMLR = self.regression(X_rMLR, E)\n",
-    "rMLR_coefs = model_rMLR.coef_.copy()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e2337912",
-   "metadata": {},
-   "source": [
-    "## The first step (structure to QMP) \n",
-    "\n",
-    "Path A and path B:"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b23ae7a2",
-   "metadata": {},
-   "source": [
-    "### Path A"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6f93524f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# calculation of E for non-reference values\n",
-    "# prepare QC values (for eq 6, main text)\n",
-    "X_cdft_QC = self.cdft_features(y_HO, y_LU)\n",
-    "E_ML_QC   = model_rMLR.predict(X_cdft_QC)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4b30ce63",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# path A: set up MLR model\n",
-    "model_A,  y_pred_A,  R2_A,  RMSE_A = self.regression(X,  E_ML_QC)\n",
-    "\n",
-    "# path A: predictions for training (L=3543) and test (K=27) data\n",
-    "pred_A      = model_A.predict(X)\n",
-    "pred_ref_A  = model_A.predict(X_ref)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8ac8ace1",
-   "metadata": {},
-   "source": [
-    "### Path B"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "20056317",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# path B: set up MLR models\n",
-    "model_LU, y_pred_LU, R2_LU, RMSE_LU = self.regression(X, y_LU)\n",
-    "model_HO, y_pred_HO, R2_HO, RMSE_HO = self.regression(X, y_HO)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "aa0e1a12",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# path B: predictions of LUMO and HOMO energies for K=27 molecules \n",
-    "pred_ref_LU = model_LU.predict(X_ref)\n",
-    "pred_ref_HO = model_HO.predict(X_ref)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b1e857e2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# path B: predictions of E with rMLR\n",
-    "X_cdft_ML_B = self.cdft_features(pred_ref_HO, pred_ref_LU)\n",
-    "E_ML_B      = model_rMLR.predict(X_cdft_ML_B)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
-- 
GitLab