From b105fbf4f36dc7510627a2cc5aa22512e0ccce05 Mon Sep 17 00:00:00 2001 From: Maike Vahl <m.vahl@tu-braunschweig.de> Date: Tue, 3 Oct 2023 08:44:02 +0000 Subject: [PATCH] Delete QSRR_MLR_notebook.ipynb --- QSRR_MLR_notebook.ipynb | 348 ---------------------------------------- 1 file changed, 348 deletions(-) delete mode 100644 QSRR_MLR_notebook.ipynb diff --git a/QSRR_MLR_notebook.ipynb b/QSRR_MLR_notebook.ipynb deleted file mode 100644 index 3c3df2f..0000000 --- a/QSRR_MLR_notebook.ipynb +++ /dev/null @@ -1,348 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "5d630471", - "metadata": {}, - "source": [ - "# Notebook to perform multivariate linear regression (MLR)\n", - "\n", - "A regression class is provided. By initializing, two settings need to be specified:\n", - "1) Standardization True/False: Whether the data is standardized before the regression task\\\n", - "2) Choice for linear model: linBay=True -> BayesianRidge module, linBay=False -> LinearRegression module" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4d6a6e5", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import pickle as pkl\n", - "\n", - "from sklearn.linear_model import LinearRegression, BayesianRidge\n", - "from sklearn.preprocessing import PolynomialFeatures, StandardScaler" - ] - }, - { - "cell_type": "markdown", - "id": "6f9e97a4", - "metadata": {}, - "source": [ - "### Data preparation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "42aaafae", - "metadata": {}, - "outputs": [], - "source": [ - "# read in pandas dataframe\n", - "df = pd.read_pickle('QC_and_descriptor_dataframe.pkl') # M=3570 molecules\n", - "df_ref = pd.read_pickle('QC_and_descriptor_dataframe_ref.pkl') # K=27 molecules\n", - "\n", - "df_noRef = df.drop(index=df_ref.index.values).copy() # M-K=L=3543 molecules" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8378b6c1", - "metadata": {}, - "outputs": [], - "source": [ - "# get descriptor values from dataframes, exemplary for C_FG\n", - "X_raw = np.stack(df_noRef.C_FG.values) # L=3543\n", - "X_ref_raw = np.stack(df_ref.C_FG.values) # K=27" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a50f3bbe", - "metadata": {}, - "outputs": [], - "source": [ - "# get HOMO and LUMO energies from dataframes\n", - "y_LU = df_noRef.ELUMO.values # L=3545\n", - "y_HO = df_noRef.EHOMO.values # L=3545\n", - "\n", - "y_ref_LU = df_ref.ELUMO.values # K=27\n", - "y_ref_HO = df_ref.EHOMO.values # K=27" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab5e62b2", - "metadata": {}, - "outputs": [], - "source": [ - "# get experimental E parameters for K=27 reference molecules\n", - "E = df_ref.E2012.values.astype(float)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "addc2d78", - "metadata": {}, - "outputs": [], - "source": [ - "# data preparation\n", - "X = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_raw))\n", - "X_ref = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_ref_raw))" - ] - }, - { - "cell_type": "markdown", - "id": "f8515261", - "metadata": {}, - "source": [ - "### Regression class" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0032879", - "metadata": {}, - "outputs": [], - "source": [ - "class regression:\n", - " \n", - " def __init__(self, HO_ref, LU_ref, std=True, linBay=False):\n", - " \n", - " self.LU_ref = LU_ref\n", - " self.HO_ref = HO_ref\n", - " \n", - " self.std = std\n", - " self.linBay = linBay\n", - " \n", - " # prepare for eq 6 (main text)\n", - " set_scaler = np.append(self.HO_ref.reshape(-1,1), self.LU_ref.reshape(-1,1), axis=1)\n", - " set_scaler = PolynomialFeatures(2).fit_transform(set_scaler)\n", - " set_scaler = np.append(set_scaler, (1 / (self.LU_ref - self.HO_ref)).reshape(-1,1), axis=1)\n", - " \n", - " self.CDFT_scaler = StandardScaler().fit(set_scaler[:,1:])\n", - "\n", - " #-----------------------------------------------------------------------------#\n", - " \n", - " def RMSE(self, f, y):\n", - " \n", - " '''Calculating the root-mean-square error (RMSE).'''\n", - " \n", - " return np.sqrt(np.mean((y.flatten() - f.flatten())**2))\n", - " \n", - " #-----------------------------------------------------------------------------#\n", - " \n", - " def r2(self, f, y):\n", - " \n", - " '''Calculating the squared correlation coefficient.'''\n", - " \n", - " return np.corrcoef(f.flatten(), y.flatten())[0,1]**2\n", - " \n", - " #-----------------------------------------------------------------------------#\n", - " \n", - " def R2(self, f, y):\n", - " \n", - " '''Calculating coefficient of determination R^2 values.'''\n", - " \n", - " return (1 - (self.RMSE(f, y)**2/(self.RMSE(np.mean(y), y)**2)))\n", - " \n", - " #-----------------------------------------------------------------------------# \n", - " \n", - " def cdft_features(self, HO, LU):\n", - " \n", - " '''Preparing frontier molecular orbital energies for eq 6 (main text).'''\n", - " \n", - " X_cdft_full = np.append(HO.reshape(-1,1), LU.reshape(-1,1), axis=1)\n", - " X_cdft_full = PolynomialFeatures(2).fit_transform(X_cdft_full)\n", - " X_cdft_full = np.append(X_cdft_full, (1 / (LU - HO)).reshape(-1,1), axis=1)\n", - " \n", - " if self.std == True:\n", - " print('std=True')\n", - " X_cdft_full = X_cdft_full[:,1:]\n", - " \n", - " X_cdft_full = self.CDFT_scaler.transform(X_cdft_full)\n", - " X_cdft_full = StandardScaler().fit_transform(X_cdft_full)\n", - " X_cdft_full = PolynomialFeatures(1).fit_transform(X_cdft_full)\n", - " \n", - " return X_cdft_full\n", - " \n", - " #-----------------------------------------------------------------------------#\n", - " \n", - " def regression(self, X, y):\n", - " \n", - " '''Performing the MLR task. Returning the model, the predictions, \n", - " and values for coefficient of determination and RMSE.'''\n", - " \n", - " if self.linBay == True:\n", - " model = BayesianRidge(fit_intercept=False, lambda_init=1e-3).fit(X, y) \n", - " print(model)\n", - " else:\n", - " model = LinearRegression(fit_intercept=False).fit(X, y) \n", - " print(model)\n", - " \n", - " y_pred = model.predict(X) \n", - " R2 = self.R2(y_pred, y) \n", - " RMSE = self.RMSE(y_pred, y) \n", - " \n", - " return model, y_pred, R2, RMSE\n", - " \n", - " #-----------------------------------------------------------------------------#" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "57e11442", - "metadata": {}, - "outputs": [], - "source": [ - "# build class regression with specified input\n", - "self = regression(y_ref_HO, y_ref_LU, std=True, linBay=False)" - ] - }, - { - "cell_type": "markdown", - "id": "aa4eca68", - "metadata": {}, - "source": [ - "## The second step (QMP to $E$) \n", - "\n", - "Reference MLR (rMLR)\\\n", - "Final settings: std=True, linBay=False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6c3926ca", - "metadata": {}, - "outputs": [], - "source": [ - "# rMLR model trained on K=27 molecules:\n", - "X_rMLR = self.cdft_features(y_ref_HO, y_ref_LU)\n", - "model_rMLR, y_pred_rMLR, R2_rMLR, RMSE_rMLR = self.regression(X_rMLR, E)\n", - "rMLR_coefs = model_rMLR.coef_.copy()" - ] - }, - { - "cell_type": "markdown", - "id": "e2337912", - "metadata": {}, - "source": [ - "## The first step (structure to QMP) \n", - "\n", - "Path A and path B:" - ] - }, - { - "cell_type": "markdown", - "id": "b23ae7a2", - "metadata": {}, - "source": [ - "### Path A" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f93524f", - "metadata": {}, - "outputs": [], - "source": [ - "# calculation of E for non-reference values\n", - "# prepare QC values (for eq 6, main text)\n", - "X_cdft_QC = self.cdft_features(y_HO, y_LU)\n", - "E_ML_QC = model_rMLR.predict(X_cdft_QC)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b30ce63", - "metadata": {}, - "outputs": [], - "source": [ - "# path A: set up MLR model\n", - "model_A, y_pred_A, R2_A, RMSE_A = self.regression(X, E_ML_QC)\n", - "\n", - "# path A: predictions for training (L=3543) and test (K=27) data\n", - "pred_A = model_A.predict(X)\n", - "pred_ref_A = model_A.predict(X_ref)" - ] - }, - { - "cell_type": "markdown", - "id": "8ac8ace1", - "metadata": {}, - "source": [ - "### Path B" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20056317", - "metadata": {}, - "outputs": [], - "source": [ - "# path B: set up MLR models\n", - "model_LU, y_pred_LU, R2_LU, RMSE_LU = self.regression(X, y_LU)\n", - "model_HO, y_pred_HO, R2_HO, RMSE_HO = self.regression(X, y_HO)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa0e1a12", - "metadata": {}, - "outputs": [], - "source": [ - "# path B: predictions of LUMO and HOMO energies for K=27 molecules \n", - "pred_ref_LU = model_LU.predict(X_ref)\n", - "pred_ref_HO = model_HO.predict(X_ref)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1e857e2", - "metadata": {}, - "outputs": [], - "source": [ - "# path B: predictions of E with rMLR\n", - "X_cdft_ML_B = self.cdft_features(pred_ref_HO, pred_ref_LU)\n", - "E_ML_B = model_rMLR.predict(X_cdft_ML_B)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} -- GitLab