From d12b3d9772fd1a52873f8d81af395f7cc7e25166 Mon Sep 17 00:00:00 2001 From: Maike Vahl <m.vahl@tu-braunschweig.de> Date: Tue, 3 Oct 2023 08:41:24 +0000 Subject: [PATCH] Upload New File --- QSRR_MLR_GPR_notebook.ipynb | 474 ++++++++++++++++++++++++++++++++++++ 1 file changed, 474 insertions(+) create mode 100644 QSRR_MLR_GPR_notebook.ipynb diff --git a/QSRR_MLR_GPR_notebook.ipynb b/QSRR_MLR_GPR_notebook.ipynb new file mode 100644 index 0000000..b71978b --- /dev/null +++ b/QSRR_MLR_GPR_notebook.ipynb @@ -0,0 +1,474 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5d630471", + "metadata": {}, + "source": [ + "# Notebook to perform multivariate linear regression (MLR) and Gaussian Process Regression (GPR)\n", + "\n", + "A regression class is provided. By initializing, two/three settings need to be specified:\n", + "1) Standardization True/False: Whether the data is standardized before the regression task\\\n", + "2) Choice for linear model: linBay=True -> BayesianRidge module, linBay=False -> LinearRegression module\n", + "\n", + "If GPR is used by setting gp=true, a specification whether the optimization of hyperparameters needs to be done (o_model=false) or if optimized hyperparameters are provided. Please comment the right keywords in the regression class/regression function out." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "e4d6a6e5", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import pickle as pkl\n", + "import GPy\n", + "\n", + "from sklearn.linear_model import LinearRegression, BayesianRidge\n", + "from sklearn.preprocessing import PolynomialFeatures, StandardScaler" + ] + }, + { + "cell_type": "markdown", + "id": "6f9e97a4", + "metadata": {}, + "source": [ + "### Data preparation" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "42aaafae", + "metadata": {}, + "outputs": [], + "source": [ + "# read in pandas dataframe for structures\n", + "df = pd.read_pickle('QC_and_descriptor_dataframe.pkl') # M=3570 molecules\n", + "df_ref = pd.read_pickle('QC_and_descriptor_dataframe_ref.pkl') # K=27 molecules\n", + "\n", + "df_noRef = df.drop(index=df_ref.index.values).copy() # M-K=L=3543 molecules" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "5416826e", + "metadata": {}, + "outputs": [], + "source": [ + "# read in pandas dataframe for models\n", + "model_df = pd.read_pickle('QSRR_MLR_GPR_model_coefficients_new.pkl')\n", + "\n", + "# get optimized hyperparameters of GPR model Y for descriptor X, exemplary for C_FG\n", + "model_GPR_X_LU = model_df.C_FG.values[-2]\n", + "model_GPR_X_HO = model_df.C_FG.values[-3]\n", + "\n", + "model_A_GPR_X = model_df.C_FG.values[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "8378b6c1", + "metadata": {}, + "outputs": [], + "source": [ + "# get descriptor values from dataframes, exemplary for C_FG\n", + "X_raw = np.stack(df_noRef.C_FG.values) # L=3543\n", + "X_ref_raw = np.stack(df_ref.C_FG.values) # K=27" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "a50f3bbe", + "metadata": {}, + "outputs": [], + "source": [ + "# get HOMO and LUMO energies from dataframes\n", + "y_LU = df_noRef.ELUMO.values # L=3545\n", + "y_HO = df_noRef.EHOMO.values # L=3545\n", + "\n", + "y_ref_LU = df_ref.ELUMO.values # K=27\n", + "y_ref_HO = df_ref.EHOMO.values # K=27" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "ab5e62b2", + "metadata": {}, + "outputs": [], + "source": [ + "# get experimental E parameters for K=27 reference molecules\n", + "E = df_ref.E2012.values.astype(float)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "addc2d78", + "metadata": {}, + "outputs": [], + "source": [ + "# data preparation\n", + "X = PolynomialFeatures(1).fit_transform(X_raw)\n", + "X_ref = PolynomialFeatures(1).fit_transform(X_ref_raw)" + ] + }, + { + "cell_type": "markdown", + "id": "f8515261", + "metadata": {}, + "source": [ + "### Regression class" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "f0032879", + "metadata": {}, + "outputs": [], + "source": [ + "class regression:\n", + " \n", + " def __init__(self, HO_ref, LU_ref, std=True, linBay=False):\n", + " \n", + " self.LU_ref = LU_ref\n", + " self.HO_ref = HO_ref\n", + " \n", + " self.std = std\n", + " self.linBay = linBay\n", + " \n", + " # prepare for eq 6 (main text)\n", + " set_scaler = np.append(self.HO_ref.reshape(-1,1), self.LU_ref.reshape(-1,1), axis=1)\n", + " set_scaler = PolynomialFeatures(2).fit_transform(set_scaler)\n", + " set_scaler = np.append(set_scaler, (1 / (self.LU_ref - self.HO_ref)).reshape(-1,1), axis=1)\n", + " \n", + " self.CDFT_scaler = StandardScaler().fit(set_scaler[:,1:])\n", + "\n", + " #-----------------------------------------------------------------------------#\n", + " \n", + " def RMSE(self, f, y):\n", + " \n", + " '''Calculating the root-mean-square error (RMSE).'''\n", + " \n", + " return np.sqrt(np.mean((y.flatten() - f.flatten())**2))\n", + " \n", + " #-----------------------------------------------------------------------------#\n", + " \n", + " def r2(self, f, y):\n", + " \n", + " '''Calculating the squared correlation coefficient.'''\n", + " \n", + " return np.corrcoef(f.flatten(), y.flatten())[0,1]**2\n", + " \n", + " #-----------------------------------------------------------------------------#\n", + " \n", + " def R2(self, f, y):\n", + " \n", + " '''Calculating coefficient of determination R^2 values.'''\n", + " \n", + " return (1 - (self.RMSE(f, y)**2/(self.RMSE(np.mean(y), y)**2)))\n", + " \n", + " #-----------------------------------------------------------------------------# \n", + " \n", + " def cdft_features(self, HO, LU):\n", + " \n", + " '''Preparing frontier molecular orbital energies for eq 6 (main text).'''\n", + " \n", + " X_cdft_full = np.append(HO.reshape(-1,1), LU.reshape(-1,1), axis=1)\n", + " X_cdft_full = PolynomialFeatures(2).fit_transform(X_cdft_full)\n", + " X_cdft_full = np.append(X_cdft_full, (1 / (LU - HO)).reshape(-1,1), axis=1)\n", + " \n", + " if self.std == True:\n", + " print('std=True')\n", + " X_cdft_full = X_cdft_full[:,1:]\n", + " \n", + " X_cdft_full = self.CDFT_scaler.transform(X_cdft_full)\n", + " X_cdft_full = PolynomialFeatures(1).fit_transform(X_cdft_full)\n", + " \n", + " return X_cdft_full\n", + " \n", + " #-----------------------------------------------------------------------------#\n", + " \n", + " def regression(self, X, y, o_model=False, gp=False):\n", + " \n", + " '''Performing the MLR/GPR task. Returning the model, the predictions, \n", + " and values for coefficient of determination and RMSE.\n", + " o_model is a list and contains the optimized hyperparameters.'''\n", + " \n", + " if gp:\n", + " \n", + " kernel = GPy.kern.Matern32(input_dim=X.shape[1])\n", + " model = GPy.models.GPRegression(X, y.reshape(-1,1), kernel, normalizer=True)\n", + " \n", + " #model optimization with o_model=False\n", + "# model.optimize_restarts(num_restarts=5, parallel=True)\n", + "\n", + " #optimized hyperparameter are provided\n", + " model.kern.variance = o_model[0]\n", + " model.kern.lengthscale = o_model[1] \n", + " model.Gaussian_noise.variance = o_model[2]\n", + " \n", + " elif self.linBay == True:\n", + " model = BayesianRidge(fit_intercept=False, lambda_init=1e-3).fit(X, y) \n", + " else:\n", + " model = LinearRegression(fit_intercept=False).fit(X, y) \n", + " \n", + " if gp: \n", + " y_pred = model.predict(X)[0].flatten()\n", + " else:\n", + " y_pred = model.predict(X) \n", + " \n", + " R2 = self.R2(y_pred, y) \n", + " RMSE = self.RMSE(y_pred, y) \n", + " \n", + " return model, y_pred, R2, RMSE\n", + " \n", + " #-----------------------------------------------------------------------------#" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "57e11442", + "metadata": {}, + "outputs": [], + "source": [ + "# build class regression with specified input\n", + "self = regression(y_ref_HO, y_ref_LU, std=True, linBay=False)" + ] + }, + { + "cell_type": "markdown", + "id": "aa4eca68", + "metadata": {}, + "source": [ + "## The second step (QMP to $E$) \n", + "\n", + "Reference MLR (rMLR)\\\n", + "Final settings: std=True, linBay=False" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "6c3926ca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "std=True\n" + ] + } + ], + "source": [ + "# rMLR model trained on K=27 molecules:\n", + "X_rMLR = self.cdft_features(y_ref_HO, y_ref_LU)\n", + "model_rMLR, y_pred_rMLR, R2_rMLR, RMSE_rMLR = self.regression(X_rMLR, E)\n", + "rMLR_coefs = model_rMLR.coef_.copy()" + ] + }, + { + "cell_type": "markdown", + "id": "e2337912", + "metadata": {}, + "source": [ + "## The first step (structure to QMP) \n", + "\n", + "Path A and path B:" + ] + }, + { + "cell_type": "markdown", + "id": "b23ae7a2", + "metadata": {}, + "source": [ + "### Path A" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "6f93524f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "std=True\n" + ] + } + ], + "source": [ + "# calculation of E for non-reference values\n", + "# prepare QC values (for eq 6, main text)\n", + "X_cdft_QC = self.cdft_features(y_HO, y_LU)\n", + "E_ML_QC = model_rMLR.predict(X_cdft_QC)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "4b30ce63", + "metadata": {}, + "outputs": [], + "source": [ + "# path A: set up MLR model\n", + "model_A, y_pred_A, R2_A, RMSE_A = self.regression(X, E_ML_QC)\n", + "\n", + "# path A (MLR): predictions for training (L=3543) and test (K=27) data\n", + "pred_A = model_A.predict(X)\n", + "pred_ref_A = model_A.predict(X_ref)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "bcab064a", + "metadata": {}, + "outputs": [], + "source": [ + "# path A: set up GPR model\n", + "model_A_G, y_pred_A_G, R2_A_G, RMSE_A_G = self.regression(X, E_ML_QC, o_model=model_A_GPR_X, gp=True)\n", + "\n", + "# path A (GPR): predictions for training (L=3543) and test (K=27) data\n", + "pred_A_G = model_A_G.predict(X)[0]\n", + "pred_ref_A_G = model_A_G.predict(X_ref)[0]" + ] + }, + { + "cell_type": "markdown", + "id": "8ac8ace1", + "metadata": {}, + "source": [ + "### Path B" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "20056317", + "metadata": {}, + "outputs": [], + "source": [ + "# path B: set up MLR models\n", + "model_LU, y_pred_LU, R2_LU, RMSE_LU = self.regression(X, y_LU)\n", + "model_HO, y_pred_HO, R2_HO, RMSE_HO = self.regression(X, y_HO)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "7b072dad", + "metadata": {}, + "outputs": [], + "source": [ + "# path B: set up GPR models\n", + "model_LU_G, y_pred_LU_G, R2_LU_G, RMSE_LU_G = self.regression(X, y_LU, o_model=model_GPR_X_LU, gp=True)\n", + "model_HO_G, y_pred_HO_G, R2_HO_G, RMSE_HO_G = self.regression(X, y_HO, o_model=model_GPR_X_LU, gp=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "aa0e1a12", + "metadata": {}, + "outputs": [], + "source": [ + "# path B: MLR predictions of LUMO and HOMO energies for K=27 molecules \n", + "pred_ref_LU = model_LU.predict(X_ref)\n", + "pred_ref_HO = model_HO.predict(X_ref)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "a8603208", + "metadata": {}, + "outputs": [], + "source": [ + "# path B: GPR predictions of LUMO and HOMO energies for K=27 molecules \n", + "pred_ref_LU_G = model_LU_G.predict(X_ref)[0]\n", + "pred_ref_HO_G = model_HO_G.predict(X_ref)[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "b1e857e2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "std=True\n" + ] + } + ], + "source": [ + "# path B (MLR): predictions of E with rMLR\n", + "X_cdft_ML_B = self.cdft_features(pred_ref_HO, pred_ref_LU)\n", + "E_ML_B = model_rMLR.predict(X_cdft_ML_B)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "ea2b7934", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "std=True\n" + ] + } + ], + "source": [ + "# path B (GPR): predictions of E with rMLR\n", + "X_cdft_ML_B_G = self.cdft_features(pred_ref_HO_G, pred_ref_LU_G)\n", + "E_ML_B_G = model_rMLR.predict(X_cdft_ML_B_G)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de5964ad", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab