Skip to content
Snippets Groups Projects
Commit b105fbf4 authored by Maike Vahl's avatar Maike Vahl
Browse files

Delete QSRR_MLR_notebook.ipynb

parent b6d23409
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:5d630471 tags:
# Notebook to perform multivariate linear regression (MLR)
A regression class is provided. By initializing, two settings need to be specified:
1) Standardization True/False: Whether the data is standardized before the regression task\
2) Choice for linear model: linBay=True -> BayesianRidge module, linBay=False -> LinearRegression module
%% Cell type:code id:e4d6a6e5 tags:
``` python
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
```
%% Cell type:markdown id:6f9e97a4 tags:
### Data preparation
%% Cell type:code id:42aaafae tags:
``` python
# read in pandas dataframe
df = pd.read_pickle('QC_and_descriptor_dataframe.pkl') # M=3570 molecules
df_ref = pd.read_pickle('QC_and_descriptor_dataframe_ref.pkl') # K=27 molecules
df_noRef = df.drop(index=df_ref.index.values).copy() # M-K=L=3543 molecules
```
%% Cell type:code id:8378b6c1 tags:
``` python
# get descriptor values from dataframes, exemplary for C_FG
X_raw = np.stack(df_noRef.C_FG.values) # L=3543
X_ref_raw = np.stack(df_ref.C_FG.values) # K=27
```
%% Cell type:code id:a50f3bbe tags:
``` python
# get HOMO and LUMO energies from dataframes
y_LU = df_noRef.ELUMO.values # L=3545
y_HO = df_noRef.EHOMO.values # L=3545
y_ref_LU = df_ref.ELUMO.values # K=27
y_ref_HO = df_ref.EHOMO.values # K=27
```
%% Cell type:code id:ab5e62b2 tags:
``` python
# get experimental E parameters for K=27 reference molecules
E = df_ref.E2012.values.astype(float)
```
%% Cell type:code id:addc2d78 tags:
``` python
# data preparation
X = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_raw))
X_ref = PolynomialFeatures(1).fit_transform(StandardScaler().fit_transform(X_ref_raw))
```
%% Cell type:markdown id:f8515261 tags:
### Regression class
%% Cell type:code id:f0032879 tags:
``` python
class regression:
def __init__(self, HO_ref, LU_ref, std=True, linBay=False):
self.LU_ref = LU_ref
self.HO_ref = HO_ref
self.std = std
self.linBay = linBay
# prepare for eq 6 (main text)
set_scaler = np.append(self.HO_ref.reshape(-1,1), self.LU_ref.reshape(-1,1), axis=1)
set_scaler = PolynomialFeatures(2).fit_transform(set_scaler)
set_scaler = np.append(set_scaler, (1 / (self.LU_ref - self.HO_ref)).reshape(-1,1), axis=1)
self.CDFT_scaler = StandardScaler().fit(set_scaler[:,1:])
#-----------------------------------------------------------------------------#
def RMSE(self, f, y):
'''Calculating the root-mean-square error (RMSE).'''
return np.sqrt(np.mean((y.flatten() - f.flatten())**2))
#-----------------------------------------------------------------------------#
def r2(self, f, y):
'''Calculating the squared correlation coefficient.'''
return np.corrcoef(f.flatten(), y.flatten())[0,1]**2
#-----------------------------------------------------------------------------#
def R2(self, f, y):
'''Calculating coefficient of determination R^2 values.'''
return (1 - (self.RMSE(f, y)**2/(self.RMSE(np.mean(y), y)**2)))
#-----------------------------------------------------------------------------#
def cdft_features(self, HO, LU):
'''Preparing frontier molecular orbital energies for eq 6 (main text).'''
X_cdft_full = np.append(HO.reshape(-1,1), LU.reshape(-1,1), axis=1)
X_cdft_full = PolynomialFeatures(2).fit_transform(X_cdft_full)
X_cdft_full = np.append(X_cdft_full, (1 / (LU - HO)).reshape(-1,1), axis=1)
if self.std == True:
print('std=True')
X_cdft_full = X_cdft_full[:,1:]
X_cdft_full = self.CDFT_scaler.transform(X_cdft_full)
X_cdft_full = StandardScaler().fit_transform(X_cdft_full)
X_cdft_full = PolynomialFeatures(1).fit_transform(X_cdft_full)
return X_cdft_full
#-----------------------------------------------------------------------------#
def regression(self, X, y):
'''Performing the MLR task. Returning the model, the predictions,
and values for coefficient of determination and RMSE.'''
if self.linBay == True:
model = BayesianRidge(fit_intercept=False, lambda_init=1e-3).fit(X, y)
print(model)
else:
model = LinearRegression(fit_intercept=False).fit(X, y)
print(model)
y_pred = model.predict(X)
R2 = self.R2(y_pred, y)
RMSE = self.RMSE(y_pred, y)
return model, y_pred, R2, RMSE
#-----------------------------------------------------------------------------#
```
%% Cell type:code id:57e11442 tags:
``` python
# build class regression with specified input
self = regression(y_ref_HO, y_ref_LU, std=True, linBay=False)
```
%% Cell type:markdown id:aa4eca68 tags:
## The second step (QMP to $E$)
Reference MLR (rMLR)\
Final settings: std=True, linBay=False
%% Cell type:code id:6c3926ca tags:
``` python
# rMLR model trained on K=27 molecules:
X_rMLR = self.cdft_features(y_ref_HO, y_ref_LU)
model_rMLR, y_pred_rMLR, R2_rMLR, RMSE_rMLR = self.regression(X_rMLR, E)
rMLR_coefs = model_rMLR.coef_.copy()
```
%% Cell type:markdown id:e2337912 tags:
## The first step (structure to QMP)
Path A and path B:
%% Cell type:markdown id:b23ae7a2 tags:
### Path A
%% Cell type:code id:6f93524f tags:
``` python
# calculation of E for non-reference values
# prepare QC values (for eq 6, main text)
X_cdft_QC = self.cdft_features(y_HO, y_LU)
E_ML_QC = model_rMLR.predict(X_cdft_QC)
```
%% Cell type:code id:4b30ce63 tags:
``` python
# path A: set up MLR model
model_A, y_pred_A, R2_A, RMSE_A = self.regression(X, E_ML_QC)
# path A: predictions for training (L=3543) and test (K=27) data
pred_A = model_A.predict(X)
pred_ref_A = model_A.predict(X_ref)
```
%% Cell type:markdown id:8ac8ace1 tags:
### Path B
%% Cell type:code id:20056317 tags:
``` python
# path B: set up MLR models
model_LU, y_pred_LU, R2_LU, RMSE_LU = self.regression(X, y_LU)
model_HO, y_pred_HO, R2_HO, RMSE_HO = self.regression(X, y_HO)
```
%% Cell type:code id:aa0e1a12 tags:
``` python
# path B: predictions of LUMO and HOMO energies for K=27 molecules
pred_ref_LU = model_LU.predict(X_ref)
pred_ref_HO = model_HO.predict(X_ref)
```
%% Cell type:code id:b1e857e2 tags:
``` python
# path B: predictions of E with rMLR
X_cdft_ML_B = self.cdft_features(pred_ref_HO, pred_ref_LU)
E_ML_B = model_rMLR.predict(X_cdft_ML_B)
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment