From a5b49b51c6e51a3a859e98dcaab99ad70738adb0 Mon Sep 17 00:00:00 2001 From: Jonny Proppe <j.proppe@tu-braunschweig.de> Date: Mon, 26 Jun 2023 12:47:47 +0000 Subject: [PATCH] Upload New File --- ...tural_descriptor_generation_from_xyz.ipynb | 997 ++++++++++++++++++ 1 file changed, 997 insertions(+) create mode 100644 QSRR_structural_descriptor_generation_from_xyz.ipynb diff --git a/QSRR_structural_descriptor_generation_from_xyz.ipynb b/QSRR_structural_descriptor_generation_from_xyz.ipynb new file mode 100644 index 0000000..bea7904 --- /dev/null +++ b/QSRR_structural_descriptor_generation_from_xyz.ipynb @@ -0,0 +1,997 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Structural descriptor generation from xyz files\n", + "\n", + "This notebook contains the technical requirements and python code for the generation of the three structural descriptors applied in the main text (same order):\n", + "1) Counting descriptor C$_\\mathrm{FG}$\\\n", + "2) The original $F_\\mathrm{2B}^1$ descriptor\\\n", + "3) $F_{\\mathrm{2B}}^{\\mathrm{split}}$ descriptor\n", + "\n", + "Data is read in and preprocessed for the subsequent descriptor calculations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from ase.io import read\n", + "from numpy import linalg as LA\n", + "from itertools import chain" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def read_input(input_name):\n", + " \n", + " '''Read in input file and return line by line.'''\n", + " \n", + " with open(input_name) as f:\n", + " data = [line.strip().split() for line in f.readlines() if line.strip()]\n", + " \n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# specification of input files\n", + "input_dir = 'xyz_files' # directory with structure data\n", + "input_files = 'names.txt' # list of structures names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# read in data\n", + "data = read_input(input_files)\n", + "names = []\n", + "var_n = []\n", + "\n", + "for i in data:\n", + " names.append(input_dir+'/'+i[0]+'.xyz')\n", + " var_n.append(i[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# read (ase) module \n", + "mols = []\n", + "\n", + "for i in range(len(names)):\n", + " var_n[i] = read(names[i])\n", + " mols.append(var_n[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "-----------------------------------" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Counting descriptor C$_\\mathrm{FG}$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def ntp_para(s): \n", + " \n", + " '''Dictionary for para positions and index.'''\n", + " \n", + " keys = { \n", + " 'Cl' : 0,\n", + " 'F' : 1,\n", + " 'Me' : 2,\n", + " 'OMe' : 3,\n", + " 'OPh' : 4,\n", + " 'NMe2' : 5,\n", + " 'NPh2' : 6,\n", + " 'NMePh' : 7,\n", + " 'NMeCF3': 8,\n", + " 'NPhCF3': 9,\n", + " 'CF3' : 10,\n", + " 'pyr' : 11,\n", + " 'mor' : 12,\n", + " }\n", + " return keys[s]\n", + "\n", + "#-----------------------------------------------------------------------------#\n", + "\n", + "def ntp_meta(s): \n", + " \n", + " '''Dictionary for para positions and index.'''\n", + " \n", + " keys = { \n", + " 'Cl' : 0,\n", + " 'F' : 1,\n", + " }\n", + " return keys[s]\n", + "\n", + "#-----------------------------------------------------------------------------#\n", + "\n", + "def get_s(fname):\n", + " \n", + " '''Count number of meta and para substituents in a molecule.'''\n", + " \n", + " name = fname.split('.')[0] # discard extension (if applicable)\n", + " ring1, ring2 = name.split('__') # separate rings\n", + " meta = []\n", + " para = []\n", + " \n", + " for ring in (ring1, ring2):\n", + " r = ring.split('_')\n", + " ct = 0\n", + " for s in r: \n", + " if ct % 2 == 0: # even: meta; odd: para\n", + " meta.append(s)\n", + " else:\n", + " para.append(s)\n", + " ct += 1\n", + " \n", + " return meta, para\n", + "\n", + "#-----------------------------------------------------------------------------#\n", + "\n", + "def get_counting(fname):\n", + " \n", + " '''This function calculates the counting descriptor based on the definition given \n", + " in the main document.'''\n", + " \n", + " nbmeta = 2 # number of blocks for meta positions\n", + " nbpara = 13 # number of blocks for para positions\n", + " nbmetameta = int(nbmeta * (nbmeta + 1) / 2) \n", + " nbparameta = nbpara * nbmeta\n", + " vec = np.zeros(nbmeta + nbpara + nbmetameta + nbparameta, dtype=int)\n", + " meta, para = get_s(fname)\n", + " \n", + " for m in range(4): # meta\n", + " if meta[m] == 'H': continue\n", + " vec[ntp_meta(meta[m])] += 1\n", + " if m % 2 ==0:\n", + " if meta[m+1] == 'H': continue\n", + " vec[nbmeta + nbpara+ntp_meta(meta[m])+ntp_meta(meta[m+1])]=+1 \n", + " \n", + " for p in range(2): # para\n", + " if para[p] == 'H': continue\n", + " vec[nbmeta + ntp_para(para[p])] += 1\n", + " for m in range(p*2,2+p*2): \n", + " if meta[m] == 'H': continue\n", + " vec[nbmeta + nbpara + nbmetameta+ntp_meta(meta[m])*nbpara+ntp_para(para[p])]+=1 \n", + " \n", + " return vec" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#example\n", + "get_counting('Cl_mor_F__Cl_NMeCF3_Cl')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# calculates counting descriptor for complete data set\n", + "counting_FG =[]\n", + "\n", + "for name in names:\n", + " counting_FG.append(get_counting(name.split('/')[1].split('.')[0])) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "counting_FG" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "-----------------------------------" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# $F_\\mathrm{2B}^1$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class F2B1:\n", + " \n", + " def __init__(self, mols):\n", + " \n", + " self.mols = mols\n", + " \n", + " charges = []\n", + " \n", + " for mol in self.mols:\n", + " charges += mol.get_atomic_numbers().tolist()\n", + " \n", + " self.A = np.unique(charges)\n", + " \n", + " #-----------------------------------------------------------------------------#\n", + "\n", + " def gen_f2b(self):\n", + " \n", + " self.f2b = []\n", + " \n", + " S2B = []\n", + " \n", + " for i in self.A:\n", + " for j in self.A:\n", + " if i <= j:\n", + " S2B.append([i,j])\n", + " \n", + " S2B = np.asarray(S2B)\n", + " \n", + " combinations = S2B.shape[0]\n", + " \n", + " for mol in self.mols:\n", + " \n", + " F2B = np.zeros(1 * combinations)\n", + " n_atoms = mol.get_global_number_of_atoms()\n", + " \n", + " for i in range(n_atoms):\n", + " for j in range(i+1, n_atoms):\n", + " R_ij = np.linalg.norm(mol.get_positions()[i] - mol.get_positions()[j])\n", + " Z_i = mol.get_atomic_numbers()[i]\n", + " Z_j = mol.get_atomic_numbers()[j]\n", + " if Z_i <= Z_j:\n", + " ij = np.argwhere((S2B[:,0] == Z_i) & (S2B[:,1] == Z_j)).item()\n", + " else:\n", + " ij = np.argwhere((S2B[:,0] == Z_j) & (S2B[:,1] == Z_i)).item()\n", + " for m in range(1):\n", + " F2B[1 * ij + m] += R_ij**(-(m + 1))\n", + " \n", + " self.f2b.append(F2B)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# build F2B1 class\n", + "X_F2B1 = F2B1(mols)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# generate F2B1 descriptor \n", + "X_F2B1.gen_f2b() " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_F2B1.f2b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "-----------------------------------" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# $F_{\\mathrm{2B}}^{\\mathrm{split}}$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# dictionary of substituents and their respective number of atoms \n", + "ligands = {\"H\": 1 ,\n", + " \"Cl\": 1 ,\n", + " \"F\": 1 ,\n", + " \"Me\": 4 ,\n", + " \"OMe\": 5 ,\n", + " \"OPh\": 12,\n", + " \"NMe2\": 9 ,\n", + " \"NPh2\": 23,\n", + " \"NMePh\": 16,\n", + " \"NMeCF3\": 12,\n", + " \"NPhCF3\": 19,\n", + " \"CF3\": 4 ,\n", + " \"pyr\": 13,\n", + " \"mor\": 14}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class F2B_split:\n", + " \n", + " def __init__(self, mols, paths, ligands):\n", + " \n", + " self.mols = mols\n", + " self.paths = paths\n", + " self.ligands = ligands\n", + " \n", + " self.get_struc_info()\n", + " self.get_carbons()\n", + " self.get_res()\n", + " \n", + " #Get unique elements in Q and R lists\n", + " self.Ccat_e_unique = ['C']\n", + " self.CPh_e_unique = ['C']\n", + " self.Q_e_unique = list(np.unique(self.Q_e_all))\n", + " self.R_e_unique = list(np.unique(self.R_e_all))\n", + " \n", + " #-----------------------------------------------------------------------------#\n", + " \n", + " def get_struc_info(self):\n", + " \n", + " self.charges = []\n", + " self.num_atoms = []\n", + " self.elements = []\n", + " self.coordinates = []\n", + " self.names = []\n", + " \n", + " \n", + " for i in range(len(self.mols)):\n", + " self.charges += self.mols[i].get_atomic_numbers().tolist()\n", + " self.num_atoms.append(self.mols[i].get_global_number_of_atoms())\n", + " self.elements.append(self.mols[i].get_chemical_symbols())\n", + " self.coordinates.append(self.mols[i].get_positions())\n", + " self.names.append(self.paths[i].split('/')[1].split('.xyz')[0])\n", + "\n", + " #-----------------------------------------------------------------------------#\n", + " \n", + " def get_carbons(self):\n", + " \n", + " '''This function reads out the different carbon atoms, the carbocation, phenyl ring carbons \n", + " (writes element list as well) from the output of read_xyz(). The xyz file follows the same scheme\n", + " for each considered structure. (Generated via the Structure_generator.py).'''\n", + " \n", + " self.Ccat = []\n", + " self.CPh = []\n", + " self.Ccat_e = []\n", + " self.CPh_e = []\n", + " \n", + " for i in range(len(self.names)):\n", + " CPh_temp = []\n", + " CPh_e_temp = []\n", + " self.Ccat.append(self.coordinates[i][0])\n", + " self.Ccat_e.append('C')\n", + " for entry in range(len(self.coordinates[i][1:18])):\n", + " if not self.elements[i][1:18][entry] == \"H\": #sort out hydrogen atoms\n", + " CPh_temp.append(list(self.coordinates[i][1:18][entry]))\n", + " CPh_e_temp.append('C')\n", + " self.CPh.append(CPh_temp)\n", + " self.CPh_e.append(CPh_e_temp)\n", + "\n", + " #-----------------------------------------------------------------------------#\n", + " \n", + " def get_res(self):\n", + " \n", + " '''This function reads out the different elements and xyz coordinates for the meta and para \n", + " substituents and sorts out hydrogen atoms. The name of the structure is used together with \n", + " the dictionary ligand for this purpose. The name of each structure can be read: \n", + " meta11_para1_meta12__meta21_para2_meta22.xyz (first three elements for first ring..). \n", + " Q means meta, R means para.'''\n", + " \n", + " self.Q = []\n", + " self.R = []\n", + " self.Q_e = []\n", + " self.R_e = []\n", + " \n", + " self.Q_all = []\n", + " self.R_all = []\n", + " self.Q_e_all = []\n", + " self.R_e_all = []\n", + " \n", + " for name in range(len(self.names)):\n", + " \n", + " ring1 = self.names[name].split('__')[0]\n", + " ring2 = self.names[name].split('__')[1]\n", + " \n", + " #get ligand information for each position\n", + " m11 = ring1.split('_')[0]\n", + " p1 = ring1.split('_')[1]\n", + " m12 = ring1.split('_')[2]\n", + " m21 = ring2.split('_')[0]\n", + " p2 = ring2.split('_')[1]\n", + " m22 = ring2.split('_')[2]\n", + " \n", + " i = 18 #position in xyz file\n", + " \n", + " #1) get element at specific position\n", + " #2) get coordinates for this position\n", + " #3) add number of ligand atoms to i to get to the next position\n", + "\n", + " Q11_e = self.elements[name][i:i+self.ligands[m11]]\n", + " Q11 = self.coordinates[name][i:i+self.ligands[m11]]\n", + " i += self.ligands[m11]\n", + " Q12_e = self.elements[name][i:i+self.ligands[m12]]\n", + " Q12 = self.coordinates[name][i:i+self.ligands[m12]]\n", + " i += self.ligands[m12]\n", + " Q21_e = self.elements[name][i:i+self.ligands[m21]]\n", + " Q21 = self.coordinates[name][i:i+self.ligands[m21]]\n", + " i += self.ligands[m21]\n", + " Q22_e = self.elements[name][i:i+self.ligands[m22]]\n", + " Q22 = self.coordinates[name][i:i+self.ligands[m22]]\n", + " i += self.ligands[m22]\n", + " R1_e = self.elements[name][i:i+self.ligands[p1]]\n", + " R1 = self.coordinates[name][i:i+self.ligands[p1]] \n", + " i += self.ligands[p1]\n", + " R2_e = self.elements[name][i:i+self.ligands[p2]]\n", + " R2 = self.coordinates[name][i:i+self.ligands[p2]]\n", + " \n", + " q = [Q11, Q12, Q21, Q22]\n", + " r = [R1, R2]\n", + " q_e = [Q11_e, Q12_e, Q21_e, Q22_e]\n", + " r_e = [R1_e, R2_e]\n", + " \n", + " q_ = []\n", + " r_ = []\n", + " q_e_ = []\n", + " r_e_ = []\n", + " \n", + " # sort out H atoms\n", + " for entry in range(len(q_e)):\n", + " if len(q_e[entry]) > 1: #if functional group is molecular\n", + " q__ = []\n", + " q_e__ = []\n", + " for atom in range(len(q_e[entry])):\n", + " if not q_e[entry][atom] == \"H\":\n", + " q__.append(q[entry][atom])\n", + " q_e__temp.append(q_e[entry][atom])\n", + " self.Q_all.append(q[entry][atom])\n", + " self.Q_e_all.append(q_e[entry][atom])\n", + " q_.append(q__)\n", + " q_e_.append(q_e__)\n", + " else: #if functional group is atomic\n", + " if not q_e[entry][0] == \"H\":\n", + " q_.append(q[entry])\n", + " q_e_.append(q_e[entry])\n", + " self.Q_all.append(q[entry][0])\n", + " self.Q_e_all.append(q_e[entry][0])\n", + " else:\n", + " q_.append([])\n", + " q_e_.append([])\n", + " self.Q.append(q_)\n", + " self.Q_e.append(q_e_)\n", + " \n", + " for entry in range(len(r_e)):\n", + " if len(r_e[entry]) > 1: #if functional group is molecular\n", + " r__ = []\n", + " r_e__ = []\n", + " for atom in range(len(r_e[entry])):\n", + " if not r_e[entry][atom] == \"H\":\n", + " r__.append(r[entry][atom])\n", + " r_e__.append(r_e[entry][atom])\n", + " self.R_all.append(r[entry][atom])\n", + " self.R_e_all.append(r_e[entry][atom])\n", + " r_.append(r__)\n", + " r_e_.append(r_e__)\n", + " else: #if functional group is atomic\n", + " if not r_e[entry][0] == \"H\":\n", + " r_.append(r[entry])\n", + " r_e_.append(r_e[entry])\n", + " self.R_all.append(r[entry][0])\n", + " self.R_e_all.append(r_e[entry][0])\n", + " else:\n", + " r_.append([])\n", + " r_e_.append([])\n", + " \n", + " self.R.append(r_)\n", + " self.R_e.append(r_e_)\n", + " \n", + " #-----------------------------------------------------------------------------#\n", + " \n", + " def get_2B_combis(self, X_e, Y_e):\n", + " \n", + " '''The different element combinations for the desired interaction group (for all but QR) can be \n", + " found with this function. X_e_unique and Y_e_unique contain the elements in the interaction group\n", + " (in a unique fashion, f.e. only one entry for C in a Ph ring at para).'''\n", + " \n", + " struc_X_dist = []\n", + " X_e_dist_inv = {}\n", + "\n", + " for mol in range(len(self.names)):\n", + " X_dist = []\n", + " #for the first loop:\n", + " if mol == 0:\n", + " for i in np.arange(len(X_e)):\n", + " for j in np.arange(len(Y_e)):\n", + " #no double counting\n", + " if j<i:\n", + " continue\n", + " else:\n", + " #append element combination to dictionary and empty list for later distance assignement\n", + " #only first loop, because the dictionary only needs to be written once\n", + " X_dist.append([])\n", + " X_e_dist_inv[X_e[i]+Y_e[j]]=len(X_dist)-1\n", + " X_e_dist_inv[Y_e[j]+X_e[i]]=len(X_dist)-1\n", + " else:\n", + " for i in np.arange(len(X_e)):\n", + " for j in np.arange(len(Y_e)):\n", + " if j<i:\n", + " continue\n", + " else:\n", + " X_dist.append([])\n", + " struc_X_dist.append(X_dist)\n", + " \n", + " return struc_X_dist, X_e_dist_inv\n", + "\n", + " #-----------------------------------------------------------------------------#\n", + " \n", + " def get_2B_combis_QR(self, X_e, Y_e):\n", + " \n", + " '''Specific case of the get_2B_combinations functions for the interaction group QR, \n", + " because the elements F and Cl are the only ones, that are placed in meta position. \n", + " Therefore, a differentiation between metaCl-paraF and metaF-paraCl interactions is \n", + " necessary. This is done in this function. Overall the aim and workflow is similar to \n", + " the original function.'''\n", + " \n", + " struc_X_dist = []\n", + " X_e_dist_inv = {}\n", + "\n", + " for mol in range(len(self.names)):\n", + " X_dist = []\n", + " if mol == 0:\n", + " for i in np.arange(len(X_e)): #meta positions\n", + " for j in np.arange(len(Y_e)): #para positions\n", + " if X_e[i]+Y_e[j] == 'ClF':\n", + " X_dist.append([])\n", + " X_e_dist_inv[str('QR')+X_e[i]+Y_e[j]]=len(X_dist)-1\n", + " if X_e[i]+Y_e[j] == 'FCl':\n", + " X_dist.append([])\n", + " X_e_dist_inv[str('RQ')+Y_e[j]+X_e[i]]=len(X_dist)-1\n", + " else:\n", + " if not X_e[i]+Y_e[j] == 'ClF' or X_e[i]+Y_e[j] == 'FCl':\n", + " X_dist.append([])\n", + " X_e_dist_inv[X_e[i]+Y_e[j]]=len(X_dist)-1\n", + " X_e_dist_inv[Y_e[j]+X_e[i]]=len(X_dist)-1 \n", + " \n", + " else:\n", + " for i in np.arange(len(X_e)):\n", + " for j in np.arange(len(Y_e)):\n", + " X_dist.append([])\n", + " struc_X_dist.append(X_dist)\n", + " \n", + " return struc_X_dist, X_e_dist_inv\n", + " \n", + " #-----------------------------------------------------------------------------#\n", + "\n", + " def calc_dist(self, v1, v2):\n", + " \n", + " '''Function to calculate the distance between two given vectors.'''\n", + " \n", + " return LA.norm(np.array(v1)-np.array(v2))\n", + "\n", + " #-----------------------------------------------------------------------------#\n", + "\n", + " def calc_R_inv(self, R_real_XY):\n", + "\n", + " '''Function to invert the given distances.'''\n", + " \n", + " R_inv_XY = []\n", + " \n", + " for mol in range(len(R_real_XY)):\n", + " if R_real_XY[mol] == []:\n", + " R_inv_XY.append([])\n", + " else:\n", + " R_inv_XY.append(np.array(R_real_XY[mol])**(-1)) \n", + " \n", + " return R_inv_XY\n", + "\n", + " #-----------------------------------------------------------------------------#\n", + "\n", + " def assign_dist(self, X_Y_e, R_inv_XY, dict_name, list_name):\n", + " \n", + " '''Function to assign the inverted distance to the right position in the descriptor. This is done by\n", + " using the dictionary written in the first step in the get_2B_combiantions function.'''\n", + " \n", + " for mol in range(len(X_Y_e)): # number of structures\n", + " for combi in range(len(X_Y_e[mol])): # number of distances\n", + " idx_dist_ls = dict_name[X_Y_e[mol][combi]]\n", + " list_name[mol][idx_dist_ls].append(R_inv_XY[mol][combi])\n", + " \n", + " return list_name\n", + " \n", + " #-----------------------------------------------------------------------------#\n", + " \n", + " def get_CcatCPh_dist(self, Ccat, Y, Ccat_e, Y_e, dict_name, list_name):\n", + " \n", + " '''For carbocation to phenyl (aromatic ring) carbons, this function calculates the inverse distances\n", + " for the different element combinations present in each structure. It returns the list, which includes\n", + " the descriptor entries in the right dimension.'''\n", + "\n", + " Rinv_Ccat_Y = []\n", + " Ccat_Y_e = []\n", + " \n", + " for struc in range(len(Y)): #number of structures\n", + " Rinv_Ccat_Y_temp = []\n", + " Ccat_Y_e_temp = []\n", + " for c in range(len(Y[struc])): #number of different element combination for specific structure\n", + " #here: c == C, only one element-element combination: C-C\n", + " Rinv_Ccat_Y_temp.append(self.calc_dist(Ccat[struc],Y[struc][c])**(-1))\n", + " Ccat_Y_e_temp.append(Ccat_e[struc]+Y_e[struc][c])\n", + " \n", + " Rinv_Ccat_Y.append(Rinv_Ccat_Y_temp)\n", + " Ccat_Y_e.append(Ccat_Y_e_temp)\n", + " \n", + " list_name = self.assign_dist(Ccat_Y_e, Rinv_Ccat_Y, dict_name, list_name)\n", + " \n", + " return list_name\n", + "\n", + " #-----------------------------------------------------------------------------#\n", + " \n", + " def get_CcatY_dist(self, Ccat, Y, Ccat_e, Y_e, dict_name, list_name):\n", + " \n", + " '''For carbocation to Y, this function calculates the inverse distances for the different element \n", + " combinations present in each structure. It returns the list, which includes the descriptor entries \n", + " in the right dimension.''' \n", + " \n", + " Rreal_Ccat_Y = []\n", + " Ccat_Y_e = []\n", + " \n", + " for struc in range(len(Y)):\n", + " Rreal_Ccat_Y_temp = []\n", + " Ccat_Y_e_temp = []\n", + " for y1 in range(len(Y[struc])): # go through Y1 ligands in each structure\n", + " for y2 in range(len(Y[struc][y1])): # go through Y2 atoms in each ligand in each structure\n", + " Rreal_Ccat_Y_temp.append(self.calc_dist(Ccat[struc],Y[struc][y1][y2]))\n", + " Ccat_Y_e_temp.append(Ccat_e[struc]+Y_e[struc][y1][y2])\n", + " \n", + " Rreal_Ccat_Y.append(Rreal_Ccat_Y_temp)\n", + " Ccat_Y_e.append(Ccat_Y_e_temp)\n", + " \n", + " Rinv_Ccat_Y = self.calc_R_inv(Rreal_Ccat_Y)\n", + " list_name = self.assign_dist(Ccat_Y_e, Rinv_Ccat_Y, dict_name, list_name)\n", + " \n", + " return list_name\n", + "\n", + " #-----------------------------------------------------------------------------#\n", + " \n", + " def get_CPhCPh_dist(self, X, Y, X_e, Y_e, dict_name, list_name):\n", + " \n", + " '''For phenyl (aromatic ring) carbons to phenyl (aromatic ring) carbons, this function calculates \n", + " the inverse distances for the different element combinations present in each structure. It returns \n", + " the list, which includes the descriptor entries in the right dimension.'''\n", + " \n", + " Rreal_X_Y = []\n", + " Rinv_X_Y = []\n", + " X_Y_e = []\n", + " \n", + " names_high_dist = []\n", + " \n", + " for struc in range(len(X)):\n", + " Rreal_X_Y_temp = []\n", + " X_Y_e_temp = []\n", + " for c1 in range(len(X[struc])):\n", + " for c2 in range(len(Y[struc])):\n", + " #Attention! Do not double account!\n", + " if c2 < c1:\n", + " continue\n", + " else:\n", + " #No self interaction, R==0.0\n", + " if self.calc_dist(X[struc][c1],Y[struc][c2]) != 0.0:\n", + " Rreal_X_Y_temp.append(self.calc_dist(X[struc][c1],Y[struc][c2]))\n", + " X_Y_e_temp.append(X_e[struc][c1]+Y_e[struc][c2])\n", + " \n", + " Rreal_X_Y.append(Rreal_X_Y_temp)\n", + " X_Y_e.append(X_Y_e_temp)\n", + " \n", + " Rinv_X_Y = np.array(Rreal_X_Y)**(-1)\n", + " list_name = self.assign_dist(X_Y_e, Rinv_X_Y, dict_name, list_name)\n", + " \n", + " return list_name\n", + "\n", + " #-----------------------------------------------------------------------------#\n", + "\n", + " def get_CPhY_dist(self, CPh, Y, CPh_e, Y_e, dict_name, list_name):\n", + " \n", + " '''For phenyl (aromatic ring) carbons to Y, this function calculates the inverse distances for the \n", + " different element combinations present in each structure. It returns the list, which includes the \n", + " descriptor entries in the right dimension.'''\n", + " \n", + " Rreal_CPh_Y = []\n", + " CPh_Y_e = []\n", + " \n", + " for struc in range(len(CPh)):\n", + " Rreal_CPh_Y_temp = []\n", + " CPh_Y_e_temp = []\n", + " for c1 in range(len(CPh[struc])): # carbon atom in aromatic rings in structure\n", + " for y1 in range(len(Y[struc])): # ligand in structure\n", + " for y2 in range(len(Y[struc][y1])): # atom in ligand in structure\n", + " Rreal_CPh_Y_temp.append(self.calc_dist(CPh[struc][c1],Y[struc][y1][y2]))\n", + " CPh_Y_e_temp.append(CPh_e[struc][c1]+Y_e[struc][y1][y2])\n", + " Rreal_CPh_Y.append(Rreal_CPh_Y_temp)\n", + " CPh_Y_e.append(CPh_Y_e_temp)\n", + " \n", + " Rinv_CPh_Y = self.calc_R_inv(Rreal_CPh_Y)\n", + " list_name = self.assign_dist(CPh_Y_e, Rinv_CPh_Y, dict_name, list_name) \n", + " \n", + " return list_name\n", + " \n", + " #-----------------------------------------------------------------------------#\n", + "\n", + " def get_XY_dist(self, X, Y, X_e, Y_e, dict_name, list_name):\n", + " \n", + " '''For X to Y, this function calculates the inverse distances for the different element combinations\n", + " present in each structure. It returns the list, which includes the descriptor entries in the right \n", + " dimension. Attention! As in the special case for get_2B_combinations, the meta/para F and Cl \n", + " differentiation needs to be considered again!''' \n", + " \n", + " Rreal_X_Y = []\n", + " X_Y_e = []\n", + " \n", + " for struc in range(len(X)):\n", + " Rreal_X_Y_temp = []\n", + " X_Y_e_temp = []\n", + " for x1 in range(len(X[struc])): # ligand x1 in structure\n", + " for x2 in range(len(X[struc][x1])): # atom x2 of ligand x1 in structure\n", + " for y1 in range(len(Y[struc])): # ligand y1 in structure\n", + " for y2 in range(len(Y[struc][y1])): # atom y2 of ligand y1 in structure\n", + " Rreal_X_Y_temp.append(self.calc_dist(X[struc][x1][x2],Y[struc][y1][y2]))\n", + " if X_e[struc][x1][x2]+Y_e[struc][y1][y2] == 'ClF':\n", + " X_Y_e_temp.append(str('QR')+X_e[struc][x1][x2]+Y_e[struc][y1][y2])\n", + " if X_e[struc][x1][x2]+Y_e[struc][y1][y2] == 'FCl':\n", + " X_Y_e_temp.append(str('RQ')+Y_e[struc][y1][y2]+X_e[struc][x1][x2])\n", + " else:\n", + " if not X_e[struc][x1][x2]+Y_e[struc][y1][y2] == 'ClF' or X_e[struc][x1][x2]+Y_e[struc][y1][y2] == 'FCl':\n", + " X_Y_e_temp.append(X_e[struc][x1][x2]+Y_e[struc][y1][y2])\n", + " \n", + " Rreal_X_Y.append(Rreal_X_Y_temp)\n", + " X_Y_e.append(X_Y_e_temp)\n", + " \n", + " Rinv_X_Y = self.calc_R_inv(Rreal_X_Y)\n", + " list_name = self.assign_dist(X_Y_e, Rinv_X_Y, dict_name, list_name)\n", + " \n", + " return list_name\n", + " \n", + " #-----------------------------------------------------------------------------#\n", + "\n", + " def get_XX_dist(self, X, Y, X_e, Y_e, dict_name, list_name):\n", + " \n", + " '''For X to X, this function calculates the inverse distances for the different element combinations\n", + " present in each structure. It returns the list, which includes the descriptor entries in the right \n", + " dimension.'''\n", + " \n", + " Rreal_X_Y = []\n", + " X_Y_e = []\n", + "\n", + " for struc in range(len(X)):\n", + " Rreal_X_Y_temp = []\n", + " X_Y_e_temp = []\n", + " for x1 in range(len(X[struc])):\n", + " for x2 in range(len(X[struc][x1])):\n", + " for y1 in range(len(Y[struc])):\n", + " for y2 in range(len(Y[struc][y1])):\n", + " if x1 < y1:\n", + " continue\n", + " else:\n", + " #Attention: no self interaction!\n", + " if self.calc_dist(X[struc][x1][x2],Y[struc][y1][y2]) != 0.0:\n", + " Rreal_X_Y_temp.append(self.calc_dist(X[struc][x1][x2],Y[struc][y1][y2]))\n", + " X_Y_e_temp.append(X_e[struc][x1][x2]+Y_e[struc][y1][y2])\n", + " Rreal_X_Y.append(Rreal_X_Y_temp)\n", + " X_Y_e.append(X_Y_e_temp)\n", + "\n", + " Rinv_X_Y = self.calc_R_inv(Rreal_X_Y) \n", + " list_name = self.assign_dist(X_Y_e, Rinv_X_Y, dict_name, list_name)\n", + "\n", + " return list_name\n", + " \n", + " #-----------------------------------------------------------------------------#\n", + "\n", + " def sum_Rinv(self, XY_dist):\n", + "\n", + " '''Calculating the sum of all inverse distances for each dimension in the descriptor for\n", + " each structure.'''\n", + " \n", + " sum_XY_all = []\n", + " \n", + " for struc in range(len(XY_dist)):\n", + " sum_XY = []\n", + " for i in range(len(XY_dist[struc])):\n", + " if len(XY_dist[struc]) == 1:\n", + " sum_XY.append(np.sum(XY_dist[struc]).astype(float))\n", + " else:\n", + " sum_XY.append(np.sum(XY_dist[struc][i]).astype(float)) \n", + " sum_XY_all.append(sum_XY)\n", + " \n", + " return sum_XY_all" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# build F2B_split class\n", + "self = F2B_split(mols, names, ligands)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for each interaction group: get 2B combinations\n", + "CcatCPh_all, CcatCPh_e_dist = self.get_2B_combis( self.Ccat_e_unique, self.CPh_e_unique) #Ccat-CPh 2B combinations\n", + "CcatQ_all, CcatQ_e_dist = self.get_2B_combis( self.Ccat_e_unique, self.Q_e_unique) #Ccat-Q 2B combinations\n", + "CcatR_all, CcatR_e_dist = self.get_2B_combis( self.Ccat_e_unique, self.R_e_unique) #Ccat-R 2B combinations\n", + "CPhCPh_all, CPhCPh_e_dist = self.get_2B_combis( self.CPh_e_unique, self.CPh_e_unique) #CPh-CPh 2B combinations\n", + "CPhQ_all, CPhQ_e_dist = self.get_2B_combis( self.CPh_e_unique, self.Q_e_unique) #CPh-Q 2B combinations\n", + "CPhR_all, CPhR_e_dist = self.get_2B_combis( self.CPh_e_unique, self.R_e_unique) #CPh-R 2B combinations\n", + "QR_all, QR_e_dist = self.get_2B_combis_QR(self.Q_e_unique, self.R_e_unique) #Q-R 2B combinations\n", + "QQ_all, QQ_e_dist = self.get_2B_combis( self.Q_e_unique, self.Q_e_unique) #Q-Q 2B combinations\n", + "RR_all, RR_e_dist = self.get_2B_combis( self.R_e_unique, self.R_e_unique) #R-R 2B combinations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# calculate the inverse distance list (assigned to the right descriptor dimension)\n", + "CcatCPh_dist_w = self.get_CcatCPh_dist(self.Ccat, self.CPh, self.Ccat_e, self.CPh_e, CcatCPh_e_dist, CcatCPh_all)\n", + "CcatQ_dist_w = self.get_CcatY_dist( self.Ccat, self.Q, self.Ccat_e, self.Q_e, CcatQ_e_dist, CcatQ_all)\n", + "CcatR_dist_w = self.get_CcatY_dist( self.Ccat, self.R, self.Ccat_e, self.R_e, CcatR_e_dist, CcatR_all)\n", + "CPhCPh_dist_w = self.get_CPhCPh_dist( self.CPh, self.CPh, self.CPh_e, self.CPh_e, CPhCPh_e_dist, CPhCPh_all)\n", + "CPhQ_dist_w = self.get_CPhY_dist( self.CPh, self.Q, self.CPh_e, self.Q_e, CPhQ_e_dist, CPhQ_all)\n", + "CPhR_dist_w = self.get_CPhY_dist( self.CPh, self.R, self.CPh_e, self.R_e, CPhR_e_dist, CPhR_all)\n", + "QR_dist_w = self.get_XY_dist( self.Q, self.R, self.Q_e, self.R_e, QR_e_dist, QR_all)\n", + "QQ_dist_w = self.get_XX_dist( self.Q, self.Q, self.Q_e, self.Q_e, QQ_e_dist, QQ_all)\n", + "RR_dist_w = self.get_XX_dist( self.R, self.R, self.R_e, self.R_e, RR_e_dist, RR_all)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get sums for each structure in each descriptor dimension\n", + "sum_CcatCPh = self.sum_Rinv(CcatCPh_dist_w)\n", + "sum_CcatQ = self.sum_Rinv(CcatQ_dist_w)\n", + "sum_CcatR = self.sum_Rinv(CcatR_dist_w)\n", + "sum_CPhCPh = self.sum_Rinv(CPhCPh_dist_w)\n", + "sum_CPhQ = self.sum_Rinv(CPhQ_dist_w)\n", + "sum_CPhR = self.sum_Rinv(CPhR_dist_w)\n", + "sum_QQ = self.sum_Rinv(QQ_dist_w)\n", + "sum_RR = self.sum_Rinv(RR_dist_w)\n", + "sum_QR = self.sum_Rinv(QR_dist_w)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# build F2Bsplit descriptor\n", + "F2Bsplit = []\n", + "\n", + "for mol in range(len(self.names)):\n", + " F2Bsplit.append(np.array(sum_CcatCPh[mol]\n", + " +sum_CcatQ[mol]\n", + " +sum_CcatR[mol]\n", + " +sum_CPhCPh[mol]\n", + " +sum_CPhQ[mol]\n", + " +sum_CPhR[mol] \n", + " +sum_QQ[mol] \n", + " +sum_RR[mol] \n", + " +sum_QR[mol]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "F2Bsplit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} -- GitLab