IFP script added, others still required.
This commit is contained in:
commit
731466dc9a
799
create_IFP_datasets_v2_dense.py
Normal file
799
create_IFP_datasets_v2_dense.py
Normal file
@ -0,0 +1,799 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed May 26 20:27:23 2021
|
||||||
|
|
||||||
|
@author: Jonathan
|
||||||
|
|
||||||
|
The concatenation is not actual concatenation! Its just a sum up.. (my bad)
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
pd.options.mode.chained_assignment = None # default='warn'
|
||||||
|
import oddt
|
||||||
|
import oddt.interactions
|
||||||
|
import oddt.fingerprints
|
||||||
|
import oddt.scoring.descriptors.binana
|
||||||
|
import oddt.docking.AutodockVina
|
||||||
|
import rdkit
|
||||||
|
from oddt.scoring.functions.RFScore import rfscore
|
||||||
|
from oddt.scoring.functions.NNScore import nnscore
|
||||||
|
from oddt.scoring.functions.PLECscore import PLECscore
|
||||||
|
import pickle
|
||||||
|
import re
|
||||||
|
from rdkit import Chem
|
||||||
|
import sys
|
||||||
|
import gc
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
def find_number(text, c):
|
||||||
|
return re.findall(r'%s(\d+)' % c, text)
|
||||||
|
|
||||||
|
# Definitions created by Olivier Béquignon (Adding them to add more information to the dataset)
|
||||||
|
def get_pocket(klifs_id):
|
||||||
|
'''Obtain the 85 residue binding pocket structure from the specified KLIFS structure ID
|
||||||
|
|
||||||
|
:param klifs_id: KLIFS structure ID
|
||||||
|
:return: ODDT protein structure of the binding pocket
|
||||||
|
'''
|
||||||
|
response = urllib.request.urlopen(f'https://klifs.net/api_v2/structure_get_pocket?structure_ID={klifs_id}')
|
||||||
|
data = response.read().decode()
|
||||||
|
pocket = oddt.toolkit.readstring('mol2', data)
|
||||||
|
pocket.protein = True
|
||||||
|
return pocket
|
||||||
|
|
||||||
|
def get_ligand(klifs_id):
|
||||||
|
'''Obtain the ligand structure from the specified KLIFS structure ID
|
||||||
|
|
||||||
|
:param klifs_id: KLIFS structure ID
|
||||||
|
:return: ODDT ligand structure
|
||||||
|
'''
|
||||||
|
response = urllib.request.urlopen(f'https://klifs.net/api_v2/structure_get_ligand?structure_ID={klifs_id}')
|
||||||
|
data = response.read().decode()
|
||||||
|
if len(data):
|
||||||
|
return oddt.toolkit.readstring('mol2', data)
|
||||||
|
|
||||||
|
def get_pocket_IFP(klifs_id: int=None,
|
||||||
|
pocket: oddt.toolkit.readfile=None,
|
||||||
|
ligand: oddt.toolkit.readfile=None):
|
||||||
|
'''Obtain the interaction fingerprint of the ligand and protein pocket residues.
|
||||||
|
|
||||||
|
:param klifs_id: KLIFS structure ID (ignored if pocket and ligand provided)
|
||||||
|
:param pocket: ODDT pocket. If None, the KLIFS pocket is used
|
||||||
|
:param ligand: ODDT ligand.If None, the KLIFS ligand is used
|
||||||
|
|
||||||
|
:return: Binary IFP
|
||||||
|
'''
|
||||||
|
if klifs_id is None and pocket is None and ligand is None:
|
||||||
|
raise ValueError('Must at least provide a KLIFS ID if any of pocket and/or ligand is/are missing')
|
||||||
|
if ligand is None:
|
||||||
|
ligand = get_ligand(klifs_id)
|
||||||
|
if pocket is None:
|
||||||
|
pocket = get_pocket(klifs_id)
|
||||||
|
return oddt.fingerprints.InteractionFingerprint(ligand, pocket)
|
||||||
|
|
||||||
|
def to_dense_fp(bits, size):
|
||||||
|
X = dict(list(map(int, x.split(': '))) for x in bits[1:-1].split(', '))
|
||||||
|
return [X[i] if i in X.keys() else 0 for i in range(size)]
|
||||||
|
|
||||||
|
to_sparse_fp = lambda X: {i: x for i, x in enumerate(map(int, X[1:-1].split(', '))) if x > 0}
|
||||||
|
|
||||||
|
n_in_range = 0
|
||||||
|
n_per_step = 1
|
||||||
|
var_from = sys.argv[1]
|
||||||
|
var_to = sys.argv[2]
|
||||||
|
next_from = int(var_from) + 1
|
||||||
|
|
||||||
|
print(var_from, var_to)
|
||||||
|
if var_from == var_to:
|
||||||
|
print("var_from and var_to are the same, exiting!")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(f'config_{var_from}_{var_to}.txt', 'r') as f:
|
||||||
|
config_file = f.readlines()
|
||||||
|
file_number = int(config_file[0].replace("\n",""))
|
||||||
|
count = -1
|
||||||
|
group_number = int(config_file[2].replace("\n",""))
|
||||||
|
var_from = int(config_file[3].replace("\n",""))
|
||||||
|
var_to = int(config_file[4])
|
||||||
|
print("File number and count", file_number, count)
|
||||||
|
f.close()
|
||||||
|
except:
|
||||||
|
print("No config file. Count is -1")
|
||||||
|
file_number = 0
|
||||||
|
count =-1
|
||||||
|
group_number = 0
|
||||||
|
|
||||||
|
# Load in dataset and add empty columns for the Interaction Fingerprints
|
||||||
|
csv_dataframe = pd.read_csv('../KLIFS_kinase_structure_data_selection_subselection_np.csv')
|
||||||
|
csv_active_compounds = pd.read_csv('../../my_rp1_compounds_kinases/uniprot_kinase_actives/uniprot_kinase_actives.csv')
|
||||||
|
csv_inactive_compounds = pd.read_csv('../../my_rp1_compounds_kinases/uniprot_kinase_inactives/uniprot_kinase_inactives.csv')
|
||||||
|
|
||||||
|
dataframe_SIFP = pd.DataFrame()
|
||||||
|
dataframe_SIFP["Type"],dataframe_SIFP["KLIFS_pocket_IFP"],dataframe_SIFP["Conc_KLIFS_pocket_IFP"],dataframe_SIFP["VinaScore"],dataframe_SIFP["RFv1"],dataframe_SIFP["RFv2"],dataframe_SIFP["RFv3"],dataframe_SIFP["nn_score"],dataframe_SIFP["plec_score"],dataframe_SIFP["SMILES"],dataframe_SIFP["pchembl_value_Mean"],dataframe_SIFP["pchembl_value_Median"],dataframe_SIFP["protein"],dataframe_SIFP["compound"],dataframe_SIFP["decoy_group"] = '','','','','','','','','','','','','','',''
|
||||||
|
|
||||||
|
dataframe_PLEC = pd.DataFrame()
|
||||||
|
dataframe_PLEC["Type"],dataframe_PLEC["PLEC"],dataframe_PLEC["Conc_PLEC"],dataframe_PLEC["VinaScore"],dataframe_PLEC["RFv1"],dataframe_PLEC["RFv2"],dataframe_PLEC["RFv3"],dataframe_PLEC["nn_score"],dataframe_PLEC["plec_score"],dataframe_PLEC["SMILES"],dataframe_PLEC["pchembl_value_Mean"],dataframe_PLEC["pchembl_value_Median"],dataframe_PLEC["protein"],dataframe_PLEC["compound"],dataframe_PLEC["decoy_group"] = '','','','','','','','','','','','','','',''
|
||||||
|
|
||||||
|
dataframe_SPLIF_ECFP2 = pd.DataFrame()
|
||||||
|
dataframe_SPLIF_ECFP2["Type"],dataframe_SPLIF_ECFP2["SPLIF_ECFP2"],dataframe_SPLIF_ECFP2["Conc_SPLIF_ECFP2"],dataframe_SPLIF_ECFP2["VinaScore"],dataframe_SPLIF_ECFP2["RFv1"],dataframe_SPLIF_ECFP2["RFv2"],dataframe_SPLIF_ECFP2["RFv3"],dataframe_SPLIF_ECFP2["nn_score"],dataframe_SPLIF_ECFP2["plec_score"],dataframe_SPLIF_ECFP2["SMILES"],dataframe_SPLIF_ECFP2["pchembl_value_Mean"],dataframe_SPLIF_ECFP2["pchembl_value_Median"],dataframe_SPLIF_ECFP2["protein"],dataframe_SPLIF_ECFP2["compound"],dataframe_SPLIF_ECFP2["decoy_group"] = '','','','','','','','','','','','','','',''
|
||||||
|
|
||||||
|
dataframe_SPLIF_ECFP4 = pd.DataFrame()
|
||||||
|
dataframe_SPLIF_ECFP4["Type"],dataframe_SPLIF_ECFP4["SPLIF_ECFP4"],dataframe_SPLIF_ECFP4["Conc_SPLIF_ECFP4"],dataframe_SPLIF_ECFP4["VinaScore"],dataframe_SPLIF_ECFP4["RFv1"],dataframe_SPLIF_ECFP4["RFv2"],dataframe_SPLIF_ECFP4["RFv3"],dataframe_SPLIF_ECFP4["nn_score"],dataframe_SPLIF_ECFP4["plec_score"],dataframe_SPLIF_ECFP4["SMILES"],dataframe_SPLIF_ECFP4["pchembl_value_Mean"],dataframe_SPLIF_ECFP4["pchembl_value_Median"],dataframe_SPLIF_ECFP4["protein"],dataframe_SPLIF_ECFP4["compound"],dataframe_SPLIF_ECFP4["decoy_group"] = '','','','','','','','','','','','','','',''
|
||||||
|
|
||||||
|
dataframe_SPLIF_ECFP6 = pd.DataFrame()
|
||||||
|
dataframe_SPLIF_ECFP6["Type"],dataframe_SPLIF_ECFP6["SPLIF_ECFP6"],dataframe_SPLIF_ECFP6["Conc_SPLIF_ECFP6"],dataframe_SPLIF_ECFP6["VinaScore"],dataframe_SPLIF_ECFP6["RFv1"],dataframe_SPLIF_ECFP6["RFv2"],dataframe_SPLIF_ECFP6["RFv3"],dataframe_SPLIF_ECFP6["nn_score"],dataframe_SPLIF_ECFP6["plec_score"],dataframe_SPLIF_ECFP6["SMILES"],dataframe_SPLIF_ECFP6["pchembl_value_Mean"],dataframe_SPLIF_ECFP6["pchembl_value_Median"],dataframe_SPLIF_ECFP6["protein"],dataframe_SPLIF_ECFP6["compound"],dataframe_SPLIF_ECFP6["decoy_group"] = '','','','','','','','','','','','','','',''
|
||||||
|
|
||||||
|
ListFaultyStructures = []
|
||||||
|
|
||||||
|
# Iterate through my csv and select for each structure the folder with actives and the folder with inactives. Then calculate fingerprints.
|
||||||
|
# for i in range(len(csv_dataframe["filename"])):
|
||||||
|
# for i in range(0+n_in_range*n_per_step, n_per_step+n_in_range*n_per_step):
|
||||||
|
for i in range(int(var_from), int(var_to)):
|
||||||
|
filename = csv_dataframe["filename"][i]
|
||||||
|
group = csv_dataframe["group"][i]
|
||||||
|
kinase_ID = csv_dataframe["kinase_ID"][i]
|
||||||
|
structure_ID = csv_dataframe["structure_ID"][i]
|
||||||
|
uniprot = csv_dataframe["uniprot"][i]
|
||||||
|
klifs_id = csv_dataframe["structure_ID"][i]
|
||||||
|
print(filename, group, kinase_ID, structure_ID, uniprot)
|
||||||
|
os.system(f'mkdir ../IFP_datasets/SIFP_v2/{filename[:-5]}')
|
||||||
|
os.system(f'mkdir ../IFP_datasets/PLEC_v2/{filename[:-5]}')
|
||||||
|
os.system(f'mkdir ../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}')
|
||||||
|
os.system(f'mkdir ../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}')
|
||||||
|
os.system(f'mkdir ../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}')
|
||||||
|
|
||||||
|
try:
|
||||||
|
protein = ''
|
||||||
|
try:
|
||||||
|
os.system(f'obabel -ipdbqt ../../my_rp1_compounds_kinases/selected_mol2structures/{group}_pdbqt/{filename[:-5]}.pdbqt -opdb -O sparse_{filename[:-5]}.pdb')
|
||||||
|
protein = next(oddt.toolkit.readfile('pdb', f'sparse_{filename[:-5]}.pdb'))
|
||||||
|
proteinname = f'../../my_rp1_compounds_kinases/selected_mol2structures/{group}_pdbqt/{filename[:-5]}.pdbqt'
|
||||||
|
print("loaded in structure on 1st try")
|
||||||
|
except:
|
||||||
|
print("On except..")
|
||||||
|
ListFaultyStructures.append("Unsanitized! ",filename)
|
||||||
|
protein = next(oddt.toolkit.readfile('pdb', f'sparse_{filename[:-5]}.pdb', sanitize=False))
|
||||||
|
print("Unsanitized! ", filename)
|
||||||
|
print("loaded in protein succesfully!")
|
||||||
|
protein.protein = True
|
||||||
|
|
||||||
|
rf1 = rfscore.load(version=1)
|
||||||
|
with open('pickles/rf1.pickle', 'wb') as f:
|
||||||
|
pickle.dump(rf1, f)
|
||||||
|
f.close()
|
||||||
|
rf2 = rfscore.load(version=2)
|
||||||
|
with open('pickles/rf2.pickle', 'wb') as f:
|
||||||
|
pickle.dump(rf2, f)
|
||||||
|
f.close()
|
||||||
|
rf3 = rfscore.load(version=3)
|
||||||
|
with open('pickles/rf3.pickle', 'wb') as f:
|
||||||
|
pickle.dump(rf3, f)
|
||||||
|
f.close()
|
||||||
|
nn = nnscore.load()
|
||||||
|
with open('pickles/nn.pickle', 'wb') as f:
|
||||||
|
pickle.dump(nn, f)
|
||||||
|
f.close()
|
||||||
|
plecscore = PLECscore.load()
|
||||||
|
with open('pickles/plecscore.pickle', 'wb') as f:
|
||||||
|
pickle.dump(plecscore, f)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
rf1.set_protein(protein)
|
||||||
|
rf2.set_protein(protein)
|
||||||
|
rf3.set_protein(protein)
|
||||||
|
nn.set_protein(protein)
|
||||||
|
plecscore.set_protein(protein)
|
||||||
|
|
||||||
|
# KLIFS retrieving pocket for SIFP
|
||||||
|
pocket = get_pocket(klifs_id)
|
||||||
|
|
||||||
|
try:
|
||||||
|
ligand_location_actives = "../../docking/"+group+"/"+uniprot+"/"+filename[:-5]+"/actives/"
|
||||||
|
ligand_location_inactives = "../../docking/"+group+"/"+uniprot+"/"+filename[:-5]+"/inactives/"
|
||||||
|
decoy_location_1000 = "../../docking/"+group+"/"+uniprot+"/"+filename[:-5]+"/decoys/"
|
||||||
|
decoy_location_1625 = "../../docking/"+group+"/"+uniprot+"/"+filename[:-5]+"/decoys1625/"
|
||||||
|
run_number = 0
|
||||||
|
number_of_actives = len(os.listdir(ligand_location_actives))
|
||||||
|
number_of_inactives = len(os.listdir(ligand_location_inactives))
|
||||||
|
grouplist = ["AGC","Atypical","CAMK","CK1","CMGC","STE","TK","TKL"]
|
||||||
|
|
||||||
|
for j in sorted(os.listdir(ligand_location_actives))[file_number:]:
|
||||||
|
print("Actives ", j, number_of_actives)
|
||||||
|
print(file_number, number_of_actives)
|
||||||
|
if file_number >= number_of_actives:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
file_number += 1
|
||||||
|
run_number += 1
|
||||||
|
group_number = 0
|
||||||
|
ligandname = f'{ligand_location_actives}{j}'
|
||||||
|
|
||||||
|
# ligand = next(oddt.toolkit.readfile('pdbqt', ligand_location_actives+j))
|
||||||
|
os.system(f'obabel -ipdbqt {ligand_location_actives}{j} -osdf -O sparse_{filename[:-5]}.sdf')
|
||||||
|
ligand = next(oddt.toolkit.readfile('sdf',f'sparse_{filename[:-5]}.sdf'))
|
||||||
|
Type = "active"
|
||||||
|
|
||||||
|
# IFP = oddt.fingerprints.InteractionFingerprint(ligand, protein)
|
||||||
|
# SIFP = oddt.fingerprints.SimpleInteractionFingerprint(ligand,protein)
|
||||||
|
KLIFS_pocket_IFP = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand)
|
||||||
|
Conc_KLIFS_pocket_IFP = KLIFS_pocket_IFP
|
||||||
|
KLIFS_pocket_IFP = ''.join(str(list(KLIFS_pocket_IFP)))
|
||||||
|
|
||||||
|
PLEC = oddt.fingerprints.PLEC(ligand, protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True)
|
||||||
|
Conc_PLEC = PLEC
|
||||||
|
PLEC = ''.join(str(list(PLEC)))
|
||||||
|
|
||||||
|
SPLIF_ECFP2 = oddt.fingerprints.SPLIF(ligand, protein, depth=1, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP2 = SPLIF_ECFP2
|
||||||
|
SPLIF_ECFP2 = ''.join(str(list(SPLIF_ECFP2)))
|
||||||
|
|
||||||
|
SPLIF_ECFP4 = oddt.fingerprints.SPLIF(ligand, protein, depth=2, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP4 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP4 = SPLIF_ECFP4
|
||||||
|
SPLIF_ECFP4 = ''.join(str(list(SPLIF_ECFP4)))
|
||||||
|
|
||||||
|
SPLIF_ECFP6 = oddt.fingerprints.SPLIF(ligand, protein, depth=3, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP6 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP6 = SPLIF_ECFP6
|
||||||
|
SPLIF_ECFP6 = ''.join(str(list(SPLIF_ECFP6)))
|
||||||
|
|
||||||
|
RFv1 = rf1.predict(ligand)
|
||||||
|
RFv2 = rf2.predict(ligand)
|
||||||
|
RFv3 = rf3.predict(ligand)
|
||||||
|
plec_score = plecscore.predict(ligand)
|
||||||
|
nn_score = nn.predict([ligand])
|
||||||
|
VinaScore = ligand.data
|
||||||
|
|
||||||
|
# This part is for concatenating all 5 poses
|
||||||
|
try:
|
||||||
|
# ligand = list(oddt.toolkit.readfile('pdbqt', ligand_location_actives+j))
|
||||||
|
ligand = list(oddt.toolkit.readfile('sdf', f'sparse_{filename[:-5]}.sdf'))
|
||||||
|
for l in [x for x in range(len(ligand)) if x != 0]:
|
||||||
|
KLIFS_pocket_IFP_v2 = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand[l])
|
||||||
|
Conc_KLIFS_pocket_IFP += KLIFS_pocket_IFP_v2
|
||||||
|
|
||||||
|
PLEC_v2 = oddt.fingerprints.PLEC(ligand[l], protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True)
|
||||||
|
Conc_PLEC += PLEC_v2
|
||||||
|
|
||||||
|
SPLIF_ECFP2_v2 = oddt.fingerprints.SPLIF(ligand[l], protein, depth=1, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP2_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2_v2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP2 += SPLIF_ECFP2_v2
|
||||||
|
|
||||||
|
SPLIF_ECFP4_v2 = oddt.fingerprints.SPLIF(ligand[l], protein, depth=2, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP4_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4_v2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP4 += SPLIF_ECFP4_v2
|
||||||
|
|
||||||
|
SPLIF_ECFP6_v2 = oddt.fingerprints.SPLIF(ligand[l], protein, depth=3, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP6_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6_v2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP6 += SPLIF_ECFP6_v2
|
||||||
|
Conc_KLIFS_pocket_IFP = ''.join(str(list(Conc_KLIFS_pocket_IFP)))
|
||||||
|
Conc_PLEC = ''.join(str(list(Conc_PLEC)))
|
||||||
|
Conc_SPLIF_ECFP2 = ''.join(str(list(Conc_SPLIF_ECFP2)))
|
||||||
|
Conc_SPLIF_ECFP4 = ''.join(str(list(Conc_SPLIF_ECFP4)))
|
||||||
|
Conc_SPLIF_ECFP6 = ''.join(str(list(Conc_SPLIF_ECFP6)))
|
||||||
|
print("Concatenated poses succesfully! Number of poses: "+str(l))
|
||||||
|
except:
|
||||||
|
print("Concat error!")
|
||||||
|
pass
|
||||||
|
|
||||||
|
#Dense to sparse for storage
|
||||||
|
# PLEC = PLEC.apply(to_sparse_fp)
|
||||||
|
# Conc_PLEC = oddt.fingerprints.dense_to_sparse(Conc_PLEC)
|
||||||
|
# print(len(PLEC))
|
||||||
|
# SPLIF_ECFP2 = oddt.fingerprints.dense_to_sparse(SPLIF_ECFP2, size=4096)
|
||||||
|
# Conc_SPLIF_ECFP2 = oddt.fingerprints.dense_to_sparse(Conc_SPLIF_ECFP2, size=4096)
|
||||||
|
# SPLIF_ECFP4 = oddt.fingerprints.dense_to_sparse(SPLIF_ECFP4, size=4096)
|
||||||
|
# Conc_SPLIF_ECFP4 = oddt.fingerprints.dense_to_sparse(Conc_SPLIF_ECFP4, size=4096)
|
||||||
|
# SPLIF_ECFP6 = oddt.fingerprints.dense_to_sparse(SPLIF_ECFP6, size=4096)
|
||||||
|
# Conc_SPLIF_ECFP6 = oddt.fingerprints.dense_to_sparse(Conc_SPLIF_ECFP6, size=4096)
|
||||||
|
|
||||||
|
#Find and calculate SMILES
|
||||||
|
with open(ligand_location_actives+j,'r') as f:
|
||||||
|
pdbqt_file = f.read()
|
||||||
|
f.close()
|
||||||
|
number_from_file = find_number(pdbqt_file, '.smi:')
|
||||||
|
print(number_from_file)
|
||||||
|
target_smiles = f'../../my_rp1_compounds_kinases/uniprot_kinase_{Type}s/{group}/{uniprot}_{Type}s.smi'
|
||||||
|
with open(target_smiles, 'r') as f:
|
||||||
|
smiles_file = f.readlines()
|
||||||
|
f.close()
|
||||||
|
SMILES = str(smiles_file[int(number_from_file[0])-1]).replace("\n","")
|
||||||
|
print(SMILES)
|
||||||
|
|
||||||
|
for q in range(len(csv_active_compounds)):
|
||||||
|
if csv_active_compounds["standardised_smiles"][q] == SMILES and csv_active_compounds["accession"][q] == uniprot:
|
||||||
|
pchembl_mean = csv_active_compounds["pchembl_value_Mean"][q]
|
||||||
|
pchembl_median = csv_active_compounds["pchembl_value_Median"][q]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
pchembl_mean = ''
|
||||||
|
pchembl_median = ''
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Conc_PLEC,Conc_SPLIF_ECFP2,Conc_SPLIF_ECFP4,Conc_SPLIF_ECFP6,PLEC_v2,SPLIF_ECFP2_v2,SPLIF_ECFP4_v2,SPLIF_ECFP6_v2 ='','','','','','','',''
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
dataframe_SIFP,dataframe_SIFP["KLIFS_pocket_IFP"][count],dataframe_SIFP["Conc_KLIFS_pocket_IFP"][count],dataframe_SIFP['VinaScore'][count],dataframe_SIFP['Type'][count],dataframe_SIFP["RFv1"][count],dataframe_SIFP["RFv2"][count],dataframe_SIFP["RFv3"][count],dataframe_SIFP["nn_score"][count],dataframe_SIFP["plec_score"][count],dataframe_SIFP["SMILES"][count],dataframe_SIFP["pchembl_value_Mean"][count],dataframe_SIFP["pchembl_value_Median"][count],dataframe_SIFP["protein"][count],dataframe_SIFP["compound"][count] = dataframe_SIFP.append(csv_dataframe.iloc[i],ignore_index=True),KLIFS_pocket_IFP,Conc_KLIFS_pocket_IFP,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname
|
||||||
|
dataframe_PLEC,dataframe_PLEC["PLEC"][count],dataframe_PLEC["Conc_PLEC"][count],dataframe_PLEC['VinaScore'][count],dataframe_PLEC['Type'][count],dataframe_PLEC["RFv1"][count],dataframe_PLEC["RFv2"][count],dataframe_PLEC["RFv3"][count],dataframe_PLEC["nn_score"][count],dataframe_PLEC["plec_score"][count],dataframe_PLEC["SMILES"][count],dataframe_PLEC["pchembl_value_Mean"][count],dataframe_PLEC["pchembl_value_Median"][count],dataframe_PLEC["protein"][count],dataframe_PLEC["compound"][count] = dataframe_PLEC.append(csv_dataframe.iloc[i],ignore_index=True),PLEC,Conc_PLEC,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname
|
||||||
|
dataframe_SPLIF_ECFP2,dataframe_SPLIF_ECFP2["SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2["Conc_SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2['VinaScore'][count],dataframe_SPLIF_ECFP2['Type'][count],dataframe_SPLIF_ECFP2["RFv1"][count],dataframe_SPLIF_ECFP2["RFv2"][count],dataframe_SPLIF_ECFP2["RFv3"][count],dataframe_SPLIF_ECFP2["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP2["SMILES"][count],dataframe_SPLIF_ECFP2["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP2["pchembl_value_Median"][count],dataframe_SPLIF_ECFP2["protein"][count],dataframe_SPLIF_ECFP2["compound"][count] = dataframe_SPLIF_ECFP2.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP2,Conc_SPLIF_ECFP2,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname
|
||||||
|
dataframe_SPLIF_ECFP4,dataframe_SPLIF_ECFP4["SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4["Conc_SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4['VinaScore'][count],dataframe_SPLIF_ECFP4['Type'][count],dataframe_SPLIF_ECFP4["RFv1"][count],dataframe_SPLIF_ECFP4["RFv2"][count],dataframe_SPLIF_ECFP4["RFv3"][count],dataframe_SPLIF_ECFP4["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP4["SMILES"][count],dataframe_SPLIF_ECFP4["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP4["pchembl_value_Median"][count],dataframe_SPLIF_ECFP4["protein"][count],dataframe_SPLIF_ECFP4["compound"][count] = dataframe_SPLIF_ECFP4.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP4,Conc_SPLIF_ECFP4,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname
|
||||||
|
dataframe_SPLIF_ECFP6,dataframe_SPLIF_ECFP6["SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6["Conc_SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6['VinaScore'][count],dataframe_SPLIF_ECFP6['Type'][count],dataframe_SPLIF_ECFP6["RFv1"][count],dataframe_SPLIF_ECFP6["RFv2"][count],dataframe_SPLIF_ECFP6["RFv3"][count],dataframe_SPLIF_ECFP6["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP6["SMILES"][count],dataframe_SPLIF_ECFP6["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP6["pchembl_value_Median"][count],dataframe_SPLIF_ECFP6["protein"][count],dataframe_SPLIF_ECFP6["compound"][count] = dataframe_SPLIF_ECFP6.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP6,Conc_SPLIF_ECFP6,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname
|
||||||
|
print("Done appending to dataframe! Number appended", count, int(run_number))
|
||||||
|
|
||||||
|
if int(run_number) > 375:
|
||||||
|
print("Going dark to preserve memory..")
|
||||||
|
with open(f'config_{var_from}_{var_to}.txt', 'w') as f:
|
||||||
|
to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(var_from)+"\n"+str(var_to)
|
||||||
|
print(to_write)
|
||||||
|
f.write(to_write)
|
||||||
|
f.close()
|
||||||
|
dataframe_SIFP.to_csv(f'../IFP_datasets/SIFP_v2/{filename[:-5]}/dataframe_SIFP_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(var_from), str(var_to))
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
# del(ligand,SMILES,RFv1,RFv2,RFv3,PLEC,SPLIF_ECFP2,SPLIF_ECFP4,SPLIF_ECFP6,Conc_PLEC,Conc_SPLIF_ECFP2,Conc_SPLIF_ECFP4,Conc_SPLIF_ECFP6,VinaScore,PLEC_v2,SPLIF_ECFP2_v2,SPLIF_ECFP4_v2,SPLIF_ECFP6_v2)
|
||||||
|
|
||||||
|
# Other descriptors I could use later on?
|
||||||
|
# protein_atoms, ligand_atoms, strict = oddt.interactions.hbonds(protein, ligand)
|
||||||
|
# print(protein_atoms['resname'])
|
||||||
|
# wut = oddt.interactions.close_contacts(protein_atoms, ligand_atoms, cutoff=4, x_column='coords', y_column='coords')
|
||||||
|
except:
|
||||||
|
print("Actives. Errors for some reason..")
|
||||||
|
print("Going to inactives.. ")
|
||||||
|
for k in sorted(os.listdir(ligand_location_inactives))[(file_number-number_of_actives):]:
|
||||||
|
print("Inactives ", k, number_of_inactives)
|
||||||
|
if int(file_number) >= (int(number_of_actives)+int(number_of_inactives)):
|
||||||
|
print("passing! len actives + actives is: ", str(number_of_actives), str(number_of_inactives))
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
file_number += 1
|
||||||
|
run_number += 1
|
||||||
|
group_number = 0
|
||||||
|
ligandname = f'{ligand_location_inactives}{k}'
|
||||||
|
|
||||||
|
ligand = ''
|
||||||
|
Type = "inactive"
|
||||||
|
# ligand = next(oddt.toolkit.readfile('pdbqt', ligand_location_inactives+k))
|
||||||
|
os.system(f'obabel -ipdbqt {ligand_location_inactives}{k} -osdf -O sparse_{filename[:-5]}.sdf')
|
||||||
|
ligand = next(oddt.toolkit.readfile('sdf',f'sparse_{filename[:-5]}.sdf'))
|
||||||
|
|
||||||
|
KLIFS_pocket_IFP = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand)
|
||||||
|
Conc_KLIFS_pocket_IFP = KLIFS_pocket_IFP
|
||||||
|
KLIFS_pocket_IFP = ''.join(str(list(KLIFS_pocket_IFP)))
|
||||||
|
|
||||||
|
PLEC = oddt.fingerprints.PLEC(ligand, protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True)
|
||||||
|
Conc_PLEC = PLEC
|
||||||
|
PLEC = ''.join(str(list(PLEC)))
|
||||||
|
|
||||||
|
SPLIF_ECFP2 = oddt.fingerprints.SPLIF(ligand, protein, depth=1, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP2 = SPLIF_ECFP2
|
||||||
|
SPLIF_ECFP2 = ''.join(str(list(SPLIF_ECFP2)))
|
||||||
|
|
||||||
|
SPLIF_ECFP4 = oddt.fingerprints.SPLIF(ligand, protein, depth=2, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP4 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP4 = SPLIF_ECFP4
|
||||||
|
SPLIF_ECFP4 = ''.join(str(list(SPLIF_ECFP4)))
|
||||||
|
|
||||||
|
SPLIF_ECFP6 = oddt.fingerprints.SPLIF(ligand, protein, depth=3, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP6 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP6 = SPLIF_ECFP6
|
||||||
|
SPLIF_ECFP6 = ''.join(str(list(SPLIF_ECFP6)))
|
||||||
|
|
||||||
|
RFv1 = rf1.predict(ligand)
|
||||||
|
RFv2 = rf2.predict(ligand)
|
||||||
|
RFv3 = rf3.predict(ligand)
|
||||||
|
plec_score = plecscore.predict(ligand)
|
||||||
|
nn_score = nn.predict([ligand])
|
||||||
|
VinaScore = ligand.data
|
||||||
|
|
||||||
|
# This part is for concatenating all 5 poses
|
||||||
|
try:
|
||||||
|
# ligand = list(oddt.toolkit.readfile('pdbqt', ligand_location_inactives+k))
|
||||||
|
ligand = list(oddt.toolkit.readfile('sdf',f'sparse_{filename[:-5]}.sdf'))
|
||||||
|
for m in [x for x in range(len(ligand)) if x != 0]:
|
||||||
|
KLIFS_pocket_IFP_v2 = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand[m])
|
||||||
|
Conc_KLIFS_pocket_IFP += KLIFS_pocket_IFP_v2
|
||||||
|
|
||||||
|
PLEC_v2 = oddt.fingerprints.PLEC(ligand[m], protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True)
|
||||||
|
Conc_PLEC += PLEC_v2
|
||||||
|
|
||||||
|
SPLIF_ECFP2_v2 = oddt.fingerprints.SPLIF(ligand[m], protein, depth=1, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP2_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2_v2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP2 += SPLIF_ECFP2_v2
|
||||||
|
|
||||||
|
SPLIF_ECFP4_v2 = oddt.fingerprints.SPLIF(ligand[m], protein, depth=2, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP4_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4_v2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP4 += SPLIF_ECFP4_v2
|
||||||
|
|
||||||
|
SPLIF_ECFP6_v2 = oddt.fingerprints.SPLIF(ligand[m], protein, depth=3, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP6_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6_v2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP6 += SPLIF_ECFP6_v2
|
||||||
|
Conc_KLIFS_pocket_IFP = ''.join(str(list(Conc_KLIFS_pocket_IFP)))
|
||||||
|
Conc_PLEC = ''.join(str(list(Conc_PLEC)))
|
||||||
|
Conc_SPLIF_ECFP2 = ''.join(str(list(Conc_SPLIF_ECFP2)))
|
||||||
|
Conc_SPLIF_ECFP4 = ''.join(str(list(Conc_SPLIF_ECFP4)))
|
||||||
|
Conc_SPLIF_ECFP6 = ''.join(str(list(Conc_SPLIF_ECFP6)))
|
||||||
|
print("Concatenated poses succesfully! Number of poses: "+str(m))
|
||||||
|
except:
|
||||||
|
print("Inactives. This file might not have 5 poses?")
|
||||||
|
|
||||||
|
#Find and calculate SMILES
|
||||||
|
with open(ligand_location_inactives+k,'r') as f:
|
||||||
|
pdbqt_file = f.read()
|
||||||
|
f.close()
|
||||||
|
number_from_file = find_number(pdbqt_file, '.smi:')
|
||||||
|
print(number_from_file)
|
||||||
|
target_smiles = f'../../my_rp1_compounds_kinases/uniprot_kinase_{Type}s/{group}/{uniprot}_{Type}s.smi'
|
||||||
|
with open(target_smiles, 'r') as f:
|
||||||
|
smiles_file = f.readlines()
|
||||||
|
f.close()
|
||||||
|
SMILES = str(smiles_file[int(number_from_file[0])-1]).replace("\n","")
|
||||||
|
print(SMILES)
|
||||||
|
|
||||||
|
for r in range(len(csv_inactive_compounds)):
|
||||||
|
if csv_inactive_compounds["standardised_smiles"][r] == SMILES and csv_inactive_compounds["accession"][r] == uniprot:
|
||||||
|
pchembl_mean = csv_inactive_compounds["pchembl_value_Mean"][r]
|
||||||
|
pchembl_median = csv_inactive_compounds["pchembl_value_Median"][r]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
pchembl_mean = ''
|
||||||
|
pchembl_median = ''
|
||||||
|
pass
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
dataframe_SIFP,dataframe_SIFP["KLIFS_pocket_IFP"][count],dataframe_SIFP["Conc_KLIFS_pocket_IFP"][count],dataframe_SIFP['VinaScore'][count],dataframe_SIFP['Type'][count],dataframe_SIFP["RFv1"][count],dataframe_SIFP["RFv2"][count],dataframe_SIFP["RFv3"][count],dataframe_SIFP["nn_score"][count],dataframe_SIFP["plec_score"][count],dataframe_SIFP["SMILES"][count],dataframe_SIFP["pchembl_value_Mean"][count],dataframe_SIFP["pchembl_value_Median"][count],dataframe_SIFP["protein"][count],dataframe_SIFP["compound"][count] = dataframe_SIFP.append(csv_dataframe.iloc[i],ignore_index=True),KLIFS_pocket_IFP,Conc_KLIFS_pocket_IFP,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname
|
||||||
|
dataframe_PLEC,dataframe_PLEC["PLEC"][count],dataframe_PLEC["Conc_PLEC"][count],dataframe_PLEC['VinaScore'][count],dataframe_PLEC['Type'][count],dataframe_PLEC["RFv1"][count],dataframe_PLEC["RFv2"][count],dataframe_PLEC["RFv3"][count],dataframe_PLEC["nn_score"][count],dataframe_PLEC["plec_score"][count],dataframe_PLEC["SMILES"][count],dataframe_PLEC["pchembl_value_Mean"][count],dataframe_PLEC["pchembl_value_Median"][count],dataframe_PLEC["protein"][count],dataframe_PLEC["compound"][count] = dataframe_PLEC.append(csv_dataframe.iloc[i],ignore_index=True),PLEC,Conc_PLEC,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname
|
||||||
|
dataframe_SPLIF_ECFP2,dataframe_SPLIF_ECFP2["SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2["Conc_SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2['VinaScore'][count],dataframe_SPLIF_ECFP2['Type'][count],dataframe_SPLIF_ECFP2["RFv1"][count],dataframe_SPLIF_ECFP2["RFv2"][count],dataframe_SPLIF_ECFP2["RFv3"][count],dataframe_SPLIF_ECFP2["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP2["SMILES"][count],dataframe_SPLIF_ECFP2["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP2["pchembl_value_Median"][count],dataframe_SPLIF_ECFP2["protein"][count],dataframe_SPLIF_ECFP2["compound"][count] = dataframe_SPLIF_ECFP2.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP2,Conc_SPLIF_ECFP2,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname
|
||||||
|
dataframe_SPLIF_ECFP4,dataframe_SPLIF_ECFP4["SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4["Conc_SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4['VinaScore'][count],dataframe_SPLIF_ECFP4['Type'][count],dataframe_SPLIF_ECFP4["RFv1"][count],dataframe_SPLIF_ECFP4["RFv2"][count],dataframe_SPLIF_ECFP4["RFv3"][count],dataframe_SPLIF_ECFP4["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP4["SMILES"][count],dataframe_SPLIF_ECFP4["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP4["pchembl_value_Median"][count],dataframe_SPLIF_ECFP4["protein"][count],dataframe_SPLIF_ECFP4["compound"][count] = dataframe_SPLIF_ECFP4.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP4,Conc_SPLIF_ECFP4,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname
|
||||||
|
dataframe_SPLIF_ECFP6,dataframe_SPLIF_ECFP6["SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6["Conc_SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6['VinaScore'][count],dataframe_SPLIF_ECFP6['Type'][count],dataframe_SPLIF_ECFP6["RFv1"][count],dataframe_SPLIF_ECFP6["RFv2"][count],dataframe_SPLIF_ECFP6["RFv3"][count],dataframe_SPLIF_ECFP6["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP6["SMILES"][count],dataframe_SPLIF_ECFP6["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP6["pchembl_value_Median"][count],dataframe_SPLIF_ECFP6["protein"][count],dataframe_SPLIF_ECFP6["compound"][count] = dataframe_SPLIF_ECFP6.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP6,Conc_SPLIF_ECFP6,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname
|
||||||
|
print("Done appending to dataframe! Number appended", count, int(run_number))
|
||||||
|
|
||||||
|
gc.collect()
|
||||||
|
if int(run_number) > 375:
|
||||||
|
print("Going dark to preserve memory..")
|
||||||
|
with open(f'config_{var_from}_{var_to}.txt', 'w') as f:
|
||||||
|
to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(var_from)+"\n"+str(var_to)
|
||||||
|
print(to_write)
|
||||||
|
f.write(to_write)
|
||||||
|
f.close()
|
||||||
|
dataframe_SIFP.to_csv(f'../IFP_datasets/SIFP_v2/{filename[:-5]}/dataframe_SIFP_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(var_from), str(var_to))
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
except:
|
||||||
|
print("No pdbqt files available maybe?")
|
||||||
|
|
||||||
|
if int(count) == -1:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
print("Going dark for the decoys..")
|
||||||
|
with open(f'config_{var_from}_{var_to}.txt', 'w') as f:
|
||||||
|
to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(var_from)+"\n"+str(var_to)
|
||||||
|
print(to_write)
|
||||||
|
f.write(to_write)
|
||||||
|
f.close()
|
||||||
|
dataframe_SIFP.to_csv(f'../IFP_datasets/SIFP_v2/{filename[:-5]}/dataframe_SIFP_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(var_from), str(var_to))
|
||||||
|
|
||||||
|
# Decoys for loop here
|
||||||
|
decoy_number = 0
|
||||||
|
for n in range(group_number, len(grouplist)):
|
||||||
|
print("On decoys!")
|
||||||
|
|
||||||
|
if int(decoy_number) >= 125:
|
||||||
|
print("Going dark to preserve memory..")
|
||||||
|
if int(group_number) > 8:
|
||||||
|
next_from = int(var_from) + 1
|
||||||
|
if next_from == var_to:
|
||||||
|
print("DONE WITH RUN")
|
||||||
|
sys.exit()
|
||||||
|
os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str("blub"))
|
||||||
|
file_number, count, group_number = 0,-1,0
|
||||||
|
with open(f'config_{next_from}_{var_to}.txt', 'w') as f:
|
||||||
|
to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(next_from)+"\n"+str(var_to)
|
||||||
|
print(to_write)
|
||||||
|
f.write(to_write)
|
||||||
|
f.close()
|
||||||
|
dataframe_SIFP.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}.csv')
|
||||||
|
dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(var_from), str(var_to))
|
||||||
|
with open(f'config_{var_from}_{var_to}.txt', 'w') as f:
|
||||||
|
to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(var_from)+"\n"+str(var_to)
|
||||||
|
print(to_write)
|
||||||
|
f.write(to_write)
|
||||||
|
f.close()
|
||||||
|
dataframe_SIFP.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}.csv')
|
||||||
|
dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(var_from), str(var_to))
|
||||||
|
else:
|
||||||
|
group_number += 1
|
||||||
|
pass
|
||||||
|
|
||||||
|
for o in sorted(os.listdir(decoy_location_1000+grouplist[n])):
|
||||||
|
print("Decoys. Decoynumber - group_number", o, group_number)
|
||||||
|
try:
|
||||||
|
file_number += 1
|
||||||
|
run_number += 1
|
||||||
|
decoy_number += 1
|
||||||
|
ligandname = f'{decoy_location_1000}{grouplist[n]}/{o}'
|
||||||
|
decoy_group = str(grouplist[n])
|
||||||
|
|
||||||
|
ligand = ''
|
||||||
|
Type = "decoy"
|
||||||
|
# ligand = next(oddt.toolkit.readfile('pdbqt', decoy_location_1000+grouplist[n]+"/"+o))
|
||||||
|
os.system(f'obabel -ipdbqt {decoy_location_1000}{grouplist[n]}/{o} -osdf -O sparse_{filename[:-5]}.sdf')
|
||||||
|
ligand = next(oddt.toolkit.readfile('sdf', f'sparse_{filename[:-5]}.sdf'))
|
||||||
|
|
||||||
|
KLIFS_pocket_IFP = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand)
|
||||||
|
Conc_KLIFS_pocket_IFP = KLIFS_pocket_IFP
|
||||||
|
KLIFS_pocket_IFP = ''.join(str(list(KLIFS_pocket_IFP)))
|
||||||
|
|
||||||
|
PLEC = oddt.fingerprints.PLEC(ligand, protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True)
|
||||||
|
Conc_PLEC = PLEC
|
||||||
|
PLEC = ''.join(str(list(PLEC)))
|
||||||
|
|
||||||
|
SPLIF_ECFP2 = oddt.fingerprints.SPLIF(ligand, protein, depth=1, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP2 = SPLIF_ECFP2
|
||||||
|
SPLIF_ECFP2 = ''.join(str(list(SPLIF_ECFP2)))
|
||||||
|
|
||||||
|
SPLIF_ECFP4 = oddt.fingerprints.SPLIF(ligand, protein, depth=2, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP4 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP4 = SPLIF_ECFP4
|
||||||
|
SPLIF_ECFP4 = ''.join(str(list(SPLIF_ECFP4)))
|
||||||
|
|
||||||
|
SPLIF_ECFP6 = oddt.fingerprints.SPLIF(ligand, protein, depth=3, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP6 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP6 = SPLIF_ECFP6
|
||||||
|
SPLIF_ECFP6 = ''.join(str(list(SPLIF_ECFP6)))
|
||||||
|
|
||||||
|
RFv1 = rf1.predict(ligand)
|
||||||
|
RFv2 = rf2.predict(ligand)
|
||||||
|
RFv3 = rf3.predict(ligand)
|
||||||
|
plec_score = plecscore.predict(ligand)
|
||||||
|
nn_score = nn.predict([ligand])
|
||||||
|
VinaScore = ligand.data
|
||||||
|
|
||||||
|
# This part is for concatenating all 5 poses
|
||||||
|
try:
|
||||||
|
# ligand = list(oddt.toolkit.readfile('pdbqt', decoy_location_1000+grouplist[n]+"/"+o))
|
||||||
|
ligand = list(oddt.toolkit.readfile('sdf', f'sparse_{filename[:-5]}.sdf'))
|
||||||
|
for p in [x for x in range(len(ligand)) if x != 0]:
|
||||||
|
KLIFS_pocket_IFP_v2 = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand[p])
|
||||||
|
Conc_KLIFS_pocket_IFP += KLIFS_pocket_IFP_v2
|
||||||
|
|
||||||
|
PLEC_v2 = oddt.fingerprints.PLEC(ligand[p], protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True)
|
||||||
|
Conc_PLEC += PLEC_v2
|
||||||
|
|
||||||
|
SPLIF_ECFP2_v2 = oddt.fingerprints.SPLIF(ligand[p], protein, depth=1, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP2_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2_v2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP2 += SPLIF_ECFP2_v2
|
||||||
|
|
||||||
|
SPLIF_ECFP4_v2 = oddt.fingerprints.SPLIF(ligand[p], protein, depth=2, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP4_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4_v2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP4 += SPLIF_ECFP4_v2
|
||||||
|
|
||||||
|
SPLIF_ECFP6_v2 = oddt.fingerprints.SPLIF(ligand[p], protein, depth=3, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP6_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6_v2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP6 += SPLIF_ECFP6_v2
|
||||||
|
Conc_KLIFS_pocket_IFP = ''.join(str(list(Conc_KLIFS_pocket_IFP)))
|
||||||
|
Conc_PLEC = ''.join(str(list(Conc_PLEC)))
|
||||||
|
Conc_SPLIF_ECFP2 = ''.join(str(list(Conc_SPLIF_ECFP2)))
|
||||||
|
Conc_SPLIF_ECFP4 = ''.join(str(list(Conc_SPLIF_ECFP4)))
|
||||||
|
Conc_SPLIF_ECFP6 = ''.join(str(list(Conc_SPLIF_ECFP6)))
|
||||||
|
print("Concatenated poses succesfully! Number of poses: "+str(p))
|
||||||
|
|
||||||
|
except:
|
||||||
|
print("Decoys. This file might not have any poses?")
|
||||||
|
|
||||||
|
#Dense to sparse for storage
|
||||||
|
# PLEC = oddt.fingerprints.dense_to_sparse(PLEC, size=16384)
|
||||||
|
# Conc_PLEC = oddt.fingerprints.dense_to_sparse(Conc_PLEC, size=16384)
|
||||||
|
# SPLIF_ECFP2 = oddt.fingerprints.dense_to_sparse(SPLIF_ECFP2, size=4096)
|
||||||
|
# Conc_SPLIF_ECFP2 = oddt.fingerprints.dense_to_sparse(Conc_SPLIF_ECFP2, size=4096)
|
||||||
|
# SPLIF_ECFP4 = oddt.fingerprints.dense_to_sparse(SPLIF_ECFP4, size=4096)
|
||||||
|
# Conc_SPLIF_ECFP4 = oddt.fingerprints.dense_to_sparse(Conc_SPLIF_ECFP4, size=4096)
|
||||||
|
# SPLIF_ECFP6 = oddt.fingerprints.dense_to_sparse(SPLIF_ECFP6, size=4096)
|
||||||
|
# Conc_SPLIF_ECFP6 = oddt.fingerprints.dense_to_sparse(Conc_SPLIF_ECFP6, size=4096)
|
||||||
|
|
||||||
|
# This doesn't (yet) work for the decoys!
|
||||||
|
# with open(decoy_location_1000+grouplist[n]+j,'r') as f:
|
||||||
|
# pdbqt_file = f.read()
|
||||||
|
# f.close()
|
||||||
|
# number_from_file = find_number(pdbqt_file, '.smi:')
|
||||||
|
# target_smiles = f'../my_rp1_compounds_kinases/{Type}s/1000/{group}_125.smi'
|
||||||
|
# with open(target_smiles, 'r') as f:
|
||||||
|
# smiles_file = f.readlines()
|
||||||
|
# f.close()
|
||||||
|
# SMILES = str(smiles_file[int(number_from_file[0])]).replace("\n","")
|
||||||
|
SMILES,pchembl_mean,pchembl_median = '','',''
|
||||||
|
print("On dataframe!")
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
dataframe_SIFP,dataframe_SIFP["KLIFS_pocket_IFP"][count],dataframe_SIFP["Conc_KLIFS_pocket_IFP"][count],dataframe_SIFP['VinaScore'][count],dataframe_SIFP['Type'][count],dataframe_SIFP["RFv1"][count],dataframe_SIFP["RFv2"][count],dataframe_SIFP["RFv3"][count],dataframe_SIFP["nn_score"][count],dataframe_SIFP["plec_score"][count],dataframe_SIFP["SMILES"][count],dataframe_SIFP["pchembl_value_Mean"][count],dataframe_SIFP["pchembl_value_Median"][count],dataframe_SIFP["protein"][count],dataframe_SIFP["compound"][count],dataframe_SIFP["decoy_group"][count] = dataframe_SIFP.append(csv_dataframe.iloc[i],ignore_index=True),KLIFS_pocket_IFP,Conc_KLIFS_pocket_IFP,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group
|
||||||
|
dataframe_PLEC,dataframe_PLEC["PLEC"][count],dataframe_PLEC["Conc_PLEC"][count],dataframe_PLEC['VinaScore'][count],dataframe_PLEC['Type'][count],dataframe_PLEC["RFv1"][count],dataframe_PLEC["RFv2"][count],dataframe_PLEC["RFv3"][count],dataframe_PLEC["nn_score"][count],dataframe_PLEC["plec_score"][count],dataframe_PLEC["SMILES"][count],dataframe_PLEC["pchembl_value_Mean"][count],dataframe_PLEC["pchembl_value_Median"][count],dataframe_PLEC["protein"][count],dataframe_PLEC["compound"][count],dataframe_PLEC["decoy_group"][count] = dataframe_PLEC.append(csv_dataframe.iloc[i],ignore_index=True),PLEC,Conc_PLEC,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group
|
||||||
|
dataframe_SPLIF_ECFP2,dataframe_SPLIF_ECFP2["SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2["Conc_SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2['VinaScore'][count],dataframe_SPLIF_ECFP2['Type'][count],dataframe_SPLIF_ECFP2["RFv1"][count],dataframe_SPLIF_ECFP2["RFv2"][count],dataframe_SPLIF_ECFP2["RFv3"][count],dataframe_SPLIF_ECFP2["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP2["SMILES"][count],dataframe_SPLIF_ECFP2["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP2["pchembl_value_Median"][count],dataframe_SPLIF_ECFP2["protein"][count],dataframe_SPLIF_ECFP2["compound"][count],dataframe_SPLIF_ECFP2["decoy_group"][count] = dataframe_SPLIF_ECFP2.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP2,Conc_SPLIF_ECFP2,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group
|
||||||
|
dataframe_SPLIF_ECFP4,dataframe_SPLIF_ECFP4["SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4["Conc_SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4['VinaScore'][count],dataframe_SPLIF_ECFP4['Type'][count],dataframe_SPLIF_ECFP4["RFv1"][count],dataframe_SPLIF_ECFP4["RFv2"][count],dataframe_SPLIF_ECFP4["RFv3"][count],dataframe_SPLIF_ECFP4["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP4["SMILES"][count],dataframe_SPLIF_ECFP4["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP4["pchembl_value_Median"][count],dataframe_SPLIF_ECFP4["protein"][count],dataframe_SPLIF_ECFP4["compound"][count],dataframe_SPLIF_ECFP2["decoy_group"][count] = dataframe_SPLIF_ECFP4.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP4,Conc_SPLIF_ECFP4,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group
|
||||||
|
dataframe_SPLIF_ECFP6,dataframe_SPLIF_ECFP6["SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6["Conc_SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6['VinaScore'][count],dataframe_SPLIF_ECFP6['Type'][count],dataframe_SPLIF_ECFP6["RFv1"][count],dataframe_SPLIF_ECFP6["RFv2"][count],dataframe_SPLIF_ECFP6["RFv3"][count],dataframe_SPLIF_ECFP6["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP6["SMILES"][count],dataframe_SPLIF_ECFP6["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP6["pchembl_value_Median"][count],dataframe_SPLIF_ECFP6["protein"][count],dataframe_SPLIF_ECFP6["compound"][count],dataframe_SPLIF_ECFP2["decoy_group"][count] = dataframe_SPLIF_ECFP6.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP6,Conc_SPLIF_ECFP6,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group
|
||||||
|
print("Done appending to dataframe! Number appended", count, int(run_number))
|
||||||
|
|
||||||
|
gc.collect()
|
||||||
|
except:
|
||||||
|
print("No pdbqt files available maybe?")
|
||||||
|
|
||||||
|
# Decoys of 1625, IF available
|
||||||
|
try:
|
||||||
|
for o in sorted(os.listdir(decoy_location_1625+grouplist[n])):
|
||||||
|
print("Decoys. Decoynumber - group_number", o, group_number)
|
||||||
|
try:
|
||||||
|
# file_number += 1
|
||||||
|
# run_number += 1
|
||||||
|
# decoy_number += 1
|
||||||
|
ligandname = f'{decoy_location_1625}{grouplist[n]}/{o}'
|
||||||
|
decoy_group = str(grouplist[n])
|
||||||
|
|
||||||
|
ligand = ''
|
||||||
|
Type = "decoy_v2"
|
||||||
|
os.system(f'obabel -ipdbqt {decoy_location_1625}{grouplist[n]}/{o} -osdf -O sparse_{filename[:-5]}.sdf')
|
||||||
|
ligand = next(oddt.toolkit.readfile('sdf', f'sparse_{filename[:-5]}.sdf'))
|
||||||
|
|
||||||
|
KLIFS_pocket_IFP = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand)
|
||||||
|
Conc_KLIFS_pocket_IFP = KLIFS_pocket_IFP
|
||||||
|
KLIFS_pocket_IFP = ''.join(str(list(KLIFS_pocket_IFP)))
|
||||||
|
|
||||||
|
PLEC = oddt.fingerprints.PLEC(ligand, protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True)
|
||||||
|
Conc_PLEC = PLEC
|
||||||
|
PLEC = ''.join(str(list(PLEC)))
|
||||||
|
|
||||||
|
SPLIF_ECFP2 = oddt.fingerprints.SPLIF(ligand, protein, depth=1, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP2 = SPLIF_ECFP2
|
||||||
|
SPLIF_ECFP2 = ''.join(str(list(SPLIF_ECFP2)))
|
||||||
|
|
||||||
|
SPLIF_ECFP4 = oddt.fingerprints.SPLIF(ligand, protein, depth=2, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP4 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP4 = SPLIF_ECFP4
|
||||||
|
SPLIF_ECFP4 = ''.join(str(list(SPLIF_ECFP4)))
|
||||||
|
|
||||||
|
SPLIF_ECFP6 = oddt.fingerprints.SPLIF(ligand, protein, depth=3, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP6 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP6 = SPLIF_ECFP6
|
||||||
|
SPLIF_ECFP6 = ''.join(str(list(SPLIF_ECFP6)))
|
||||||
|
|
||||||
|
RFv1 = rf1.predict(ligand)
|
||||||
|
RFv2 = rf2.predict(ligand)
|
||||||
|
RFv3 = rf3.predict(ligand)
|
||||||
|
plec_score = plecscore.predict(ligand)
|
||||||
|
nn_score = nn.predict([ligand])
|
||||||
|
VinaScore = ligand.data
|
||||||
|
|
||||||
|
# This part is for concatenating all 5 poses
|
||||||
|
try:
|
||||||
|
# ligand = list(oddt.toolkit.readfile('pdbqt', decoy_location_1000+grouplist[n]+"/"+o))
|
||||||
|
ligand = list(oddt.toolkit.readfile('sdf', f'sparse_{filename[:-5]}.sdf'))
|
||||||
|
for p in [x for x in range(len(ligand)) if x != 0]:
|
||||||
|
KLIFS_pocket_IFP_v2 = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand[p])
|
||||||
|
Conc_KLIFS_pocket_IFP += KLIFS_pocket_IFP_v2
|
||||||
|
|
||||||
|
PLEC_v2 = oddt.fingerprints.PLEC(ligand[p], protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True)
|
||||||
|
Conc_PLEC += PLEC_v2
|
||||||
|
|
||||||
|
SPLIF_ECFP2_v2 = oddt.fingerprints.SPLIF(ligand[p], protein, depth=1, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP2_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2_v2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP2 += SPLIF_ECFP2_v2
|
||||||
|
|
||||||
|
SPLIF_ECFP4_v2 = oddt.fingerprints.SPLIF(ligand[p], protein, depth=2, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP4_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4_v2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP4 += SPLIF_ECFP4_v2
|
||||||
|
|
||||||
|
SPLIF_ECFP6_v2 = oddt.fingerprints.SPLIF(ligand[p], protein, depth=3, size=4096, distance_cutoff=4.5)
|
||||||
|
SPLIF_ECFP6_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6_v2['hash'], size=4096)
|
||||||
|
Conc_SPLIF_ECFP6 += SPLIF_ECFP6_v2
|
||||||
|
Conc_KLIFS_pocket_IFP = ''.join(str(list(Conc_KLIFS_pocket_IFP)))
|
||||||
|
Conc_PLEC = ''.join(str(list(Conc_PLEC)))
|
||||||
|
Conc_SPLIF_ECFP2 = ''.join(str(list(Conc_SPLIF_ECFP2)))
|
||||||
|
Conc_SPLIF_ECFP4 = ''.join(str(list(Conc_SPLIF_ECFP4)))
|
||||||
|
Conc_SPLIF_ECFP6 = ''.join(str(list(Conc_SPLIF_ECFP6)))
|
||||||
|
print("Concatenated poses succesfully! Number of poses: "+str(p))
|
||||||
|
|
||||||
|
except:
|
||||||
|
print("Decoys. This file might not have any poses?")
|
||||||
|
|
||||||
|
SMILES,pchembl_mean,pchembl_median = '','',''
|
||||||
|
print("On dataframe!")
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
dataframe_SIFP,dataframe_SIFP["KLIFS_pocket_IFP"][count],dataframe_SIFP["Conc_KLIFS_pocket_IFP"][count],dataframe_SIFP['VinaScore'][count],dataframe_SIFP['Type'][count],dataframe_SIFP["RFv1"][count],dataframe_SIFP["RFv2"][count],dataframe_SIFP["RFv3"][count],dataframe_SIFP["nn_score"][count],dataframe_SIFP["plec_score"][count],dataframe_SIFP["SMILES"][count],dataframe_SIFP["pchembl_value_Mean"][count],dataframe_SIFP["pchembl_value_Median"][count],dataframe_SIFP["protein"][count],dataframe_SIFP["compound"][count],dataframe_SIFP["decoy_group"][count] = dataframe_SIFP.append(csv_dataframe.iloc[i],ignore_index=True),KLIFS_pocket_IFP,Conc_KLIFS_pocket_IFP,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group
|
||||||
|
dataframe_PLEC,dataframe_PLEC["PLEC"][count],dataframe_PLEC["Conc_PLEC"][count],dataframe_PLEC['VinaScore'][count],dataframe_PLEC['Type'][count],dataframe_PLEC["RFv1"][count],dataframe_PLEC["RFv2"][count],dataframe_PLEC["RFv3"][count],dataframe_PLEC["nn_score"][count],dataframe_PLEC["plec_score"][count],dataframe_PLEC["SMILES"][count],dataframe_PLEC["pchembl_value_Mean"][count],dataframe_PLEC["pchembl_value_Median"][count],dataframe_PLEC["protein"][count],dataframe_PLEC["compound"][count],dataframe_PLEC["decoy_group"][count] = dataframe_PLEC.append(csv_dataframe.iloc[i],ignore_index=True),PLEC,Conc_PLEC,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group
|
||||||
|
dataframe_SPLIF_ECFP2,dataframe_SPLIF_ECFP2["SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2["Conc_SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2['VinaScore'][count],dataframe_SPLIF_ECFP2['Type'][count],dataframe_SPLIF_ECFP2["RFv1"][count],dataframe_SPLIF_ECFP2["RFv2"][count],dataframe_SPLIF_ECFP2["RFv3"][count],dataframe_SPLIF_ECFP2["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP2["SMILES"][count],dataframe_SPLIF_ECFP2["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP2["pchembl_value_Median"][count],dataframe_SPLIF_ECFP2["protein"][count],dataframe_SPLIF_ECFP2["compound"][count],dataframe_SPLIF_ECFP2["decoy_group"][count] = dataframe_SPLIF_ECFP2.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP2,Conc_SPLIF_ECFP2,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group
|
||||||
|
dataframe_SPLIF_ECFP4,dataframe_SPLIF_ECFP4["SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4["Conc_SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4['VinaScore'][count],dataframe_SPLIF_ECFP4['Type'][count],dataframe_SPLIF_ECFP4["RFv1"][count],dataframe_SPLIF_ECFP4["RFv2"][count],dataframe_SPLIF_ECFP4["RFv3"][count],dataframe_SPLIF_ECFP4["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP4["SMILES"][count],dataframe_SPLIF_ECFP4["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP4["pchembl_value_Median"][count],dataframe_SPLIF_ECFP4["protein"][count],dataframe_SPLIF_ECFP4["compound"][count],dataframe_SPLIF_ECFP2["decoy_group"][count] = dataframe_SPLIF_ECFP4.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP4,Conc_SPLIF_ECFP4,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group
|
||||||
|
dataframe_SPLIF_ECFP6,dataframe_SPLIF_ECFP6["SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6["Conc_SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6['VinaScore'][count],dataframe_SPLIF_ECFP6['Type'][count],dataframe_SPLIF_ECFP6["RFv1"][count],dataframe_SPLIF_ECFP6["RFv2"][count],dataframe_SPLIF_ECFP6["RFv3"][count],dataframe_SPLIF_ECFP6["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP6["SMILES"][count],dataframe_SPLIF_ECFP6["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP6["pchembl_value_Median"][count],dataframe_SPLIF_ECFP6["protein"][count],dataframe_SPLIF_ECFP6["compound"][count],dataframe_SPLIF_ECFP2["decoy_group"][count] = dataframe_SPLIF_ECFP6.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP6,Conc_SPLIF_ECFP6,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group
|
||||||
|
print("Done appending to dataframe! Number appended", count, int(run_number))
|
||||||
|
except:
|
||||||
|
print("No pdbqt files available maybe?")
|
||||||
|
|
||||||
|
|
||||||
|
except:
|
||||||
|
print("No second decoy files available")
|
||||||
|
|
||||||
|
if int(group_number) > 8:
|
||||||
|
var_from = next_from
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if int(decoy_number) >= 125:
|
||||||
|
print("Going dark to preserve memory..")
|
||||||
|
with open(f'config_{var_from}_{var_to}.txt', 'w') as f:
|
||||||
|
to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(var_from)+"\n"+str(var_to)
|
||||||
|
print(to_write)
|
||||||
|
f.write(to_write)
|
||||||
|
f.close()
|
||||||
|
dataframe_SIFP.to_csv(f'../IFP_datasets/SIFP_v2/{filename[:-5]}/dataframe_SIFP_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(var_from), str(var_to))
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
print("There is a time to run and a time to error out")
|
||||||
|
# ListFaultyStructures.append(filename)
|
||||||
|
quit("No valid structure to calculate on")
|
||||||
|
except:
|
||||||
|
print("Maybe haven't downloaded ", filename,"moving on to next structure!")
|
||||||
|
ListFaultyStructures.append(filename)
|
||||||
|
next_from = int(var_from) + 1
|
||||||
|
if next_from == var_to:
|
||||||
|
print("DONE WITH RUN")
|
||||||
|
sys.exit()
|
||||||
|
os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str("blub"))
|
||||||
|
file_number, count, group_number = 0,-1,0
|
||||||
|
with open(f'config_{next_from}_{var_to}.txt', 'w') as f:
|
||||||
|
to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(next_from)+"\n"+str(var_to)
|
||||||
|
print(to_write)
|
||||||
|
f.write(to_write)
|
||||||
|
f.close()
|
||||||
|
dataframe_SIFP.to_csv(f'../IFP_datasets/SIFP_v2/{filename[:-5]}/dataframe_SIFP_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(next_from), str(var_to))
|
||||||
|
|
||||||
|
print("Done with run!")
|
||||||
|
# dataframe_SIFP.to_csv(f'../IFP_datasets/SIFP_v2/{filename[:-5]}/dataframe_SIFP_dense_{filename[:-5]}_{file_number}.csv')
|
||||||
|
# dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}.csv')
|
||||||
|
# dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}.csv')
|
||||||
|
# dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}.csv')
|
||||||
|
# dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}.csv')
|
||||||
|
# with open(f'../IFP_datasets/FaultyStructures_dense_{filename[:-5]}.txt', "w") as f:
|
||||||
|
# f.writelines(ListFaultyStructures)
|
||||||
|
# f.close()
|
||||||
|
# rdkit.SimDivFilters.rdSimDivPickers.MaxMinPicker()
|
Loading…
Reference in New Issue
Block a user