From 731466dc9a786a1a24e951530d88f29516bd3fed Mon Sep 17 00:00:00 2001 From: Jonathan Herrewijnen Date: Mon, 31 Oct 2022 14:42:35 +0100 Subject: [PATCH] IFP script added, others still required. --- create_IFP_datasets_v2_dense.py | 799 ++++++++++++++++++++++++++++++++ 1 file changed, 799 insertions(+) create mode 100644 create_IFP_datasets_v2_dense.py diff --git a/create_IFP_datasets_v2_dense.py b/create_IFP_datasets_v2_dense.py new file mode 100644 index 0000000..141d0c3 --- /dev/null +++ b/create_IFP_datasets_v2_dense.py @@ -0,0 +1,799 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed May 26 20:27:23 2021 + +@author: Jonathan + +The concatenation is not actual concatenation! Its just a sum up.. (my bad) +""" +import os +import pandas as pd +pd.options.mode.chained_assignment = None # default='warn' +import oddt +import oddt.interactions +import oddt.fingerprints +import oddt.scoring.descriptors.binana +import oddt.docking.AutodockVina +import rdkit +from oddt.scoring.functions.RFScore import rfscore +from oddt.scoring.functions.NNScore import nnscore +from oddt.scoring.functions.PLECscore import PLECscore +import pickle +import re +from rdkit import Chem +import sys +import gc +import urllib.request + +def find_number(text, c): + return re.findall(r'%s(\d+)' % c, text) + +# Definitions created by Olivier BĂ©quignon (Adding them to add more information to the dataset) +def get_pocket(klifs_id): + '''Obtain the 85 residue binding pocket structure from the specified KLIFS structure ID + + :param klifs_id: KLIFS structure ID + :return: ODDT protein structure of the binding pocket + ''' + response = urllib.request.urlopen(f'https://klifs.net/api_v2/structure_get_pocket?structure_ID={klifs_id}') + data = response.read().decode() + pocket = oddt.toolkit.readstring('mol2', data) + pocket.protein = True + return pocket + +def get_ligand(klifs_id): + '''Obtain the ligand structure from the specified KLIFS structure ID + + :param klifs_id: KLIFS structure ID + :return: ODDT ligand structure + ''' + response = urllib.request.urlopen(f'https://klifs.net/api_v2/structure_get_ligand?structure_ID={klifs_id}') + data = response.read().decode() + if len(data): + return oddt.toolkit.readstring('mol2', data) + +def get_pocket_IFP(klifs_id: int=None, + pocket: oddt.toolkit.readfile=None, + ligand: oddt.toolkit.readfile=None): + '''Obtain the interaction fingerprint of the ligand and protein pocket residues. + + :param klifs_id: KLIFS structure ID (ignored if pocket and ligand provided) + :param pocket: ODDT pocket. If None, the KLIFS pocket is used + :param ligand: ODDT ligand.If None, the KLIFS ligand is used + + :return: Binary IFP + ''' + if klifs_id is None and pocket is None and ligand is None: + raise ValueError('Must at least provide a KLIFS ID if any of pocket and/or ligand is/are missing') + if ligand is None: + ligand = get_ligand(klifs_id) + if pocket is None: + pocket = get_pocket(klifs_id) + return oddt.fingerprints.InteractionFingerprint(ligand, pocket) + +def to_dense_fp(bits, size): + X = dict(list(map(int, x.split(': '))) for x in bits[1:-1].split(', ')) + return [X[i] if i in X.keys() else 0 for i in range(size)] + +to_sparse_fp = lambda X: {i: x for i, x in enumerate(map(int, X[1:-1].split(', '))) if x > 0} + +n_in_range = 0 +n_per_step = 1 +var_from = sys.argv[1] +var_to = sys.argv[2] +next_from = int(var_from) + 1 + +print(var_from, var_to) +if var_from == var_to: + print("var_from and var_to are the same, exiting!") + exit() + +try: + with open(f'config_{var_from}_{var_to}.txt', 'r') as f: + config_file = f.readlines() + file_number = int(config_file[0].replace("\n","")) + count = -1 + group_number = int(config_file[2].replace("\n","")) + var_from = int(config_file[3].replace("\n","")) + var_to = int(config_file[4]) + print("File number and count", file_number, count) + f.close() +except: + print("No config file. Count is -1") + file_number = 0 + count =-1 + group_number = 0 + +# Load in dataset and add empty columns for the Interaction Fingerprints +csv_dataframe = pd.read_csv('../KLIFS_kinase_structure_data_selection_subselection_np.csv') +csv_active_compounds = pd.read_csv('../../my_rp1_compounds_kinases/uniprot_kinase_actives/uniprot_kinase_actives.csv') +csv_inactive_compounds = pd.read_csv('../../my_rp1_compounds_kinases/uniprot_kinase_inactives/uniprot_kinase_inactives.csv') + +dataframe_SIFP = pd.DataFrame() +dataframe_SIFP["Type"],dataframe_SIFP["KLIFS_pocket_IFP"],dataframe_SIFP["Conc_KLIFS_pocket_IFP"],dataframe_SIFP["VinaScore"],dataframe_SIFP["RFv1"],dataframe_SIFP["RFv2"],dataframe_SIFP["RFv3"],dataframe_SIFP["nn_score"],dataframe_SIFP["plec_score"],dataframe_SIFP["SMILES"],dataframe_SIFP["pchembl_value_Mean"],dataframe_SIFP["pchembl_value_Median"],dataframe_SIFP["protein"],dataframe_SIFP["compound"],dataframe_SIFP["decoy_group"] = '','','','','','','','','','','','','','','' + +dataframe_PLEC = pd.DataFrame() +dataframe_PLEC["Type"],dataframe_PLEC["PLEC"],dataframe_PLEC["Conc_PLEC"],dataframe_PLEC["VinaScore"],dataframe_PLEC["RFv1"],dataframe_PLEC["RFv2"],dataframe_PLEC["RFv3"],dataframe_PLEC["nn_score"],dataframe_PLEC["plec_score"],dataframe_PLEC["SMILES"],dataframe_PLEC["pchembl_value_Mean"],dataframe_PLEC["pchembl_value_Median"],dataframe_PLEC["protein"],dataframe_PLEC["compound"],dataframe_PLEC["decoy_group"] = '','','','','','','','','','','','','','','' + +dataframe_SPLIF_ECFP2 = pd.DataFrame() +dataframe_SPLIF_ECFP2["Type"],dataframe_SPLIF_ECFP2["SPLIF_ECFP2"],dataframe_SPLIF_ECFP2["Conc_SPLIF_ECFP2"],dataframe_SPLIF_ECFP2["VinaScore"],dataframe_SPLIF_ECFP2["RFv1"],dataframe_SPLIF_ECFP2["RFv2"],dataframe_SPLIF_ECFP2["RFv3"],dataframe_SPLIF_ECFP2["nn_score"],dataframe_SPLIF_ECFP2["plec_score"],dataframe_SPLIF_ECFP2["SMILES"],dataframe_SPLIF_ECFP2["pchembl_value_Mean"],dataframe_SPLIF_ECFP2["pchembl_value_Median"],dataframe_SPLIF_ECFP2["protein"],dataframe_SPLIF_ECFP2["compound"],dataframe_SPLIF_ECFP2["decoy_group"] = '','','','','','','','','','','','','','','' + +dataframe_SPLIF_ECFP4 = pd.DataFrame() +dataframe_SPLIF_ECFP4["Type"],dataframe_SPLIF_ECFP4["SPLIF_ECFP4"],dataframe_SPLIF_ECFP4["Conc_SPLIF_ECFP4"],dataframe_SPLIF_ECFP4["VinaScore"],dataframe_SPLIF_ECFP4["RFv1"],dataframe_SPLIF_ECFP4["RFv2"],dataframe_SPLIF_ECFP4["RFv3"],dataframe_SPLIF_ECFP4["nn_score"],dataframe_SPLIF_ECFP4["plec_score"],dataframe_SPLIF_ECFP4["SMILES"],dataframe_SPLIF_ECFP4["pchembl_value_Mean"],dataframe_SPLIF_ECFP4["pchembl_value_Median"],dataframe_SPLIF_ECFP4["protein"],dataframe_SPLIF_ECFP4["compound"],dataframe_SPLIF_ECFP4["decoy_group"] = '','','','','','','','','','','','','','','' + +dataframe_SPLIF_ECFP6 = pd.DataFrame() +dataframe_SPLIF_ECFP6["Type"],dataframe_SPLIF_ECFP6["SPLIF_ECFP6"],dataframe_SPLIF_ECFP6["Conc_SPLIF_ECFP6"],dataframe_SPLIF_ECFP6["VinaScore"],dataframe_SPLIF_ECFP6["RFv1"],dataframe_SPLIF_ECFP6["RFv2"],dataframe_SPLIF_ECFP6["RFv3"],dataframe_SPLIF_ECFP6["nn_score"],dataframe_SPLIF_ECFP6["plec_score"],dataframe_SPLIF_ECFP6["SMILES"],dataframe_SPLIF_ECFP6["pchembl_value_Mean"],dataframe_SPLIF_ECFP6["pchembl_value_Median"],dataframe_SPLIF_ECFP6["protein"],dataframe_SPLIF_ECFP6["compound"],dataframe_SPLIF_ECFP6["decoy_group"] = '','','','','','','','','','','','','','','' + +ListFaultyStructures = [] + +# Iterate through my csv and select for each structure the folder with actives and the folder with inactives. Then calculate fingerprints. +# for i in range(len(csv_dataframe["filename"])): +# for i in range(0+n_in_range*n_per_step, n_per_step+n_in_range*n_per_step): +for i in range(int(var_from), int(var_to)): + filename = csv_dataframe["filename"][i] + group = csv_dataframe["group"][i] + kinase_ID = csv_dataframe["kinase_ID"][i] + structure_ID = csv_dataframe["structure_ID"][i] + uniprot = csv_dataframe["uniprot"][i] + klifs_id = csv_dataframe["structure_ID"][i] + print(filename, group, kinase_ID, structure_ID, uniprot) + os.system(f'mkdir ../IFP_datasets/SIFP_v2/{filename[:-5]}') + os.system(f'mkdir ../IFP_datasets/PLEC_v2/{filename[:-5]}') + os.system(f'mkdir ../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}') + os.system(f'mkdir ../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}') + os.system(f'mkdir ../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}') + + try: + protein = '' + try: + os.system(f'obabel -ipdbqt ../../my_rp1_compounds_kinases/selected_mol2structures/{group}_pdbqt/{filename[:-5]}.pdbqt -opdb -O sparse_{filename[:-5]}.pdb') + protein = next(oddt.toolkit.readfile('pdb', f'sparse_{filename[:-5]}.pdb')) + proteinname = f'../../my_rp1_compounds_kinases/selected_mol2structures/{group}_pdbqt/{filename[:-5]}.pdbqt' + print("loaded in structure on 1st try") + except: + print("On except..") + ListFaultyStructures.append("Unsanitized! ",filename) + protein = next(oddt.toolkit.readfile('pdb', f'sparse_{filename[:-5]}.pdb', sanitize=False)) + print("Unsanitized! ", filename) + print("loaded in protein succesfully!") + protein.protein = True + + rf1 = rfscore.load(version=1) + with open('pickles/rf1.pickle', 'wb') as f: + pickle.dump(rf1, f) + f.close() + rf2 = rfscore.load(version=2) + with open('pickles/rf2.pickle', 'wb') as f: + pickle.dump(rf2, f) + f.close() + rf3 = rfscore.load(version=3) + with open('pickles/rf3.pickle', 'wb') as f: + pickle.dump(rf3, f) + f.close() + nn = nnscore.load() + with open('pickles/nn.pickle', 'wb') as f: + pickle.dump(nn, f) + f.close() + plecscore = PLECscore.load() + with open('pickles/plecscore.pickle', 'wb') as f: + pickle.dump(plecscore, f) + f.close() + + rf1.set_protein(protein) + rf2.set_protein(protein) + rf3.set_protein(protein) + nn.set_protein(protein) + plecscore.set_protein(protein) + + # KLIFS retrieving pocket for SIFP + pocket = get_pocket(klifs_id) + + try: + ligand_location_actives = "../../docking/"+group+"/"+uniprot+"/"+filename[:-5]+"/actives/" + ligand_location_inactives = "../../docking/"+group+"/"+uniprot+"/"+filename[:-5]+"/inactives/" + decoy_location_1000 = "../../docking/"+group+"/"+uniprot+"/"+filename[:-5]+"/decoys/" + decoy_location_1625 = "../../docking/"+group+"/"+uniprot+"/"+filename[:-5]+"/decoys1625/" + run_number = 0 + number_of_actives = len(os.listdir(ligand_location_actives)) + number_of_inactives = len(os.listdir(ligand_location_inactives)) + grouplist = ["AGC","Atypical","CAMK","CK1","CMGC","STE","TK","TKL"] + + for j in sorted(os.listdir(ligand_location_actives))[file_number:]: + print("Actives ", j, number_of_actives) + print(file_number, number_of_actives) + if file_number >= number_of_actives: + pass + else: + try: + file_number += 1 + run_number += 1 + group_number = 0 + ligandname = f'{ligand_location_actives}{j}' + + # ligand = next(oddt.toolkit.readfile('pdbqt', ligand_location_actives+j)) + os.system(f'obabel -ipdbqt {ligand_location_actives}{j} -osdf -O sparse_{filename[:-5]}.sdf') + ligand = next(oddt.toolkit.readfile('sdf',f'sparse_{filename[:-5]}.sdf')) + Type = "active" + + # IFP = oddt.fingerprints.InteractionFingerprint(ligand, protein) + # SIFP = oddt.fingerprints.SimpleInteractionFingerprint(ligand,protein) + KLIFS_pocket_IFP = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand) + Conc_KLIFS_pocket_IFP = KLIFS_pocket_IFP + KLIFS_pocket_IFP = ''.join(str(list(KLIFS_pocket_IFP))) + + PLEC = oddt.fingerprints.PLEC(ligand, protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True) + Conc_PLEC = PLEC + PLEC = ''.join(str(list(PLEC))) + + SPLIF_ECFP2 = oddt.fingerprints.SPLIF(ligand, protein, depth=1, size=4096, distance_cutoff=4.5) + SPLIF_ECFP2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2['hash'], size=4096) + Conc_SPLIF_ECFP2 = SPLIF_ECFP2 + SPLIF_ECFP2 = ''.join(str(list(SPLIF_ECFP2))) + + SPLIF_ECFP4 = oddt.fingerprints.SPLIF(ligand, protein, depth=2, size=4096, distance_cutoff=4.5) + SPLIF_ECFP4 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4['hash'], size=4096) + Conc_SPLIF_ECFP4 = SPLIF_ECFP4 + SPLIF_ECFP4 = ''.join(str(list(SPLIF_ECFP4))) + + SPLIF_ECFP6 = oddt.fingerprints.SPLIF(ligand, protein, depth=3, size=4096, distance_cutoff=4.5) + SPLIF_ECFP6 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6['hash'], size=4096) + Conc_SPLIF_ECFP6 = SPLIF_ECFP6 + SPLIF_ECFP6 = ''.join(str(list(SPLIF_ECFP6))) + + RFv1 = rf1.predict(ligand) + RFv2 = rf2.predict(ligand) + RFv3 = rf3.predict(ligand) + plec_score = plecscore.predict(ligand) + nn_score = nn.predict([ligand]) + VinaScore = ligand.data + + # This part is for concatenating all 5 poses + try: + # ligand = list(oddt.toolkit.readfile('pdbqt', ligand_location_actives+j)) + ligand = list(oddt.toolkit.readfile('sdf', f'sparse_{filename[:-5]}.sdf')) + for l in [x for x in range(len(ligand)) if x != 0]: + KLIFS_pocket_IFP_v2 = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand[l]) + Conc_KLIFS_pocket_IFP += KLIFS_pocket_IFP_v2 + + PLEC_v2 = oddt.fingerprints.PLEC(ligand[l], protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True) + Conc_PLEC += PLEC_v2 + + SPLIF_ECFP2_v2 = oddt.fingerprints.SPLIF(ligand[l], protein, depth=1, size=4096, distance_cutoff=4.5) + SPLIF_ECFP2_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2_v2['hash'], size=4096) + Conc_SPLIF_ECFP2 += SPLIF_ECFP2_v2 + + SPLIF_ECFP4_v2 = oddt.fingerprints.SPLIF(ligand[l], protein, depth=2, size=4096, distance_cutoff=4.5) + SPLIF_ECFP4_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4_v2['hash'], size=4096) + Conc_SPLIF_ECFP4 += SPLIF_ECFP4_v2 + + SPLIF_ECFP6_v2 = oddt.fingerprints.SPLIF(ligand[l], protein, depth=3, size=4096, distance_cutoff=4.5) + SPLIF_ECFP6_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6_v2['hash'], size=4096) + Conc_SPLIF_ECFP6 += SPLIF_ECFP6_v2 + Conc_KLIFS_pocket_IFP = ''.join(str(list(Conc_KLIFS_pocket_IFP))) + Conc_PLEC = ''.join(str(list(Conc_PLEC))) + Conc_SPLIF_ECFP2 = ''.join(str(list(Conc_SPLIF_ECFP2))) + Conc_SPLIF_ECFP4 = ''.join(str(list(Conc_SPLIF_ECFP4))) + Conc_SPLIF_ECFP6 = ''.join(str(list(Conc_SPLIF_ECFP6))) + print("Concatenated poses succesfully! Number of poses: "+str(l)) + except: + print("Concat error!") + pass + + #Dense to sparse for storage + # PLEC = PLEC.apply(to_sparse_fp) + # Conc_PLEC = oddt.fingerprints.dense_to_sparse(Conc_PLEC) + # print(len(PLEC)) + # SPLIF_ECFP2 = oddt.fingerprints.dense_to_sparse(SPLIF_ECFP2, size=4096) + # Conc_SPLIF_ECFP2 = oddt.fingerprints.dense_to_sparse(Conc_SPLIF_ECFP2, size=4096) + # SPLIF_ECFP4 = oddt.fingerprints.dense_to_sparse(SPLIF_ECFP4, size=4096) + # Conc_SPLIF_ECFP4 = oddt.fingerprints.dense_to_sparse(Conc_SPLIF_ECFP4, size=4096) + # SPLIF_ECFP6 = oddt.fingerprints.dense_to_sparse(SPLIF_ECFP6, size=4096) + # Conc_SPLIF_ECFP6 = oddt.fingerprints.dense_to_sparse(Conc_SPLIF_ECFP6, size=4096) + + #Find and calculate SMILES + with open(ligand_location_actives+j,'r') as f: + pdbqt_file = f.read() + f.close() + number_from_file = find_number(pdbqt_file, '.smi:') + print(number_from_file) + target_smiles = f'../../my_rp1_compounds_kinases/uniprot_kinase_{Type}s/{group}/{uniprot}_{Type}s.smi' + with open(target_smiles, 'r') as f: + smiles_file = f.readlines() + f.close() + SMILES = str(smiles_file[int(number_from_file[0])-1]).replace("\n","") + print(SMILES) + + for q in range(len(csv_active_compounds)): + if csv_active_compounds["standardised_smiles"][q] == SMILES and csv_active_compounds["accession"][q] == uniprot: + pchembl_mean = csv_active_compounds["pchembl_value_Mean"][q] + pchembl_median = csv_active_compounds["pchembl_value_Median"][q] + break + else: + pchembl_mean = '' + pchembl_median = '' + pass + + # Conc_PLEC,Conc_SPLIF_ECFP2,Conc_SPLIF_ECFP4,Conc_SPLIF_ECFP6,PLEC_v2,SPLIF_ECFP2_v2,SPLIF_ECFP4_v2,SPLIF_ECFP6_v2 ='','','','','','','','' + + count += 1 + dataframe_SIFP,dataframe_SIFP["KLIFS_pocket_IFP"][count],dataframe_SIFP["Conc_KLIFS_pocket_IFP"][count],dataframe_SIFP['VinaScore'][count],dataframe_SIFP['Type'][count],dataframe_SIFP["RFv1"][count],dataframe_SIFP["RFv2"][count],dataframe_SIFP["RFv3"][count],dataframe_SIFP["nn_score"][count],dataframe_SIFP["plec_score"][count],dataframe_SIFP["SMILES"][count],dataframe_SIFP["pchembl_value_Mean"][count],dataframe_SIFP["pchembl_value_Median"][count],dataframe_SIFP["protein"][count],dataframe_SIFP["compound"][count] = dataframe_SIFP.append(csv_dataframe.iloc[i],ignore_index=True),KLIFS_pocket_IFP,Conc_KLIFS_pocket_IFP,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname + dataframe_PLEC,dataframe_PLEC["PLEC"][count],dataframe_PLEC["Conc_PLEC"][count],dataframe_PLEC['VinaScore'][count],dataframe_PLEC['Type'][count],dataframe_PLEC["RFv1"][count],dataframe_PLEC["RFv2"][count],dataframe_PLEC["RFv3"][count],dataframe_PLEC["nn_score"][count],dataframe_PLEC["plec_score"][count],dataframe_PLEC["SMILES"][count],dataframe_PLEC["pchembl_value_Mean"][count],dataframe_PLEC["pchembl_value_Median"][count],dataframe_PLEC["protein"][count],dataframe_PLEC["compound"][count] = dataframe_PLEC.append(csv_dataframe.iloc[i],ignore_index=True),PLEC,Conc_PLEC,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname + dataframe_SPLIF_ECFP2,dataframe_SPLIF_ECFP2["SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2["Conc_SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2['VinaScore'][count],dataframe_SPLIF_ECFP2['Type'][count],dataframe_SPLIF_ECFP2["RFv1"][count],dataframe_SPLIF_ECFP2["RFv2"][count],dataframe_SPLIF_ECFP2["RFv3"][count],dataframe_SPLIF_ECFP2["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP2["SMILES"][count],dataframe_SPLIF_ECFP2["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP2["pchembl_value_Median"][count],dataframe_SPLIF_ECFP2["protein"][count],dataframe_SPLIF_ECFP2["compound"][count] = dataframe_SPLIF_ECFP2.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP2,Conc_SPLIF_ECFP2,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname + dataframe_SPLIF_ECFP4,dataframe_SPLIF_ECFP4["SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4["Conc_SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4['VinaScore'][count],dataframe_SPLIF_ECFP4['Type'][count],dataframe_SPLIF_ECFP4["RFv1"][count],dataframe_SPLIF_ECFP4["RFv2"][count],dataframe_SPLIF_ECFP4["RFv3"][count],dataframe_SPLIF_ECFP4["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP4["SMILES"][count],dataframe_SPLIF_ECFP4["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP4["pchembl_value_Median"][count],dataframe_SPLIF_ECFP4["protein"][count],dataframe_SPLIF_ECFP4["compound"][count] = dataframe_SPLIF_ECFP4.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP4,Conc_SPLIF_ECFP4,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname + dataframe_SPLIF_ECFP6,dataframe_SPLIF_ECFP6["SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6["Conc_SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6['VinaScore'][count],dataframe_SPLIF_ECFP6['Type'][count],dataframe_SPLIF_ECFP6["RFv1"][count],dataframe_SPLIF_ECFP6["RFv2"][count],dataframe_SPLIF_ECFP6["RFv3"][count],dataframe_SPLIF_ECFP6["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP6["SMILES"][count],dataframe_SPLIF_ECFP6["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP6["pchembl_value_Median"][count],dataframe_SPLIF_ECFP6["protein"][count],dataframe_SPLIF_ECFP6["compound"][count] = dataframe_SPLIF_ECFP6.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP6,Conc_SPLIF_ECFP6,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname + print("Done appending to dataframe! Number appended", count, int(run_number)) + + if int(run_number) > 375: + print("Going dark to preserve memory..") + with open(f'config_{var_from}_{var_to}.txt', 'w') as f: + to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(var_from)+"\n"+str(var_to) + print(to_write) + f.write(to_write) + f.close() + dataframe_SIFP.to_csv(f'../IFP_datasets/SIFP_v2/{filename[:-5]}/dataframe_SIFP_dense_{filename[:-5]}_{file_number}.csv') + dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv') + os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(var_from), str(var_to)) + else: + pass + gc.collect() + + # del(ligand,SMILES,RFv1,RFv2,RFv3,PLEC,SPLIF_ECFP2,SPLIF_ECFP4,SPLIF_ECFP6,Conc_PLEC,Conc_SPLIF_ECFP2,Conc_SPLIF_ECFP4,Conc_SPLIF_ECFP6,VinaScore,PLEC_v2,SPLIF_ECFP2_v2,SPLIF_ECFP4_v2,SPLIF_ECFP6_v2) + + # Other descriptors I could use later on? + # protein_atoms, ligand_atoms, strict = oddt.interactions.hbonds(protein, ligand) + # print(protein_atoms['resname']) + # wut = oddt.interactions.close_contacts(protein_atoms, ligand_atoms, cutoff=4, x_column='coords', y_column='coords') + except: + print("Actives. Errors for some reason..") + print("Going to inactives.. ") + for k in sorted(os.listdir(ligand_location_inactives))[(file_number-number_of_actives):]: + print("Inactives ", k, number_of_inactives) + if int(file_number) >= (int(number_of_actives)+int(number_of_inactives)): + print("passing! len actives + actives is: ", str(number_of_actives), str(number_of_inactives)) + pass + else: + try: + file_number += 1 + run_number += 1 + group_number = 0 + ligandname = f'{ligand_location_inactives}{k}' + + ligand = '' + Type = "inactive" + # ligand = next(oddt.toolkit.readfile('pdbqt', ligand_location_inactives+k)) + os.system(f'obabel -ipdbqt {ligand_location_inactives}{k} -osdf -O sparse_{filename[:-5]}.sdf') + ligand = next(oddt.toolkit.readfile('sdf',f'sparse_{filename[:-5]}.sdf')) + + KLIFS_pocket_IFP = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand) + Conc_KLIFS_pocket_IFP = KLIFS_pocket_IFP + KLIFS_pocket_IFP = ''.join(str(list(KLIFS_pocket_IFP))) + + PLEC = oddt.fingerprints.PLEC(ligand, protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True) + Conc_PLEC = PLEC + PLEC = ''.join(str(list(PLEC))) + + SPLIF_ECFP2 = oddt.fingerprints.SPLIF(ligand, protein, depth=1, size=4096, distance_cutoff=4.5) + SPLIF_ECFP2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2['hash'], size=4096) + Conc_SPLIF_ECFP2 = SPLIF_ECFP2 + SPLIF_ECFP2 = ''.join(str(list(SPLIF_ECFP2))) + + SPLIF_ECFP4 = oddt.fingerprints.SPLIF(ligand, protein, depth=2, size=4096, distance_cutoff=4.5) + SPLIF_ECFP4 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4['hash'], size=4096) + Conc_SPLIF_ECFP4 = SPLIF_ECFP4 + SPLIF_ECFP4 = ''.join(str(list(SPLIF_ECFP4))) + + SPLIF_ECFP6 = oddt.fingerprints.SPLIF(ligand, protein, depth=3, size=4096, distance_cutoff=4.5) + SPLIF_ECFP6 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6['hash'], size=4096) + Conc_SPLIF_ECFP6 = SPLIF_ECFP6 + SPLIF_ECFP6 = ''.join(str(list(SPLIF_ECFP6))) + + RFv1 = rf1.predict(ligand) + RFv2 = rf2.predict(ligand) + RFv3 = rf3.predict(ligand) + plec_score = plecscore.predict(ligand) + nn_score = nn.predict([ligand]) + VinaScore = ligand.data + + # This part is for concatenating all 5 poses + try: + # ligand = list(oddt.toolkit.readfile('pdbqt', ligand_location_inactives+k)) + ligand = list(oddt.toolkit.readfile('sdf',f'sparse_{filename[:-5]}.sdf')) + for m in [x for x in range(len(ligand)) if x != 0]: + KLIFS_pocket_IFP_v2 = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand[m]) + Conc_KLIFS_pocket_IFP += KLIFS_pocket_IFP_v2 + + PLEC_v2 = oddt.fingerprints.PLEC(ligand[m], protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True) + Conc_PLEC += PLEC_v2 + + SPLIF_ECFP2_v2 = oddt.fingerprints.SPLIF(ligand[m], protein, depth=1, size=4096, distance_cutoff=4.5) + SPLIF_ECFP2_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2_v2['hash'], size=4096) + Conc_SPLIF_ECFP2 += SPLIF_ECFP2_v2 + + SPLIF_ECFP4_v2 = oddt.fingerprints.SPLIF(ligand[m], protein, depth=2, size=4096, distance_cutoff=4.5) + SPLIF_ECFP4_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4_v2['hash'], size=4096) + Conc_SPLIF_ECFP4 += SPLIF_ECFP4_v2 + + SPLIF_ECFP6_v2 = oddt.fingerprints.SPLIF(ligand[m], protein, depth=3, size=4096, distance_cutoff=4.5) + SPLIF_ECFP6_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6_v2['hash'], size=4096) + Conc_SPLIF_ECFP6 += SPLIF_ECFP6_v2 + Conc_KLIFS_pocket_IFP = ''.join(str(list(Conc_KLIFS_pocket_IFP))) + Conc_PLEC = ''.join(str(list(Conc_PLEC))) + Conc_SPLIF_ECFP2 = ''.join(str(list(Conc_SPLIF_ECFP2))) + Conc_SPLIF_ECFP4 = ''.join(str(list(Conc_SPLIF_ECFP4))) + Conc_SPLIF_ECFP6 = ''.join(str(list(Conc_SPLIF_ECFP6))) + print("Concatenated poses succesfully! Number of poses: "+str(m)) + except: + print("Inactives. This file might not have 5 poses?") + + #Find and calculate SMILES + with open(ligand_location_inactives+k,'r') as f: + pdbqt_file = f.read() + f.close() + number_from_file = find_number(pdbqt_file, '.smi:') + print(number_from_file) + target_smiles = f'../../my_rp1_compounds_kinases/uniprot_kinase_{Type}s/{group}/{uniprot}_{Type}s.smi' + with open(target_smiles, 'r') as f: + smiles_file = f.readlines() + f.close() + SMILES = str(smiles_file[int(number_from_file[0])-1]).replace("\n","") + print(SMILES) + + for r in range(len(csv_inactive_compounds)): + if csv_inactive_compounds["standardised_smiles"][r] == SMILES and csv_inactive_compounds["accession"][r] == uniprot: + pchembl_mean = csv_inactive_compounds["pchembl_value_Mean"][r] + pchembl_median = csv_inactive_compounds["pchembl_value_Median"][r] + break + else: + pchembl_mean = '' + pchembl_median = '' + pass + + count += 1 + dataframe_SIFP,dataframe_SIFP["KLIFS_pocket_IFP"][count],dataframe_SIFP["Conc_KLIFS_pocket_IFP"][count],dataframe_SIFP['VinaScore'][count],dataframe_SIFP['Type'][count],dataframe_SIFP["RFv1"][count],dataframe_SIFP["RFv2"][count],dataframe_SIFP["RFv3"][count],dataframe_SIFP["nn_score"][count],dataframe_SIFP["plec_score"][count],dataframe_SIFP["SMILES"][count],dataframe_SIFP["pchembl_value_Mean"][count],dataframe_SIFP["pchembl_value_Median"][count],dataframe_SIFP["protein"][count],dataframe_SIFP["compound"][count] = dataframe_SIFP.append(csv_dataframe.iloc[i],ignore_index=True),KLIFS_pocket_IFP,Conc_KLIFS_pocket_IFP,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname + dataframe_PLEC,dataframe_PLEC["PLEC"][count],dataframe_PLEC["Conc_PLEC"][count],dataframe_PLEC['VinaScore'][count],dataframe_PLEC['Type'][count],dataframe_PLEC["RFv1"][count],dataframe_PLEC["RFv2"][count],dataframe_PLEC["RFv3"][count],dataframe_PLEC["nn_score"][count],dataframe_PLEC["plec_score"][count],dataframe_PLEC["SMILES"][count],dataframe_PLEC["pchembl_value_Mean"][count],dataframe_PLEC["pchembl_value_Median"][count],dataframe_PLEC["protein"][count],dataframe_PLEC["compound"][count] = dataframe_PLEC.append(csv_dataframe.iloc[i],ignore_index=True),PLEC,Conc_PLEC,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname + dataframe_SPLIF_ECFP2,dataframe_SPLIF_ECFP2["SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2["Conc_SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2['VinaScore'][count],dataframe_SPLIF_ECFP2['Type'][count],dataframe_SPLIF_ECFP2["RFv1"][count],dataframe_SPLIF_ECFP2["RFv2"][count],dataframe_SPLIF_ECFP2["RFv3"][count],dataframe_SPLIF_ECFP2["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP2["SMILES"][count],dataframe_SPLIF_ECFP2["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP2["pchembl_value_Median"][count],dataframe_SPLIF_ECFP2["protein"][count],dataframe_SPLIF_ECFP2["compound"][count] = dataframe_SPLIF_ECFP2.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP2,Conc_SPLIF_ECFP2,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname + dataframe_SPLIF_ECFP4,dataframe_SPLIF_ECFP4["SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4["Conc_SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4['VinaScore'][count],dataframe_SPLIF_ECFP4['Type'][count],dataframe_SPLIF_ECFP4["RFv1"][count],dataframe_SPLIF_ECFP4["RFv2"][count],dataframe_SPLIF_ECFP4["RFv3"][count],dataframe_SPLIF_ECFP4["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP4["SMILES"][count],dataframe_SPLIF_ECFP4["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP4["pchembl_value_Median"][count],dataframe_SPLIF_ECFP4["protein"][count],dataframe_SPLIF_ECFP4["compound"][count] = dataframe_SPLIF_ECFP4.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP4,Conc_SPLIF_ECFP4,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname + dataframe_SPLIF_ECFP6,dataframe_SPLIF_ECFP6["SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6["Conc_SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6['VinaScore'][count],dataframe_SPLIF_ECFP6['Type'][count],dataframe_SPLIF_ECFP6["RFv1"][count],dataframe_SPLIF_ECFP6["RFv2"][count],dataframe_SPLIF_ECFP6["RFv3"][count],dataframe_SPLIF_ECFP6["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP6["SMILES"][count],dataframe_SPLIF_ECFP6["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP6["pchembl_value_Median"][count],dataframe_SPLIF_ECFP6["protein"][count],dataframe_SPLIF_ECFP6["compound"][count] = dataframe_SPLIF_ECFP6.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP6,Conc_SPLIF_ECFP6,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname + print("Done appending to dataframe! Number appended", count, int(run_number)) + + gc.collect() + if int(run_number) > 375: + print("Going dark to preserve memory..") + with open(f'config_{var_from}_{var_to}.txt', 'w') as f: + to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(var_from)+"\n"+str(var_to) + print(to_write) + f.write(to_write) + f.close() + dataframe_SIFP.to_csv(f'../IFP_datasets/SIFP_v2/{filename[:-5]}/dataframe_SIFP_dense_{filename[:-5]}_{file_number}.csv') + dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv') + os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(var_from), str(var_to)) + else: + pass + + except: + print("No pdbqt files available maybe?") + + if int(count) == -1: + pass + else: + print("Going dark for the decoys..") + with open(f'config_{var_from}_{var_to}.txt', 'w') as f: + to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(var_from)+"\n"+str(var_to) + print(to_write) + f.write(to_write) + f.close() + dataframe_SIFP.to_csv(f'../IFP_datasets/SIFP_v2/{filename[:-5]}/dataframe_SIFP_dense_{filename[:-5]}_{file_number}.csv') + dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv') + os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(var_from), str(var_to)) + + # Decoys for loop here + decoy_number = 0 + for n in range(group_number, len(grouplist)): + print("On decoys!") + + if int(decoy_number) >= 125: + print("Going dark to preserve memory..") + if int(group_number) > 8: + next_from = int(var_from) + 1 + if next_from == var_to: + print("DONE WITH RUN") + sys.exit() + os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str("blub")) + file_number, count, group_number = 0,-1,0 + with open(f'config_{next_from}_{var_to}.txt', 'w') as f: + to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(next_from)+"\n"+str(var_to) + print(to_write) + f.write(to_write) + f.close() + dataframe_SIFP.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv') + dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}.csv') + dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv') + os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(var_from), str(var_to)) + with open(f'config_{var_from}_{var_to}.txt', 'w') as f: + to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(var_from)+"\n"+str(var_to) + print(to_write) + f.write(to_write) + f.close() + dataframe_SIFP.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv') + dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}.csv') + dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv') + os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(var_from), str(var_to)) + else: + group_number += 1 + pass + + for o in sorted(os.listdir(decoy_location_1000+grouplist[n])): + print("Decoys. Decoynumber - group_number", o, group_number) + try: + file_number += 1 + run_number += 1 + decoy_number += 1 + ligandname = f'{decoy_location_1000}{grouplist[n]}/{o}' + decoy_group = str(grouplist[n]) + + ligand = '' + Type = "decoy" + # ligand = next(oddt.toolkit.readfile('pdbqt', decoy_location_1000+grouplist[n]+"/"+o)) + os.system(f'obabel -ipdbqt {decoy_location_1000}{grouplist[n]}/{o} -osdf -O sparse_{filename[:-5]}.sdf') + ligand = next(oddt.toolkit.readfile('sdf', f'sparse_{filename[:-5]}.sdf')) + + KLIFS_pocket_IFP = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand) + Conc_KLIFS_pocket_IFP = KLIFS_pocket_IFP + KLIFS_pocket_IFP = ''.join(str(list(KLIFS_pocket_IFP))) + + PLEC = oddt.fingerprints.PLEC(ligand, protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True) + Conc_PLEC = PLEC + PLEC = ''.join(str(list(PLEC))) + + SPLIF_ECFP2 = oddt.fingerprints.SPLIF(ligand, protein, depth=1, size=4096, distance_cutoff=4.5) + SPLIF_ECFP2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2['hash'], size=4096) + Conc_SPLIF_ECFP2 = SPLIF_ECFP2 + SPLIF_ECFP2 = ''.join(str(list(SPLIF_ECFP2))) + + SPLIF_ECFP4 = oddt.fingerprints.SPLIF(ligand, protein, depth=2, size=4096, distance_cutoff=4.5) + SPLIF_ECFP4 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4['hash'], size=4096) + Conc_SPLIF_ECFP4 = SPLIF_ECFP4 + SPLIF_ECFP4 = ''.join(str(list(SPLIF_ECFP4))) + + SPLIF_ECFP6 = oddt.fingerprints.SPLIF(ligand, protein, depth=3, size=4096, distance_cutoff=4.5) + SPLIF_ECFP6 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6['hash'], size=4096) + Conc_SPLIF_ECFP6 = SPLIF_ECFP6 + SPLIF_ECFP6 = ''.join(str(list(SPLIF_ECFP6))) + + RFv1 = rf1.predict(ligand) + RFv2 = rf2.predict(ligand) + RFv3 = rf3.predict(ligand) + plec_score = plecscore.predict(ligand) + nn_score = nn.predict([ligand]) + VinaScore = ligand.data + + # This part is for concatenating all 5 poses + try: + # ligand = list(oddt.toolkit.readfile('pdbqt', decoy_location_1000+grouplist[n]+"/"+o)) + ligand = list(oddt.toolkit.readfile('sdf', f'sparse_{filename[:-5]}.sdf')) + for p in [x for x in range(len(ligand)) if x != 0]: + KLIFS_pocket_IFP_v2 = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand[p]) + Conc_KLIFS_pocket_IFP += KLIFS_pocket_IFP_v2 + + PLEC_v2 = oddt.fingerprints.PLEC(ligand[p], protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True) + Conc_PLEC += PLEC_v2 + + SPLIF_ECFP2_v2 = oddt.fingerprints.SPLIF(ligand[p], protein, depth=1, size=4096, distance_cutoff=4.5) + SPLIF_ECFP2_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2_v2['hash'], size=4096) + Conc_SPLIF_ECFP2 += SPLIF_ECFP2_v2 + + SPLIF_ECFP4_v2 = oddt.fingerprints.SPLIF(ligand[p], protein, depth=2, size=4096, distance_cutoff=4.5) + SPLIF_ECFP4_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4_v2['hash'], size=4096) + Conc_SPLIF_ECFP4 += SPLIF_ECFP4_v2 + + SPLIF_ECFP6_v2 = oddt.fingerprints.SPLIF(ligand[p], protein, depth=3, size=4096, distance_cutoff=4.5) + SPLIF_ECFP6_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6_v2['hash'], size=4096) + Conc_SPLIF_ECFP6 += SPLIF_ECFP6_v2 + Conc_KLIFS_pocket_IFP = ''.join(str(list(Conc_KLIFS_pocket_IFP))) + Conc_PLEC = ''.join(str(list(Conc_PLEC))) + Conc_SPLIF_ECFP2 = ''.join(str(list(Conc_SPLIF_ECFP2))) + Conc_SPLIF_ECFP4 = ''.join(str(list(Conc_SPLIF_ECFP4))) + Conc_SPLIF_ECFP6 = ''.join(str(list(Conc_SPLIF_ECFP6))) + print("Concatenated poses succesfully! Number of poses: "+str(p)) + + except: + print("Decoys. This file might not have any poses?") + + #Dense to sparse for storage + # PLEC = oddt.fingerprints.dense_to_sparse(PLEC, size=16384) + # Conc_PLEC = oddt.fingerprints.dense_to_sparse(Conc_PLEC, size=16384) + # SPLIF_ECFP2 = oddt.fingerprints.dense_to_sparse(SPLIF_ECFP2, size=4096) + # Conc_SPLIF_ECFP2 = oddt.fingerprints.dense_to_sparse(Conc_SPLIF_ECFP2, size=4096) + # SPLIF_ECFP4 = oddt.fingerprints.dense_to_sparse(SPLIF_ECFP4, size=4096) + # Conc_SPLIF_ECFP4 = oddt.fingerprints.dense_to_sparse(Conc_SPLIF_ECFP4, size=4096) + # SPLIF_ECFP6 = oddt.fingerprints.dense_to_sparse(SPLIF_ECFP6, size=4096) + # Conc_SPLIF_ECFP6 = oddt.fingerprints.dense_to_sparse(Conc_SPLIF_ECFP6, size=4096) + + # This doesn't (yet) work for the decoys! + # with open(decoy_location_1000+grouplist[n]+j,'r') as f: + # pdbqt_file = f.read() + # f.close() + # number_from_file = find_number(pdbqt_file, '.smi:') + # target_smiles = f'../my_rp1_compounds_kinases/{Type}s/1000/{group}_125.smi' + # with open(target_smiles, 'r') as f: + # smiles_file = f.readlines() + # f.close() + # SMILES = str(smiles_file[int(number_from_file[0])]).replace("\n","") + SMILES,pchembl_mean,pchembl_median = '','','' + print("On dataframe!") + + count += 1 + dataframe_SIFP,dataframe_SIFP["KLIFS_pocket_IFP"][count],dataframe_SIFP["Conc_KLIFS_pocket_IFP"][count],dataframe_SIFP['VinaScore'][count],dataframe_SIFP['Type'][count],dataframe_SIFP["RFv1"][count],dataframe_SIFP["RFv2"][count],dataframe_SIFP["RFv3"][count],dataframe_SIFP["nn_score"][count],dataframe_SIFP["plec_score"][count],dataframe_SIFP["SMILES"][count],dataframe_SIFP["pchembl_value_Mean"][count],dataframe_SIFP["pchembl_value_Median"][count],dataframe_SIFP["protein"][count],dataframe_SIFP["compound"][count],dataframe_SIFP["decoy_group"][count] = dataframe_SIFP.append(csv_dataframe.iloc[i],ignore_index=True),KLIFS_pocket_IFP,Conc_KLIFS_pocket_IFP,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group + dataframe_PLEC,dataframe_PLEC["PLEC"][count],dataframe_PLEC["Conc_PLEC"][count],dataframe_PLEC['VinaScore'][count],dataframe_PLEC['Type'][count],dataframe_PLEC["RFv1"][count],dataframe_PLEC["RFv2"][count],dataframe_PLEC["RFv3"][count],dataframe_PLEC["nn_score"][count],dataframe_PLEC["plec_score"][count],dataframe_PLEC["SMILES"][count],dataframe_PLEC["pchembl_value_Mean"][count],dataframe_PLEC["pchembl_value_Median"][count],dataframe_PLEC["protein"][count],dataframe_PLEC["compound"][count],dataframe_PLEC["decoy_group"][count] = dataframe_PLEC.append(csv_dataframe.iloc[i],ignore_index=True),PLEC,Conc_PLEC,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group + dataframe_SPLIF_ECFP2,dataframe_SPLIF_ECFP2["SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2["Conc_SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2['VinaScore'][count],dataframe_SPLIF_ECFP2['Type'][count],dataframe_SPLIF_ECFP2["RFv1"][count],dataframe_SPLIF_ECFP2["RFv2"][count],dataframe_SPLIF_ECFP2["RFv3"][count],dataframe_SPLIF_ECFP2["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP2["SMILES"][count],dataframe_SPLIF_ECFP2["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP2["pchembl_value_Median"][count],dataframe_SPLIF_ECFP2["protein"][count],dataframe_SPLIF_ECFP2["compound"][count],dataframe_SPLIF_ECFP2["decoy_group"][count] = dataframe_SPLIF_ECFP2.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP2,Conc_SPLIF_ECFP2,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group + dataframe_SPLIF_ECFP4,dataframe_SPLIF_ECFP4["SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4["Conc_SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4['VinaScore'][count],dataframe_SPLIF_ECFP4['Type'][count],dataframe_SPLIF_ECFP4["RFv1"][count],dataframe_SPLIF_ECFP4["RFv2"][count],dataframe_SPLIF_ECFP4["RFv3"][count],dataframe_SPLIF_ECFP4["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP4["SMILES"][count],dataframe_SPLIF_ECFP4["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP4["pchembl_value_Median"][count],dataframe_SPLIF_ECFP4["protein"][count],dataframe_SPLIF_ECFP4["compound"][count],dataframe_SPLIF_ECFP2["decoy_group"][count] = dataframe_SPLIF_ECFP4.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP4,Conc_SPLIF_ECFP4,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group + dataframe_SPLIF_ECFP6,dataframe_SPLIF_ECFP6["SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6["Conc_SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6['VinaScore'][count],dataframe_SPLIF_ECFP6['Type'][count],dataframe_SPLIF_ECFP6["RFv1"][count],dataframe_SPLIF_ECFP6["RFv2"][count],dataframe_SPLIF_ECFP6["RFv3"][count],dataframe_SPLIF_ECFP6["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP6["SMILES"][count],dataframe_SPLIF_ECFP6["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP6["pchembl_value_Median"][count],dataframe_SPLIF_ECFP6["protein"][count],dataframe_SPLIF_ECFP6["compound"][count],dataframe_SPLIF_ECFP2["decoy_group"][count] = dataframe_SPLIF_ECFP6.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP6,Conc_SPLIF_ECFP6,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group + print("Done appending to dataframe! Number appended", count, int(run_number)) + + gc.collect() + except: + print("No pdbqt files available maybe?") + + # Decoys of 1625, IF available + try: + for o in sorted(os.listdir(decoy_location_1625+grouplist[n])): + print("Decoys. Decoynumber - group_number", o, group_number) + try: + # file_number += 1 + # run_number += 1 + # decoy_number += 1 + ligandname = f'{decoy_location_1625}{grouplist[n]}/{o}' + decoy_group = str(grouplist[n]) + + ligand = '' + Type = "decoy_v2" + os.system(f'obabel -ipdbqt {decoy_location_1625}{grouplist[n]}/{o} -osdf -O sparse_{filename[:-5]}.sdf') + ligand = next(oddt.toolkit.readfile('sdf', f'sparse_{filename[:-5]}.sdf')) + + KLIFS_pocket_IFP = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand) + Conc_KLIFS_pocket_IFP = KLIFS_pocket_IFP + KLIFS_pocket_IFP = ''.join(str(list(KLIFS_pocket_IFP))) + + PLEC = oddt.fingerprints.PLEC(ligand, protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True) + Conc_PLEC = PLEC + PLEC = ''.join(str(list(PLEC))) + + SPLIF_ECFP2 = oddt.fingerprints.SPLIF(ligand, protein, depth=1, size=4096, distance_cutoff=4.5) + SPLIF_ECFP2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2['hash'], size=4096) + Conc_SPLIF_ECFP2 = SPLIF_ECFP2 + SPLIF_ECFP2 = ''.join(str(list(SPLIF_ECFP2))) + + SPLIF_ECFP4 = oddt.fingerprints.SPLIF(ligand, protein, depth=2, size=4096, distance_cutoff=4.5) + SPLIF_ECFP4 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4['hash'], size=4096) + Conc_SPLIF_ECFP4 = SPLIF_ECFP4 + SPLIF_ECFP4 = ''.join(str(list(SPLIF_ECFP4))) + + SPLIF_ECFP6 = oddt.fingerprints.SPLIF(ligand, protein, depth=3, size=4096, distance_cutoff=4.5) + SPLIF_ECFP6 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6['hash'], size=4096) + Conc_SPLIF_ECFP6 = SPLIF_ECFP6 + SPLIF_ECFP6 = ''.join(str(list(SPLIF_ECFP6))) + + RFv1 = rf1.predict(ligand) + RFv2 = rf2.predict(ligand) + RFv3 = rf3.predict(ligand) + plec_score = plecscore.predict(ligand) + nn_score = nn.predict([ligand]) + VinaScore = ligand.data + + # This part is for concatenating all 5 poses + try: + # ligand = list(oddt.toolkit.readfile('pdbqt', decoy_location_1000+grouplist[n]+"/"+o)) + ligand = list(oddt.toolkit.readfile('sdf', f'sparse_{filename[:-5]}.sdf')) + for p in [x for x in range(len(ligand)) if x != 0]: + KLIFS_pocket_IFP_v2 = get_pocket_IFP(klifs_id=klifs_id, pocket=pocket, ligand=ligand[p]) + Conc_KLIFS_pocket_IFP += KLIFS_pocket_IFP_v2 + + PLEC_v2 = oddt.fingerprints.PLEC(ligand[p], protein, depth_ligand=2, depth_protein=4, distance_cutoff=4.5, size=16384, count_bits=True, sparse=False, ignore_hoh=True) + Conc_PLEC += PLEC_v2 + + SPLIF_ECFP2_v2 = oddt.fingerprints.SPLIF(ligand[p], protein, depth=1, size=4096, distance_cutoff=4.5) + SPLIF_ECFP2_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP2_v2['hash'], size=4096) + Conc_SPLIF_ECFP2 += SPLIF_ECFP2_v2 + + SPLIF_ECFP4_v2 = oddt.fingerprints.SPLIF(ligand[p], protein, depth=2, size=4096, distance_cutoff=4.5) + SPLIF_ECFP4_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP4_v2['hash'], size=4096) + Conc_SPLIF_ECFP4 += SPLIF_ECFP4_v2 + + SPLIF_ECFP6_v2 = oddt.fingerprints.SPLIF(ligand[p], protein, depth=3, size=4096, distance_cutoff=4.5) + SPLIF_ECFP6_v2 = oddt.fingerprints.sparse_to_dense(SPLIF_ECFP6_v2['hash'], size=4096) + Conc_SPLIF_ECFP6 += SPLIF_ECFP6_v2 + Conc_KLIFS_pocket_IFP = ''.join(str(list(Conc_KLIFS_pocket_IFP))) + Conc_PLEC = ''.join(str(list(Conc_PLEC))) + Conc_SPLIF_ECFP2 = ''.join(str(list(Conc_SPLIF_ECFP2))) + Conc_SPLIF_ECFP4 = ''.join(str(list(Conc_SPLIF_ECFP4))) + Conc_SPLIF_ECFP6 = ''.join(str(list(Conc_SPLIF_ECFP6))) + print("Concatenated poses succesfully! Number of poses: "+str(p)) + + except: + print("Decoys. This file might not have any poses?") + + SMILES,pchembl_mean,pchembl_median = '','','' + print("On dataframe!") + + count += 1 + dataframe_SIFP,dataframe_SIFP["KLIFS_pocket_IFP"][count],dataframe_SIFP["Conc_KLIFS_pocket_IFP"][count],dataframe_SIFP['VinaScore'][count],dataframe_SIFP['Type'][count],dataframe_SIFP["RFv1"][count],dataframe_SIFP["RFv2"][count],dataframe_SIFP["RFv3"][count],dataframe_SIFP["nn_score"][count],dataframe_SIFP["plec_score"][count],dataframe_SIFP["SMILES"][count],dataframe_SIFP["pchembl_value_Mean"][count],dataframe_SIFP["pchembl_value_Median"][count],dataframe_SIFP["protein"][count],dataframe_SIFP["compound"][count],dataframe_SIFP["decoy_group"][count] = dataframe_SIFP.append(csv_dataframe.iloc[i],ignore_index=True),KLIFS_pocket_IFP,Conc_KLIFS_pocket_IFP,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group + dataframe_PLEC,dataframe_PLEC["PLEC"][count],dataframe_PLEC["Conc_PLEC"][count],dataframe_PLEC['VinaScore'][count],dataframe_PLEC['Type'][count],dataframe_PLEC["RFv1"][count],dataframe_PLEC["RFv2"][count],dataframe_PLEC["RFv3"][count],dataframe_PLEC["nn_score"][count],dataframe_PLEC["plec_score"][count],dataframe_PLEC["SMILES"][count],dataframe_PLEC["pchembl_value_Mean"][count],dataframe_PLEC["pchembl_value_Median"][count],dataframe_PLEC["protein"][count],dataframe_PLEC["compound"][count],dataframe_PLEC["decoy_group"][count] = dataframe_PLEC.append(csv_dataframe.iloc[i],ignore_index=True),PLEC,Conc_PLEC,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group + dataframe_SPLIF_ECFP2,dataframe_SPLIF_ECFP2["SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2["Conc_SPLIF_ECFP2"][count],dataframe_SPLIF_ECFP2['VinaScore'][count],dataframe_SPLIF_ECFP2['Type'][count],dataframe_SPLIF_ECFP2["RFv1"][count],dataframe_SPLIF_ECFP2["RFv2"][count],dataframe_SPLIF_ECFP2["RFv3"][count],dataframe_SPLIF_ECFP2["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP2["SMILES"][count],dataframe_SPLIF_ECFP2["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP2["pchembl_value_Median"][count],dataframe_SPLIF_ECFP2["protein"][count],dataframe_SPLIF_ECFP2["compound"][count],dataframe_SPLIF_ECFP2["decoy_group"][count] = dataframe_SPLIF_ECFP2.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP2,Conc_SPLIF_ECFP2,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group + dataframe_SPLIF_ECFP4,dataframe_SPLIF_ECFP4["SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4["Conc_SPLIF_ECFP4"][count],dataframe_SPLIF_ECFP4['VinaScore'][count],dataframe_SPLIF_ECFP4['Type'][count],dataframe_SPLIF_ECFP4["RFv1"][count],dataframe_SPLIF_ECFP4["RFv2"][count],dataframe_SPLIF_ECFP4["RFv3"][count],dataframe_SPLIF_ECFP4["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP4["SMILES"][count],dataframe_SPLIF_ECFP4["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP4["pchembl_value_Median"][count],dataframe_SPLIF_ECFP4["protein"][count],dataframe_SPLIF_ECFP4["compound"][count],dataframe_SPLIF_ECFP2["decoy_group"][count] = dataframe_SPLIF_ECFP4.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP4,Conc_SPLIF_ECFP4,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group + dataframe_SPLIF_ECFP6,dataframe_SPLIF_ECFP6["SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6["Conc_SPLIF_ECFP6"][count],dataframe_SPLIF_ECFP6['VinaScore'][count],dataframe_SPLIF_ECFP6['Type'][count],dataframe_SPLIF_ECFP6["RFv1"][count],dataframe_SPLIF_ECFP6["RFv2"][count],dataframe_SPLIF_ECFP6["RFv3"][count],dataframe_SPLIF_ECFP6["nn_score"][count],dataframe_SPLIF_ECFP2["plec_score"][count],dataframe_SPLIF_ECFP6["SMILES"][count],dataframe_SPLIF_ECFP6["pchembl_value_Mean"][count],dataframe_SPLIF_ECFP6["pchembl_value_Median"][count],dataframe_SPLIF_ECFP6["protein"][count],dataframe_SPLIF_ECFP6["compound"][count],dataframe_SPLIF_ECFP2["decoy_group"][count] = dataframe_SPLIF_ECFP6.append(csv_dataframe.iloc[i],ignore_index=True),SPLIF_ECFP6,Conc_SPLIF_ECFP6,VinaScore['vina_affinity'],Type,RFv1,RFv2,RFv3,nn_score,plec_score,SMILES,pchembl_mean,pchembl_median,proteinname,ligandname,decoy_group + print("Done appending to dataframe! Number appended", count, int(run_number)) + except: + print("No pdbqt files available maybe?") + + + except: + print("No second decoy files available") + + if int(group_number) > 8: + var_from = next_from + else: + pass + + if int(decoy_number) >= 125: + print("Going dark to preserve memory..") + with open(f'config_{var_from}_{var_to}.txt', 'w') as f: + to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(var_from)+"\n"+str(var_to) + print(to_write) + f.write(to_write) + f.close() + dataframe_SIFP.to_csv(f'../IFP_datasets/SIFP_v2/{filename[:-5]}/dataframe_SIFP_dense_{filename[:-5]}_{file_number}.csv') + dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv') + os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(var_from), str(var_to)) + else: + pass + except: + print("There is a time to run and a time to error out") + # ListFaultyStructures.append(filename) + quit("No valid structure to calculate on") + except: + print("Maybe haven't downloaded ", filename,"moving on to next structure!") + ListFaultyStructures.append(filename) + next_from = int(var_from) + 1 + if next_from == var_to: + print("DONE WITH RUN") + sys.exit() + os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str("blub")) + file_number, count, group_number = 0,-1,0 + with open(f'config_{next_from}_{var_to}.txt', 'w') as f: + to_write = str(file_number)+"\n"+str(count)+"\n"+str(group_number)+"\n"+str(next_from)+"\n"+str(var_to) + print(to_write) + f.write(to_write) + f.close() + dataframe_SIFP.to_csv(f'../IFP_datasets/SIFP_v2/{filename[:-5]}/dataframe_SIFP_dense_{filename[:-5]}_{file_number}.csv') + dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}_{file_number}.csv') + dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}_{file_number}.csv') + os.execl(sys.executable, 'python', f'./create_IFP_datasets_v2_dense.py', str(next_from), str(var_to)) + +print("Done with run!") +# dataframe_SIFP.to_csv(f'../IFP_datasets/SIFP_v2/{filename[:-5]}/dataframe_SIFP_dense_{filename[:-5]}_{file_number}.csv') +# dataframe_PLEC.to_csv(f'../IFP_datasets/PLEC_v2/{filename[:-5]}/dataframe_PLEC_dense_{filename[:-5]}.csv') +# dataframe_SPLIF_ECFP2.to_csv(f'../IFP_datasets/SPLIF_ECFP2_v2/{filename[:-5]}/dataframe_SPLIF_ECFP2_dense_{filename[:-5]}.csv') +# dataframe_SPLIF_ECFP4.to_csv(f'../IFP_datasets/SPLIF_ECFP4_v2/{filename[:-5]}/dataframe_SPLIF_ECFP4_dense_{filename[:-5]}.csv') +# dataframe_SPLIF_ECFP6.to_csv(f'../IFP_datasets/SPLIF_ECFP6_v2/{filename[:-5]}/dataframe_SPLIF_ECFP6_dense_{filename[:-5]}.csv') +# with open(f'../IFP_datasets/FaultyStructures_dense_{filename[:-5]}.txt', "w") as f: +# f.writelines(ListFaultyStructures) +# f.close() +# rdkit.SimDivFilters.rdSimDivPickers.MaxMinPicker()