"""HADDOCK3 modules to score models."""
import pandas as pd
from haddock.core.typing import FilePath, Path, Any
from haddock.modules.base_cns_module import BaseCNSModule
from haddock.modules import BaseHaddockModule, PDBFile
[docs]
class ScoringModule(BaseHaddockModule):
"""Parent class for Scoring modules."""
[docs]
def output(
self,
output_fname: FilePath,
sep: str = "\t",
ascending_sort: bool = True,
) -> None:
r"""Save the output in comprehensive tables.
Parameters
----------
output_fname : FilePath
Path to the file where to write scoring data.
sep : str, optional
Character used as separator in file, by default "\t"
ascending_sort : bool, optional
Should the data be sorted in ascending order, by default True
"""
# saves scoring data
sc_data = []
for pdb in self.output_models:
sc_data.append([pdb.file_name, pdb.ori_name, pdb.md5, pdb.score])
# converts to pandas dataframe and sorts by score
df_columns = ["structure", "original_name", "md5", "score"]
df_sc = pd.DataFrame(sc_data, columns=df_columns)
df_sc_sorted = df_sc.sort_values(by="score", ascending=ascending_sort)
# writes to disk
df_sc_sorted.to_csv(output_fname,
sep=sep,
index=False,
na_rep="None",
float_format="%.3f")
return
[docs]
class CNSScoringModule(BaseCNSModule, ScoringModule):
"""Parent class for CNS Scoring modules."""
[docs]
def per_interface_output(
self,
output_fname: FilePath,
sep: str = "\t",
ascending_sort: bool = True,
) -> None:
r"""Generate per interface scoring tsv output files.
Parameters
----------
output_fname : FilePath
Path to the file where to write scoring data.
sep : str, optional
Character used as separator in file, by default "\t"
ascending_sort : bool, optional
Should the data be sorted in ascending order, by default True
"""
# Retrieve all interfaces data for all pdb
set_interfaces: list[str] = []
pdb_interfaces_scores: dict[tuple[Any, Any, Any], dict[str, dict[str, float]]] = {} # noqa : E501
# Loop over models to recover interfaces
for pdb in self.output_models:
# if the pdb does not exist, skip
if not Path(pdb.file_name).exists():
continue
interfaces_scores = self.read_per_interface_scores(pdb)
reversed_interfaces_scores = {}
# Hold list of interfaces
for interface, scores in interfaces_scores.items():
# Check if reverse chain order present
split_inter = interface.split('_')
reverse_interface = f"{split_inter[1]}_{split_inter[0]}"
if reverse_interface in set_interfaces:
reversed_interfaces_scores[reverse_interface] = scores
# Check if interface present
if interface not in set_interfaces:
set_interfaces.append(interface)
# Combine with reversed interface scores
interfaces_scores.update(reversed_interfaces_scores)
# Hold data
pdbkey = (pdb.file_name, pdb.ori_name, pdb.md5)
pdb_interfaces_scores[pdbkey] = interfaces_scores
# Preset output file basename and extension
output_file = Path(output_fname)
output_bn = output_file.stem
ouput_ext = ''.join(output_file.suffixes)
# Write separated files for all interfaces
for interface in set_interfaces:
# Point data
sc_data = []
for pdbkey, interfaces_scores in pdb_interfaces_scores.items():
if interface not in interfaces_scores.keys():
continue
interface_scores = interfaces_scores[interface]
score = interface_scores['HADDOCKscore']
sc_data.append([pdbkey[0], pdbkey[1], pdbkey[2], score])
# Check that the list is not empty
if len(sc_data) == 0:
continue
# converts to pandas dataframe and sorts by score
df_columns = ["structure", "original_name", "md5", "score"]
df_sc = pd.DataFrame(sc_data, columns=df_columns)
df_sc_sorted = df_sc.sort_values(
by="score",
ascending=ascending_sort,
)
# Generate output filename
interface_output_fname = f"{output_bn}_{interface}{ouput_ext}"
# writes to disk
df_sc_sorted.to_csv(
interface_output_fname,
sep=sep,
index=False,
na_rep="None",
float_format="%.3f",
)
return
[docs]
@staticmethod
def read_per_interface_scores(pdb: PDBFile) -> dict[str, dict[str, float]]:
"""Read a pdb file and parse per interface scores.
Parameters
----------
pdb : PDBFile
A PDBFile object.
Returns
-------
interfaces_scores : dict[str, dict[str, float]]
Dictionary holding per interfaces scores.
"""
header = None
interfaces_scores: dict[str, dict[str, float]] = {}
with open(pdb.file_name, 'r') as filin:
for _ in filin:
if _.startswith('REMARK Interface'):
s_ = _.strip().split()[2:]
# Extract header
if not header:
header = s_
# Extract data
else:
chain1 = s_[header.index('Chain1')]
chain2 = s_[header.index('Chain2')]
haddockscore = float(s_[header.index('HADDOCKscore')])
evdw = float(s_[header.index('Evdw')])
eelec = float(s_[header.index('Eelec')])
edesol = float(s_[header.index('Edesol')])
bsa = float(s_[header.index('BSA')])
# Combine chains together
chains_key = f"{chain1}_{chain2}"
# Hold data
interfaces_scores[chains_key] = {
'HADDOCKscore': haddockscore,
'Evdw': evdw,
'Eelec': eelec,
'Edesol': edesol,
'BSA': bsa,
}
return interfaces_scores