"""Set of functions related to the selection of top clusters."""
import os
import math
from pathlib import Path
from haddock.core.typing import Union
from haddock.libs.libontology import PDBFile
[docs]
def select_top_clusts_models(
sortby: str,
models_to_select: list[PDBFile],
top_clusters: int,
top_models: Union[int, float],
) -> tuple[list[PDBFile], list[str]]:
"""Select best clusters based on structures scores.
Parameters
----------
sortby : str
How to order clusters: by `score` or by `size`.
models_to_select : list[PDBFile]
List of input models on which selection must be performed.
top_clusters : int
Number of best clusters to take into account.
top_models : int
Number of best models in each cluster to take into account.
Returns
-------
models_to_export : list[PDBFile]
List of PDBfiles to export.
notes : list[str]
List of notes to be printed.
"""
notes: list[str] = []
by_clusters = map_clusters_models(models_to_select)
# Get cluster order
if sortby == "size":
cluster_rankings = size_clust_order(by_clusters)
else:
cluster_rankings = rank_clust_order(by_clusters)
# Check if number of clusters >= set of rank
if top_clusters >= len(cluster_rankings):
# select all clusters
cluster_rankins_str = ",".join(map(str, cluster_rankings))
notes.append(f"Selecting all clusters: {cluster_rankins_str}")
else:
# select top_cluster clusters
cluster_rankings = cluster_rankings[:top_clusters]
cluster_rankins_str = ",".join(map(str, cluster_rankings))
notes.append(
f"Selecting top {top_clusters} clusters: "
f"{cluster_rankins_str}"
)
# Initiate set of selected models to export
models_to_export: list[PDBFile] = []
# Loop over cluster ranks
for clt_rank in cluster_rankings:
# Sort models by model rank
clt_mdls, note = sort_models(by_clusters[clt_rank])
if note:
notes.append(note)
# Set new ranks to models
for mdl_rank, pdb in enumerate(clt_mdls, start=1):
pdb.clt_rank = clt_rank
pdb.clt_model_rank = mdl_rank
# In case number of models is not set (nan.)
if math.isnan(top_models):
for pdb in clt_mdls:
models_to_export.append(pdb)
# In case number of models is a integer
else:
# Loop over first `top_models` models
for pdb in clt_mdls[:top_models]:
notes.append(
f" {pdb.file_name} "
f"> cluster_{pdb.clt_rank}_"
f"model_{pdb.clt_model_rank}.pdb"
)
models_to_export.append(pdb)
return models_to_export, notes
[docs]
def sort_models(
models: list[PDBFile]
) -> tuple[list[PDBFile], Union[None, str]]:
"""Sort models based on their rank in cluster.
Parameters
----------
models : list[PDBFile]
List of input models on which ordering must be performed.
Returns
-------
sorted_mdls : list[PDBFile]
List of sorted models.
"""
note: Union[None, str] = None
try:
sorted_mdls = sorted(
models,
key=lambda k: k.clt_model_rank,
)
except TypeError:
note = 'model rank unavailable, falling back to input order'
sorted_mdls = models
return sorted_mdls, note
[docs]
def rank_clust_order(
by_clusters: dict[int, list[PDBFile]],
) -> list[int]:
"""Select best clusters based on structures scores.
Parameters
----------
models_to_select : list[PDBFile]
List of input models on which selection must be performed.
top_clusters : int
Number of best clusters to take into account.
top_models : int
Number of best models in each cluster to take into account.
Returns
-------
models_to_export : list[PDBFile]
List of PDBfiles to export.
notes : list[str]
List of notes to be printed.
"""
# Generate set of all cluster rank available
cluster_rankings = sorted(by_clusters)
return cluster_rankings
[docs]
def size_clust_order(
by_clusters: dict[int, list[PDBFile]],
) -> list[int]:
"""Select best clusters based on structures scores.
Parameters
----------
models_to_select : list[PDBFile]
List of input models on which selection must be performed.
top_clusters : int
Number of best clusters to take into account.
top_models : int
Number of best models in each cluster to take into account.
Returns
-------
models_to_export : list[PDBFile]
List of PDBfiles to export.
notes : list[str]
List of notes to be printed.
"""
# Generate set of all cluster rank available
cluster_rankings = sorted(
by_clusters,
key=lambda k: len(by_clusters[k]),
reverse=True,
)
return cluster_rankings
[docs]
def map_clusters_models(models: list[PDBFile]) -> dict[int, list[PDBFile]]:
"""Group models by clusters.
Parameters
----------
models : list[PDBFile]
List of PDBfiles models to be grouped.
Returns
-------
by_clusters : dict[int, list[PDBFile]]
_description_
"""
# Preset dictionary keys
by_clusters: dict[int, list[PDBFile]] = {
clrank: []
for clrank in list(set([pdb.clt_rank for pdb in models]))
}
# Loop over models
for pdb in models:
# Add model to cluster
by_clusters[pdb.clt_rank].append(pdb)
return by_clusters
[docs]
def write_selected_models(
output_path: Union[str, Path],
models: list[PDBFile],
module_path: Union[str, Path],
) -> list[PDBFile]:
"""Dump selected models and new names in a file.
Parameters
----------
output_path : Union[str, Path]
Name of tne file to create.
models : list[PDBFile]
List of PDBfiles of selected models.
module_path : Union[str, Path]
Path of the module.
Returns
-------
models : list[PDBFile]
Updated list of selected models.
"""
# dump the models to disk and change their attributes
with open(output_path, 'w') as fh:
fh.write("rel_path\tori_name\tcluster_name\tmd5" + os.linesep)
for model in models:
name = (
f"cluster_{model.clt_rank}_model"
f"_{model.clt_model_rank}.pdb"
)
# writing name
fh.write(
f"{model.rel_path}\t"
f"{model.ori_name}\t"
f"{name}\t"
f"{model.md5}" + os.linesep
)
# changing attributes
name_path = Path(name)
name_path.write_text(model.rel_path.read_text())
model.ori_name = model.file_name
model.file_name = name
model.full_name = name
model.rel_path = Path('..', Path(module_path).name, name)
model.path = str(Path(".").resolve())
return models