Source code for haddock.modules

"""HADDOCK3 modules."""

import re

from abc import ABC, abstractmethod
from contextlib import contextmanager, suppress
from copy import deepcopy
from functools import partial
from os import linesep
from pathlib import Path

from haddock import EmptyPath, log, modules_defaults_path
from haddock.core.defaults import MODULE_IO_FILE, INTERACTIVE_RE_SUFFIX
from haddock.core.exceptions import ConfigurationError
from haddock.core.typing import (
    Any,
    Container,
    FilePath,
    Generator,
    Literal,
    Optional,
    ParamDict,
    Union,
)
from haddock.gear import config
from haddock.gear.clean_steps import clean_output
from haddock.gear.known_cns_errors import find_all_cns_errors
from haddock.gear.parameters import config_mandatory_general_parameters
from haddock.gear.yaml2cfg import read_from_yaml_config, find_incompatible_parameters
from haddock.libs.libhpc import HPCScheduler
from haddock.libs.libio import folder_exists, working_directory
from haddock.libs.libmpi import MPIScheduler
from haddock.libs.libontology import ModuleIO, PDBFile
from haddock.libs.libparallel import Scheduler
from haddock.libs.libtimer import log_time
from haddock.libs.libutil import recursive_dict_update


modules_folder = Path(__file__).resolve().parent

_folder_match_regex = "[a-zA-Z]*/"
modules_category = {
    module.name: category.name
    for category in modules_folder.glob(_folder_match_regex)
    for module in category.glob(_folder_match_regex)
}
"""Indexes each module in its specific category. Keys are Paths to the module,
values are their categories. Categories are the modules parent folders."""

modules_names = set(modules_category.keys())

category_hierarchy = [
    "topology",
    "sampling",
    "refinement",
    "scoring",
    "analysis",
    "extras",
]

# this dictionary defines non-mandatory general parameters that can be defined
# as global parameters thus affect all modules, or, instead, can be defined per
# module where the module definition overwrites global definition. Not all
# modules will use these parameters. It is the responsibility of the module to
# extract the parameters it needs.
# the config file is in modules/defaults.cfg
non_mandatory_general_parameters_defaults = read_from_yaml_config(
    modules_defaults_path
)  # noqa : E501

incompatible_defaults_params = find_incompatible_parameters(modules_defaults_path)

config_readers = {
    ".yaml": read_from_yaml_config,
    ".cfg": config.load,
}

_step_folder_regex = tuple(
    r"[0-9]+_" + mod_name for mod_name in modules_category.keys()
)
step_folder_regex = "(" + "|".join(_step_folder_regex) + ")"
"""
String for regular expression to match module folders in a run directory.

It will match folders with a numeric prefix followed by underscore ("_")
followed by the name of a module.

Example: https://regex101.com/r/roHls9/1
"""

step_folder_regex_re = re.compile(step_folder_regex)
"""
Compiled regular expression from :py:const:`step_folder_regex`.

It will match folders with a numeric prefix followed by underscore ("_")
followed by the name of a module.

Example: https://regex101.com/r/roHls9/1
"""


@contextmanager
def _not_valid_config() -> Generator[None, None, None]:
    try:
        yield
    except KeyError as err:
        emsg = (
            "The configuration file extension is not supported. "
            f"Supported types are {', '.join(config_readers.keys())}."
        )
        raise ConfigurationError(emsg) from err


[docs] class BaseHaddockModule(ABC): """HADDOCK3 module's base class.""" name: str def __init__(self, order: int, path: Path, params_fname: FilePath) -> None: """ HADDOCK3 modules base class. Parameters ---------- params : dict or path to HADDOCK3 configuration file A dictionary or a path to a HADDOCK3 configuration file containing the initial module parameters. Usually this is defined by the default params. """ self.order = order self.path = path self.previous_io = self._load_previous_io() # instantiate module's parameters self._origignal_config_file = params_fname with _not_valid_config(): extension = Path(params_fname).suffix self._original_params = config_readers[extension](params_fname) self._params: ParamDict = {} self.update_params(update_from_cfg_file=params_fname) @property def params(self) -> ParamDict: """Configuration parameters.""" # noqa: D401 return self._params
[docs] def reset_params(self) -> None: """Reset parameters to the ones used to instantiate the class.""" self._params.clear() self.update_params(**self._original_params)
[docs] def update_params( self, update_from_cfg_file: Optional[FilePath] = None, **params: Any, ) -> None: """ Update the modules parameters. Add/update to the current modules parameters the ones given in the function call. If you want to enterily replace the modules parameters to their default values use the `reset_params()` method. Update takes places recursively, that is, nested dictionaries will be updated accordingly. To update the current config with the parameters defined in an HADDOCK3 configuration file use the `update_from_cfg_file` parameter. To update from a JSON file, first load the JSON into a dictionary and unpack the dictionary to the function call. Examples -------- >>> m.update_params(param1=value1, param2=value2) >>> m.update_params(**param_dict) >>> m.update_params(update_from_cfg_file=path_to_file) # if you wish to start from scratch >>> m.reset_params() >>> m.update_params(...) """ if update_from_cfg_file and params: _msg = ( "You can not provide both `update_from_cfg_file` " "and key arguments." ) raise TypeError(_msg) if update_from_cfg_file: with _not_valid_config(): extension = Path(update_from_cfg_file).suffix params = config_readers[extension](update_from_cfg_file) # the updating order is relevant _n = recursive_dict_update( non_mandatory_general_parameters_defaults, self._params ) self._params = recursive_dict_update(_n, params) self._fill_emptypaths() self._confirm_fnames_exist()
[docs] def save_config(self, path: FilePath) -> None: """Save current parameters to a HADDOCK3 config file.""" # creates this dictionary for the config to have the module name # key in brackets, for example: # # [topoaa] # ... ignore = config_mandatory_general_parameters.union( non_mandatory_general_parameters_defaults ) # noqa: 501 params = deepcopy(self.params) with suppress(KeyError): for key in list(ignore): params.pop(key) config.save({self.name: params}, path)
[docs] def add_parent_to_paths(self) -> None: """Add parent path to paths.""" # convert paths to relative by appending parent for key, value in self.params.items(): if value and key.endswith("_fname"): if not Path(value).is_absolute(): self.params[key] = Path("..", value) return
@abstractmethod def _run(self) -> None: ...
[docs] def run(self, **params: Any) -> None: """Execute the module.""" log.info(f"Running [{self.name}] module") self.update_params(**params) self.add_parent_to_paths() with working_directory(self.path): self._run() log.info(f"Module [{self.name}] finished.")
[docs] def clean_output(self) -> None: """ Clean module output folder. See Also -------- :py:func:`haddock.gear.clean_steps.clean_output` """ with log_time("cleaning output files took"): clean_output(self.path, self.params["ncores"])
[docs] @classmethod @abstractmethod def confirm_installation(cls) -> None: """ Confirm the third-party software needed for the module is installed. HADDOCK3's own modules should just return. """ return
[docs] def export_io_models(self, faulty_tolerance: float = 0.0) -> None: """ Export input/output to the ModuleIO interface. Modules that do not perform any operation on PDB files should have input = output. This function implements a common interface for all modules. Parameters ---------- faulty_tolerance : int, default 0 The percentage of missing output allowed. If 20 is given, raises an error if 20% of the expected output is missing (not saved to disk). """ self.output_models: Union[list[PDBFile], dict[int, PDBFile]] assert self.output_models, "`self.output_models` cannot be empty." io = ModuleIO() # add the input models io.add(self.previous_io.output, "i") # add the output models io.add(self.output_models, "o") # Removes un-generated outputs and compute percentage of ungenerated faulty = io.check_faulty() # Save outputs io.save() # Check if number of generated outputs is under the tolerance threshold if faulty > faulty_tolerance: _msg = ( f"{faulty:.2f}% of output was not generated for this module " f"and tolerance was set to {faulty_tolerance:.2f}%." ) # Try to detect CNS errors if detected_errors := find_all_cns_errors(self.path): _msg += linesep for error in detected_errors.values(): _msg += f'{str(error["error"])}{linesep}' # Show final error message self.finish_with_error(_msg)
[docs] def finish_with_error(self, reason: object = "Module has failed.") -> None: """Finish with error message.""" if isinstance(reason, Exception): raise RuntimeError("Module has failed.") from reason else: raise RuntimeError(reason)
def _load_previous_io( self, filename: FilePath = MODULE_IO_FILE, ) -> ModuleIO: if self.order == 0: self._num_of_input_molecules = 0 return ModuleIO() io = ModuleIO() previous_io = Path(self.previous_path(), filename) if previous_io.is_file(): io.load(previous_io) self._num_of_input_molecules = len(io.output) return io
[docs] def previous_path(self) -> Path: """Give the path from the previous calculation.""" previous = get_module_steps_folders(self.path.resolve().parent) try: # return Path(previous[self.order - 1]) return self.last_step_folder(previous, self.order - 1) except IndexError: return self.path
[docs] @staticmethod def last_step_folder(folders, index): """Retrieve last step folder.""" with_ind = [folder for folder in folders if int(folder.split("_")[0]) == index] nb_with_ind = len(with_ind) # No matching index if nb_with_ind == 0: raise IndexError # Only one matching index elif nb_with_ind == 1: return with_ind[0] # Case of multiple matching index else: for folder in with_ind: if folder.split("_")[-1] != INTERACTIVE_RE_SUFFIX: return folder return with_ind[0]
[docs] def log(self, msg: str, level: str = "info") -> None: """ Log a message with a common header. Currently the header is the [MODULE NAME] in square brackets. Parameters ---------- msg : str The log message. level : str The level log: 'debug', 'info', ... Defaults to 'info'. """ getattr(log, level)(f"[{self.name}] {msg}")
def _confirm_fnames_exist(self) -> None: for param, value in self._params.items(): if param.endswith("_fname") and value: if not Path(value).exists(): raise FileNotFoundError(f"File not found: {str(value)!r}") def _fill_emptypaths(self) -> None: """Fill empty paths.""" for param, value in list(self._params.items()): if param.endswith("_fname") and not value: self._params[param] = EmptyPath()
EngineMode = Literal["batch", "local", "mpi"]
[docs] def get_engine( mode: str, params: dict[Any, Any], ) -> partial[Union[HPCScheduler, Scheduler, MPIScheduler]]: """ Create an engine to run the jobs. Parameters ---------- mode : str The type of engine to create params : dict A dictionary containing parameters for the engine. `get_engine` will retrieve from `params` only those parameters needed and ignore the others. """ # a bit of a factory pattern here # this might end up in another module but for now its fine here if mode == "batch": return partial( # type: ignore HPCScheduler, target_queue=params["queue"], queue_limit=params["queue_limit"], concat=params["concat"], ) elif mode == "local": return partial( # type: ignore Scheduler, ncores=params["ncores"], max_cpus=params["max_cpus"], ) elif mode == "mpi": return partial(MPIScheduler, ncores=params["ncores"]) # type: ignore else: available_engines = ("batch", "local", "mpi") raise ValueError( f"Scheduler `mode` {mode!r} not recognized. " f"Available options are {', '.join(available_engines)}" )
[docs] def get_module_steps_folders( folder: FilePath, modules: Optional[Container[int]] = None, ) -> list[str]: """ Return a sorted list of the step folders in a running directory. Example ------- Consider the folder structure: run_dir/ 0_topoaa/ 1_rigidbody/ 2_caprieval/ 3_bad_module_name/ data/ >>> get_module_steps_folders("run_dir") >>> ["0_topoaa", "1_rigidbody", "2_caprieval"] Parameters ---------- folder : str or Path Path to the run directory, or to the folder containing the step folders. Returns ------- list of str List containing strings with the names of the step folders. """ folders = (p.name for p in Path(folder).iterdir() if p.is_dir()) steps = sorted( (f for f in folders if step_folder_regex_re.search(f)), key=lambda x: int(x.split("_")[0]), ) if modules: steps = [ st for st in steps if all( [ int(st.split("_")[0]) in modules, st.split("_")[1] in modules_names, ] ) ] return steps
[docs] def is_step_folder(path: FilePath) -> bool: """ Assess whether a folder is a possible step folder. The folder is considered a step folder if has a zero or positive integer index followed by a name of a module. Parameters ---------- path : str or pathlib.Path The path to the folder. Returns ------- bool Whether the folder is a step folder or not. """ path = Path(path) folder_exists(path) main_folder_name = path.name parts = main_folder_name.split("_") if len(parts) == 2 and parts[0].isdigit() and parts[1] in modules_category: return True else: return False