Source code for haddock.clis.cli_dmn

r"""
HADDOCK3 benchmark submission daemon.

For more information read our benchmark tutorial at `docs/benchmark.tut`
in HADDOCK3 repository site: https://github.com/haddocking/haddock3::

   (_) L|J
   (")  |
   /_\--|
 _/\ /  |
   _W_  |

Usage::

    haddock3-dmn -h
    haddock3-dmn <benchmark folder>  --job-limit <num> [OPTIONS]
"""
import argparse
import os
import subprocess
import sys
import time
from pathlib import Path

from haddock.core.typing import ArgumentParser, Callable, Namespace, Optional


workload_manager_launch = {
    "slurm": "sbatch",
    "torque": "qsub",
}
"""options for the different job queue systems supported"""


# prepares client arguments
ap = argparse.ArgumentParser(
    prog="HADDOCK3 benchmark submission daemon.",
    description=__doc__,
    formatter_class=argparse.RawDescriptionHelpFormatter,
)

ap.add_argument(
    "benchmark_path",
    help="Path to the benchmark folder as prepared by `haddock3-bm` interface.",
    type=Path,
)

ap.add_argument(
    "--job-limit",
    dest="job_limit",
    help="How many jobs should run at the same time. Default: 10",
    default=10,
    type=int,
)

ap.add_argument(
    "--job-sys",
    dest="manager",
    help="The system where the jobs will be run. Default `slurm`.",
    choices=tuple(workload_manager_launch.keys()),
    default="slurm",
)

ap.add_argument(
    "--restart",
    help="Restart the RUNNING jobs. DONE jobs won't be touched.",
    action="store_true",
)

ap.add_argument(
    "--sort-first",
    dest="sort_first",
    help=(
        "Sort jobs by size in ascending order. If not given jobs are order by "
        "size in descending order: the biggest first."
    ),
    action="store_true",
)


def _ap() -> ArgumentParser:
    return ap



[docs]
class Job:
    """
    Job task.

    Controls the status of each job.

    Parameters
    ----------
    job_f : pathlib.Path
        The path to the job file.

    launch_command : str
        The command to launch the job. For example `sbatch`.
    """

    def __init__(self, job_f: Path, launch_command: str) -> None:
        self.job_filename = job_f

        self.launch_command = launch_command

        # previous jog path
        job_stem = job_f.stem
        self.job_run_folder = Path(job_f.parents[1], f"run-{job_stem}")

        self.check_done = Path(self.job_run_folder, "DONE")
        self.check_running = Path(self.job_run_folder, "RUNNING")
        self.check_available = Path(self.job_run_folder, "AVAILABLE")
        self.check_fail = Path(self.job_run_folder, "FAIL")
        self.status = None

        self.status_files = [
            self.check_done,
            self.check_running,
            self.check_fail,
            self.check_available,
        ]


[docs]
    def get_status(self) -> Optional[str]:
        """
        Get job status.

        The job status depends on the present of files: `AVAILABLE`,
        `RUNNING`, `DONE`, and `FAIL` created by `haddock-bm` jobs.

        Status is assigned to `self.status` and returned.
        """
        for _file in self.status_files:
            if _file.exists():
                self.status = _file.stem  # type: ignore
                break
        return self.status



[docs]
    def submit(self) -> None:
        """
        Submit job.

        Run command `$launch_command $job_filename`.
        """
        subprocess.run(cmds := [self.launch_command, str(self.job_filename)])
        print("Job sent: ", cmds)



[docs]
    def restart(self) -> None:
        """
        Restart the status of the job to `AVAILABLE`.

        Does this by removing all status files and creating the file
        `AVAILABLE`.
        """
        self.check_done.unlink(missing_ok=True)
        self.check_running.unlink(missing_ok=True)
        self.check_fail.unlink(missing_ok=True)
        self.check_available.touch(exist_ok=True)





[docs]
def get_current_jobs(grep: str = "BM5") -> int:
    """
    Get current number of jobs for which job-name has the `grep` word.

    List of jobs is retrieve using the command `qstat`.

    Parameters
    ----------
    grep : str
        The string to search job-names for.

    Return
    ------
    int
        The number of jobs with the word `grep` in their name.
    """
    concurrent_cmd = "qstat -a | awk '{print $4}' | grep " + grep + " | wc -l"
    p = subprocess.Popen(
        concurrent_cmd,
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
    )
    output = p.communicate()[0]
    njobs = int(output.decode("utf-8"))
    print(f"Found {njobs} in the queue.")
    return njobs




[docs]
def calc_size(job_path: Path) -> int:
    """
    Calculate the size of the job.

    Expects the job file to be in a folder structure as defined by
    `haddock3-bm`.

    The size of the jobs is defined by the number of carbon-alpha lines
    in the `target.pdb` file.
    """
    target_pdb = Path(job_path.parents[1], "input", "target.pdb")
    lines = target_pdb.read_text().split(os.linesep)
    size = sum(1 for line in lines if line[11:16].strip() == "CA")
    return size




[docs]
def filter_by_status(job_list: list[Job], status: str = "AVAILABLE") -> list[Job]:
    """
    Filter jobs by their status.

    Only jobs with `status` are accepted.

    Parameters
    ----------
    job_list : list of Job objects.

    Returns
    -------
    list
        The list with the `Job`s with matching `status`.
    """
    jobs = [j for j in job_list if j.get_status() == status]
    print(f"Number of {status} jobs: {len(jobs)}.")
    return jobs



# command-line client helper functions
# load_args, cli, maincli

[docs]
def load_args(ap: ArgumentParser) -> Namespace:
    """Load argument parser args."""
    return ap.parse_args()




[docs]
def cli(ap: ArgumentParser, main: Callable[..., None]) -> None:
    """Command-line interface entry point."""
    cmd = load_args(ap)
    main(**vars(cmd))




[docs]
def maincli() -> None:
    """Execute main client."""
    cli(ap, main)




[docs]
def main(
    benchmark_path: Path,
    job_limit: int = 10,
    manager: str = "slurm",
    restart: bool = False,
    sort_first: bool = False,
) -> None:
    """
    Execute the benchmark daemon.

    The parameters defined here are the same as defined in the client
    arguments.

    This is the main function of the client. If you want to run the
    daemon withOUT using the command line and instead importing its
    functionalities and setting it up from another pythong script, you
    should import this function.

    >>> from haddock.clis.cli_dmn import main

    Parameters
    ----------
    benchmark_path : pathlib.Path
        The path of the benchmark folder as created by the `haddock3-bm`
        interface.

    job_limit : int
        The max number of jobs to send to the queue.

    manager : str
        A key to the `workload_manager_launch` dictionary. Selects the queue
        management system.

    restart : bool
        Whether to restart the `RUNNING` jobs that might have been halted
        in previous daemon runs. Defaults to False.

    sort_first : bool
        Whether to sort jobs by their size in ascending manner. That is,
        the sorted jobs first. Defaults to False, the longer first.
    """
    # lists of all the job files in the benchmark_path folder
    job_list = list(benchmark_path.glob("*/jobs/*.job"))

    # breaks if no jobs are found
    if not job_list:
        sys.exit("+ ERROR! No jobs found in folder: {str(benchmark_path)!r}")

    # sorts the job list
    job_list.sort(key=calc_size, reverse=not (sort_first))  # noqa: E275

    # create the job objects according to the queue managing systme
    _jobsys = workload_manager_launch[manager]
    jobs = [Job(j, _jobsys) for j in job_list]

    # restart previously (halted) `RUNNING` jobs - if selected.
    if restart:
        running_jobs = filter_by_status(jobs, status="RUNNING")
        for _job in running_jobs:
            _job.restart()

    # Lists the available jobs (those with status `AVAILABLE`)
    available_jobs = filter_by_status(jobs)

    # runs the daemon loop, only if there are available jobs :-)
    while available_jobs:
        # get the number of available queue slots according to the
        # job limit parameter
        #
        # 0 is added to avoid going to negative values in case a job
        # had been manually submitted.
        empty_slots = max(0, job_limit - get_current_jobs())
        print("empty slots: ", empty_slots)

        # send jobs if there are empty slots
        for i in range(empty_slots):
            available_jobs[i].submit()
            time.sleep(5)

        # chill before repeating the process.
        print("chilling for 120 seconds...")
        time.sleep(120)

        # refreshes the available_jobs list
        available_jobs = filter_by_status(jobs)

    # done
    return



if __name__ == "__main__":
    sys.exit(maincli())  # type: ignore