Source code for haddock.libs.libio

"""Lib I/O."""
import contextlib
import glob
import gzip
import os
import stat
import tarfile
import re
from functools import partial
from multiprocessing import Pool
from pathlib import Path

import yaml

from haddock import log
from haddock.core.typing import (
    Any,
    Callable,
    FilePath,
    Generator,
    Iterable,
    Mapping,
    Optional,
    Union,
    )
from haddock.libs.libontology import PDBFile
from haddock.libs.libutil import sort_numbered_paths



[docs]
def clean_suffix(ext: str) -> str:
    """
    Remove the preffix dot of an extension if exists.

    Parameters
    ----------
    ext : str
        The extension string.

    Examples
    --------
    >>> clean_suffix('.pdb')
    'pdb'

    >>> clean_suffix('pdb')
    'pdb'
    """
    return ext.lstrip(r".")




[docs]
def dot_suffix(ext: str) -> str:
    """
    Add the dot preffix to an extension if missing.

    Parameters
    ----------
    ext : str
        The extension string.

    Examples
    --------
    >>> clean_suffix('.pdb')
    '.pdb'

    >>> clean_suffix('pdb')
    '.pdb'
    """
    return "." + clean_suffix(ext)




[docs]
def read_lines(func: Callable[..., Any]) -> Callable[..., Any]:
    """
    Open the file and read lines for the decorated function.

    Send to the decorated function the lines of the file in the form
    of list.
    """

    def wrapper(fpath: FilePath, *args: Any, **kwargs: Any) -> Any:
        lines = Path(fpath).read_text().split(os.linesep)
        return func(lines, *args, **kwargs)

    # manual wrapping for displaying documentation properly
    wrapper.original = func  # type: ignore
    wrapper.__doc__ = func.__doc__
    wrapper.__name__ = func.__name__
    return wrapper




[docs]
def read_from_yaml(yaml_file: FilePath) -> dict[Any, Any]:
    """
    Read a YAML file to a dictionary.

    Used internally to read HADDOCK3's default configuration files.

    Parameters
    ----------
    yaml_file : str or Path
        Path to the YAML file.

    Returns
    -------
    dict
        Always returns a dictionary.
        Returns empty dictionary if yaml_file is empty.
    """
    # Check that this yaml file do not contain duplicated parameters
    check_yaml_duplicated_parameters(yaml_file)

    # Load yaml file using the yaml lib
    with open(yaml_file, "r") as fin:
        ycfg = yaml.safe_load(fin)

    # ycfg is None if yaml_file is empty
    # returns an empty dictionary to comply with HADDOCK workflow
    if ycfg is None:
        return {}

    assert isinstance(ycfg, dict), type(ycfg)
    return ycfg




[docs]
def check_yaml_duplicated_parameters(yaml_fpath: str) -> None:
    """Make sure the provided yaml file do not contain duplicated parameters.

    Parameters
    ----------
    yaml_fpath : str
        Path to a yaml file
    """
    # Build regular expression
    # Note: Understand behavior here -> https://regex101.com/r/AaFHp4/1
    yaml_param_regex = re.compile("^(([A-Za-z0-9]_?)+):")
    # Read content as string
    with open(yaml_fpath, 'r') as filin:
        yaml_content = filin.readlines()
    parsed_param_names: dict[str, int] = {}
    # Loop over lines
    for i, line in enumerate(yaml_content, start=1):
        # Check if new parameter
        if (match := yaml_param_regex.search(line)):
            # Point parameter name
            param_name = match.group(1)
            # Make sure this parameter has not yet been used
            assert param_name not in parsed_param_names.keys(), f"Parameter '{param_name}' in {yaml_fpath} has duplicates: l.{parsed_param_names[param_name]} and l.{i}"  # noqa : E501
            # Hold line were this parameter is, in case of duplication, to help
            parsed_param_names[param_name] = i




[docs]
def open_files_to_lines(*files: FilePath) -> list[list[str]]:
    """
    Open files to lines.

    New-lines are stripped.

    Returns
    -------
    list of lists of strings
        The lines of the files.
        Input order is maintained.
    """
    f_paths = map(Path, files)
    return [f.read_text().split(os.linesep) for f in f_paths]




[docs]
def save_lines_to_files(
        files: Iterable[FilePath],
        lines: Iterable[Iterable[str]]
        ) -> None:
    """
    Save a list of list of lines to files.

    The first list of strings in `lines` will be saved in the first file
    of `files`, and so on.

    Lines are saved using `pathlib.Path.write_text` function.

    Parameters
    ----------
    files : list
        The list of file names to save.

    lines : list of lists of str
        A list containing lists of lines that are the file contents.
        Must be synched with `files`.
    """
    for file_, content in zip(files, lines):
        Path(file_).write_text(os.linesep.join(content) + os.linesep)

    return




[docs]
def add_suffix_to_files(
        files: Iterable[FilePath],
        suffix: str
        ) -> Generator[Path, None, None]:
    """
    Add a suffix to file paths.

    Yields
    ------
    pathlib.Path objects
        Exhausts when files exhaust.
    """
    for file_ in files:
        p = Path(file_)
        folder = p.parent
        psuffix = p.suffix
        name = p.stem + suffix + psuffix
        path = Path(folder, name)
        yield path




[docs]
def write_dic_to_file(
        data_dict: Mapping[Any, Any],
        output_fname: FilePath,
        info_header: str = "",
        sep: str = "\t",
        ) -> None:
    """
    Create a table from a dictionary.

    Parameters
    ----------
    data_dict : dict
        Dictionary to write.
    output_fname : str or Path
        Name of the output file.
    info_header : str
        Header to write before the data.
    """
    header = "\t".join(data_dict.keys())

    if info_header:
        header = info_header + os.linesep + header

    with open(output_fname, "w") as out_fh:
        out_fh.write(header + os.linesep)
        row_l: list[str] = []
        for element in data_dict:
            value = data_dict[element]
            if isinstance(value, Path):
                row_l.append(str(value))
            elif isinstance(value, PDBFile):
                row_l.append(str(value.rel_path))
            elif isinstance(value, int):
                row_l.append(f"{value}")
            elif isinstance(value, str):
                row_l.append(f"{value}")
            elif value is None:
                row_l.append("-")
            else:
                row_l.append(f"{value:.3f}")
        out_fh.write(sep.join(row_l) + os.linesep)




[docs]
def write_nested_dic_to_file(
        data_dict: Mapping[Any, Any],
        output_fname: FilePath,
        info_header: str = "",
        sep: str = "\t",
        ) -> None:
    """
    Create a table from a nested dictionary.

    Parameters
    ----------
    data_dict : dict
        Dictionary to write.
    output_fname : str or Path
        Name of the output file.

    Notes
    -----
    This function is used to write nested dictionaries.
    {int: {key: value}}, the int key will be discarded.
    """
    first_key = list(data_dict.keys())[0]
    header = "\t".join(data_dict[first_key].keys())

    if info_header:
        header = info_header + os.linesep + header

    with open(output_fname, "w") as out_fh:
        out_fh.write(header + os.linesep)
        for row in data_dict:
            row_l: list[str] = []
            for element in data_dict[row]:
                value = data_dict[row][element]
                if isinstance(value, Path):
                    row_l.append(str(value))
                elif isinstance(value, PDBFile):
                    row_l.append(str(value.rel_path))
                elif isinstance(value, int):
                    row_l.append(f"{value}")
                elif isinstance(value, str):
                    row_l.append(f"{value}")
                elif value is None:
                    row_l.append("-")
                else:
                    row_l.append(f"{value:.3f}")
            out_fh.write(sep.join(row_l) + os.linesep)



# thanks to @brianjimenez

[docs]
@contextlib.contextmanager
def working_directory(path: FilePath) -> Generator[None, None, None]:
    """Change working directory and returns to previous on exit."""
    prev_cwd = Path.cwd()
    os.chdir(path)
    try:
        yield
    finally:
        os.chdir(prev_cwd)




[docs]
def compress_files_ext(
        path: FilePath,
        ext: str,
        ncores: int = 1,
        **kwargs: Any
        ) -> bool:
    """
    Compress all files with same extension in folder to `.gz`.

    Do not archive the files in TAR, only compress files individually.

    Parameters
    ----------
    path : str or :external:py:class:`pathlib.Path`
        The folder containing the files.

    ext : str
        The extension of the files.

    **kwargs : anything
        Arguments passed to :py:func:`gzip_files`.

    Returns
    -------
    bool
        ``True`` if files with ``ext`` were found and the compressed
        `.gz` files created.

        ``False`` if no files with ``ext`` were found and, hence, the
        `.gz` files were not created.
    """
    files = glob_folder(path, ext)
    gzip_ready = partial(gzip_files, **kwargs)
    if files:
        with Pool(ncores) as pool:
            imap = pool.imap_unordered(gzip_ready, files)
            for _ in imap:
                pass
        return True
    return False




[docs]
def gzip_files(
        file_: FilePath,
        block_size: Optional[int] = None,
        compresslevel: int = 9,
        remove_original: bool = False,
        ) -> None:
    """
    Gzip a file.

    Parameters
    ----------
    file_ : str or :external:py:class:`pathlib.Path`
        The path to the file to compress.

    block_size : int
        The block size to treat per cycle. Defaults to 200MB (2*10**8
        (2*10**8).

    compresslevel : int
        The compress level. Defaults to 9.
    """
    if block_size is None:
        block_size = 2 * 10**8

    gfile = str(file_) + ".gz"
    with open(file_, "rb") as fin, gzip.open(
            gfile, mode="wb", compresslevel=compresslevel
            ) as gout:
        content = fin.read(block_size)  # read the first
        while content:
            gout.write(content)
            content = fin.read(block_size)

    if remove_original:
        Path(file_).unlink()




[docs]
def archive_files_ext(path: FilePath, ext: str, compresslevel: int = 9) -> bool:
    """
    Archive all files with same extension in folder.

    Parameters
    ----------
    path : str or :external:py:class:`pathlib.Path`
        The folder containing the files.

    ext : str
        The extension of the files.

    compresslevel : int
        The compression level.

    Returns
    -------
    bool
        ``True`` if files with ``ext`` were found and the Zip files created.
        ``False`` if no files with ``ext`` were found and, hence, the
        Zip files was not created.
    """
    files = glob_folder(path, clean_suffix(ext))

    ext = clean_suffix(ext)

    if files:
        with tarfile.open(
                Path(path, f"{ext}.tgz"),
                mode="w:gz",
                compresslevel=compresslevel,
                ) as tarout:
            for file_ in files:
                tarout.add(file_, arcname=file_.name)

        return True
    return False




[docs]
def glob_folder(folder: FilePath, ext: str) -> list[Path]:
    """
    List files with extention `ext` in `folder`.

    Does NOT perform recursive search.

    Parameters
    ----------
    folder : str
        The path to the folder to investigate.

    ext : str
        The file extention. Can be with or without the dot [.]
        preffix.

    Returns
    -------
    list of Path objects
        SORTED list of matching results.
    """
    ext = f"*{dot_suffix(ext)}"
    files = glob.glob(str(Path(folder, ext)))
    return sort_numbered_paths(*(Path(file) for file in files))




[docs]
def remove_files_with_ext(folder: FilePath, ext: str) -> None:
    """
    Remove files with ``ext`` in folder.

    Parameters
    ----------
    folder : str
        The path to the folder.

    ext : str
        The extention of files to delete. Can be with or without the dot ``.``
        preffix.
    """
    files = sort_numbered_paths(*glob_folder(folder, ext))
    # if there are no files, the for loop  won't run.
    for file_ in files:
        log.debug(f"removing: {file_}")
        file_.unlink()




[docs]
def folder_exists(
        path: FilePath,
        exception: type[Exception] = ValueError,
        emsg: str = "The folder {!r} does not exist or is not a folder.",
        ) -> Path:
    """
    Assert if a folder exist.

    Parameters
    ----------
    path : str or pathlib.Path
        The path to the folder.

    exception : Exception
        The Exception to raise in case `path` is not file or does not
        exist.

    emsg : str
        The error message to give to `exception`. May accept formatting
        to pass `path`.

    Returns
    -------
    pathlib.Path
        The Path representation of the input ``path`` if condition is
        true.

    Raises
    ------
    Exception
        Any exception that pathlib.Path can raise.
    """
    p = Path(path)

    valid = [p.exists, p.is_dir]

    if all(f() for f in valid):
        return p

    # don't change to f-strings, .format has a purpose
    raise exception(emsg.format(str(path)))




[docs]
def file_exists(
        path: FilePath,
        exception: type[Exception] = ValueError,
        emsg: str = "`path` is not a file or does not exist",
        ) -> Path:
    """
    Assert if file exist.

    Parameters
    ----------
    path : str or pathlib.Path
        The file path.

    exception : Exception
        The Exception to raise in case `path` is not file or does not
        exist.

    emsg : str
        The error message to give to `exception`. May accept formatting
        to pass `path`.

    Returns
    -------
    pathlib.Path
        The Path representation of the input ``path`` if condition is
        true.

    Raises
    ------
    Exception
        Any exception that pathlib.Path can raise.
    """
    p = Path(path)

    valid = [p.exists, p.is_file]

    if all(f() for f in valid):
        return p

    # don't change to f-strings, .format has a purpose
    raise exception(emsg.format(str(path)))




[docs]
def pdb_path_exists(pdb_path: Path) -> tuple[bool, Optional[str]]:
    """
    Check if a pdb path exists.

    If not, checks for the existence of a gzipped pdb file and informs the user
    that the file is gzipped

    Parameters
    ----------
    pdb_path : pathlib.Path
        path to the pdb

    Returns
    -------
    exists : bool
        True if file exists
    msg : str or None
        the error message
    """
    exists, msg = True, None
    if not pdb_path.exists():
        msg = f"PDB file {pdb_path} not found."
        gz_pdb_path = pdb_path.with_suffix(pdb_path.suffix + ".gz")
        if gz_pdb_path.exists():
            msg += f" A compressed file ({gz_pdb_path}) exists though."
            msg += "Use haddock3-unpack to unpack the run."
        exists = False
    return exists, msg




[docs]
def get_perm(fname: FilePath) -> int:
    """Get permissions of file."""
    # https://stackoverflow.com/questions/6874970
    return stat.S_IMODE(os.lstat(fname)[stat.ST_MODE])




[docs]
def make_writeable_recursive(path: FilePath) -> None:
    """
    Add writing to a folder, its subfolders and files.

    Parameters
    ----------
    path : str or Path
        The path to add writing permissions.
    """
    # https://stackoverflow.com/questions/6874970
    for root, dirs, files in os.walk(path, topdown=False):
        for dir_ in (os.path.join(root, d) for d in dirs):
            os.chmod(dir_, get_perm(dir_) | stat.S_IWUSR)

        for file_ in (os.path.join(root, f) for f in files):
            os.chmod(file_, get_perm(file_) | stat.S_IWUSR)




[docs]
def extract_files_flat(tar_path: Union[str, FilePath],
                       dest_path: Union[str, FilePath]) -> None:
    """
    Extract files from a tarball to a destination folder.
    
    Parameters
    ----------
    tar_path : str or Path
        The path to the tarball file.

    dest_path : str or Path
        The path to the destination folder.
    """
    with tarfile.open(tar_path, "r:gz") as tar:
        for member in tar.getmembers():
            # Extract only files (skip directories)
            if member.isfile():
                # Modify the member name to remove the directory structure
                member.name = os.path.basename(member.name)
                tar.extract(member, dest_path)