Source code for haddock.gear.preprocessing

# TODO:
# 1. single_ions may contain PO4 - verify
# 2. the single_ions_elements map seems not needed
"""
Process input PDB files to ensure compatibility with HADDOCK3.

This module checks and modifies PDB files for compatibility with
HADDOCK3. There are three types of checks/modifications:

1. Performed to each PDB line-by-line, in a equal fashion of ``pdb-tools``.
   In fact, this step mostly uses the ``pdb-tools`` package.
2. Performed on each PDB as a whole.
3. Performed on all PDBs together.

Main functions
--------------

* :py:func:`process_pdbs`
* :py:func:`read_additional_residues`

Corrections performed on 1)
---------------------------

The following actions are perfomed sequentially over all PDBs:

#. from ``pdb-tools``: ``pdb_keepcoord``
#. from ``pdb-tools``: ``pdb_tidy`` with ``strict=True``
#. from ``pdb-toos``: ``pdb_element``
#. from ``pdb-tools``: ``pdb_selaltloc``
#. from ``pdb-tools``: ``pdb_pdb_occ`` with ``occupancy=1.00``
#. replace ``MSE`` to ``MET``
#. replace ``HSD`` to ``HIS``
#. replace ``HSE`` to ``HIS``
#. replace ``HID`` to ``HIS``
#. replace ``HIE`` to ``HIS``
#. add_charges_to_ions, see :py:func:`add_charges_to_ions`
#. convert ``ATOM`` to ``HETATM`` for those atoms that should be ``HETATM``.
   Considers the additional residues provided by the user.
   See :py:func:`convert_ATOM_to_HETATM`.
#. convert ``HETATM`` to ``ATOM`` for those atoms that should be ``ATOM``,
#. from ``pdb-toos``: ``pdb_fixinsert``, with ``option_list=[]``.
#. remove unsupported ``HETATM``. Considers residues provided by the user.
#. remove unsupported ``ATOM``. Considers residues provided by the user.
#. from ``pdb-tools``: ``pdb_reatom``, start from ``1``.
#. from ``pdb-tools``: ``pdb_tidy`` with ``strict=True``

Corrections performed on 2)
---------------------------

The following actions are performed sequentially for each PDB:

* :py:func:`models_should_have_the_same_labels`
* :py:func:`solve_no_chainID_no_segID`
* :py:func:`homogenize_chains`

Read the documentation of the above functions for details what they do.

Corrections performed on 3)
---------------------------

The following actions are performed to all PDBs together:

* :py:func:`correct_equal_chain_segids`

Read the documentation of the above functions for details what they do.

When it happens
---------------

The PDB processing step is performed by default when reading the input
molecules and copying them to the `data/` folder inside the run
directory. When PDBs are processed, a copy of the original input PDBs is
also stored in the `data/` folder.

To deactivate this initial PDB processing, set ``skip_preprocess = False``
in the general parameters of the configuration file.

Additional information
----------------------

If you are a developer and want to read more about the history of this
preprocessing module, visit:

https://github.com/haddocking/haddock3/projects/16
"""
import io
import itertools as it
import re
import string
from functools import partial, wraps
from os import linesep
from pathlib import Path

from pdbtools import (
    pdb_chain,
    pdb_chainxseg,
    pdb_element,
    pdb_fixinsert,
    pdb_keepcoord,
    pdb_occ,
    pdb_reatom,
    pdb_rplresname,
    pdb_segxchain,
    pdb_selaltloc,
    pdb_shiftres,
    pdb_tidy,
)

from haddock import log
from haddock.core.exceptions import HaddockError
from haddock.core.supported_molecules import (
    supported_ATOM,
    supported_HETATM,
    supported_non_ions,
    supported_single_ions_atoms_map,
    supported_single_ions_resnames_map,
)
from haddock.core.typing import (
    Any,
    Callable,
    Container,
    Generator,
    Iterable,
    LineIterSource,
    Optional,
    Union,
)
from haddock.libs.libfunc import chainf
from haddock.libs.libio import read_lines
from haddock.libs.libpdb import (
    format_atom_name,
    read_chainids,
    read_segids,
    slc_charge,
    slc_element,
    slc_name,
    slc_resname,
)


# defines chain letters for chain and seg IDs
_ascii_letters = list(string.ascii_uppercase + string.ascii_lowercase)
_CHAINS = it.cycle(_ascii_letters)


[docs] class ModelsDifferError(HaddockError): """MODELS of the PDB differ in atom labels.""" pass
def _report(log_msg: str) -> Callable[..., Any]: """ Add report functionality to the function (decorator). Functions decorated with `_report` log the difference between the input and the output. Decorated functions gain an additional boolean parameter `report` to activate or deactivate the report functionality; defaults to ``False``. Note that a generator decorated with ``_report`` no longer behaves as a generator if ``report=True`` is given. Instead, it returns a list from the exhausted generator. **Important:** Do NOT use ``_report`` with infinite generators, such as ``itertools.cycle``. """ def decorator(function: Callable[..., Any]) -> Callable[..., Any]: @wraps(function) def wrapper( lines: Iterable[Any], *args: Any, report: bool = False, **kwargs: Any ) -> Any: if report: in_lines = list(lines) result = list(function(in_lines, *args, **kwargs)) # Here we could use sets to increase speed, but for the size # of the systems, we can actually use lists and get a sorted # result by default. I tried using difflib from STD but it is # just too slow. # _ is line additions = [_ for _ in result if _ not in in_lines] deletions = [_ for _ in in_lines if _ not in result] la = len(additions) ld = len(deletions) add_lines = linesep.join(f"+ {_}" for _ in additions) del_lines = linesep.join(f"- {_}" for _ in deletions) log_msg_ = log_msg.format(*args, *kwargs.values()) extended_log = ( f"[{log_msg_}] + {la} - {ld} lines", add_lines, del_lines, ) log.info(linesep.join(extended_log)) return result # If report=False, maintain the original behaviour else: return function(lines, *args, **kwargs) return wrapper return decorator def _open_or_give(inputdata: Iterable[LineIterSource]) -> list[list[str]]: """ Adapt input to the functions. Used in py:func:`process_pdbs`. Homogenizes input by: * removing new line characters at the end of the line * removing empty lines Parameters ---------- inputdata : list A **flat** list where in each index it can contain: * file objects * paths to files * strings representing paths * lists or tuples of lines The above types can be mixed in the input list. Files are read to lines in a list. Line separators are stripped. Do not provide nested lists with lists containing paths inside lists. Returns ------- list of list of strings Each sublist has the contents of the input in the same order. Raises ------ TypeError In any other circumstances. """ def get_line(lines: Iterable[str]) -> list[str]: """Ignore empty lines.""" return [line.rstrip(linesep) for line in lines if line] lines: list[list[str]] = [] for idata in inputdata: if isinstance(idata, (Path, str)): lines.append(get_line(Path(idata).read_text().split(linesep))) elif isinstance(idata, io.TextIOBase): lines.append(get_line(idata.readlines())) elif isinstance(idata, (list, tuple)): lines.append(get_line(idata)) else: emsg = f"Unexpected type in `inputdata`: {type(idata)}" raise TypeError(emsg) return lines @read_lines def read_additional_residues( lines: Iterable[str], *ignore: Any, **everything: Any ) -> tuple[str, ...]: """ Read additional residues listed in a ``*.top`` filename. Expects new residues to be defined as:: RESIdue XXX RESI XXX residue XXX where, XXX is the new residue name. Does not read ATOM or charge information. Reads only the residue name. Examples -------- Read directly the file: >>> read_additional_residues(fpath) Read the lines instead: >>> lines = Path(fpath).read_text().split(os.linesep) >>> read_additional_residues.original(lines) Parameters ---------- fpath : str or pathlib.Path The path to the file. lines : list of lines You can also use this function in the form of ``read_additional_residues.original(...)`` and directly give it a list containing the lines of the file. Returns ------- tuple A tuple with the new identified residues names. """ # https://regex101.com/r/1H44kO/1 res_regex = re.compile(r"^(RESIdue|residue|RESI) ([A-Z0-9]{1,3}).*$") residues: list[str] = [] for line in map(str.strip, lines): name = res_regex.findall(line) if name: residues.append(name[0][1]) return tuple(residues)
[docs] def process_pdbs( *inputdata: LineIterSource, dry: bool = False, user_supported_residues: Optional[Iterable[str]] = None, ) -> list[list[str]]: """ Process PDB file contents for compatibility with HADDOCK3. Parameters ---------- inputdata : list of (str, path, list of str [lines], file handler) A **flat** list where in each index it can contain: * file objects * paths to files * strings representing paths * lists or tuples of lines The above types can be mixed in the input list. Files are read to lines in a list. Line separators are stripped. Do not provide nested lists with lists containing paths inside lists. dry : bool Perform a dry run. That is, does not change anything, and just report. user_supported_residues : list, tuple, or set The new residues that are allowed. Returns ------- list of (list of str) The corrected (processed) PDB content in the same order as ``inputdata``. """ structures = _open_or_give(inputdata) # these are the processing or checking functions that should (if needed) # modify the input PDB and return the corrected lines. # Follows the same style as for pdb-tools. # these functions yield line-by-line. line_by_line_processing_steps = [ wrep_pdb_keepcoord, # also discards ANISOU # tidy is important before some other corrections wrep_pdb_tidy_strict, wrep_pdb_element, wrep_pdb_selaltloc, partial(wrep_pdb_occ, occupancy=1.00), replace_MSE_to_MET, replace_HSD_to_HIS, replace_HSE_to_HIS, replace_HID_to_HIS, replace_HIE_to_HIS, add_charges_to_ions, partial( convert_ATOM_to_HETATM, residues=set.union( supported_HETATM, user_supported_residues or set(), ), ), convert_HETATM_to_ATOM, partial(wrep_pdb_fixinsert, option_list=[]), ##### partial( remove_unsupported_hetatm, user_defined=user_supported_residues ), # noqa: E501 partial(remove_unsupported_atom), #### # partial(wrep_pdb_shiftres, shifting_factor=0), partial(wrep_pdb_reatom, starting_value=1), wrep_pdb_tidy, ### wrep_rstrip, ] # these functions take the whole PDB content, evaluate it, and # modify it if needed. whole_pdb_processing_steps = [ models_should_have_the_same_labels, solve_no_chainID_no_segID, homogenize_chains, ] # these functions take all structures combined, evulate them # togehter, and modify them if needed. processed_combined_steps = [ correct_equal_chain_segids, ] # START THE ACTUAL PROCESSING # individual processing (line-by-line) result_1 = [ list(chainf(structure, *line_by_line_processing_steps, report=dry)) for structure in structures ] # whole structure processing result_2 = [ list(chainf(structure, *whole_pdb_processing_steps, report=dry)) for structure in result_1 ] # combined processing final_result = chainf(result_2, *processed_combined_steps) return final_result
# Functions operating line-by-line # make pdb-tools reportable wrep_pdb_chain = _report("pdb_chain")(pdb_chain.run) wrep_pdb_chainxseg = _report("pbd_segxchain")(pdb_chainxseg.run) wrep_pdb_element = _report("pdb_element")(pdb_element.run) wrep_pdb_fixinsert = _report("pdb_fixinsert")(pdb_fixinsert.run) wrep_pdb_keepcoord = _report("pdb_keepcoord")(pdb_keepcoord.run) wrep_pdb_occ = _report("pdb_occ")(pdb_occ.run) wrep_pdb_reatom = _report("pdb_reatom")(pdb_reatom.run) wrep_pdb_shiftres = _report("pdb_shiftres")(pdb_shiftres.run) wrep_pdb_rplresname = _report("pdb_rplresname")(pdb_rplresname.run) wrep_pdb_segxchain = _report("pdb_segxchain")(pdb_segxchain.run) wrep_pdb_selaltloc = _report("pdb_selaltloc")(pdb_selaltloc.run) wrep_pdb_tidy = _report("pdb_tidy")(pdb_tidy.run) wrep_pdb_tidy_strict = _report("pdb_tidy")(partial(pdb_tidy.run, strict=True)) wrep_rstrip = _report("str.rstrip")( partial(map, lambda x: x.rstrip(linesep)) ) # noqa: E501
[docs] @_report("Replacing HETATM to ATOM for residue {!r}") def replace_HETATM_to_ATOM( fhandler: Iterable[str], res: str ) -> Generator[str, None, None]: """ Replace record `HETATM` to `ATOM` for `res`. Do not alter other lines. Parameters ---------- fhanlder : file handler or list of lines List-like of file lines. Consumes over a ``for`` loop. res : str Residue name to match for the substitution. Yields ------ str Yield line-by-line. """ for line in fhandler: if line.startswith("HETATM") and line[slc_resname].strip() == res: yield "ATOM " + line[6:] else: yield line
[docs] @_report("Replace residue ATOM/HETATM {!r} to ATOM {!r}") def replace_residue( fhandler: Iterable[str], resin: str, resout: str ) -> Generator[str, None, None]: """ Replace residue by another and changes ``HETATM`` to ``ATOM`` if needed. Do not alter other lines. Parameters ---------- fhanlder : file handler or list of lines List-like of file lines. Consumes over a ``for`` loop. resin : str Residue name to match for the substitution. resout : str Name of the new residue. Renames ``resin`` to ``resout``. Yields ------ str Yield line-by-line. See Also -------- * :py:func:`replace_HETATM_to_ATOM` * ``pdb_rplresname`` from ``pdb-tools`` """ _ = replace_HETATM_to_ATOM(fhandler, res=resin) yield from pdb_rplresname.run(_, name_from=resin, name_to=resout)
replace_MSE_to_MET = partial(replace_residue, resin="MSE", resout="MET") """ Replace ``MSE`` to ``MET``. See Also -------- * :py:func:`replace_residue` """ replace_HSD_to_HIS = partial(replace_residue, resin="HSD", resout="HIS") """ Replace ``HSD`` to ``HIS``. See Also -------- * :py:func:`replace_residue` """ replace_HSE_to_HIS = partial(replace_residue, resin="HSE", resout="HIS") """ Replace ``HSE`` to ``HIS``. See Also -------- * :py:func:`replace_residue` """ replace_HID_to_HIS = partial(replace_residue, resin="HID", resout="HIS") """ Replace ``HID`` to ``HIS``. See Also -------- * :py:func:`replace_residue` """ replace_HIE_to_HIS = partial(replace_residue, resin="HIE", resout="HIS") """ Replace ``HIE`` to ``HIS``. See Also -------- * :py:func:`replace_residue` """
[docs] @_report("Remove unsupported molecules") def remove_unsupported_molecules( lines: Iterable[str], haddock3_defined: Optional[set[str]] = None, user_defined: Optional[set[str]] = None, line_startswith: Union[str, tuple[str, ...]] = ("ATOM", "HETATM"), ) -> Generator[str, None, None]: """ Remove HADDOCK3 unsupported molecules. This function is abstract and you need to provide the set of residues supported by HADDOCK3. See parameters. Residues not provided in ``haddock3_defined`` and ``user_defined`` are removed from the PDB lines. Other lines are yieled unmodified. Parameters ---------- lines : list or list-like Lines of the PDB file. This function will consumes lines over a ``for`` loop; mind it if you use a generator. haddock3_defined : set Set of residues supported by HADDOCK3. Defaults to ``None``. user_defined : set An additional set of allowed residues given by the user. Defaults to ``None``. line_startswith : tuple The lines to consider. Defaults to ``("ATOM", "HETATM")``. Yields ------ line : str Line-by-line. Lines for residues not supported are *not* yielded. See Also -------- Other functions use this function to create context. * :py:func:`remove_unsupported_atom` * :py:func:`remove_unsupported_hetatm` """ user_defined = user_defined or set() haddock3_defined = haddock3_defined or set() allowed = set.union(haddock3_defined, user_defined) # find a way to report this not_allowed_found: set[str] = set() for line in lines: if line.startswith(line_startswith): residue = line[slc_resname].strip() if residue in allowed: yield line else: not_allowed_found.add(residue) continue else: yield line return
remove_unsupported_hetatm = partial( remove_unsupported_molecules, haddock3_defined=supported_HETATM, line_startswith="HETATM", ) """ Remove unsupported molecules in ``HETATM`` lines. Uses :py:func:`remove_unsupported_molecules` by populating its ``haddock3_define`` and ``line_startswith`` parameters. See Also -------- * :py:func:`remove_unsupported_atom` """ remove_unsupported_atom = partial( remove_unsupported_molecules, haddock3_defined=supported_ATOM, line_startswith="ATOM", ) """ Remove unsupported molecules in ``ATOM`` lines. Uses :py:func:`remove_unsupported_molecules` by populating its ``haddock3_define`` and ``line_startswith`` parameters. See Also -------- * :py:func:`remove_unsupported_hetatm` """
[docs] @_report("Add charges to ions.") def add_charges_to_ions(fhandler: Iterable[str]) -> Generator[str, None, None]: """ Add charges to ions according to HADDOCK3 specifications. 1. Check if charge is correctly defined in residue name. If so, yield the line with correct residue name and charge at the end. 2. Check if charge is correctly defined in atom name. 3. Create charge from element. This might need manual edit in case the atom as an unconventional charge. Parameters ---------- fhandler : file-hanlder, list, or list-like Lines of the PDB file. This function will consumes lines over a ``for`` loop; mind it if you use a generator. Yields ------ line : str Line-by-line: modified ion lines and any other line. """ # list of functions that correct ion entries # by order of preference in case a preference is not found. # see further ion_correction_cases = [ _process_ion_case_atom, _process_ion_case_resname, _process_ion_case_element_charge, ] for line in fhandler: if line.startswith(("ATOM", "ANISOU", "HETATM")): # get values atom = line[slc_name].strip() # max 4 chars resname = line[slc_resname].strip() # max 3 chars element = line[slc_element].strip() # max 2 chars charge = line[slc_charge].strip() # max 2 chars if resname in supported_non_ions: yield line continue # Which of the above fields has information on the charge? # If more than one have information on the charge, we give # the preference to the atom, resname, and then charge func_to_apply = False if atom[-1].isdigit(): func_to_apply = _process_ion_case_atom # type: ignore elif resname[-1].isdigit(): func_to_apply = _process_ion_case_resname # type: ignore elif element and charge and charge[-1].isdigit(): func_to_apply = _process_ion_case_element_charge # type: ignore if func_to_apply: yield func_to_apply(line) # type: ignore # in case none of the fields has information on the charge, # applies the process functions by giving preference to the # residue names. else: for func in ion_correction_cases: try: yield func(line) except Exception: # test the next function continue else: break # done else: yield line # lines that do not concern to ions else: yield line # lines that are not atoms
def _process_ion_case_resname(line: str) -> str: """ Process ion information based on resnames. case 1: charge is correctly defined in resname, for example, ZN2. In this case, ignore other fields and write ion information from scratch even if it's already correct. """ resname = line[slc_resname].strip() # max 3 chars new_atom = supported_single_ions_resnames_map[resname].atoms[0] new_element = supported_single_ions_resnames_map[resname].elements[0] charge = new_atom[-2:] if len(new_atom) > 2 else " " new_line = ( line[:12] + format_atom_name(new_atom, new_element) + line[16] + resname.rjust(3, " ") + line[20:76] + new_element.rjust(2, " ") + charge ) return new_line def _process_ion_case_atom(line: str) -> str: """ Process ion information based on atom names. case 2: charge is correctly defined in atom name ignore other fields and write them from scratch even if they are already correct. """ element = line[slc_element].strip() # max 2 chars if element == "C": # element C can have atom name CA which conflicts carbon alpha # and calcium raise ValueError("Element is 'C', does not apply to this case.") atom = line[slc_name].strip() # max 4 chars new_resname = supported_single_ions_atoms_map[atom].resname new_element = supported_single_ions_atoms_map[atom].elements[0] charge = atom[-2:] if len(atom) > 2 else " " new_line = ( line[:12] + format_atom_name(atom, new_element) + line[16] + new_resname.rjust(3, " ") + line[20:76] + new_element.rjust(2, " ") + charge ) return new_line def _process_ion_case_element_charge(line: str) -> str: """ Process ion information based on element and charge. case 3: charge is correctly defined in atom name ignore other fields and write them from scratch even if they are already correct. """ element = line[slc_element].strip() # max 2 chars charge = line[slc_charge].strip() # max 2 chars atom = element + charge new_resname = supported_single_ions_atoms_map[atom].resname new_line = ( line[:12] + format_atom_name(atom, element) + line[16] + new_resname.rjust(3, " ") + line[20:76] + element.rjust(2, " ") + charge ) return new_line
[docs] @_report("Convert record: {!r} to {!r}.") def convert_record( fhandler: Iterable[str], record: str, other_record: str, residues: Container[str] ) -> Generator[str, None, None]: """ Convert on record to another for specified residues. For example, replace ``ATOM`` by ``HETATM`` for specific residues. Parameters ---------- fhandler : list-like Contains lines of file. record : str The PDB RECORD to match; for example, ``ATOM`` or ``HETATM``. other_record : str The PDB RECORD to replace with; for example, ``ATOM`` or ``HETATM``. residues : list, tuple, or set List of residues to replace the record. """ for line in fhandler: if line.startswith(record): resname = line[slc_resname].strip() if resname in residues: yield other_record + line[6:] continue yield line
convert_ATOM_to_HETATM = partial( convert_record, record="ATOM", other_record="HETATM", residues=supported_HETATM, ) """ Convert ``ATOM`` to ``HETATM`` for HADDOCK3 supported ``HETATM``. See Also -------- * :py:data:`haddock.core.supported_molecules.supported_HETATM` """ convert_HETATM_to_ATOM = partial( convert_record, record="HETATM", other_record="ATOM ", residues=supported_ATOM, ) """ Convert ``HETATM`` to ``ATOM`` for HADDOCK3 supported ``ATOM``. See Also -------- * :py:data:`haddock.core.supported_molecules.supported_ATOM` """ # Functions operating in the whole PDB
[docs] @_report("Solving chain/seg ID issues.") def solve_no_chainID_no_segID(lines: Iterable[str]) -> Iterable[str]: """ Solve inconsistencies with chainID and segID. If segID is non-existant, copy chainID over segID, and vice-versa. If none are present, adds an upper case char starting from A. This char is not repeated until the alphabet exhausts. If chainIDs and segIDs differ, copy chainIDs over segIDs. Parameters ---------- lines : list of str The lines of a PDB file. Returns ------- list With new lines. Or the input ones if no modification was made. """ chainids = read_chainids(lines) segids = read_segids(lines) if not chainids and segids: new_lines = pdb_segxchain.run(lines) elif chainids and not segids: new_lines = pdb_chainxseg.run(lines) elif not chainids and not segids: _chains = pdb_chain.run(lines, next(_CHAINS)) new_lines = pdb_chainxseg.run(_chains) # gives priority to chains elif chainids != segids: new_lines = pdb_chainxseg.run(lines) else: # nothing to do return lines return list(new_lines)
# TODO: needs to become an option. # change chain ID and shift the residue of the other chains. # also needs to be sync with the restraints. # maybe not an automatic.. maybe used as a CLI # maybe is a place to have a CHECK instead of a autoprocess
[docs] @_report("Homogenizes chains") def homogenize_chains(lines: list[str]) -> list[str]: """ Homogenize chainIDs within the same PDB. If there are multiple chain identifiers in the PDB file, make all them equal to the first one. ChainIDs are copied to segIDs afterwards. Returns ------- list The modified lines. """ chainids = read_chainids(lines) if len(set(chainids)) > 1: return list( chainf( lines, partial(pdb_chain.run, chain_id=chainids[0]), pdb_chainxseg.run, ) ) else: return lines
# Functions operating in all PDBs at once
[docs] def correct_equal_chain_segids(structures: list[list[str]]) -> list[list[str]]: """ Correct for repeated chainID in the input PDB files. Repeated chain IDs are replaced by an upper case character (``[A-Z]``) in order. Parameters ---------- structures : list of lists of str The input data. Returns ------- list of lists of str The new structures. """ _all_chains = (read_chainids(s) for s in structures) # set of all chain IDs present in the input PDBs all_chain_ids = set(it.chain.from_iterable(_all_chains)) # the remaining available chain characters are the A-Z minus the # `all_chain_ids` remaining_chars = it.cycle(sorted(set(_ascii_letters).difference(all_chain_ids))) chain_ids: list[Iterable[str]] = [] new_structures: list[list[str]] = [] for lines in structures: new_lines: Optional[list[str]] = None # read chain IDs from the structure chain_id = read_chainids(lines) # if chain_id is repeated if chain_id in chain_ids: new_lines = list( chainf( lines, # change the chain ID by a new one partial(pdb_chain.run, chain_id=next(remaining_chars)), # applies chain ID to seg ID as well pdb_chainxseg.run, ) ) else: chain_ids.append(chain_id) new_structures.append(new_lines or lines) if len(new_structures) != len(structures): raise AssertionError("Number of lines differ. This is a bug!") return new_structures
# this id bad
[docs] @_report("Check models are the same") def models_should_have_the_same_labels(lines: Iterable[str]) -> Iterable[str]: """ Confirm models have the same labels. In an ensemble of structures, where the PDB file has multiple MODELS, all models should have the same labels; hence the same number and typ of atoms. Parameters ---------- lines : list of strings. List containing the lines of the PDB file. Must NOT be a generator. Returns ------- list The original ``lines`` in case no errors are found. Raises ------ ModelsDifferError In case MODELS differ. Reports on which models differ. """ # searchers for the first MODEL line. If found, break the loop # and continue to the rest of the function. # # if not found, return the same input lines. for line in lines: if line.startswith("MODEL"): break else: return lines # captures all the models models: dict[Optional[int], set[str]] = {} new_model: list[str] = [] new_model_id = None for line in lines: if line.startswith("MODEL"): if new_model_id is not None: models[new_model_id] = set(new_model) new_model.clear() new_model_id = int(line[10:14]) elif line.startswith(("ATOM", "HETATM")): new_model.append(line[12:27]) else: models[new_model_id] = set(new_model) new_model.clear() # check if all MODELS are equal, performing all vs all comparison keys = list(models.keys()) first_key = keys[0] for model_num in keys[1:]: if models[model_num] != models[first_key]: emsg = f"Labels in MODEL {model_num} differ from MODEL {first_key}." raise ModelsDifferError(emsg) return lines