Source code for haddock.libs.librestraints

from pathlib import Path
import logging
import itertools
import re
import random
import sys

from Bio.PDB import PDBParser, NeighborSearch
from freesasa import Classifier, structureFromBioPDB, calc

# Scaling factors for relative ASA
# Calculated using extended ALA-X-ALA peptides
# Taken from NACCESS
# Taken from NACCESS
REL_ASA = {
    'total':
        {
            'ALA': 107.95,
            'CYS': 134.28,
            'ASP': 140.39,
            'GLU': 172.25,
            'PHE': 199.48,
            'GLY': 80.10,
            'HIS': 182.88,
            'ILE': 175.12,
            'LYS': 200.81,
            'LEU': 178.63,
            'MET': 194.15,
            'ASN': 143.94,
            'PRO': 136.13,
            'GLN': 178.50,
            'ARG': 238.76,
            'SER': 116.50,
            'THR': 139.27,
            'VAL': 151.44,
            'TRP': 249.36,
            'TYR': 212.76,
            'ASH': 140.39,
            'DDZ': 107.95,
            'GLH': 172.25,
            'CYM': 134.28,
            'CSP': 134.28,
            'CYF': 134.28,
            'CYC': 134.28,
            'CFE': 134.28,
            'NEP': 182.88,
            'ALY': 200.81,
            'MLZ': 200.81,
            'MLY': 200.81,
            'M3L': 200.81,
            'HYP': 136.13,
            'SEP': 116.50,
            'TOP': 139.27,
            'TYP': 212.76,
            'PTR': 212.76,
            'TYS': 212.76,
            'PNS': 116.50,
            },
    'bb':
        {
            'ALA': 38.54,
            'CYS': 37.53,
            'ASP': 37.70,
            'GLU': 37.51,
            'PHE': 35.37,
            'GLY': 47.77,
            'HIS': 35.80,
            'ILE': 37.16,
            'LYS': 37.51,
            'LEU': 37.51,
            'MET': 37.51,
            'ASN': 37.70,
            'PRO': 16.23,
            'GLN': 37.51,
            'ARG': 37.51,
            'SER': 38.40,
            'THR': 37.57,
            'VAL': 37.16,
            'TRP': 38.10,
            'TYR': 35.38,
            'ASH': 37.70,
            'DDZ': 38.54,
            'GLH': 37.51,
            'CYM': 37.53,
            'CYC': 37.53,
            'CSP': 37.53,
            'CYF': 37.53,
            'CFE': 37.53,
            'NEP': 35.80,
            'ALY': 37.51,
            'MLZ': 37.51,
            'MLY': 37.51,
            'M3L': 37.51,
            'HYP': 16.23,
            'SEP': 38.40,
            'TOP': 37.57,
            'TYP': 35.38,
            'PTR': 35.38,
            'TYS': 35.38,
            'PNS': 38.40,
            },
    'sc':
        {
            'ALA': 69.41,
            'CYS': 96.75,
            'ASP': 102.69,
            'GLU': 134.74,
            'PHE': 164.11,
            'GLY': 32.33,
            'HIS': 147.08,
            'ILE': 137.96,
            'LYS': 163.30,
            'LEU': 141.12,
            'MET': 156.64,
            'ASN': 106.24,
            'PRO': 119.90,
            'GLN': 140.99,
            'ARG': 201.25,
            'SER': 78.11,
            'THR': 101.70,
            'VAL': 114.28,
            'TRP': 211.26,
            'TYR': 177.38,
            'ASH': 102.69,
            'DDZ': 69.41,
            'GLH': 134.74,
            'CYM': 96.75,
            'CYC': 96.75,
            'CSP': 96.75,
            'CYF': 96.75,
            'CFE': 96.75,
            'NEP': 147.08,
            'ALY': 163.30,
            'MLZ': 163.30,
            'MLY': 163.30,
            'M3L': 163.30,
            'HYP': 119.90,
            'SEP': 78.11,
            'TOP': 101.70,
            'TYP': 177.38,
            'PTR': 177.38,
            'TYS': 177.38,
            'PNS': 78.11,
            }
    }
DEFAULT_PROBE_RADIUS = 1.4


[docs]
def get_surface_resids(structure, cutoff=15):
    """
    Calls freesasa using its Python API and returns
    per-residue accessibilities.
    """
    asa_data, rsa_data, rel_main_chain, rel_side_chain = {}, {}, {}, {}
    _rsa = REL_ASA['total']
    _rsa_bb = REL_ASA['bb']
    _rsa_sc = REL_ASA['sc']

    #classifier = Classifier(config_path)
    classifier = Classifier()

    struct = structureFromBioPDB(structure, classifier, )
    result = calc(struct)

    # iterate over all atoms to get SASA and residue name
    for idx in range(struct.nAtoms()):
        atname = struct.atomName(idx).strip()
        resname = struct.residueName(idx)
        resid = int(struct.residueNumber(idx))
        chain = struct.chainLabel(idx)
        at_uid = (chain, resname, resid, atname)
        res_uid = (chain, resname, resid)

        asa = result.atomArea(idx)
        asa_data[at_uid] = asa
        # add asa to residue
        rsa_data[res_uid] = rsa_data.get(res_uid, 0) + asa

        if atname in ('C', 'N', 'O'):
            rel_main_chain[res_uid] = rel_main_chain.get(res_uid, 0) + asa
        else:
            rel_side_chain[res_uid] = rel_side_chain.get(res_uid, 0) + asa

    # convert total asa ro relative asa
    rsa_data.update((res_uid, asa / _rsa[res_uid[1]]) for res_uid, asa in rsa_data.items())
    rel_main_chain.update((res_uid, asa / _rsa_bb[res_uid[1]] * 100) for res_uid, asa in rel_main_chain.items())
    rel_side_chain.update((res_uid, asa / _rsa_sc[res_uid[1]] * 100) for res_uid, asa in rel_side_chain.items())

    # We format to fit the pipeline
    resid_access = {}
    for res_uid, access in rel_main_chain.items():
        resid_access[res_uid[2]] = {'side_chain_rel': rel_side_chain.get(res_uid), 'main_chain_rel': access}
    surface_resids = [r for r, v in resid_access.items() if v['side_chain_rel'] >= cutoff or
                      v['main_chain_rel'] >= cutoff]
    return surface_resids




[docs]
def parse_actpass_file(actpass_file):
    """Parse actpass file
    
    Parameters
    ----------
    actpass_file: str or Path
        path to actpass_file

    Returns
    -------
    active: list
        list of active residues
    passive: list
        list of passive residues
    """
    
    if Path(actpass_file).exists() is False:
        raise Exception(f"actpass file {actpass_file} does not exist.")

    lines = open(actpass_file, "r").readlines()
    nlines = len(lines)
    if nlines != 2:
        raise Exception(f"actpass file {actpass_file} does not have two lines (counted {nlines})")
    active, passive = [[int(x) for x in line.split()] for line in lines]
    return active, passive




[docs]
def active_passive_to_ambig(active1, passive1, active2, passive2, segid1='A', segid2='B'):
    """Convert active and passive residues to Ambiguous Interaction Restraints

    Parameters
    ----------
    active1 : list
        List of active residue numbers of the first segid

    passive1 : list
        List of passive residue numbers of the first segid

    passive2 : list
        List of passive residue numbers of the second segid

    active2 : list
        List of active residue numbers of the second segid
    
    active2 : list
        List of passive residue numbers of the second segid

    segid1 : string
        Segid to use for the first model

    segid2 : string
        Segid to use for the second model

    """

    all1 = active1 + passive1
    all2 = active2 + passive2

    for resi1 in active1:
        print('assign (resi {:d} and segid {:s})'.format(resi1, segid1))
        print('(')
        c = 0
        for resi2 in all2:
            print('       (resi {:d} and segid {:s})'.format(resi2, segid2))
            c += 1
            if c != len(all2):
                print('        or')

        print(') 2.0 2.0 0.0\n')
            
    for resi2 in active2:
        print('assign (resi {:d} and segid {:s})'.format(resi2, segid2))
        print('(')
        c = 0
        for resi1 in all1:
            print('       (resi {:d} and segid {:s})'.format(resi1, segid1))
            c += 1
            if c != len(all1):
                print('        or')

        print(') 2.0 2.0 0.0\n')



# Functions/Methods


[docs]
def calc_euclidean(i, j):
	return ((j[0]-i[0])**2 + (j[1]-i[1])**2 + (j[2]-i[2])**2)**0.5




[docs]
def read_structure(pdbf, exclude=None):
	"""
	Reads a PDB file and returns a list of parsed atoms
	"""
	_atoms = {'CA', 'P'}  # Alpha-Carbon (Prot), Backbone Phosphorous (DNA)
	_altloc = {' ', 'A'}

	if not exclude:
		exclude = set()
	else:
		exclude = set(exclude)

	res_list = []
	with open(pdbf, 'r') as pdb_handle:
		for line in pdb_handle:
			field = line[0:4]
			if field != 'ATOM':
				continue

			aname = line[12:16].strip()
			chain = line[21] if line[21].strip() else line[72:76].strip()  # chain ID or segID
			if chain not in exclude and aname in _atoms and line[16] in _altloc:
				resi = int(line[22:26])
				coords = (float(line[30:38]), float(line[38:46]), float(line[46:54]))
				res_list.append((chain, resi, aname, coords))

	if not res_list:
		logging.critical('[!] PDB File seems empty or no CA/P atoms found: {0}'.format(pdbf))
		sys.exit(1)

	return res_list




[docs]
def get_bodies(atom_lst, prot_threshold=4.0, dna_threshold=7.5):
	"""
	Determines gaps in an atom list following simple distance based criteria.
	Returns continuous fragments.
	"""

	bodies = []

	threshold = {'CA': prot_threshold, 'P': dna_threshold}

	body_start = 0
	i = None
	for i, atom in enumerate(atom_lst[1:], start=1):
		p_atom = atom_lst[i-1]

		chain, resi, aname, xyz = atom
		p_chain, p_resi, p_aname, p_xyz = p_atom

		if (chain == p_chain) and (aname == p_aname):  # Internal Gap
			d_xyz = calc_euclidean(xyz, p_xyz)
			if d_xyz >= threshold[aname]:
				logging.debug('[+++] (Internal) Body: {0}:{1}'.format(body_start, i-1))
				bodies.append((body_start, i-1))
				body_start = i  # Set new beginning

		elif (chain != p_chain) or (aname != p_aname):  # Different molecules/types
			logging.debug('[+++] Body: {0}:{1}'.format(body_start, i-1))
			bodies.append((body_start, i-1))
			body_start = i  # Set new beginning

	if not bodies:  # Single continuous molecule
		bodies.append((0, len(atom_lst)))
	else:
		logging.debug('[+++] Body: {0}:{1}'.format(body_start, i))
		bodies.append((body_start, i))  # Last body

	logging.info('[++] Found {0} bodies'.format(len(bodies)))

	return bodies




[docs]
def build_restraints(bodies):
	"""
	Generates distance restraints to maintain the relative
	orientation of the different bodies during the simulations.

	Generates two unique restraints per pair of bodies.

	Each restraint is created using two random atoms on each body
	and using their exact euclidean distance as target distance.
	"""

	def pick_residues(body, max_trials=10):
		# Pick two random residues in each body
		# Make sure they are far apart from each other
		n_trials = 0
		while 1:
			try:
				res_i, res_ii = random.sample(body, 2)
			except ValueError:
				# Likely, sample size is 1
				logging.warning('[!] One-sized body found. This may lead to problems..')
				return body[0], body[0]

			logging.debug('[+++] Trial {0}: {1} & {2}'.format(n_trials, res_i, res_ii))
			if abs(res_i - res_ii) > 3:
				logging.info('[++] Picked residues {0} & {1}'.format(res_i, res_ii))
				return res_i, res_ii
			n_trials += 1
			if n_trials == max_trials:
				msg = '[!] Could not pick two unique distant residues in body after {0} tries'
				logging.info(msg.format(max_trials))
				return res_i, res_ii

	restraints = []

	n_bodies = range(len(bodies))
	combinations = itertools.combinations(n_bodies, 2)

	for pair_bodies in combinations:
		body_i, body_j = pair_bodies
		logging.debug('[+++] Restraining body {0} to body {1}'.format(body_i, body_j))

		st_body_i, en_body_i = bodies[body_i]
		st_body_j, en_body_j = bodies[body_j]
		res_i, res_ii = pick_residues(range(st_body_i, en_body_i+1))
		res_j, res_jj = pick_residues(range(st_body_j, en_body_j+1))

		logging.info('[++] Created restraint: {0}:{1} <--> {2}:{3}'.format(body_i, res_i, body_j, res_j))
		restraints.append((res_i, res_j))
		logging.info('[++] Created restraint: {0}:{1} <--> {2}:{3}'.format(body_i, res_ii, body_j, res_jj))
		restraints.append((res_ii, res_jj))

	return restraints




[docs]
def generate_tbl(atom_lst, restraints):
	"""
	Makes a list of TBL-formatted restraints.

    Parameters
    ----------
    atom_lst : list
        List of atoms in the form (chain, resi, aname, coords)
    
    restraints : list
        List of restraints in the form (res_i, res_j)
	"""

	for r in restraints:
		i, j = r
		atom_i, atom_j = atom_lst[i], atom_lst[j]
		dist_ij = calc_euclidean(atom_i[3], atom_j[3])

		tbl = "assign (segid {0[0]} and resi {0[1]} and name {0[2]}) ".format(atom_i)
		tbl += "(segid {0[0]} and resi {0[1]} and name {0[2]}) ".format(atom_j)
		tbl += "{0:3.3f} 0.0 0.0".format(dist_ij)
		print(tbl)




[docs]
def check_parenthesis(file):
    open_parenthesis = re.compile('[(]')
    close_parenthesis = re.compile('[)]')
    quotation_marks = re.compile('[\"]')
    opened = 0
    closed = 0
    quote = 0
    for _match in open_parenthesis.finditer(file):
        opened += 1
    for _match in close_parenthesis.finditer(file):
        closed += 1
    for _match in quotation_marks.finditer(file):
        quote += 1
    if opened != closed:
        raise Exception("Problem with TBL file parentheses ({:d} opening for {:d} "
                        "closing parentheses)".format(opened, closed))
    if quote % 2 != 0:
        raise Exception("Problem with TBL file, odd number of quotation marks "
                        "({:d} quotation marks)".format(quote))




[docs]
def validate_tbldata(restraints, pcs=False):
    # List of selection keywords
    selectors = ('name', 'resn', 'atom', 'resi', 'attr', 'segi', 'chem', 'id',
                 'byres', 'not')
    connectors = ('and', 'or', 'not', 'byres')
    output = ""
    parentmatch = re.compile('[()]')
    # Global mode is activated outside any assign statement
    mode = "global"
    # Remove any carriage return/new line caracters
    lines = restraints.replace('\r', '').split("\n")
    # Line number
    lnr = 0
    # Temporary line storage for future output
    tmp_output = None
    tmp = None
    level = None
    s = None
    selections = None
    postselections = None
    numbers = None
    types = None
    lastassign = None
    for line in lines:
        lnr += 1
        # Take everything which is before putative "!" (comment) caracter
        if line.find("!") > -1:
            line = line[:line.find("!")]
        # Remove whitespaces and merge with previous line if in OR statement
        # for AIR restraints
        if mode != "format":
            line = line.strip()
        else:
            line = tmp + line.strip()
            mode = "postassign"
        # End of line
        if not len(line):
            continue
        # Check if all "" are closed
        if line.count('"') % 2 > 0:
            raise Exception('Unclosed " at line {:d} for line: {}'.format(lnr, line))
        if mode in ("global", "postglobal"):
            # Start of an assign statement
            if line.lower().startswith("assi"):
                if mode == "postglobal":
                    output += "\n!\n"
                    output += tmp_output
                mode = "assign"
                selections = []
                # assign is the only character of the line,check following lines
                if line.find("(") == -1:
                    continue
                # Reset temporary buffer
                tmp_output = None
            # Check for "OR" restraint
            elif mode == "postglobal" and line.lower().startswith("or"):
                mode = "postassign"
                selections = []
                line = line[len("or"):]
                # Case where "OR" statement is the only one on the line
                # (rest of selection at the next line)
                if line == "":
                    continue
            # We are not treating an assign params (postglobal) or at the
            # end (global) and no "OR" restraint is present -> ERROR
            else:
                raise Exception("Invalid TBL file: Unknown statement (line {:d}): {}".format(lnr, line))
        # We check if the selection is made over two lines
        # (thanks to "segid" keyword)
        if mode == "postassign":
            if line.count("segid") == 1:
                mode = "format"
                tmp = line
                continue

        matched = True
        while matched:
            matched = False
            # We are looking for parenthesis as start of the selections
            if mode in ("assign", "postassign"):
                # Ambiguous restraint for two different pairs of atoms
                # (ex: THR 1 B <-> ALA 2 A OR GLY 2 B <-> ASP 10 A )
                pos = line.find("(")
                if pos != -1:
                    matched = True
                    line = line[pos + 1:]
                    # We look for an "OR" selection
                    if mode == "postassign":
                        mode = "postsel"
                    else:
                        mode = "sel"
                    lastassign = lnr
                    s = ""
                    level = 1
            # Get the structural selections
            if mode in ("sel", "postsel"):
                # Detect opening and closing parenthesis
                for match in parentmatch.finditer(line):
                    if match.group() == "(":
                        level += 1
                    else:
                        level -= 1
                    # End of the parenthesis content
                    if level == 0:
                        if mode == "postsel":
                            mode = "postassign"
                        else:
                            mode = "assign"
                        matched = True
                        # print l[:match.start()]
                        # Get back the parentheses content and check
                        # for selection keywords
                        idx_connectors = []
                        sel = line[:match.start()]
                        # We can directly check for the 1st keyword presence
                        if len(sel) > 0:
                            syntax_ok = False
                            for s in selectors:
                                if sel.strip().startswith(s):
                                    syntax_ok = True
                            if not syntax_ok and not \
                                    sel.strip().startswith('byres') and not \
                                    sel.strip().startswith('not'):
                                raise Exception("1) Missing or wrong keyword in-term {} (stopped at line {:d})".
                                                format(sel, lnr))
                        # Get indexes of connectors (AND, OR, etc.)
                        for c in connectors:
                            idx_connectors.append([m.start()+len(c) for m in
                                                   re.finditer(c, sel)])
                        # Flatten the list
                        tmp_list = [item for sublist in idx_connectors for
                                    item in sublist]
                        idx_connectors = tmp_list
                        # Check that each connector is followed by a keyword
                        for i in idx_connectors:
                            new_sel = sel[i:].strip().strip("(").strip()
                            if len(new_sel) > 0:
                                syntax_ok = False
                                for s in selectors:
                                    if new_sel.startswith(s):
                                        syntax_ok = True
                                        break
                                if not syntax_ok:
                                    raise Exception("Missing or wrong keyword in-term {} (stopped at line {:d})".
                                                    format(sel, lnr))
                        s += sel
                        selections.append(s)
                        s = None
                        # Get the rest of the line
                        line = line[match.end():]
                        # Go to the process of the selection
                        break
                # No parenthesis, we get the whole line
                if level > 0:
                    if line != "":
                        s += line + "\n\t"
                    else:
                        # To avoid multiple blank lines
                        if repr(s) == "''":
                            # print repr(l), repr(s), selections
                            s += "\n\t"
        # TBD
        if mode in ("sel", "postsel"):
            continue
        # Selection parsed for "OR" line
        if mode == "postassign":
            if len(selections) != postselections:
                raise Exception("Invalid TBL file: wrong number of selections: in-term {:d}, cross-term {:d} "
                                "(stopped at line {:d})".format(postselections, len(selections), lnr))
            tmp_output += " or"
            for s in selections:
                tmp_output += "\t({})\n".format(s)
            # We let the possibility for other "OR"
            mode = "postglobal"
        if len(line) == 0:
            continue
        # Define the distance restraints type to adapt the parsing
        if mode == "assign":
            mode = "numbers"
            if pcs:
                if len(selections) == 5:
                    types = (" {:.3f}", " {:.3f}")
                else:
                    raise Exception("Invalid TBL file: wrong number of selections (must be 5 in PCS mode)")
            else:
                if len(selections) == 2:
                    types = (" {:.3f}", " {:.3f}", " {:.3f}")
                elif len(selections) == 4:
                    types = (" {:.3f}", " {:.3f}", " {:.3f}", " {:d}")
                elif len(selections) == 5:
                    raise Exception("Invalid TBL file: wrong number of selections (can be 5 only in PCS mode)")
                elif len(selections) == 6:
                    types = (" {:.3f}", " {:.3f}")
                else:
                    check_parenthesis(restraints)
                    raise Exception("Invalid TBL file: wrong number of selections (must be 2, 4 or 6)")
            postselections = len(selections)
            numbers = []
        # Distance restraints parsing
        if mode == "numbers":
            ll = line.split()
            for num in ll:
                if len(numbers) == len(types):
                    break
                numbers.append(float(num))
            if len(numbers) == len(types):
                tmp_output = "assign "
                for s in selections:
                    tmp_output += "\t({})\n".format(s)
                tmp_output = tmp_output[:-len("\n")]
                for n, t in zip(numbers, types):
                    tmp_output += t.format(n)
                tmp_output += "\n"
                mode = "postglobal"
    # "OR" lines have been parsed, we store the selections
    if mode == "postglobal":
        output += "!\n"
        output += tmp_output
        mode = "global"
    # If mode is not back to global, something has not been processed properly
    if mode != "global":
        raise Exception("Invalid TBL file: Malformed ASSIGN statement (line {:d}), use --quick to check for "
                        "putative syntax issues".format(lastassign))
    if not len(output.strip()):
        raise Exception("Invalid or empty TBL file")

    # Remove extra lines before each assign statement
    output = output.replace("\n\n", "\n")
    # Remove extra line at the beginning of the file
    if output.startswith("\n"):
        output = output.replace("\n", "", 1)
    return output



[docs]
def passive_from_active_raw(structure, active, chain_id=None, surface=None, radius=6.5):
    """Get the passive residues.
    
    Parameters
    ----------

    structure : str
        path to the PDB file

    active : list
        List of active residues

    chain_id : str
        Chain ID
    
    surface : list
       List of surface residues.

    radius : float
        Radius from active residues
    """

    # Parse the PDB file
    if Path(structure).exists():
        p = PDBParser(QUIET=True)
        s = p.get_structure('pdb', structure)
    else:
        raise FileNotFoundError('File not found: {0}'.format(structure))

    try:
        if chain_id:
            atom_list = [a for a in s[0][chain_id].get_atoms()]
        else:
            atom_list = [a for a in s[0].get_atoms()]
    except KeyError as e:
        raise KeyError('Chain {0} does not exist in the PDB file {1}, please enter a proper chain id'.
              format(chain_id, structure)) from e

    act_atoms = [a.get_coord() for a in atom_list if a.parent.id[1] in active]

    try:
        if not surface:
            surface = get_surface_resids(s)
    except Exception as e:
        raise Exception("There was an error while calculating surface residues: {}".format(e)) from e

    ns = NeighborSearch(atom_list)
    neighbors = []
    for a in act_atoms:
        neighbors.append(ns.search(a, radius, "R"))  # HADDOCK used 6.5A as default

    passive_list = set()
    for n in neighbors:
        for r in n:
            passive_list.add(r.id[1])
    tmp = passive_list & set(surface)
    passive_list = tmp - set(active)
    return sorted(passive_list)