from pathlib import Path
import logging
import itertools
import re
import random
import sys
from Bio.PDB import PDBParser, NeighborSearch
from freesasa import Classifier, structureFromBioPDB, calc
# Scaling factors for relative ASA
# Calculated using extended ALA-X-ALA peptides
# Taken from NACCESS
# Taken from NACCESS
REL_ASA = {
'total':
{
'ALA': 107.95,
'CYS': 134.28,
'ASP': 140.39,
'GLU': 172.25,
'PHE': 199.48,
'GLY': 80.10,
'HIS': 182.88,
'ILE': 175.12,
'LYS': 200.81,
'LEU': 178.63,
'MET': 194.15,
'ASN': 143.94,
'PRO': 136.13,
'GLN': 178.50,
'ARG': 238.76,
'SER': 116.50,
'THR': 139.27,
'VAL': 151.44,
'TRP': 249.36,
'TYR': 212.76,
'ASH': 140.39,
'DDZ': 107.95,
'GLH': 172.25,
'CYM': 134.28,
'CSP': 134.28,
'CYF': 134.28,
'CYC': 134.28,
'CFE': 134.28,
'NEP': 182.88,
'ALY': 200.81,
'MLZ': 200.81,
'MLY': 200.81,
'M3L': 200.81,
'HYP': 136.13,
'SEP': 116.50,
'TOP': 139.27,
'TYP': 212.76,
'PTR': 212.76,
'TYS': 212.76,
'PNS': 116.50,
},
'bb':
{
'ALA': 38.54,
'CYS': 37.53,
'ASP': 37.70,
'GLU': 37.51,
'PHE': 35.37,
'GLY': 47.77,
'HIS': 35.80,
'ILE': 37.16,
'LYS': 37.51,
'LEU': 37.51,
'MET': 37.51,
'ASN': 37.70,
'PRO': 16.23,
'GLN': 37.51,
'ARG': 37.51,
'SER': 38.40,
'THR': 37.57,
'VAL': 37.16,
'TRP': 38.10,
'TYR': 35.38,
'ASH': 37.70,
'DDZ': 38.54,
'GLH': 37.51,
'CYM': 37.53,
'CYC': 37.53,
'CSP': 37.53,
'CYF': 37.53,
'CFE': 37.53,
'NEP': 35.80,
'ALY': 37.51,
'MLZ': 37.51,
'MLY': 37.51,
'M3L': 37.51,
'HYP': 16.23,
'SEP': 38.40,
'TOP': 37.57,
'TYP': 35.38,
'PTR': 35.38,
'TYS': 35.38,
'PNS': 38.40,
},
'sc':
{
'ALA': 69.41,
'CYS': 96.75,
'ASP': 102.69,
'GLU': 134.74,
'PHE': 164.11,
'GLY': 32.33,
'HIS': 147.08,
'ILE': 137.96,
'LYS': 163.30,
'LEU': 141.12,
'MET': 156.64,
'ASN': 106.24,
'PRO': 119.90,
'GLN': 140.99,
'ARG': 201.25,
'SER': 78.11,
'THR': 101.70,
'VAL': 114.28,
'TRP': 211.26,
'TYR': 177.38,
'ASH': 102.69,
'DDZ': 69.41,
'GLH': 134.74,
'CYM': 96.75,
'CYC': 96.75,
'CSP': 96.75,
'CYF': 96.75,
'CFE': 96.75,
'NEP': 147.08,
'ALY': 163.30,
'MLZ': 163.30,
'MLY': 163.30,
'M3L': 163.30,
'HYP': 119.90,
'SEP': 78.11,
'TOP': 101.70,
'TYP': 177.38,
'PTR': 177.38,
'TYS': 177.38,
'PNS': 78.11,
}
}
DEFAULT_PROBE_RADIUS = 1.4
[docs]
def get_surface_resids(structure, cutoff=15):
"""
Calls freesasa using its Python API and returns
per-residue accessibilities.
"""
asa_data, rsa_data, rel_main_chain, rel_side_chain = {}, {}, {}, {}
_rsa = REL_ASA['total']
_rsa_bb = REL_ASA['bb']
_rsa_sc = REL_ASA['sc']
#classifier = Classifier(config_path)
classifier = Classifier()
struct = structureFromBioPDB(structure, classifier, )
result = calc(struct)
# iterate over all atoms to get SASA and residue name
for idx in range(struct.nAtoms()):
atname = struct.atomName(idx).strip()
resname = struct.residueName(idx)
resid = int(struct.residueNumber(idx))
chain = struct.chainLabel(idx)
at_uid = (chain, resname, resid, atname)
res_uid = (chain, resname, resid)
asa = result.atomArea(idx)
asa_data[at_uid] = asa
# add asa to residue
rsa_data[res_uid] = rsa_data.get(res_uid, 0) + asa
if atname in ('C', 'N', 'O'):
rel_main_chain[res_uid] = rel_main_chain.get(res_uid, 0) + asa
else:
rel_side_chain[res_uid] = rel_side_chain.get(res_uid, 0) + asa
# convert total asa ro relative asa
rsa_data.update((res_uid, asa / _rsa[res_uid[1]]) for res_uid, asa in rsa_data.items())
rel_main_chain.update((res_uid, asa / _rsa_bb[res_uid[1]] * 100) for res_uid, asa in rel_main_chain.items())
rel_side_chain.update((res_uid, asa / _rsa_sc[res_uid[1]] * 100) for res_uid, asa in rel_side_chain.items())
# We format to fit the pipeline
resid_access = {}
for res_uid, access in rel_main_chain.items():
resid_access[res_uid[2]] = {'side_chain_rel': rel_side_chain.get(res_uid), 'main_chain_rel': access}
surface_resids = [r for r, v in resid_access.items() if v['side_chain_rel'] >= cutoff or
v['main_chain_rel'] >= cutoff]
return surface_resids
[docs]
def parse_actpass_file(actpass_file):
"""Parse actpass file
Parameters
----------
actpass_file: str or Path
path to actpass_file
Returns
-------
active: list
list of active residues
passive: list
list of passive residues
"""
if Path(actpass_file).exists() is False:
raise Exception(f"actpass file {actpass_file} does not exist.")
lines = open(actpass_file, "r").readlines()
nlines = len(lines)
if nlines != 2:
raise Exception(f"actpass file {actpass_file} does not have two lines (counted {nlines})")
active, passive = [[int(x) for x in line.split()] for line in lines]
return active, passive
[docs]
def active_passive_to_ambig(active1, passive1, active2, passive2, segid1='A', segid2='B'):
"""Convert active and passive residues to Ambiguous Interaction Restraints
Parameters
----------
active1 : list
List of active residue numbers of the first segid
passive1 : list
List of passive residue numbers of the first segid
passive2 : list
List of passive residue numbers of the second segid
active2 : list
List of active residue numbers of the second segid
active2 : list
List of passive residue numbers of the second segid
segid1 : string
Segid to use for the first model
segid2 : string
Segid to use for the second model
"""
all1 = active1 + passive1
all2 = active2 + passive2
for resi1 in active1:
print('assign (resi {:d} and segid {:s})'.format(resi1, segid1))
print('(')
c = 0
for resi2 in all2:
print(' (resi {:d} and segid {:s})'.format(resi2, segid2))
c += 1
if c != len(all2):
print(' or')
print(') 2.0 2.0 0.0\n')
for resi2 in active2:
print('assign (resi {:d} and segid {:s})'.format(resi2, segid2))
print('(')
c = 0
for resi1 in all1:
print(' (resi {:d} and segid {:s})'.format(resi1, segid1))
c += 1
if c != len(all1):
print(' or')
print(') 2.0 2.0 0.0\n')
# Functions/Methods
[docs]
def calc_euclidean(i, j):
return ((j[0]-i[0])**2 + (j[1]-i[1])**2 + (j[2]-i[2])**2)**0.5
[docs]
def read_structure(pdbf, exclude=None):
"""
Reads a PDB file and returns a list of parsed atoms
"""
_atoms = {'CA', 'P'} # Alpha-Carbon (Prot), Backbone Phosphorous (DNA)
_altloc = {' ', 'A'}
if not exclude:
exclude = set()
else:
exclude = set(exclude)
res_list = []
with open(pdbf, 'r') as pdb_handle:
for line in pdb_handle:
field = line[0:4]
if field != 'ATOM':
continue
aname = line[12:16].strip()
chain = line[21] if line[21].strip() else line[72:76].strip() # chain ID or segID
if chain not in exclude and aname in _atoms and line[16] in _altloc:
resi = int(line[22:26])
coords = (float(line[30:38]), float(line[38:46]), float(line[46:54]))
res_list.append((chain, resi, aname, coords))
if not res_list:
logging.critical('[!] PDB File seems empty or no CA/P atoms found: {0}'.format(pdbf))
sys.exit(1)
return res_list
[docs]
def get_bodies(atom_lst, prot_threshold=4.0, dna_threshold=7.5):
"""
Determines gaps in an atom list following simple distance based criteria.
Returns continuous fragments.
"""
bodies = []
threshold = {'CA': prot_threshold, 'P': dna_threshold}
body_start = 0
i = None
for i, atom in enumerate(atom_lst[1:], start=1):
p_atom = atom_lst[i-1]
chain, resi, aname, xyz = atom
p_chain, p_resi, p_aname, p_xyz = p_atom
if (chain == p_chain) and (aname == p_aname): # Internal Gap
d_xyz = calc_euclidean(xyz, p_xyz)
if d_xyz >= threshold[aname]:
logging.debug('[+++] (Internal) Body: {0}:{1}'.format(body_start, i-1))
bodies.append((body_start, i-1))
body_start = i # Set new beginning
elif (chain != p_chain) or (aname != p_aname): # Different molecules/types
logging.debug('[+++] Body: {0}:{1}'.format(body_start, i-1))
bodies.append((body_start, i-1))
body_start = i # Set new beginning
if not bodies: # Single continuous molecule
bodies.append((0, len(atom_lst)))
else:
logging.debug('[+++] Body: {0}:{1}'.format(body_start, i))
bodies.append((body_start, i)) # Last body
logging.info('[++] Found {0} bodies'.format(len(bodies)))
return bodies
[docs]
def build_restraints(bodies):
"""
Generates distance restraints to maintain the relative
orientation of the different bodies during the simulations.
Generates two unique restraints per pair of bodies.
Each restraint is created using two random atoms on each body
and using their exact euclidean distance as target distance.
"""
def pick_residues(body, max_trials=10):
# Pick two random residues in each body
# Make sure they are far apart from each other
n_trials = 0
while 1:
try:
res_i, res_ii = random.sample(body, 2)
except ValueError:
# Likely, sample size is 1
logging.warning('[!] One-sized body found. This may lead to problems..')
return body[0], body[0]
logging.debug('[+++] Trial {0}: {1} & {2}'.format(n_trials, res_i, res_ii))
if abs(res_i - res_ii) > 3:
logging.info('[++] Picked residues {0} & {1}'.format(res_i, res_ii))
return res_i, res_ii
n_trials += 1
if n_trials == max_trials:
msg = '[!] Could not pick two unique distant residues in body after {0} tries'
logging.info(msg.format(max_trials))
return res_i, res_ii
restraints = []
n_bodies = range(len(bodies))
combinations = itertools.combinations(n_bodies, 2)
for pair_bodies in combinations:
body_i, body_j = pair_bodies
logging.debug('[+++] Restraining body {0} to body {1}'.format(body_i, body_j))
st_body_i, en_body_i = bodies[body_i]
st_body_j, en_body_j = bodies[body_j]
res_i, res_ii = pick_residues(range(st_body_i, en_body_i+1))
res_j, res_jj = pick_residues(range(st_body_j, en_body_j+1))
logging.info('[++] Created restraint: {0}:{1} <--> {2}:{3}'.format(body_i, res_i, body_j, res_j))
restraints.append((res_i, res_j))
logging.info('[++] Created restraint: {0}:{1} <--> {2}:{3}'.format(body_i, res_ii, body_j, res_jj))
restraints.append((res_ii, res_jj))
return restraints
[docs]
def generate_tbl(atom_lst, restraints):
"""
Makes a list of TBL-formatted restraints.
Parameters
----------
atom_lst : list
List of atoms in the form (chain, resi, aname, coords)
restraints : list
List of restraints in the form (res_i, res_j)
"""
for r in restraints:
i, j = r
atom_i, atom_j = atom_lst[i], atom_lst[j]
dist_ij = calc_euclidean(atom_i[3], atom_j[3])
tbl = "assign (segid {0[0]} and resi {0[1]} and name {0[2]}) ".format(atom_i)
tbl += "(segid {0[0]} and resi {0[1]} and name {0[2]}) ".format(atom_j)
tbl += "{0:3.3f} 0.0 0.0".format(dist_ij)
print(tbl)
[docs]
def check_parenthesis(file):
open_parenthesis = re.compile('[(]')
close_parenthesis = re.compile('[)]')
quotation_marks = re.compile('[\"]')
opened = 0
closed = 0
quote = 0
for _match in open_parenthesis.finditer(file):
opened += 1
for _match in close_parenthesis.finditer(file):
closed += 1
for _match in quotation_marks.finditer(file):
quote += 1
if opened != closed:
raise Exception("Problem with TBL file parentheses ({:d} opening for {:d} "
"closing parentheses)".format(opened, closed))
if quote % 2 != 0:
raise Exception("Problem with TBL file, odd number of quotation marks "
"({:d} quotation marks)".format(quote))
[docs]
def validate_tbldata(restraints, pcs=False):
# List of selection keywords
selectors = ('name', 'resn', 'atom', 'resi', 'attr', 'segi', 'chem', 'id',
'byres', 'not')
connectors = ('and', 'or', 'not', 'byres')
output = ""
parentmatch = re.compile('[()]')
# Global mode is activated outside any assign statement
mode = "global"
# Remove any carriage return/new line caracters
lines = restraints.replace('\r', '').split("\n")
# Line number
lnr = 0
# Temporary line storage for future output
tmp_output = None
tmp = None
level = None
s = None
selections = None
postselections = None
numbers = None
types = None
lastassign = None
for line in lines:
lnr += 1
# Take everything which is before putative "!" (comment) caracter
if line.find("!") > -1:
line = line[:line.find("!")]
# Remove whitespaces and merge with previous line if in OR statement
# for AIR restraints
if mode != "format":
line = line.strip()
else:
line = tmp + line.strip()
mode = "postassign"
# End of line
if not len(line):
continue
# Check if all "" are closed
if line.count('"') % 2 > 0:
raise Exception('Unclosed " at line {:d} for line: {}'.format(lnr, line))
if mode in ("global", "postglobal"):
# Start of an assign statement
if line.lower().startswith("assi"):
if mode == "postglobal":
output += "\n!\n"
output += tmp_output
mode = "assign"
selections = []
# assign is the only character of the line,check following lines
if line.find("(") == -1:
continue
# Reset temporary buffer
tmp_output = None
# Check for "OR" restraint
elif mode == "postglobal" and line.lower().startswith("or"):
mode = "postassign"
selections = []
line = line[len("or"):]
# Case where "OR" statement is the only one on the line
# (rest of selection at the next line)
if line == "":
continue
# We are not treating an assign params (postglobal) or at the
# end (global) and no "OR" restraint is present -> ERROR
else:
raise Exception("Invalid TBL file: Unknown statement (line {:d}): {}".format(lnr, line))
# We check if the selection is made over two lines
# (thanks to "segid" keyword)
if mode == "postassign":
if line.count("segid") == 1:
mode = "format"
tmp = line
continue
matched = True
while matched:
matched = False
# We are looking for parenthesis as start of the selections
if mode in ("assign", "postassign"):
# Ambiguous restraint for two different pairs of atoms
# (ex: THR 1 B <-> ALA 2 A OR GLY 2 B <-> ASP 10 A )
pos = line.find("(")
if pos != -1:
matched = True
line = line[pos + 1:]
# We look for an "OR" selection
if mode == "postassign":
mode = "postsel"
else:
mode = "sel"
lastassign = lnr
s = ""
level = 1
# Get the structural selections
if mode in ("sel", "postsel"):
# Detect opening and closing parenthesis
for match in parentmatch.finditer(line):
if match.group() == "(":
level += 1
else:
level -= 1
# End of the parenthesis content
if level == 0:
if mode == "postsel":
mode = "postassign"
else:
mode = "assign"
matched = True
# print l[:match.start()]
# Get back the parentheses content and check
# for selection keywords
idx_connectors = []
sel = line[:match.start()]
# We can directly check for the 1st keyword presence
if len(sel) > 0:
syntax_ok = False
for s in selectors:
if sel.strip().startswith(s):
syntax_ok = True
if not syntax_ok and not \
sel.strip().startswith('byres') and not \
sel.strip().startswith('not'):
raise Exception("1) Missing or wrong keyword in-term {} (stopped at line {:d})".
format(sel, lnr))
# Get indexes of connectors (AND, OR, etc.)
for c in connectors:
idx_connectors.append([m.start()+len(c) for m in
re.finditer(c, sel)])
# Flatten the list
tmp_list = [item for sublist in idx_connectors for
item in sublist]
idx_connectors = tmp_list
# Check that each connector is followed by a keyword
for i in idx_connectors:
new_sel = sel[i:].strip().strip("(").strip()
if len(new_sel) > 0:
syntax_ok = False
for s in selectors:
if new_sel.startswith(s):
syntax_ok = True
break
if not syntax_ok:
raise Exception("Missing or wrong keyword in-term {} (stopped at line {:d})".
format(sel, lnr))
s += sel
selections.append(s)
s = None
# Get the rest of the line
line = line[match.end():]
# Go to the process of the selection
break
# No parenthesis, we get the whole line
if level > 0:
if line != "":
s += line + "\n\t"
else:
# To avoid multiple blank lines
if repr(s) == "''":
# print repr(l), repr(s), selections
s += "\n\t"
# TBD
if mode in ("sel", "postsel"):
continue
# Selection parsed for "OR" line
if mode == "postassign":
if len(selections) != postselections:
raise Exception("Invalid TBL file: wrong number of selections: in-term {:d}, cross-term {:d} "
"(stopped at line {:d})".format(postselections, len(selections), lnr))
tmp_output += " or"
for s in selections:
tmp_output += "\t({})\n".format(s)
# We let the possibility for other "OR"
mode = "postglobal"
if len(line) == 0:
continue
# Define the distance restraints type to adapt the parsing
if mode == "assign":
mode = "numbers"
if pcs:
if len(selections) == 5:
types = (" {:.3f}", " {:.3f}")
else:
raise Exception("Invalid TBL file: wrong number of selections (must be 5 in PCS mode)")
else:
if len(selections) == 2:
types = (" {:.3f}", " {:.3f}", " {:.3f}")
elif len(selections) == 4:
types = (" {:.3f}", " {:.3f}", " {:.3f}", " {:d}")
elif len(selections) == 5:
raise Exception("Invalid TBL file: wrong number of selections (can be 5 only in PCS mode)")
elif len(selections) == 6:
types = (" {:.3f}", " {:.3f}")
else:
check_parenthesis(restraints)
raise Exception("Invalid TBL file: wrong number of selections (must be 2, 4 or 6)")
postselections = len(selections)
numbers = []
# Distance restraints parsing
if mode == "numbers":
ll = line.split()
for num in ll:
if len(numbers) == len(types):
break
numbers.append(float(num))
if len(numbers) == len(types):
tmp_output = "assign "
for s in selections:
tmp_output += "\t({})\n".format(s)
tmp_output = tmp_output[:-len("\n")]
for n, t in zip(numbers, types):
tmp_output += t.format(n)
tmp_output += "\n"
mode = "postglobal"
# "OR" lines have been parsed, we store the selections
if mode == "postglobal":
output += "!\n"
output += tmp_output
mode = "global"
# If mode is not back to global, something has not been processed properly
if mode != "global":
raise Exception("Invalid TBL file: Malformed ASSIGN statement (line {:d}), use --quick to check for "
"putative syntax issues".format(lastassign))
if not len(output.strip()):
raise Exception("Invalid or empty TBL file")
# Remove extra lines before each assign statement
output = output.replace("\n\n", "\n")
# Remove extra line at the beginning of the file
if output.startswith("\n"):
output = output.replace("\n", "", 1)
return output
[docs]
def passive_from_active_raw(structure, active, chain_id=None, surface=None, radius=6.5):
"""Get the passive residues.
Parameters
----------
structure : str
path to the PDB file
active : list
List of active residues
chain_id : str
Chain ID
surface : list
List of surface residues.
radius : float
Radius from active residues
"""
# Parse the PDB file
if Path(structure).exists():
p = PDBParser(QUIET=True)
s = p.get_structure('pdb', structure)
else:
raise FileNotFoundError('File not found: {0}'.format(structure))
try:
if chain_id:
atom_list = [a for a in s[0][chain_id].get_atoms()]
else:
atom_list = [a for a in s[0].get_atoms()]
except KeyError as e:
raise KeyError('Chain {0} does not exist in the PDB file {1}, please enter a proper chain id'.
format(chain_id, structure)) from e
act_atoms = [a.get_coord() for a in atom_list if a.parent.id[1] in active]
try:
if not surface:
surface = get_surface_resids(s)
except Exception as e:
raise Exception("There was an error while calculating surface residues: {}".format(e)) from e
ns = NeighborSearch(atom_list)
neighbors = []
for a in act_atoms:
neighbors.append(ns.search(a, radius, "R")) # HADDOCK used 6.5A as default
passive_list = set()
for n in neighbors:
for r in n:
passive_list.add(r.id[1])
tmp = passive_list & set(surface)
passive_list = tmp - set(active)
return sorted(passive_list)