Skip to content

Commit

Permalink
Add support for reading chainID info from prmtop amber topologies. (M…
Browse files Browse the repository at this point in the history
…DAnalysis#4007)

* Add chainID support for prmtop amber topologies (although default tleap does not 
   add chainID information, parmed does). 

   Parses the optional `%FLAG RESIDUE_CHAINID` in the prmtop.

   The PRMTOP format definition (most recent from 2004 https://ambermd.org/prmtop.pdf) states
   "it is allowed to extend the PRMTOP format"  so we consider  RESIDUE_CHAINID an optional 
   extension. If not present, the file is still parsed and no errors/warnings are raised.
* Add  tests for chainIDs
* updated docs
* updated CHANGELOG
  • Loading branch information
pgbarletta authored Sep 3, 2023
1 parent c2e6df9 commit 2acd594
Show file tree
Hide file tree
Showing 7 changed files with 476 additions and 39 deletions.
3 changes: 2 additions & 1 deletion package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ The rules for this file:
* release numbers follow "Semantic Versioning" http://semver.org

------------------------------------------------------------------------------
??/??/?? IAlibay, ianmkenney, PicoCentauri
??/??/?? IAlibay, ianmkenney, PicoCentauri, pgbarletta

* 2.7.0

Expand All @@ -24,6 +24,7 @@ Fixes
Enhancements
* Added a warning about charge neutrality to the documentation of
`DielectricConstant` (Issue #4262, PR #4263)
* Add support for reading chainID info from prmtop amber topologies (PR #4007)

Changes
* The `mda-xdrlib` module is now a core dependency of MDAnalysis
Expand Down
104 changes: 89 additions & 15 deletions package/MDAnalysis/topology/TOPParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
:mod:`MDAnalysis.topology.guessers` for more details.
.. _`PARM parameter/topology file specification`:
http://ambermd.org/formats.html#topology
https://ambermd.org/FileFormats.php#topo.cntrl
Classes
-------
Expand All @@ -93,7 +93,7 @@

from .tables import Z2SYMB
from ..lib.util import openany, FORTRANReader
from .base import TopologyReaderBase
from .base import TopologyReaderBase, change_squash
from ..core.topology import Topology
from ..core.topologyattrs import (
Atomnames,
Expand All @@ -106,6 +106,7 @@
Resids,
Resnums,
Segids,
ChainIDs,
AtomAttr,
Bonds,
Angles,
Expand Down Expand Up @@ -140,13 +141,20 @@ class TOPParser(TopologyReaderBase):
- Bonds
- Angles
- Dihedrals (inc. impropers)
- ChainIDs (from %RESIDUE_CHAINID)
- Segids (from %RESIDUE_CHAINID)
The format is defined in `PARM parameter/topology file
specification`_. The reader tries to detect if it is a newer
(AMBER 12?) file format by looking for the flag "ATOMIC_NUMBER".
.. _`PARM parameter/topology file specification`:
http://ambermd.org/formats.html#topology
https://ambermd.org/FileFormats.php#topo.cntrl
Additionally, the RESIDUE_CHAINID non-standard flag is supported. This
can be added with the `addPDB`_ command from parmed:
.. _`addPDB`: https://parmed.github.io/ParmEd/html/parmed.html#addpdb
Notes
-----
Expand All @@ -162,6 +170,8 @@ class TOPParser(TopologyReaderBase):
warns users that chamber-style topologies are not currently supported
.. versionchanged:: 2.0.0
no longer guesses elements if missing
.. versionchanged:: 2.7.0
gets Segments and chainIDs from flag RESIDUE_CHAINID, when present
"""
format = ['TOP', 'PRMTOP', 'PARM7']

Expand All @@ -188,7 +198,8 @@ def parse(self, **kwargs):
"ANGLES_INC_HYDROGEN": (4, 10, self.parse_bonded, "angh", 4),
"ANGLES_WITHOUT_HYDROGEN": (4, 10, self.parse_bonded, "anga", 5),
"DIHEDRALS_INC_HYDROGEN": (5, 10, self.parse_bonded, "dihh", 6),
"DIHEDRALS_WITHOUT_HYDROGEN": (5, 10, self.parse_bonded, "diha", 7)
"DIHEDRALS_WITHOUT_HYDROGEN": (5, 10, self.parse_bonded, "diha", 7),
"RESIDUE_CHAINID": (1, 20, self.parse_chainids, "segids", 11),
}

attrs = {} # empty dict for attrs that we'll fill
Expand Down Expand Up @@ -298,12 +309,36 @@ def next_getter():
attrs['atomids'] = Atomids(np.arange(n_atoms) + 1)
attrs['resids'] = Resids(np.arange(n_res) + 1)
attrs['resnums'] = Resnums(np.arange(n_res) + 1)
attrs['segids'] = Segids(np.array(['SYSTEM'], dtype=object))

top = Topology(n_atoms, n_res, 1,
attrs=list(attrs.values()),
atom_resindex=residx,
residue_segindex=None)
# Amber's 'RESIDUE_CHAINID' is a by-residue attribute, turn it into
# a by-atom attribute when present. See PR #4007.
if "segids" in attrs and len(attrs["segids"]) == n_res:
segidx, (segids,) = change_squash((attrs["segids"],), (attrs["segids"],))
chainids = [attrs["segids"][r] for r in residx]

attrs["segids"] = Segids(segids)
attrs["ChainIDs"] = ChainIDs(chainids)
n_segs = len(segids)
else:
if "segids" in attrs:
msg = (
f"Number of residues ({n_res}) does not match number of "
f"%RESIDUE_CHAINID ({len(attrs['segids'])}). Skipping section."
)
logger.warning(msg)
warnings.warn(msg)
attrs["segids"] = Segids(np.array(["SYSTEM"], dtype=object))
segidx = None
n_segs = 1

top = Topology(
n_atoms,
n_res,
n_segs,
attrs=list(attrs.values()),
atom_resindex=residx,
residue_segindex=segidx,
)

return top

Expand Down Expand Up @@ -537,10 +572,12 @@ def parse_bonded(self, num_per_record, numlines):
Note
----
For the bond/angle sections of parm7 files, the atom numbers are set to
coordinate array index values. As detailed in
http://ambermd.org/formats.html to recover the actual atom number, one
coordinate array index values. As detailed in `the specification`_,
to recover the actual atom number, one
should divide the values by 3 and add 1. Here, since we want to satisfy
zero-indexing, we only divide by 3.
.. _`the specification`: https://ambermd.org/FileFormats.php#topo.cntrl
"""
fields = self.parsesection_mapper(numlines, lambda x: int(x) // 3)
section = self.parse_chunks(fields, num_per_record)
Expand All @@ -563,9 +600,24 @@ def parsesection_mapper(self, numlines, mapper):
A list of all entries in a given parm7 section
"""
section = []
y = next(self.topfile).strip("%FORMAT(")
y.strip(")")
x = FORTRANReader(y)

def get_fmt(file):
""" Skips '%COMMENT' lines until it gets the FORMAT specification
for the section."""
line = next(file)
if line[:7] == "%FORMAT":
return line[8:].split(")")[0]
elif line[:8] == "%COMMENT":
return get_fmt(file)
else:
raise ValueError(
"Invalid header line. Does not begin with either %FLAG, %FORMAT "
f"nor %COMMENT:\n{line}"
)

# There may be %COMMENT lines between %FLAG and %FORMAT statements. Skip them.
fmt = get_fmt(self.topfile)
x = FORTRANReader(fmt)
for i in range(numlines):
l = next(self.topfile)
for j in range(len(x.entries)):
Expand Down Expand Up @@ -596,7 +648,7 @@ def parse_dihedrals(self, diha, dihh):
Note
----
As detailed in http://ambermd.org/formats.html, the dihedral sections
As detailed in `the specification`_, the dihedral sections
of parm7 files contain information about both conventional dihedrals
and impropers. The following must be accounted for:
1) If the fourth atom in a dihedral entry is given a negative value,
Expand All @@ -605,6 +657,8 @@ def parse_dihedrals(self, diha, dihh):
this indicates that it 1-4 NB interactions are ignored for this
dihedrals. This could be due to the dihedral within a ring, or if it is
part of a multi-term dihedral definition or if it is an improper.
.. _`the specification`: https://ambermd.org/FileFormats.php#topo.cntrl
"""
improp = []
dihed = []
Expand All @@ -620,3 +674,23 @@ def parse_dihedrals(self, diha, dihh):
dihedrals = Dihedrals(dihed)
impropers = Impropers(improp)
return dihedrals, impropers

def parse_chainids(self, num_per_record: int, numlines: int):
"""Extracts the chainID of each residue
Parameters
----------
num_per_record : int
The number of entries for each record in section (unused input)
numlines : int
The number of lines to be parsed in current section
Returns
-------
attr : numpy array
A numpy array containing the chainID of each residue as defined in
the parm7 file
"""
vals = self.parsesection_mapper(numlines, lambda x: x)
attr = np.array(vals)
return attr
144 changes: 144 additions & 0 deletions testsuite/MDAnalysisTests/data/Amber/ace_mbondi3.error4.parm7
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
%VERSION VERSION_STAMP = V0001.000 DATE = 09/08/18 15:36:17
%FLAG TITLE
%FORMAT(20a4)
ACE
%FLAG POINTERS
%FORMAT(10I8)
6 4 3 2 6 1 9 0 0 0
16 1 2 1 0 3 3 3 4 0
0 0 0 0 0 0 0 0 6 0
0
%FLAG ATOM_NAME
%FORMAT(20a4)
HH31CH3 HH32HH33C O
%FLAG CHARGE
%COMMENT
%BAD LINE
%COMMENT
%FORMAT(5E16.8)
2.04636429E+00 -6.67300626E+00 2.04636429E+00 2.04636429E+00 1.08823576E+01
-1.03484442E+01
%FLAG ATOMIC_NUMBER
%FORMAT(10I8)
1 6 1 1 6 8
%FLAG MASS
%FORMAT(5E16.8)
1.00800000E+00 1.20100000E+01 1.00800000E+00 1.00800000E+00 1.20100000E+01
1.60000000E+01
%FLAG ATOM_TYPE_INDEX
%FORMAT(10I8)
1 2 1 1 3 4
%FLAG NUMBER_EXCLUDED_ATOMS
%FORMAT(10I8)
5 4 3 2 1 1
%FLAG NONBONDED_PARM_INDEX
%FORMAT(10I8)
1 2 4 7 2 3 5 8 4 5
6 9 7 8 9 10
%FLAG RESIDUE_LABEL
%FORMAT(20a4)
ACE
%FLAG RESIDUE_POINTER
%FORMAT(10I8)
1
%FLAG BOND_FORCE_CONSTANT
%FORMAT(5E16.8)
5.70000000E+02 3.40000000E+02 3.17000000E+02
%FLAG BOND_EQUIL_VALUE
%FORMAT(5E16.8)
1.22900000E+00 1.09000000E+00 1.52200000E+00
%FLAG ANGLE_FORCE_CONSTANT
%FORMAT(5E16.8)
5.00000000E+01 3.50000000E+01 8.00000000E+01
%FLAG ANGLE_EQUIL_VALUE
%FORMAT(5E16.8)
1.91113635E+00 1.91113635E+00 2.10137732E+00
%FLAG DIHEDRAL_FORCE_CONSTANT
%FORMAT(5E16.8)
8.00000000E-01 0.00000000E+00 8.00000000E-02
%FLAG DIHEDRAL_PERIODICITY
%FORMAT(5E16.8)
1.00000000E+00 2.00000000E+00 3.00000000E+00
%FLAG DIHEDRAL_PHASE
%FORMAT(5E16.8)
0.00000000E+00 0.00000000E+00 3.14159400E+00
%FLAG SCEE_SCALE_FACTOR
%FORMAT(5E16.8)
1.20000000E+00 1.20000000E+00 1.20000000E+00
%FLAG SCNB_SCALE_FACTOR
%FORMAT(5E16.8)
2.00000000E+00 2.00000000E+00 2.00000000E+00
%FLAG SOLTY
%FORMAT(5E16.8)
0.00000000E+00 0.00000000E+00 0.00000000E+00 0.00000000E+00
%FLAG LENNARD_JONES_ACOEF
%FORMAT(5E16.8)
7.51607703E+03 9.71708117E+04 1.04308023E+06 8.61541883E+04 9.24822270E+05
8.19971662E+05 5.44261042E+04 6.47841731E+05 5.74393458E+05 3.79876399E+05
%FLAG LENNARD_JONES_BCOEF
%FORMAT(5E16.8)
2.17257828E+01 1.26919150E+02 6.75612247E+02 1.12529845E+02 5.99015525E+02
5.31102864E+02 1.11805549E+02 6.26720080E+02 5.55666448E+02 5.64885984E+02
%FLAG BONDS_INC_HYDROGEN
%FORMAT(10I8)
3 6 2 3 9 2 0 3 2
%FLAG BONDS_WITHOUT_HYDROGEN
%FORMAT(10I8)
12 15 1 3 12 3
%FLAG ANGLES_INC_HYDROGEN
%FORMAT(10I8)
9 3 12 1 6 3 9 2 6 3
12 1 0 3 6 2 0 3 9 2
0 3 12 1
%FLAG ANGLES_WITHOUT_HYDROGEN
%FORMAT(10I8)
3 12 15 3
%FLAG DIHEDRALS_INC_HYDROGEN
%FORMAT(10I8)
9 3 12 15 1 9 3 -12 15 2
9 3 -12 15 3 6 3 12 15 1
6 3 -12 15 2 6 3 -12 15 3
0 3 12 15 1 0 3 -12 15 2
0 3 -12 15 3
%FLAG DIHEDRALS_WITHOUT_HYDROGEN
%FORMAT(10I8)

%FLAG EXCLUDED_ATOMS_LIST
%FORMAT(10I8)
2 3 4 5 6 3 4 5 6 4
5 6 5 6 6 0
%FLAG HBOND_ACOEF
%FORMAT(5E16.8)

%FLAG HBOND_BCOEF
%FORMAT(5E16.8)

%FLAG HBCUT
%FORMAT(5E16.8)

%FLAG AMBER_ATOM_TYPE
%FORMAT(20a4)
HC CT HC HC C O
%FLAG TREE_CHAIN_CLASSIFICATION
%FORMAT(20a4)
M M E E M E
%FLAG JOIN_ARRAY
%FORMAT(10I8)
0 0 0 0 0 0
%FLAG IROTAT
%FORMAT(10I8)
0 0 0 0 0 0
%FLAG RADIUS_SET
%FORMAT(1a80)
ArgH and AspGluO modified Bondi2 radii (mbondi3)
%FLAG RADII
%FORMAT(5E16.8)
1.20000000E+00 1.70000000E+00 1.20000000E+00 1.20000000E+00 1.70000000E+00
1.50000000E+00
%FLAG SCREEN
%FORMAT(5E16.8)
8.50000000E-01 7.20000000E-01 8.50000000E-01 8.50000000E-01 7.20000000E-01
8.50000000E-01
%FLAG IPOL
%FORMAT(1I8)
0
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 2acd594

Please sign in to comment.