Source code for pwem.convert.atom_struct

# **************************************************************************
# *
# * Authors:     Roberto Marabini (roberto@cnb.csic.es)
# *              Marta Martinez (mmmtnez@cnb.csic.es)
# *
# * Unidad de  Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 3 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307  USA
# *
# *  All comments concerning this program package may be sent to the
# *  e-mail address 'scipion@cnb.csic.es'
# *
# **************************************************************************
#
# cif to pdb conversion is based on  cif2pdb.py by Spencer Bliven
# <spencer.bliven@gmail.com>
#
# see http://biopython.org/DIST/docs/tutorial/Tutorial.html for a description
# on the object "structure" and other Bio.xxxx modules

from Bio.PDB.Dice import ChainSelector
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB import PDBIO, MMCIFIO
from Bio.PDB.mmcifio import mmcif_order
from Bio.PDB import Superimposer
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from Bio.PDB import Entity
# PDBList is a biopython function but it not longer works
# so we fix it and now is in this module
from .pdb_list import PDBList
from collections import OrderedDict
from Bio.PDB.Polypeptide import is_aa
from Bio.PDB.Polypeptide import three_to_one
from Bio.Seq import Seq
from pwem import Plugin, MAXIT_HOME
from pwem.convert.transformations import translation_from_matrix
import mmap
import re
import os
import numpy
import pyworkflow.utils as pwutils
import shutil

from pwem.objects.data import Alphabet



SECTION = '_scipion_attributes'
NAME, RECIP, SPEC, VALUE = SECTION + '.name', SECTION + '.recipient', SECTION + '.specifier', SECTION + '.value'

[docs]class OutOfChainsError(Exception):
    pass


[docs]class scipionMMCIFIO(MMCIFIO):

    # this class was needed before including maxit
    # as converter.
    # TODO: Delete it after a more extensive testing period
    # DATE mar feb 18 17:48:57 CET 2020

    def _save_dict_delete(self, out_file):
        # Form dictionary where key is first part of mmCIF key and value is list
        # of corresponding second parts
        key_lists = {}
        for key in self.dic:
            if key == "data_":
                data_val = self.dic[key]
            else:
                s = re.split(r"\.", key)
                if len(s) == 2:
                    if s[0] in key_lists:
                        key_lists[s[0]].append(s[1])
                    else:
                        key_lists[s[0]] = [s[1]]
                else:
                    raise ValueError("Invalid key in mmCIF dictionary: " + key)

        # Re-order lists if an order has been specified
        # Not all elements from the specified order are necessarily present
        for key, key_list in key_lists.items():
            # P3
            # for key, key_list in list(key_lists.items()):
            if key in mmcif_order:
                inds = []
                for i in key_list:
                    try:
                        inds.append(mmcif_order[key].index(i))
                    # Unrecognised key - add at end
                    except ValueError:
                        inds.append(len(mmcif_order[key]))
                key_lists[key] = [k for _, k in sorted(zip(inds, key_list))]

        # Write out top data_ line
        if data_val:
            out_file.write("data_" + data_val + "\n#\n")
            # ESCRIBIR POLYSEQTABLE
            out_file.write("""loop_
_entity_poly_seq.entity_id
_entity_poly_seq.num
_entity_poly_seq.mon_id
_entity_poly_seq.hetero\n#\n""")
            total_seqs = []
            counter_chain = 1
            for model in self.structure:
                for chain in model:
                    min_val = int(chain.get_unpacked_list()[0].id[1])
                    # max_val = min_val + len(chain.get_unpacked_list())
                    #                    not_are = range(1, min_val-1)
                    #                    for i in range(min_val)[:-1]:
                    #                        not_are.append(i + 1)
                    counter = 1
                    if min_val > 1:
                        for counter in range(1, min_val):
                            if len(chain.get_unpacked_list()[0].resname.strip()) == 3:
                                total_seqs.append((counter_chain, str(counter), "XAA", "n"))
                            elif len(chain.get_unpacked_list()[0].resname.strip()) == 2:
                                total_seqs.append((counter_chain, str(counter), "DX", "n"))
                            elif len(chain.get_unpacked_list()[0].resname.strip()) == 1:
                                total_seqs.append((counter_chain, str(counter), "X", "n"))
                            counter += 1
                    for residue in chain:
                        if len(chain.get_unpacked_list()[0].resname.strip()) == 3 and \
                                is_aa(residue.get_resname(), standard=True):  # aminoacids
                            total_seqs.append((counter_chain, residue.id[1], residue.get_resname(), "n"))
                        elif len(chain.get_unpacked_list()[0].resname.strip()) == 2 and \
                                residue.get_resname()[1] in ['A', 'C', 'G', 'T']:
                            total_seqs.append((counter_chain, counter, residue.get_resname(), "n"))
                        elif len(chain.get_unpacked_list()[0].resname.strip()) == 1 and \
                                residue.get_resname() in ['A', 'C', 'G', 'U']:
                            total_seqs.append((counter_chain, counter, residue.get_resname(), "n"))
                        counter += 1
                    counter_chain += 1
            for item in total_seqs:
                item = list(item)
                out_file.write("""%s %s %s  %s\n""" %
                               (str(item[0]), item[1], item[2], item[3]))

        for key, key_list in key_lists.items():
            #  for key, key_list in list(key_lists.items()):
            # Pick a sample mmCIF value, which can be a list or a single value
            sample_val = self.dic[key + "." + key_list[0]]
            n_vals = len(sample_val)
            # Check the mmCIF dictionary has consistent list sizes
            for i in key_list:
                val = self.dic[key + "." + i]
                if (isinstance(sample_val, list) and (isinstance(val, str) or len(val) != n_vals)) or (
                        isinstance(sample_val, str) and isinstance(val, list)):
                    raise ValueError("Inconsistent list sizes in mmCIF dictionary: " + key + "." + i)
            # If the value is a single value, write as key-value pairs
            if isinstance(sample_val, str):
                m = 0
                # Find the maximum key length
                for i in key_list:
                    if len(i) > m:
                        m = len(i)
                for i in key_list:
                    out_file.write(
                        "{k: <{width}}".format(k=key + "." + i, width=len(key) + m + 4) + self._format_mmcif_col(
                            self.dic[key + "." + i], len(self.dic[key + "." + i])) + "\n")
            # If the value is a list, write as keys then a value table
            elif isinstance(sample_val, list):
                out_file.write("loop_\n")
                col_widths = {}
                # Write keys and find max widths for each set of values
                for i in key_list:
                    out_file.write(key + "." + i + "\n")
                    col_widths[i] = 0
                    for val in self.dic[key + "." + i]:
                        len_val = len(val)
                        # If the value requires quoting it will add 2 characters
                        if self._requires_quote(val) and not self._requires_newline(val):
                            len_val += 2
                        if len_val > col_widths[i]:
                            col_widths[i] = len_val
                # Technically the max of the sum of the column widths is 2048

                # Write the values as rows
                for i in range(n_vals):
                    for col in key_list:
                        out_file.write(self._format_mmcif_col(self.dic[key + "." + col][i], col_widths[col] + 1))
                    out_file.write("\n")
            else:
                raise ValueError("Invalid type in mmCIF dictionary: " + str(type(sample_val)))
            out_file.write("#\n")


[docs]class AtomicStructHandler:
    """ Class that contain utilities to handle pdb/cif files"""
    PDB = 0
    CIF = 1

    def __init__(self, fileName=None, permissive=1):
        # The PERMISSIVE flag indicates that a number of common problems
        # associated with PDB files will be ignored
        self.permissive = permissive
        self.pdbParser = None
        self.cifParser = None
        self.ioPDB = None
        self.ioCIF = None
        self.structure = None
        self._readDone = False
        if fileName is not None:
            self.read(fileName)

[docs]    def readFromPDBDatabase(self, pdbID, dir=None, type='mmCif'):
        """
        Retrieve structure from PDB
        :param pdbID:
        :param dir: save structure in this directory
        :param type:  mmCif or pdb
        :return: filename with pdb file
        """
        if dir is None:
            dir = os.getcwd()
        pdbl = PDBList(pdb=dir)
        for counter in range(5):
            try:
                fileName = pdbl.retrieve_pdb_file(pdbID, pdir=dir, file_format=type)
                break  # file has been retrived do not retry download
            except Exception as error:
                print("Error: ", error)
                print("Retry connection", counter)
                if counter == 4:
                    raise
        return os.path.abspath(fileName)

[docs]    def checkLabelInFile(self, fileName, label):
        with open(fileName, "r") as f:
            s = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
            if s.find(label) != -1:
                return True
            else:
                return False

[docs]    def getStructure(self):
        """return structure information, model, chain,
           residues, atoms..."""
        return self.structure

[docs]    def read(self, fileName):
        """ Read and parse file."""
        # biopython assigns an ID to any read structure
        structure_id = os.path.basename(fileName)
        structure_id = structure_id[:4] if len(structure_id) > 4 else "1xxx"

        if fileName.endswith(".pdb") or fileName.endswith(".ent"):
            if self.pdbParser is None:
                self.pdbParser = PDBParser(PERMISSIVE=self.permissive)
            parser = self.pdbParser
            self.type = self.PDB
        else:
            if self.cifParser is None:
                self.cifParser = MMCIFParser()
            parser = self.cifParser
            self.type = self.CIF

        self.structure = parser.get_structure(structure_id, fileName)
        self._readDone = True

[docs]    def checkRead(self):
        if self._readDone:
            return True
        else:
            print("you must read the pdb file first")
            exit(0)

[docs]    def getModelsChains(self):
        """
        given an atomic structure returns two dictionaries:
            (1) for all models and respective chains (chainID and length of residues)
            (2) for each chain list of residues
        """
        self.checkRead()
        listOfChains = OrderedDict()
        listOfResidues = OrderedDict()

        for model in self.structure:
            chainDicLength = OrderedDict()
            chainDicFirstResidue = OrderedDict()
            for chain in model:
                if len(chain.get_unpacked_list()[0].resname.strip()) == 1:  # RNA
                    seq = list()
                    seq_number = list()
                    for residue in chain:
                        if residue.get_resname() in ['A', 'C', 'G', 'U']:
                            seq.append(residue.get_resname())
                        else:
                            seq.append("X")
                        seq_number.append((residue.get_id()[1], residue.get_resname()))
                elif len(chain.get_unpacked_list()[0].resname.strip()) == 2:  # DNA
                    seq = list()
                    seq_number = list()
                    for residue in chain:
                        if residue.get_resname()[1] in ['A', 'C', 'G', 'T']:
                            seq.append(residue.get_resname()[1])
                        else:
                            seq.append("X")
                        seq_number.append((residue.get_id()[1], residue.get_resname()))
                elif len(chain.get_unpacked_list()[0].resname.strip()) == 3:  # Protein
                    seq = list()
                    seq_number = list()
                    counter = 0
                    for residue in chain:
                        if is_aa(residue.get_resname(), standard=True):  # aminoacids
                            seq.append(three_to_one(residue.get_resname()))
                            counter += 1
                        else:
                            seq.append("X")
                        seq_number.append((residue.get_id()[1], residue.get_resname()))
                    if counter == 0:  # HETAM
                        for residue in chain:
                            seq.append(residue.get_resname())
                while seq[-1] == "X":
                    del seq[-1]
                while seq[0] == "X":
                    del seq[0]
                chainDicLength[chain.id] = len(seq)
                chainDicFirstResidue[chain.id] = seq_number
            listOfChains[model.id] = chainDicLength
            listOfResidues[model.id] = chainDicFirstResidue

        return listOfChains, listOfResidues

[docs]    def getSequenceFromChain(self, modelID, chainID, returnAlphabet=False):
        self.checkRead()
        seq = list()
        for model in self.structure:
            if model.id == modelID:
                for chain in model:
                    if str(chain.id) == chainID:
                        if len(chain.get_unpacked_list()[0].resname) == 1:
                            print("Your sequence is a nucleotide sequence ("
                                  "RNA)\n")
                            alphabet = Alphabet.UNAMBIGOUS_RNA_ALPHABET
                            for residue in chain:
                                # Check if the residue belongs to the
                                # standard RNA and add those residues to the
                                # seq
                                if residue.get_resname() in ['A', 'C',
                                                             'G', 'U']:
                                    seq.append(residue.get_resname())
                                else:
                                    seq.append("X")
                        elif len(chain.get_unpacked_list()[0].resname) == 2:
                            print("Your sequence is a nucleotide sequence ("
                                  "DNA)\n")
                            alphabet = Alphabet.UNAMBIGOUS_DNA_ALPHABET
                            for residue in chain:
                                # Check if the residue belongs to the
                                # standard DNA and add those residues to the
                                # seq
                                if residue.get_resname()[1] in ['A', 'C',
                                                                'G', 'T']:
                                    seq.append(residue.get_resname()[1])
                                else:
                                    seq.append("X")
                        elif len(chain.get_unpacked_list()[0].resname) == 3:
                            counter = 0
                            for residue in chain:
                                if is_aa(residue.get_resname(), standard=True):
                                    alphabet = Alphabet.EXTENDED_PROTEIN_ALPHABET
                                    # The test checks if the amino acid
                                    # is one of the 20 standard amino acids
                                    # Some proteins have "UNK" or "XXX", or other symbols
                                    # for missing or unknown residues
                                    seq.append(three_to_one(residue.get_resname()))
                                    counter += 1
                                else:
                                    seq.append("X")
                            if counter != 0:  # aminoacids
                                print("Your sequence is an aminoacid sequence")
                            else:  # HETAM
                                print("Your sequence is a HETAM sequence")
                                for residue in chain:
                                    seq.append(residue.get_resname())
                        while seq[-1] == "X":
                            del seq[-1]
                        while seq[0] == "X":
                            del seq[0]
                        # return Seq(str(''.join(seq)), alphabet=alphabet)
                        if returnAlphabet:
                            return Seq(str(''.join(seq))), alphabet
                        else:
                            return Seq(str(''.join(seq)))

[docs]    def getFullID(self, model_id='0', chain_id=None):
        """
        assign a label to a sequence obtained from a PDB file
        :parameter
        model_id
        chain_id
        :return: string with label
        """
        self.checkRead()  # cehck we have read the structure
        label = "%s" % self.structure.get_id()  # PDB_ID
        # check how many model are in sequence if > 1 add to id
        models = self.getModelsChains()
        if len(models) > 1:
            label = label + "__%s" % str(model_id)
        # if chain_id is None do not add it to te name
        if chain_id is not None:
            label = label + "_%s" % str(chain_id)
        return label

[docs]    def getStructureBFactorValues(self):
        """Using an atomic structure as input, this method returns two list:
        * List 1: B-factor field values for each atom of the structure.
        * List 2 is a list of lists. For each residue of the structure a list with
        the model, chain, order number of residue, name of residue and average of
        B-factor field values (atoms) is returned. An example
        of each list item is:
        [0, 'A', 103, 'PHE', 96.01]"""
        self.checkRead()
        listOfBFactors = []
        listOfResiduesBFactors = []
        for model in self.structure:
            for chain in model:
                for residue in chain:
                    residue_list = []
                    residue_list.append(model.id)
                    residue_list.append(chain.id)
                    residue_list.append(residue.id[1])
                    residue_list.append(residue.get_resname())
                    sum_list = 0.00
                    residue_atom_b_factor_list = []
                    for atom in residue:
                        listOfBFactors.append(atom.get_bfactor())
                        residue_atom_b_factor_list.append(atom.get_bfactor())
                    for item in residue_atom_b_factor_list:
                        sum_list += item
                    avg = sum_list / len(residue_atom_b_factor_list)
                    residue_list.append(round(avg, 2))
                    listOfResiduesBFactors.append(residue_list)
        return listOfBFactors, listOfResiduesBFactors

[docs]    def readLowLevel(self, fileName):
        """ Return a dictionary with all mmcif fields. you should parse them
            Example: get the list of the y coordinates of all atoms
              dict = readLowLevel("kk.pdb")
              y_list = dict['_atom_site.Cartn_y']
        """

        if fileName.endswith(".pdb"):
            print("Low level access to PDB is not implemented")
        else:
            dict = MMCIF2Dict(fileName)
        return dict

    def _write(self, fileName):
        """ Do not use this function use toPDB or toCIF, they take care of some
        compatibiity issues"""
        if fileName.endswith(".pdb") or fileName.endswith(".ent"):
            if self.ioPDB is None:
                self.ioPDB = PDBIO()
            io = self.ioPDB
        else:
            if self.ioCIF is None:
                self.ioCIF = scipionMMCIFIO()
                # self.ioCIF = MMCIFIO()
            io = self.ioCIF
        io.set_structure(self.structure)
        io.save(fileName)

    def _writeLowLevel(self, fileName, dict):
        """ write a dictionary as cif file
        """

        if fileName.endswith(".pdb"):
            print("Low level access to PDB is not implemented")
        else:
            if self.ioCIF is None:
                self.ioCIF = MMCIFIO()
            io = self.ioCIF
        io.set_dict(dict)
        io.save(fileName)

    def _intToChain(self, i, base=62):
        """
        int_to_chain(int,int) -> str
        Converts a positive integer to a chain ID. Chain IDs include uppercase
        characters, numbers, and optionally lowercase letters.
        i = a positive integer to convert
        base = the alphabet size to include. Typically 36 or 62.
        """
        if i < 0:
            raise ValueError("positive integers only")
        if base < 0 or 62 < base:
            raise ValueError("Invalid base")

        quot = int(i) // base
        rem = i % base
        if rem < 26:
            letter = chr(ord("A") + rem)
        elif rem < 36:
            letter = str(rem - 26)
        else:
            letter = chr(ord("a") + rem - 36)
        if quot == 0:
            return letter
        else:
            return self._intToChain(quot - 1, base) + letter

[docs]    def renameChains(self, structure):
        """Renames chains to be one-letter chains

        Existing one-letter chains will be kept.
        Multi-letter chains will be truncated
        or renamed to the next available letter of the alphabet.

        If more than 62 chains are present in the structure,
        raises an OutOfChainsError

        Returns a map between new and old chain IDs, as well as modifying
        the input structure

        """
        next_chain = 0  #
        # single-letters stay the same
        chainmap = {c.id: c.id
                    for c in structure.get_chains() if len(c.id) == 1}
        for o in structure.get_chains():
            if len(o.id) != 1:
                if o.id[0] not in chainmap:
                    chainmap[o.id[0]] = o.id
                    o.id = o.id[0]
                else:
                    c = self._intToChain(next_chain)
                    while c in chainmap:
                        next_chain += 1
                        c = self._intToChain(next_chain)
                        if next_chain >= 62:
                            raise OutOfChainsError()
                    chainmap[c] = o.id
                    o.id = c
        return chainmap

[docs]    def write(self, fileName):
        if fileName.endswith(".pdb") or fileName.endswith(".ent"):
            self.writeAsPdb(fileName)
        else:
            self.writeAsCif(fileName)

[docs]    def writeAsPdb(self, pdbFile):
        """ Save structure as PDB. Be aware that this is not a lossless conversion
        Returns False is conversion is not possible. True otherwise
        """
        # check input is not PDB
        if self.type == self.PDB:
            pass
        else:
            # rename long chains
            try:
                chainmap = self.renameChains(self.structure)
            except OutOfChainsError:
                print("Too many chains to represent in PDB format")
                return False

            for new, old in chainmap.items():
                # for new, old in list(chainmap.items()):
                if new != old:
                    print("Renaming chain {0} to {1}".format(old, new))

        self._write(pdbFile)

        return True

[docs]    def writeAsCif(self, cifFile):
        """ Save structure as CIF.
            Be aware that this is not a lossless conversion
        """
        self._write(cifFile)

[docs]    def getBoundingBox(self, expand=3):
        """Get bounding box (angstroms) for atom struct.
         Only alpha carbons are taken into account.
         parameter: expand.- make box larger addind this factor
         """
        # Use the first model
        ref_model = self.getStructure()[0]
        # init bounding box volues
        xmin = 100000; ymin = xmin; zmin = xmin
        xmax = -100000; ymax = xmax; zmax = xmax
        # iterate for all aminoacids
        for chain in ref_model:
            for residue in chain:
                (x,y,z) = residue["CA"].get_coord()
                if x < xmin: xmin = x
                if x > xmax: xmax = x

                if y < ymin: ymin = y
                if y > ymax: ymax = y

                if z < zmin: zmin = z
                if z > zmax: zmax = z
            # DEBUG
            # with open("/tmp/kk.bild", "w") as file:
            #     file.write(".transparency 0.8\n"
            #                ".color red\n"
            #                ".box %f %f %f %f %f %f\n" %
            #                (xmin - expand , ymin - expand, zmin - expand,
            #                xmax + expand, ymax + expand, zmax + expand))
        return [[xmin - expand, ymin - expand, zmin - expand],
                [xmax + expand, ymax + expand, zmax + expand]]


[docs]    def getTransformMatrix(self, atomStructFn, startId=-1, endId=-1, outFn=None):
        """find matrix that Superimposes two atom structures.
        this matrix moves atomStructFn to self """
        if endId == -1:
            endId = 10000000
        # load second atom structure
        aSH = AtomicStructHandler()
        aSH.read(atomStructFn)

        # Use the first model in the atom struct for alignment
        ref_model = self.getStructure()[0]
        sample_model = aSH.getStructure()[0]

        # Make a list of the atoms (in the structures) you wish to align.
        # In this case we use CA atoms whose index is in the
        # specified range (starId, endId)
        ref_atoms = []
        sample_atoms = []

        # Now get a list with CA atoms (alpha-carbon)
        # I assume that both atom structs have the same number of chains
        # and in the same order.
        for ref_chain, sample_chain in zip(ref_model, sample_model):
            # create a set with the id of all residues
            # for the current chain
            ref_set_id = set(res.get_id()[1] for res in ref_chain)
            sample_set_id = set(res.get_id()[1] for res in sample_chain)
            # keep the intersection as an ordered list (sets are not ordered)
            ref_set_id.intersection_update(sample_set_id)
            ref_list_id = sorted(ref_set_id)
            # delete AA smaller or large than the predefined values
            ref_list_id[:] = [x for x in ref_list_id
                              if x >= startId and x <= endId]

            # Iiterate through all residues in each chain a store CA atoms
            for id in ref_list_id:
                if id in ref_chain and id in sample_chain:
                    ref_atoms.append(ref_chain[id]['CA'])
                    sample_atoms.append(sample_chain[id]['CA'])
        # Now we initiate the superimposer:
        super_imposer = Superimposer()
        super_imposer.set_atoms(ref_atoms, sample_atoms)
        # super_imposer.apply(sample_model.get_atoms())
        (rot, trans) = super_imposer.rotran
        # DEBUG, uncomment next two lines to see
        # transformation applied to atomStructFn
        if outFn:
            aSH.getStructure().transform(rot, trans)
            aSH.write(outFn)

        # convert 3x3 rotation matrix to homogeneous matrix
        rot = numpy.transpose(rot)  # scipion and biopython use
                                    # different conventions
        tmp = numpy.r_[rot, numpy.zeros((1, 3))]
        mat = numpy.c_[tmp, numpy.array(
            [[trans[0]], [trans[1]], [trans[2]], [1]])]
        return mat, super_imposer.rms

[docs]    def centerOfMass(self, geometric=False):
        """
        Returns gravity [default] or geometric center of mass of an Entity
        (anything with a get_atoms function in biopython.
        Geometric assumes all masses are equal (geometric=True)
        """
        entity = self.structure
        # Structure, Model, Chain, Residue
        if isinstance(entity, Entity.Entity):
            atom_list = entity.get_atoms()
        # List of Atoms
        elif hasattr(entity, '__iter__') and \
                [x for x in entity if x.level == 'A']:
            atom_list = entity
        else:  # Some other weirdo object
            raise ValueError("Center of Mass can only be calculated "
                             "from the following objects:\n"
                             "Structure, Model, Chain, Residue, "
                             "list of Atoms.")

        masses = []
        positions = [[], [], []]

        for atom in atom_list:
            masses.append(atom.mass)

            for i, coord in enumerate(atom.coord.tolist()):
                positions[i].append(coord)

        # If there is a single atom with undefined mass complain loudly.
        if 'ukn' in set(masses) and not geometric:
            raise ValueError("Some Atoms don't have an element assigned.\n"
                             "Try adding them manually or calculate the "
                             "geometrical center of mass instead.")

        if geometric:
            return [sum(coord_list) / len(masses) for coord_list in positions]
        else:
            w_pos = [[], [], []]
            for atom_index, atom_mass in enumerate(masses):
                w_pos[0].append(positions[0][atom_index] * atom_mass)
                w_pos[1].append(positions[1][atom_index] * atom_mass)
                w_pos[2].append(positions[2][atom_index] * atom_mass)

            return [sum(coord_list) / sum(masses) for coord_list in w_pos]

[docs]    def transform(self, transformation_matrix, sampling=1.):
        """ Geometrical transformation of a PDB structure

        :param entity: PDB biopython structure
        :param transformation_matrix -> 4x4 scipion matrix
        :param sampling: scipion transform matrix is applied to voxels so
              length must be multiplied by samplingRate

        internal variables:
             rotation matrix -> numpy.array(
                      [[ 0.5, -0.809017,  0.309017],
                       [ 0.809017,  0.30917,  -0.5     ],
                       [ 0.309017,  0.5,       0.809017]])
             translation: translation vector -> numpy.array([1., 0., 0.], 'd')
        :return: no return, new data overwrites entity
        """
        # bioPhython and Scipion conventions do not match
        rotation_matrix = numpy.transpose(transformation_matrix[:3, :3])
        # from geometry get euler angles and recreate matrix
        translation = translation_from_matrix(transformation_matrix)
        translation = [x * sampling for x in translation]
        self.structure.transform(rotation_matrix, translation)

    def _renameChainsIfNeed(self, struct2):
        """Rename chain, we assume that there is a single model per structure"""
        repeated = False

        def RepresentsInt(s):
            try:
                int(s)
                return True
            except ValueError:
                return False

        import uuid
        chainIDs1 = [chain.id for chain in self.structure.get_chains()]
        for chain in struct2.get_chains():
            counter = 2
            if chain.id in chainIDs1:
                repeated = True
                cId = chain.id
                l = len(cId)
                if l == 1:
                    while True:
                        try:
                            chain.id = "%s%03d" % (cId, counter)
                            break
                        except ValueError:
                            counter +=1
                            if counter > 1000:
                                raise ValueError('Error in _renameChainsIfNeed.')
                elif RepresentsInt(cId[1:]):  # try to fit a number and increase it by one
                    chain.id = "%s%03d" % (cId[0], int(cId[1:]) + 1)
                else:  # generate a 4 byte random string
                    chain.id = uuid.uuid4().hex[:4]
        if repeated:
            self._renameChainsIfNeed(struct2)

[docs]    def addStruct(self, secondPDBfileName, outPDBfileName=None, useModel=False):
        """ Join the second structure to the first one.
            If cheon numes are the same rename them.
            if outPDBfileName id provided then new
            struct is saved to a file"""
        # read new structure
        if outPDBfileName is not None:
            pdbID = (os.path.splitext(os.path.basename(outPDBfileName))[0])[:4]
        else:
            pdbID = (os.path.splitext(os.path.basename(secondPDBfileName))[0])[:4]

        if secondPDBfileName.endswith(".pdb") or secondPDBfileName.endswith(".ent"):
            parser = PDBParser(PERMISSIVE=self.permissive)
        else:
            parser = MMCIFParser()

        struct2 = parser.get_structure(pdbID, secondPDBfileName)

        if useModel:
            modelNumber = 0
            modelID = 0
            # model.id = model.serial_num = len(self.structure)?  # not sure this
            # is valid always
            for model in self.structure:
                pass
            modelNumber = model.serial_num
            modelID = model.id
            for model in struct2:
                modelNumber += 1
                modelID += 1
                model.detach_parent()
                model.serial_num = modelNumber
                model.id = modelID
                self.structure.add(model)
        else:
            self._renameChainsIfNeed(struct2)

            for model in struct2:
                for chain in model:
                    chain.detach_parent()
                    self.structure[0].add(chain)

        # create new output file
        if outPDBfileName is not None:
            self.write(outPDBfileName)

[docs]    def extractChain(self, chainID, start=0, end=-1,
                     modelID='0', filename="output.mmcif"):
        """Code for chopping  a structure.
        This module is used internally by the Bio.PDB.extract() function.
        """
        sel = ChainSelector(chain_id=chainID, start=start,
                            end=end, model_id=int(modelID))
        io = scipionMMCIFIO()
        io.set_structure(self.structure)
        io.save(filename, sel)

[docs]    def renameChain(self, chainID, newChainName, modelID='0',
                    filename="output.mmcif"):
        self.structure[modelID][chainID].id = newChainName
        self.write(filename)

[docs]    def renumberChain(self, chainID, offset=0, modelID='0',
                      filename="output.mmcif"):
        # get chain object
        chain = self.structure[modelID][chainID]
        # remove chain from model
        self.structure[modelID].detach_child(chainID)
        from Bio.PDB.Chain import Chain
        # create new chain
        newChain = Chain(chainID)
        for residue in chain:
            # remove residue, otherwise we cannot renumber it
            residue.detach_parent()
            rId = residue.id
            res_id = list(rId)
            res_id[1] = res_id[1] + offset
            if res_id[1] < 0:
                raise ValueError('Residue number cant be <= 0')
            residue.id = tuple(res_id)
            newChain.add(residue)
        self.structure[modelID].add(newChain)
        self.write(filename)


[docs]def cifToPdb(fnCif, fnPdb):
    h = AtomicStructHandler()
    h.read(fnCif)
    h.writeAsPdb(fnPdb)


[docs]def pdbToCif(fnPdb, fnCif):
    h = AtomicStructHandler()
    h.read(fnPdb)
    h.writeAsCif(fnCif)


[docs]def toPdb(inFileName, outPDBFile):
    if inFileName.endswith(".pdb") or inFileName.endswith(".ent"):
        return inFileName
    elif inFileName.endswith(".cif") or inFileName.endswith(".mmcif"):
        cifToPdb(inFileName, outPDBFile)
        return outPDBFile
    else:
        print("ERROR (toPdb), Unknown file type for file = %s" % inFileName)


[docs]def toCIF(inFileName, outCIFFile):
    if inFileName.endswith(".cif") or inFileName.endswith(".mmcif"):
        return inFileName
    elif inFileName.endswith(".pdb") or inFileName.endswith(".ent"):
        pdbToCif(inFileName, outCIFFile)
        return outCIFFile
    else:
        print("ERROR (toCIF), Unknown file type for file = %s" % inFileName)


[docs]def getEnviron():
    environ = pwutils.Environ(os.environ)
    environ.update({'RCSBROOT': os.path.join(Plugin.getMaxitHome())
                    }, position=pwutils.Environ.REPLACE)
    environ.update({'PATH': os.path.join(Plugin.getMaxitHome(), 'bin')
                    }, position=pwutils.Environ.BEGIN)
    return environ


def _frombase(inFileName, outFileName, log, oParam=1):
    # check if maxit exists,
    # if it does not then complain
    # convert pdb to cif using maxit
    global maxitAvailable
    try:
        maxitAvailable
    except:
        if not os.path.exists(Plugin.getMaxitBin()):
            maxitAvailable = False
            # show error message
        else:
            maxitAvailable = True

    if maxitAvailable:
        args = ' -input "' + inFileName + '" -output "' + outFileName + \
               '" -o %d' % oParam
        log.info('Launching: ' + Plugin.getMaxitBin() + args)
        # run in the background
        env = getEnviron()
        pwutils.runJob(None, Plugin.getMaxitBin(), args, env=env)
    else:
        # this is not the ideal conversion but it is better
        # than nothing
        aSH = AtomicStructHandler()
        aSH.read(inFileName)
        aSH.write(outFileName)
        # show error message
        print(pwutils.redStr("Please, install maxit with the command 'scipion installb maxit'"))
        print(pwutils.redStr("and restart scipion. Packages bison and flex are needed."))
        print(pwutils.redStr("If maxit is installed check %s in scipion.conf" % MAXIT_HOME))


[docs]def fromPDBToCIF(inFileName, outFileName, log):
    _frombase(inFileName, outFileName, log, 1)


[docs]def fromCIFToPDB(inFileName, outFileName, log):
    _frombase(inFileName, outFileName, log, 2)


[docs]def fromCIFTommCIF(inFileName, outFileName, log):
    _frombase(inFileName, outFileName, log, 8)

[docs]def testLog(log, messages= None, sdterrLog = None):
    # Method to capture exceptions like error messages from Phenix when map and model are far apart
    if sdterrLog is None or messages is None:
        return

    # read all content of a file
    for line in sdterrLog(logFile=1):
        # check if string present in a file
        for message in messages:
            if message[0] in line:
                log.info(pwutils.redStr("Error:  Protocol aborted "))
                log.info(pwutils.redStr(f"Error: {message[1]}"))
                raise Exception(pwutils.redStr("Error: %s" % message[1]))
        if "Sorry" in line:
            log.info(pwutils.redStr("WARNING, %s" % line))

[docs]def retry(runEnvirom, program, args, cwd, listAtomStruct=[], log=None, clean_dir=None, messages=[], sdterrLog=None):
    messages.append(("Sorry: Input map is all zero after boxing", "Sorry: Input map is all zero after boxing"))

    try:
        runEnvirom(program, args, cwd=cwd)
    except:
        testLog(log, messages,sdterrLog)

        # first remove every files or directories in the extra folder,
        # except maps
        partiallyCleaningFolder(program, cwd)
        # something went wrong, may be bad atomStruct format
        log.info('retry with maxit conversion')

        for i, atomStructName in enumerate(listAtomStruct):
            if atomStructName.endswith(".pdb.cif"):
                aSH = AtomicStructHandler()
                aSH.read(atomStructName)
                aSH.write(atomStructName)
                try:
                    runEnvirom(program, args, cwd=cwd)
                except:
                    testLog(log, messages,sdterrLog)
            else:
                try:
                    if atomStructName.endswith(".pdb"):
                        try:
                            newAtomStructName = atomStructName.replace(atomStructName.split(".")[-1],"cif")
                            _args = args.replace(atomStructName, newAtomStructName)
                            runEnvirom(program, _args, cwd=cwd)
                        except:
                            newAtomStructName = os.path.abspath(
                                os.path.join(cwd, "retrypdb%d.cif" % i))
                            fromPDBToCIF(atomStructName, newAtomStructName, log)
                            _args = args.replace(atomStructName, newAtomStructName)
                            try:
                                runEnvirom(program, _args, cwd=cwd)
                            except:
                                testLog(log, messages,sdterrLog)
                                fromCIFTommCIF(newAtomStructName, newAtomStructName, log)
                                runEnvirom(program, _args, cwd=cwd)
                    elif atomStructName.endswith(".cif"):
                        try:
                            newAtomStructName = atomStructName.replace(atomStruct.split(".")[-1],"pdb")
                            _args = args.replace(atomStructName, newAtomStructName)
                            runEnvirom(program, _args, cwd=cwd)
                        except:
                            newAtomStructName = os.path.abspath(
                                os.path.join(cwd, "retrycif%d.cif" % i))
                            fromCIFTommCIF(atomStructName, newAtomStructName, log)
                            _args = args.replace(atomStructName, newAtomStructName)
                            try:
                                runEnvirom(program, _args, cwd=cwd)
                            except:
                                testLog(log, messages,sdterrLog)
                                newAtomStructName = os.path.abspath(
                                    os.path.join(cwd, "retrycif%d.pdb" % i))
                                fromCIFToPDB(atomStructName, newAtomStructName, log)
                                _args = args.replace(atomStructName, newAtomStructName)
                                runEnvirom(program, _args, cwd=cwd)
                                testLog(log, messages,sdterrLog)
                except:
                    testLog(log, messages,sdterrLog)
                    # first remove files or directories in the extra folder,
                    # except maps
                    partiallyCleaningFolder(program, cwd)
                    # biopython conversion
                    aSH = AtomicStructHandler()
                    if atomStructName.endswith(".pdb") or atomStructName.endswith(".ent"):
                        newAtomStructName = atomStructName.replace(".pdb", ".cif"). \
                            replace(".ent", ".cif")
                    else:
                        newAtomStructName = atomStructName
                    try:
                        aSH.read(atomStructName)
                        aSH.write(newAtomStructName)
                        _args = args.replace(atomStructName, newAtomStructName)
                        runEnvirom(program, _args, cwd=cwd)
                        testLog(log, messages,sdterrLog)
                    except:
                        print("CIF file standardisation failed.")
                        # atomStructName = newAtomStructName



# TODO this should no be here, ROB
[docs]def partiallyCleaningFolder(program, cwd):
    l1 = os.listdir(cwd)
    l2 = ['molprobity.out', 'molprobity_probe.txt', 'molprobity_coot.py']
    l3 = ['validation_cryoem.pkl']
    l4 = ['placed_model.pdb', 'placed_model.cif']

    for item in l1:
        if (program.endswith('real_space_refine.py') and
            (item.endswith('geo') or
             item.endswith('real_space_refined.log') or
             item.endswith('real_space_refine.pdb') or
             item.endswith('real_space_refined.cif'))) or \
                (program.endswith('molprobity.py') and item in l2) or \
                (program.endswith('validation_cryoem.py') and item in l3) or \
                (program.endswith('emringer.py') and
                 (item.endswith("emringer.csv") or
                  item.endswith("emringer.pkl") or
                  item.endswith("emringer_plots") or
                  item.startswith('emringer_transfer'))) or \
                (program.endswith('dock_in_map.py') and item in l4):
            path1 = os.path.join(cwd, item)
            if os.path.exists(path1):
                if os.path.isfile(path1) or os.path.islink(path1):
                    os.remove(path1)
                elif os.path.isdir(path1):
                    shutil.rmtree(path1)

[docs]def addScipionAttribute(cifDic, attributeScoresDic, attrName, recipient='residues'):
    '''Add a scipion attribute in a section of the CIF dictionary
    "cifDic": must be a cif dictionary parsed by Biopython
    "attributeScoresDic" a dictionary of the form {spec: value}"'''
    if not NAME in cifDic:
        cifDic[NAME], cifDic[RECIP] = [], []
        cifDic[SPEC], cifDic[VALUE] = [], []
    for spec in attributeScoresDic:
        value = attributeScoresDic[spec]
        cifDic[NAME].append(attrName)
        cifDic[RECIP].append(recipient)
        cifDic[SPEC].append(spec)
        cifDic[VALUE].append(str(value))
    return cifDic