Source code for pwem.protocols.protocol_import.sequence

# -*- coding: utf-8 -*-
# **************************************************************************
# *
# * Authors:     Marta Martinez (
# *              Roberto Marabini (
# *
# * Unidad de  Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 3 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307  USA
# *
# *  All comments concerning this program package may be sent to the
# *  e-mail address ''
# *
# **************************************************************************

from os.path import exists
import os
import pyworkflow.protocol.params as params

from import Sequence, Alphabet
import pwem.convert as emconv
from pwem.convert import AtomicStructHandler

from .base import ProtImportFiles

[docs]class ProtImportSequence(ProtImportFiles): """ Protocol to import an aminoacid/nucleotide sequence file to the project""" _label = 'import sequence' # SEQUENCEFILENAME = '_sequence.fasta' # proteins IMPORT_FROM_PLAIN_TEXT = 0 IMPORT_FROM_STRUCTURE = 1 IMPORT_FROM_FILES = 2 IMPORT_FROM_UNIPROT = 3 IMPORT_FROM_PROTEIN_GENEBANK = 4 # nucleotics IMPORT_FROM_NUCLEOTIDE_PLAIN_TEXT = 0 IMPORT_FROM_NUCLEOTIDE_STRUCTURE = 1 IMPORT_FROM_NUCLEOTIDE_FILES = 2 IMPORT_FROM_NUCLEOTIDE_GENEBANK = 3 IMPORT_STRUCTURE_FROM_ID = 0 IMPORT_STRUCTURE_FROM_FILES = 1 url = "" def __init__(self, **args): ProtImportFiles.__init__(self, **args) def _defineParams(self, form): form.addSection(label='Input') form.addParam('inputSequenceID', params.StringParam, label="Sequence ID", allowsNull=True, expertLevel=params.LEVEL_ADVANCED, help="Write a sequence ID. Otherwise, if the " "sequence derives from GenBank/UniProt/PDB " "databases, the respective database ID will be " "selected as starting sequence ID; examples: if " "you select GenBank accession AJ520101, SCIPION " "will assign AJ520101 as sequence ID; if " "you select UniProt accession P12345, SCIPION will " "assign P12345 as sequence ID; if you " "select atomic structure 3lqd.cif, chain B, " "SCIPION will assign 3lqd_B as sequence ID. In " "the rest of cases, the Sequence name " "will be selected as starting Sequence ID.") form.addParam('inputSequenceName', params.StringParam, important=True, label="Sequence name", allowsNull=False, help="Write a sequence name.") form.addParam('inputSequenceDescription', params.StringParam, label="Sequence description", allowsNull=True, expertLevel=params.LEVEL_ADVANCED, help="Write a description for your sequence. Otherwise, " "if the " "sequence derives from GenBank/UniProt/PDB " "databases, the respective database description " "will be " "selected as starting sequence description. In " "the rest of cases, no sequence description will " "be added.") form.addParam('inputSequence', params.EnumParam, pointerClass='Sequence', choices=Alphabet.SEQ_TYPE, display=params.EnumParam.DISPLAY_HLIST, label="Import sequence of ", default=Alphabet.AMINOACIDS, help='Select the type of sequence to import.') form.addParam('inputProteinSequence', params.EnumParam, choices=['plain text', 'atomic structure', 'file', 'UniProt ID', 'NCBI/GenBank ID'], display=params.EnumParam.DISPLAY_HLIST, condition='inputSequence == %d' % Alphabet.AMINOACIDS, label="From ", default=self.IMPORT_FROM_PLAIN_TEXT, help='Select one of the four options: write the ' 'aminoacid sequence or import it ' 'from a previously loaded atomic structure, a local ' 'file or an online server.') form.addParam('proteinIUPACalphabet', params.EnumParam, choices=list(Alphabet.alphabetsLabels.values())[:Alphabet.AMBIGOUS_DNA_ALPHABET], display=params.EnumParam.DISPLAY_HLIST, condition='inputSequence == %d and ' 'inputProteinSequence == %d' % ( Alphabet.AMINOACIDS, self.IMPORT_FROM_PLAIN_TEXT), label="IUPAC Protein alphabet: ", default=Alphabet.EXTENDED_PROTEIN_ALPHABET, help='Your raw sequence will be cleaned according ' 'a certain alphabet, i.e., only the letters ' 'contained in the alphabet will be maintained in ' 'the sequence. Select thus the type of protein ' 'alphabet in order to accomplish the ' 'cleaning:\n\nProtein alphabet: IUPAC protein ' 'alphabet of the 20 standard amino acids; uppercase' ' and single letter: *ACDEFGHIKLMNPQRSTVWY*.\n\n' 'Extended Protein alphabet: Extended uppercase ' 'IUPAC ' 'protein single letter alphabet including X etc.\n' 'In addition to the standard 20 single letter ' 'protein codes, this includes:\n*B = Asx*; ' 'Aspartic acid (R) or Asparagine (N)\n*X = Xxx*"; ' 'Unknown or other amino acid\n*Z = Glx*; Glutamic ' 'acid (E) or Glutamine (Q)\n*J = Xle*; Leucine (' 'L) or Isoleucine (I), used in mass-spec (NMR)\n' '*U = Sec*; Selenocysteine\n*O = Pyl*; ' 'Pyrrolysine\nThis alphabet is not intended to be ' 'used with X for Selenocysteine (an ad-hoc standard' ' prior to the IUPAC adoption of U instead).\n') form.addParam('uniProtSequence', params.StringParam, condition='inputSequence == %d and ' 'inputProteinSequence == %d' % (Alphabet.AMINOACIDS, self.IMPORT_FROM_UNIPROT), label="UniProt name/ID ", allowsNull=True, help='Write a UniProt ID (six or ten alphanumeric ' 'characters; examples: A2BC19, P12345, ' 'A0A022YWF9, DGAL_ECOLI).\n You can convert other ' 'database identifiers to UniProt accession codes ' 'by using the "ID Mapping" tab on ' '') form.addParam('inputNucleotideSequence', params.EnumParam, choices=['plain text', 'atomic structure', 'file', 'NCBI/GenBank ID'], display=params.EnumParam.DISPLAY_HLIST, condition='inputSequence == %d' % Alphabet.NUCLEOTIDES, label="From ", default=self.IMPORT_FROM_NUCLEOTIDE_PLAIN_TEXT, help='Select one of the five options: write the ' 'nucleic acid sequence or import it ' 'from a local file or an online server.') form.addParam('nucleotideIUPACalphabet', params.EnumParam, # move to first element in label list that is nucleotide choices=list(Alphabet.alphabetsLabels.values())[Alphabet.AMBIGOUS_DNA_ALPHABET:], display=params.EnumParam.DISPLAY_HLIST, condition='inputSequence == %d and ' 'inputNucleotideSequence == %d' % (Alphabet.NUCLEOTIDES, self.IMPORT_FROM_NUCLEOTIDE_PLAIN_TEXT), label="IUPAC Nucleic acid alphabet: ", # subtract first element of type nucleotide default=Alphabet.EXTENDED_DNA_ALPHABET - Alphabet.EXTENDED_DNA_ALPHABET, help='Your raw sequence will be cleaned according ' 'a certain alphabet, i.e., only the letters ' 'contained in the alphabet will be maintained in ' 'the sequence. Select thus the type of nucleic ' 'acid alphabet in order to accomplish the ' 'cleaning:\n\n Ambiguous DNA alphabet: Uppercase ' 'IUPAC ambiguous DNA: *GATCRYWSMKHBVDN*.\n\n' 'Unambiguous DNA alphabet: Uppercase IUPAC unambiguous DNA ' '(letters *GATC* only).\n\nExtended DNA: Extended ' 'IUPAC DNA alphabet.\nIn addition to the standard letter ' 'codes GATC, this includes:\n*B* = 5-bromouridine\n' '*D* = 5,6-dihydrouridine\n*S* = thiouridine\n*W* ' '= wyosine\n\nAmbiguous RNA: Uppercase IUPAC ' 'ambiguous RNA; *GAUCRYWSMKHBVDN*\n\nUnambigous ' 'RNA alphabet: Generic single letter RNA ' 'alphabet.\n\n') form.addParam('inputRawSequence', params.StringParam, condition='(inputSequence == %d and ' 'inputProteinSequence == %d) or ' '(inputSequence == %d and ' 'inputNucleotideSequence == %d) ' % (Alphabet.AMINOACIDS, self.IMPORT_FROM_PLAIN_TEXT, Alphabet.NUCLEOTIDES, self.IMPORT_FROM_NUCLEOTIDE_PLAIN_TEXT), label="Write your sequence here:", important=True, help="Write the aminoacid or nucleotide raw sequence.\n") form.addParam('inputStructureSequence', params.EnumParam, choices=['id', 'file'], condition='inputProteinSequence == %d or ' 'inputNucleotideSequence == %d' % (self.IMPORT_FROM_STRUCTURE, self.IMPORT_FROM_NUCLEOTIDE_STRUCTURE), label="Atomic structure from", default=self.IMPORT_STRUCTURE_FROM_ID, display=params.EnumParam.DISPLAY_HLIST, help='Import structure data from online server or local ' 'file', pointerClass='AtomStruct', allowsNull=True) form.addParam('pdbId', params.StringParam, condition='(inputProteinSequence == %d or ' 'inputNucleotideSequence == %d) and ' 'inputStructureSequence == %d' % (self.IMPORT_FROM_STRUCTURE, self.IMPORT_FROM_NUCLEOTIDE_STRUCTURE, self.IMPORT_STRUCTURE_FROM_ID), label="Atomic structure ID ", allowsNull=True, help='Type a structure ID (four alphanumeric ' 'characters).') form.addParam('pdbFile', params.PathParam, label="File path", condition='(inputProteinSequence == %d or ' 'inputNucleotideSequence == %d) and ' 'inputStructureSequence == %d' % (self.IMPORT_FROM_STRUCTURE, self.IMPORT_FROM_NUCLEOTIDE_STRUCTURE, self.IMPORT_STRUCTURE_FROM_FILES), allowsNull=True, help='Specify a path to desired atomic structure.') form.addParam('inputStructureChain', params.StringParam, condition='inputProteinSequence == %d or ' 'inputNucleotideSequence == %d' % (self.IMPORT_FROM_STRUCTURE, self.IMPORT_FROM_NUCLEOTIDE_STRUCTURE), label="Chain ", allowsNull=True, help="Select a particular chain of the atomic " "structure.") form.addParam('fileSequence', params.PathParam, label="File path", condition='inputProteinSequence == %d or ' 'inputNucleotideSequence == %d' % (self.IMPORT_FROM_FILES, self.IMPORT_FROM_NUCLEOTIDE_FILES), allowsNull=True, help='Specify a path to desired aminoacid or ' 'nucleic acid sequence ' 'file.\nIf your file contains more than one ' 'sequence, only the first one will be considered.') form.addParam('geneBankSequence', params.StringParam, condition='(inputSequence == %d and ' 'inputNucleotideSequence == %d) or ' '(inputSequence == %d and ' 'inputProteinSequence == %d)' % (Alphabet.NUCLEOTIDES, self.IMPORT_FROM_NUCLEOTIDE_GENEBANK, Alphabet.AMINOACIDS, self.IMPORT_FROM_PROTEIN_GENEBANK), label="GenBank accession ", allowsNull=True, help='Write a GenBank accession.\n') def _insertAllSteps(self): = self.inputSequenceName.get() if self.inputSequence == Alphabet.AMINOACIDS: if self.inputProteinSequence == self.IMPORT_FROM_PLAIN_TEXT: rawSequence = self.inputRawSequence.get() self._insertFunctionStep('getRawSequenceStep', rawSequence) elif self.inputProteinSequence == self.IMPORT_FROM_STRUCTURE: chainId = self.inputStructureChain.get() self._insertFunctionStep('getSequenceOfChainStep', chainId) elif self.inputProteinSequence == self.IMPORT_FROM_UNIPROT: sequenceDB = self._getUniProtID() self._insertFunctionStep('sequenceDatabaseDownloadStep', sequenceDB) elif self.inputProteinSequence == self.IMPORT_FROM_PROTEIN_GENEBANK: sequenceDB = self._getGeneBankID() self._insertFunctionStep('sequenceDatabaseDownloadStep', sequenceDB) elif self.inputProteinSequence == self.IMPORT_FROM_FILES: self.sequenceFile = self.fileSequence.get() sequenceFile = self.sequenceFile self._insertFunctionStep('fileDownloadStep', sequenceFile) else: if self.inputNucleotideSequence == \ self.IMPORT_FROM_NUCLEOTIDE_PLAIN_TEXT: rawSequence = self.inputRawSequence.get() self._insertFunctionStep('getRawSequenceStep', rawSequence) elif self.inputNucleotideSequence == \ self.IMPORT_FROM_NUCLEOTIDE_STRUCTURE: chainId = self.inputStructureChain.get() self._insertFunctionStep('getSequenceOfChainStep', chainId) elif self.inputNucleotideSequence == self.IMPORT_FROM_NUCLEOTIDE_GENEBANK: sequenceDB = self._getGeneBankID() self._insertFunctionStep('sequenceDatabaseDownloadStep', sequenceDB) elif self.inputNucleotideSequence == \ self.IMPORT_FROM_NUCLEOTIDE_FILES: self.sequenceFile = self.fileSequence.get() sequenceFile = self.sequenceFile self._insertFunctionStep('fileDownloadStep', sequenceFile) self._insertFunctionStep('createOutputStep')
[docs] def getRawSequenceStep(self, rawSequence): # user types sequence if self.inputSequenceID.get() is not None: = self.inputSequenceID.get() else: = isAmino = self.inputSequence == Alphabet.AMINOACIDS if isAmino: self.alphabet = self.proteinIUPACalphabet.get() else: self.alphabet = self.nucleotideIUPACalphabet.get() self.sequence = emconv.cleanSequenceScipion(isAmino, self.alphabet, rawSequence)
[docs] def getSequenceOfChainStep(self, chainId): # sequece is obtained from PDB file # form has a wizard that creates label with the format # [model: x, chain: x, xxx residues] import json chainIdDict = json.loads(self.inputStructureChain.get()) selectedModel = chainIdDict['model'] selectedChain = chainIdDict['chain'] self.structureHandler = AtomicStructHandler() if self.pdbId.get() is not None: # PDB from remote database pdbID = self.pdbId.get() tmpFilePath = os.path.join("/tmp", pdbID + ".cif").lower() if not exists(tmpFilePath): # wizard has not used and the file has not been downloaded yet self.structureHandler.readFromPDBDatabase(pdbID, dir="/tmp") else: # PDB from file _sequence, alphabet = self.structureHandler.getSequenceFromChain( selectedModel, selectedChain, returnAlphabet =True) self.sequence = str(_sequence) self.alphabet = alphabet #emconv.alphabetToIndex(self.inputSequence == # Alphabet.AMINOACIDS, # _sequence.alphabet) # Assignation of sequence ID: if the user has provided a specific # ID, this will be adopted by default; otherwise, a sequence ID # related with the starting structure will be selected. if self.inputSequenceID.get() is not None: = self.inputSequenceID.get() else: = self.structureHandler.getFullID( selectedModel, selectedChain) print("Selected chain: %s from model: %s from structure: %s" % (selectedChain, selectedModel, self.structureHandler.structure.get_id()))
[docs] def sequenceDatabaseDownloadStep(self, sequenceDB): """Download UniProt/GenBank sequence from its respective database """ # sequenceDB = str(sequenceDB) isAminoacid=(self.inputSequence == Alphabet.AMINOACIDS) if self.uniProtSequence.get() is not None: seqHandler = emconv.SequenceHandler(iUPACAlphabet=Alphabet.EXTENDED_PROTEIN_ALPHABET) dataBase = 'UniProt' elif self._getGeneBankID() is not None: if isAminoacid: seqHandler = emconv.SequenceHandler(iUPACAlphabet=Alphabet.EXTENDED_PROTEIN_ALPHABET) else: seqHandler = emconv.SequenceHandler(iUPACAlphabet=Alphabet.NUCLEOTIDES_ALPHABET) dataBase = 'GeneBank' seqDic, error = seqHandler.downloadSeqFromDatabase(seqID = sequenceDB, dataBase=dataBase) if seqDic is None: print("Error: ", error) self.setAborted() exit(0) if self.inputSequenceID.get() is not None: = self.inputSequenceID.get() elif sequenceDB != '': = sequenceDB else: = if seqDic['description'] != '': self.description = seqDic['description'] self.sequence = seqDic['sequence'] self.alphabet = seqDic['alphabet']
[docs] def fileDownloadStep(self, sequenceFile): # If sequencePath contains more than one sequence, only # the first one will be considered seqHandler = emconv.SequenceHandler() seqDic = seqHandler.readSequenceFromFile(sequenceFile, type="fasta", isAmino= self.inputSequence.get()==Alphabet.AMINOACIDS) if self.inputSequenceID.get() is not None: = self.inputSequenceID.get() elif seqDic['seqID'] != '': = seqDic['seqID'] else: = if seqDic['description'] != '': self.description = seqDic['description'] self.sequence = seqDic['sequence'] self.alphabet = seqDic['alphabet']
[docs] def createOutputStep(self): """ Register the output object. """ if self.inputSequenceDescription.get() is not None: self.description = self.inputSequenceDescription.get() elif hasattr(self, 'description'): pass else: self.description = '' seq = Sequence(, sequence=self.sequence, alphabet=self.alphabet, isAminoacids=(self.inputSequence == Alphabet.AMINOACIDS),, description=self.description) outputs = {'outputSequence': seq} self._defineOutputs(**outputs)
def _summary(self): summary = [] = self.inputSequenceName.get() uniProtId = self._getUniProtID() geneBankID = self._getGeneBankID() if self.inputSequence == Alphabet.AMINOACIDS: summary.append('Sequence of aminoacids:\n') if self.inputProteinSequence == self.IMPORT_FROM_PLAIN_TEXT: summary.append("Sequence *%s* imported from plain text\n" % elif self.inputProteinSequence == self.IMPORT_FROM_STRUCTURE: if self.inputStructureSequence == \ self.IMPORT_STRUCTURE_FROM_ID: summary.append("Sequence *%s* imported from atomic " "structure *%s.cif*\n" % (, self.pdbId.get())) elif self.inputStructureSequence == \ self.IMPORT_STRUCTURE_FROM_FILES: summary.append("Sequence *%s* imported from file *%s*\n" % (, self.pdbFile.get())) elif self.inputProteinSequence == self.IMPORT_FROM_UNIPROT: summary.append("Sequence *%s* imported from UniProt ID " "*%s*\n" % (, uniProtId)) elif self.inputProteinSequence == self.IMPORT_FROM_FILES: summary.append("Sequence *%s* imported from file name: " "*%s*\n" % (, self.fileSequence.get())) else: summary.append('Sequence of nucleotides:\n') if self.inputNucleotideSequence == \ self.IMPORT_FROM_NUCLEOTIDE_PLAIN_TEXT: summary.append("Sequence *%s* imported from plain text\n" % elif self.inputNucleotideSequence == \ self.IMPORT_FROM_NUCLEOTIDE_STRUCTURE: if self.inputStructureSequence == \ self.IMPORT_STRUCTURE_FROM_ID: summary.append("Sequence *%s* imported from atomic " "structure *%s.cif*\n" % (, self.pdbId.get())) elif self.inputStructureSequence == \ self.IMPORT_STRUCTURE_FROM_FILES: summary.append("Sequence *%s* imported from file *%s*\n" % (, self.pdbFile.get())) elif self.inputNucleotideSequence == self.IMPORT_FROM_NUCLEOTIDE_GENEBANK: summary.append("Sequence *%s* imported from GenBank ID " "*%s*\n" % (, geneBankID)) elif self.inputNucleotideSequence == \ self.IMPORT_FROM_NUCLEOTIDE_FILES: summary.append("Sequence *%s* imported from file name: " "*%s*\n" % (, self.fileSequence.get())) return summary def _validate(self): errors = [] return errors def _getSequenceName(self): pass def _getUniProtID(self): return self.uniProtSequence.get() def _getGeneBankID(self): return self.geneBankSequence def _getAlphabet(self): if self.inputSequence == Alphabet.AMINOACIDS: return self.proteinIUPACalphabet.get() else: return self.nucleotideIUPACalphabet.get() + Alphabet.AMBIGOUS_DNA_ALPHABET