Source code for pwem.convert.sequence

# -*- coding: utf-8 -*-
# **************************************************************************
# *
# * Authors:     Marta Martinez (
# *              Roberto Marabini (
# *
# * Unidad de  Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 3 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307  USA
# *
# *  All comments concerning this program package may be sent to the
# *  e-mail address ''
# *
# **************************************************************************
from urllib.error import URLError, HTTPError
from urllib.request import urlopen
from os.path import exists

# sequence related stuff
SEQ_TYPE = ['aminoacids', 'nucleotides']
IUPAC_PROTEIN_ALPHABET = ['Extended Protein', 'Protein']
IUPAC_NUCLEOTIDE_ALPHABET = ['Ambiguous DNA', 'Unambiguous DNA',
                             'Extended DNA', 'Ambiguous RNA',
                             'Unambiguous RNA']

from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio import Entrez, SeqIO
import sys
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import ClustalOmegaCommandline, MuscleCommandline
from Bio import pairwise2
from pyworkflow.utils import getExt

[docs]class SequenceHandler: def __init__(self, sequence=None, iUPACAlphabet=0, isAminoacid=True): self.isAminoacid = isAminoacid self.alphabet = indexToAlphabet(isAminoacid, iUPACAlphabet) if sequence is not None: # sequence = cleanSequence(self.alphabet, sequence) self._sequence = Seq(sequence, alphabet=self.alphabet) else: self._sequence = None # type(self._sequence): <class 'Bio.Seq.Seq'>
[docs] def getTypeFromFile(self, fileName): '''Returns the expected BioPython file type according to the filename''' ext = getExt(fileName) if ext == '.fasta': type = 'fasta' elif ext == '.genbank' or ext == '.gb': type = 'genbank' else: type = 'fasta' return type
[docs] def appendFile(self, fileName, seqID, sequence=None, name=None, seqDescription=None, type="fasta"): '''Appends a sequence to the sequences in a file''' if sequence is not None: self._sequence = sequence if type is None: type = self.getTypeFromFile(fileName) if exists(fileName): records = list(SeqIO.parse(fileName, type)) else: records = [] records.append(SeqRecord(self._sequence, id=seqID, name=name, description=seqDescription)) with open(fileName, "w") as output_handle: SeqIO.write(records, output_handle, type)
[docs] def saveFile(self, fileName, seqID, sequence=None, name=None, seqDescription=None, type="fasta"): '''Saves a single sequence into a specified file''' if sequence is not None: self._sequence = sequence self.saveFiles(fileName, [seqID], [self._sequence], [name], [seqDescription], type)
[docs] def saveFiles(self, fileName, seqIDs, sequences, names, seqDescriptions, type="fasta"): '''Saves multiple sequences into a specified file''' if type is None: type = self.getTypeFromFile(fileName) records = [] for seq, seqID, name, seqDescription in zip(sequences, seqIDs, names, seqDescriptions): records.append(SeqRecord(seq, id=seqID, name=name, description=seqDescription)) # type(record): < class 'Bio.SeqRecord.SeqRecord'> with open(fileName, "w") as output_handle: SeqIO.write(records, output_handle, type)
[docs] def readSequenceFromFile(self, fileName, type="fasta", isAmino=True): '''From a sequences file, returns a dictionary with ther FIRST sequence info. Dictionary: {'seqID': seqID1, 'sequence': sequence1, 'description': description1, 'alphabet': alphabet1}''' if type is None: type = self.getTypeFromFile(fileName) return self.readSequencesFromFile(fileName, isAmino=isAmino)[0]
[docs] def readSequencesFromFile(self, fileName, type='fasta', isAmino=True): '''From a sequences file, returns a list of dictionaries with each sequence info. Dictionary: [{'seqID': seqID1, 'sequence': sequence1, 'description': description1, 'alphabet': alphabet1}, ...]''' if type is None: type = self.getTypeFromFile(fileName) sequences = [] records = SeqIO.parse(fileName, type) for rec in records: alphabet = alphabetToIndex(isAmino, rec.seq.alphabet) sequences.append({'seqID':, 'sequence': str(rec.seq), 'name':, 'description': rec.description, 'alphabet': alphabet, 'isAminoacids': isAmino}) return sequences
[docs] def downloadSeqFromDatabase(self, seqID): # see # for format/databases print("Connecting to database...") seqID = str(seqID) sys.stdout.flush() counter = 1 retries = 5 record = None error = "" while counter <= retries: # retry up to 5 times if server busy try: if self.isAminoacid: dataBase = 'UnitProt' url = "" format = "uniprot-xml" handle = urlopen(url % seqID) print("URL", url % seqID) else: dataBase = 'GeneBank' = "" format = "fasta" handle = Entrez.efetch(db="nucleotide", id=seqID, rettype=format, retmode="text") record =, format) break except HTTPError as e: error = "%s is a wrong sequence ID" % seqID print(e.code) except URLError as e: error = "Cannot connect to %s" % dataBase print(e.args) except Exception as ex: template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) error = message if counter == retries: break counter += 1 seqDic = None if record is not None: seqDic = {'seqID':, 'sequence': str(record.seq), 'description': record.description, 'alphabet': record.seq.alphabet} return seqDic, error
[docs] def alignSeq(self, referenceSeq): if self._sequence is not None: alignments = pairwise2.align.globalds(self._sequence.seq, referenceSeq.seq) return alignments else: print("read the sequence first") exit(0)
[docs]def sequenceLength(filename, format='fasta'): handler = SequenceHandler() return len(handler.readSequenceFromFile(filename, format)['sequence'])
[docs]def cleanSequenceScipion(isAminoacid, iUPACAlphabet, sequence): return cleanSequence(indexToAlphabet(isAminoacid, iUPACAlphabet), sequence)
[docs]def cleanSequence(alphabet, sequence): str_list = [] for item in sequence.upper(): if item in alphabet.letters: str_list.append(item) value = ''.join(str_list) return ''.join(str_list)
[docs]def indexToAlphabet(isAminoacid, iUPACAlphabet): if isAminoacid: if iUPACAlphabet == EXTENDED_PROTEIN_ALPHABET: alphabet = IUPAC.ExtendedIUPACProtein else: alphabet = IUPAC.IUPACProtein else: if iUPACAlphabet == EXTENDED_DNA_ALPHABET: alphabet = IUPAC.ExtendedIUPACDNA elif iUPACAlphabet == AMBIGOUS_DNA_ALPHABET: alphabet = IUPAC.IUPACAmbiguousDNA elif iUPACAlphabet == UNAMBIGOUS_DNA_ALPHABET: alphabet = IUPAC.IUPACUnambiguousDNA elif iUPACAlphabet == AMBIGOUS_RNA_ALPHABET: alphabet = IUPAC.IUPACAmbiguousRNA else: alphabet = IUPAC.IUPACUnambiguousRNA return alphabet
[docs]def alphabetToIndex(isAminoacid, alphabet): if isAminoacid: if isinstance(alphabet, IUPAC.ExtendedIUPACProtein): return EXTENDED_PROTEIN_ALPHABET else: return PROTEIN_ALPHABET else: if isinstance(alphabet, IUPAC.ExtendedIUPACDNA): return EXTENDED_DNA_ALPHABET elif isinstance(alphabet, IUPAC.IUPACAmbiguousDNA): return AMBIGOUS_DNA_ALPHABET elif isinstance(alphabet, IUPAC.IUPACUnambiguousDNA): return UNAMBIGOUS_DNA_ALPHABET elif isinstance(alphabet, IUPAC.IUPACAmbiguousRNA): return AMBIGOUS_RNA_ALPHABET else: return UNAMBIGOUS_RNA_ALPHABET
[docs]def saveFileSequencesToAlign(SeqDic, inFile, type="fasta"): # Write my sequences to a fasta file with open(inFile, "w") as output_handle: for index, seq in SeqDic.items(): record = SeqRecord(seq, id=str(index), name="", description="") SeqIO.write(record, output_handle, type)
[docs]def alignClustalSequences(inFile, outFile): # Alignment of sequences with Clustal Omega program clustalomega_cline = ClustalOmegaCommandline( infile=inFile, outfile=outFile, verbose=True, auto=True) return clustalomega_cline
[docs]def alignMuscleSequences(inFile, outFile): # Alignment of sequences with Muscle program muscle_cline = MuscleCommandline(input=inFile, out=outFile) return muscle_cline
[docs]def alignBioPairwise2Sequences(structureSequenceId, structureSequence, userSequenceId, userSequence, outFileName): """aligns two sequences and saves them to disk using fasta format""" # see alignment_function for globalms parameters alignments = pairwise2.align.globalms(structureSequence, userSequence, 3, -1, -3, -2) align1, align2, score, begin, end = alignments[0] with open(outFileName, "w") as handle: handle.write(">%s\n%s\n>%s\n%s\n" % (structureSequenceId, align1, userSequenceId, align2))