# -*- coding: utf-8 -*-
# **************************************************************************
# *
# * Authors: Marta Martinez (mmmtnez@cnb.csic.es)
# * Roberto Marabini (roberto@cnb.csic.es)
# *
# * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 3 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307 USA
# *
# * All comments concerning this program package may be sent to the
# * e-mail address 'scipion@cnb.csic.es'
# *
# **************************************************************************
from urllib.error import URLError, HTTPError
from urllib.request import urlopen
# sequence related stuff
SEQ_TYPE = ['aminoacids', 'nucleotides']
SEQ_TYPE_AMINOACIDS = 0
SEQ_TYPE_NUCLEOTIDES = 1
IUPAC_PROTEIN_ALPHABET = ['Extended Protein', 'Protein']
EXTENDED_PROTEIN_ALPHABET = 0
PROTEIN_ALPHABET = 1
IUPAC_NUCLEOTIDE_ALPHABET = ['Ambiguous DNA', 'Unambiguous DNA',
'Extended DNA', 'Ambiguous RNA',
'Unambiguous RNA']
EXTENDED_DNA_ALPHABET = 0
AMBIGOUS_DNA_ALPHABET = 1
UNAMBIGOUS_DNA_ALPHABET = 2
AMBIGOUS_RNA_ALPHABET = 3
UNAMBIGOUS_RNA_ALPHABET = 4
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio import Entrez, SeqIO
import sys
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import ClustalOmegaCommandline, MuscleCommandline
from Bio import pairwise2
[docs]class SequenceHandler:
def __init__(self, sequence=None,
iUPACAlphabet=0,
isAminoacid=True):
self.isAminoacid = isAminoacid
self.alphabet = indexToAlphabet(isAminoacid, iUPACAlphabet)
if sequence is not None:
# sequence = cleanSequence(self.alphabet, sequence)
self._sequence = Seq(sequence, alphabet=self.alphabet)
else:
self._sequence = None
# type(self._sequence): <class 'Bio.Seq.Seq'>
[docs] def saveFile(self, fileName, seqID, sequence=None, name=None,
seqDescription=None, type="fasta"):
if sequence is not None:
self._sequence = sequence
record = SeqRecord(self._sequence, id=seqID, name=name,
description=seqDescription)
# type(record): < class 'Bio.SeqRecord.SeqRecord'>
with open(fileName, "w") as output_handle:
SeqIO.write(record, output_handle, type)
[docs] def downloadSeqFromFile(self, fileName, type="fasta"):
record = next(SeqIO.parse(fileName, type))
return record
[docs] def downloadSeqFromDatabase(self, seqID):
# see http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html
# for format/databases
print("Connecting to database...")
seqID = str(seqID)
sys.stdout.flush()
counter = 1
retries = 5
record = None
error = ""
while counter <= retries: # retry up to 5 times if server busy
try:
if self.isAminoacid:
dataBase = 'UnitProt'
url = "http://www.uniprot.org/uniprot/%s.xml"
format = "uniprot-xml"
handle = urlopen(url % seqID)
print("URL", url % seqID)
else:
dataBase = 'GeneBank'
Entrez.email = "adam.richards@stat.duke.edu"
format = "fasta"
handle = Entrez.efetch(db="nucleotide", id=seqID,
rettype=format, retmode="text")
record = SeqIO.read(handle, format)
break
except HTTPError as e:
error = "%s is a wrong sequence ID" % seqID
print(e.code)
except URLError as e:
error = "Cannot connect to %s" % dataBase
print(e.args)
except Exception as ex:
template = "An exception of type {0} occurred. Arguments:\n{1!r}"
message = template.format(type(ex).__name__, ex.args)
error = message
if counter == retries:
break
counter += 1
return record, error
[docs] def alignSeq(self, referenceSeq):
if self._sequence is not None:
alignments = pairwise2.align.globalds(self._sequence.seq,
referenceSeq.seq)
return alignments
else:
print("read the sequence first")
exit(0)
[docs]def sequenceLength(filename, format='fasta'):
handler = SequenceHandler()
return len(handler.downloadSeqFromFile(filename, format))
[docs]def cleanSequenceScipion(isAminoacid, iUPACAlphabet, sequence):
return cleanSequence(indexToAlphabet(isAminoacid, iUPACAlphabet), sequence)
[docs]def cleanSequence(alphabet, sequence):
str_list = []
for item in sequence.upper():
if item in alphabet.letters:
str_list.append(item)
value = ''.join(str_list)
return ''.join(str_list)
[docs]def indexToAlphabet(isAminoacid, iUPACAlphabet):
if isAminoacid:
if iUPACAlphabet == EXTENDED_PROTEIN_ALPHABET:
alphabet = IUPAC.ExtendedIUPACProtein
else:
alphabet = IUPAC.IUPACProtein
else:
if iUPACAlphabet == EXTENDED_DNA_ALPHABET:
alphabet = IUPAC.ExtendedIUPACDNA
elif iUPACAlphabet == AMBIGOUS_DNA_ALPHABET:
alphabet = IUPAC.IUPACAmbiguousDNA
elif iUPACAlphabet == UNAMBIGOUS_DNA_ALPHABET:
alphabet = IUPAC.IUPACUnambiguousDNA
elif iUPACAlphabet == AMBIGOUS_RNA_ALPHABET:
alphabet = IUPAC.IUPACAmbiguousRNA
else:
alphabet = IUPAC.IUPACUnambiguousRNA
return alphabet
[docs]def alphabetToIndex(isAminoacid, alphabet):
if isAminoacid:
if isinstance(alphabet, IUPAC.ExtendedIUPACProtein):
return EXTENDED_PROTEIN_ALPHABET
else:
return PROTEIN_ALPHABET
else:
if isinstance(alphabet, IUPAC.ExtendedIUPACDNA):
return EXTENDED_DNA_ALPHABET
elif isinstance(alphabet, IUPAC.IUPACAmbiguousDNA):
return AMBIGOUS_DNA_ALPHABET
elif isinstance(alphabet, IUPAC.IUPACUnambiguousDNA):
return UNAMBIGOUS_DNA_ALPHABET
elif isinstance(alphabet, IUPAC.IUPACAmbiguousRNA):
return AMBIGOUS_RNA_ALPHABET
else:
return UNAMBIGOUS_RNA_ALPHABET
[docs]def saveFileSequencesToAlign(SeqDic, inFile, type="fasta"):
# Write my sequences to a fasta file
with open(inFile, "w") as output_handle:
for index, seq in SeqDic.items():
record = SeqRecord(seq, id=str(index),
name="", description="")
SeqIO.write(record, output_handle, type)
[docs]def alignClustalSequences(inFile, outFile):
# Alignment of sequences with Clustal Omega program
clustalomega_cline = ClustalOmegaCommandline(
infile=inFile,
outfile=outFile,
verbose=True, auto=True)
return clustalomega_cline
[docs]def alignMuscleSequences(inFile, outFile):
# Alignment of sequences with Muscle program
muscle_cline = MuscleCommandline(input=inFile, out=outFile)
return muscle_cline
[docs]def alignBioPairwise2Sequences(structureSequenceId, structureSequence,
userSequenceId, userSequence,
outFileName):
"""aligns two sequences and saves them to disk using fasta format"""
# see alignment_function for globalms parameters
alignments = pairwise2.align.globalms(structureSequence,
userSequence, 3, -1, -3, -2)
align1, align2, score, begin, end = alignments[0]
with open(outFileName, "w") as handle:
handle.write(">%s\n%s\n>%s\n%s\n" % (structureSequenceId,
align1,
userSequenceId,
align2))