Source code for continuousflex.protocols.protocol_subtomograms_classify

# **************************************************************************
# * Author:  Mohamad Harastani          (mohamad.harastani@upmc.fr)
# * IMPMC, UPMC Sorbonne University
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307  USA
# *
# *  All comments concerning this program package may be sent to the
# *  e-mail address 'scipion@cnb.csic.es'
# **************************************************************************
from pyworkflow.object import String
from pyworkflow.protocol.params import (PointerParam, EnumParam, IntParam)
from pwem.protocols import ProtAnalysis3D
from pwem.convert import cifToPdb
from pyworkflow.utils.path import makePath, copyFile, removeBaseExt
from pyworkflow.protocol import params

from .protocol_subtomogram_averaging import FlexProtSubtomogramAveraging
from sklearn.cluster import AgglomerativeClustering, KMeans
import time
import os
from sh_alignment.tompy.transform import fft, ifft, fftshift, ifftshift
import pwem.emlib.metadata as md
from continuousflex.protocols.utilities.spider_files3 import save_volume, open_volume
import xmipp3

from pwem.objects import Volume
import numpy as np
import glob
from sklearn import decomposition
from joblib import dump, load
from pwem.utils import runProgram



[docs]class FlexProtSubtomoClassify(ProtAnalysis3D):
    """ Protocol applying post alignment classification on subtomograms. """
    _label = 'classify subtomograms'

    # --------------------------- DEFINE param functions --------------------------------------------
    def _defineParams(self, form):
        form.addSection(label='Input')
        form.addParam('SubtomoSource', EnumParam, default=0,
                      label='Source of Subtomograms',
                      choices=['After subtomogram synthesis', 'After subtomogram averaging'],
                      help='Choose the source of the subtomograms that you want to classify')
        form.addParam('ProtSynthesize', params.PointerParam, pointerClass='FlexProtSynthesizeSubtomo',
                      condition='SubtomoSource == 0',
                      label="Subtomogram synthesis",
                      help='All PDBs should have the same size')
        form.addParam('StA',params.PointerParam,
                      condition='SubtomoSource == 1',
                      pointerClass='FlexProtSubtomogramAveraging',
                      label="StA protocol",
                      help='Choose a subtomogram averaging previous run')
        form.addParam('classifyTechnique', EnumParam, default=0,
                      label='Classification techinque',
                      choices=['Hierarchical clustering', 'Dimentionality reduction then Clustering'],
                      help='Choose a classification techinque')
        form.addParam('ClusteringLinkage', EnumParam, default=0,
                      label='Linkage',
                      condition='classifyTechnique == 0',
                      choices=['ward'])
        form.addParam('dimredMethod', EnumParam, default=0,
                      condition='classifyTechnique == 1',
                      choices=['Scikit-Learn PCA'],
                      label='Dimensionality reduction method',
                      help='This method will be used to reduce the dimensions of the covariance matrix')
        form.addParam('applyMask', params.BooleanParam, label='Use a mask?', default=False,
                       help='This mask will be applied on the aligned particles before finding the cross correlation'
                            ', a proper mask is a mask derived from the subtomogram average (tight), that can be '
                            'applied on it without cropping any part.')
        form.addParam('Mask', params.PointerParam,
                       condition='applyMask',
                       pointerClass='Volume', allowsNull=True,
                       label="Select mask")
        form.addParam('reducedDim', IntParam, default=2,
                      condition='classifyTechnique == 1',
                      label='Reduced dimension')
        form.addParam('numOfClasses', IntParam, default=2,
                      label='Number of classes')
        # form.addParallelSection(threads=0, mpi=8)

        # --------------------------- INSERT steps functions --------------------------------------------
    def _insertAllSteps(self):
        mdSubtomo = self.getSubtomoMetaData()
        self._insertFunctionStep('subtomo_wedge_align',mdSubtomo)
        self._insertFunctionStep('find_covariance_matrix')
        if self.classifyTechnique.get() == 0:
            self._insertFunctionStep('performHierarchicalClustering')
        else:
            self._insertFunctionStep('performKmeansClustering')
        self._insertFunctionStep('findTotalAverage')
        self._insertFunctionStep('createOutputStep')


    # --------------------------- STEPS functions --------------------------------------------
[docs]    def subtomo_wedge_align(self,mdSubtomo):
        # we align the subtomograms but we align also a missing wedge mask for each one
        tilt = self.getTiltRange()
        # Creating a missing wedge mask:
        start_ang = tilt[0]
        end_ang = tilt[1]
        leg = self.getVolumeSize()
        size = (leg, leg, leg)
        MW_mask = np.ones(size)
        x, z = np.mgrid[0.:size[0], 0.:size[2]]
        x -= size[0] / 2
        ind = np.where(x)
        z -= size[2] / 2
        angles = np.zeros(z.shape)
        angles[ind] = np.arctan(z[ind] / x[ind]) * 180 / np.pi
        angles = np.reshape(angles, (size[0], 1, size[2]))
        angles = np.repeat(angles, size[1], axis=1)
        MW_mask[angles > -start_ang] = 0
        MW_mask[angles < -end_ang] = 0
        MW_mask[size[0] // 2, :, :] = 0
        MW_mask[size[0] // 2, :, size[2] // 2] = 1
        fnmask = self._getExtraPath('missing_wedge.spi')
        save_volume(np.float32(MW_mask), fnmask)
        args = ' -i ' + fnmask + ' --rotate_volume euler 0 90 0'
        runProgram('xmipp_transform_geometry',args)
        # Now aligning each subtomogram and its missing wedge version
        mw_path = self._getExtraPath('mw_masks/')
        subtom_aligned_path = self._getExtraPath('aligned_subtomograms/')
        makePath(mw_path)
        makePath(subtom_aligned_path)
        subtomogramMD = md.MetaData(mdSubtomo)
        subtomogaligneMD = md.MetaData()
        mwalignedMD = md.MetaData()
        for i in subtomogramMD:
            fnsubtomo = subtomogramMD.getValue(md.MDL_IMAGE, i)
            bnsubtomo = os.path.basename(fnsubtomo)
            bnwedge = removeBaseExt(bnsubtomo)+'_wedge.spi'
            fnalignedsubtomo = self._getExtraPath('aligned_subtomograms/'+bnsubtomo)
            fnalignedmask = self._getExtraPath('mw_masks/'+bnwedge)
            # print(fnalignedsubtomo)
            # print(fnalignedmask)
            rot = str(subtomogramMD.getValue(md.MDL_ANGLE_ROT, i))
            tilt = str(subtomogramMD.getValue(md.MDL_ANGLE_TILT, i))
            psi = str(subtomogramMD.getValue(md.MDL_ANGLE_PSI, i))
            shiftx = str(subtomogramMD.getValue(md.MDL_SHIFT_X, i))
            shifty = str(subtomogramMD.getValue(md.MDL_SHIFT_Y, i))
            shiftz = str(subtomogramMD.getValue(md.MDL_SHIFT_Z, i))
            # align the subtomogram

            if self.getAngleY() == 90:
                args = '-i ' + fnsubtomo + ' -o ' + fnalignedsubtomo + ' --rotate_volume euler 0 90 0'
                runProgram('xmipp_transform_geometry', args)
                params = '-i ' + fnalignedsubtomo + ' -o ' + fnalignedsubtomo + ' '

            else:
                params = '-i ' + fnsubtomo + ' -o ' + fnalignedsubtomo + ' '
                params += ' --inverse '

            params += '--rotate_volume euler ' + rot + ' ' + tilt + ' ' + psi + ' '
            params += '--shift ' + shiftx + ' ' + shifty + ' ' + shiftz + ' '

            runProgram('xmipp_transform_geometry', params)

            # align the mask (no shift should be applied only angles)
            if self.getAngleY() == 90:
                args = '-i ' + fnmask + ' -o ' + fnalignedmask + ' --rotate_volume euler 0 90 0'
                runProgram('xmipp_transform_geometry', args)
                params = '-i ' + fnalignedmask + ' -o ' + fnalignedmask + ' '
            else:
                params = '-i ' + fnmask + ' -o ' + fnalignedmask + ' '
                params += ' --inverse '

            params += '--rotate_volume euler ' + rot + ' ' + tilt + ' ' + psi + ' '

            runProgram('xmipp_transform_geometry', params)

            if (self.applyMask):
                maskfn = self.Mask.get().getFileName()
                params = '-i ' + fnalignedsubtomo + ' -o ' + fnalignedsubtomo + ' --mult ' + maskfn
                runProgram('xmipp_image_operate', params)

            subtomogaligneMD.setValue(md.MDL_IMAGE, fnalignedsubtomo, subtomogaligneMD.addObject())
            mwalignedMD.setValue(md.MDL_IMAGE, fnalignedmask, mwalignedMD.addObject())
        subtomogaligneMD.write(self._getExtraPath('aligned_subtomograms.xmd'))
        mwalignedMD.write(self._getExtraPath('aligned_masks.xmd'))




[docs]    def find_covariance_matrix(self):
        # if the covariance matrix exists, we should not re-evaluate it (it takes long time)
        fn_covarmat = self._getExtraPath('covar_mat.pkl')
        if os.path.exists(fn_covarmat):
            pass
        subtomogaligneMD= md.MetaData(self._getExtraPath('aligned_subtomograms.xmd'))
        mwalignedMD= md.MetaData(self._getExtraPath('aligned_masks.xmd'))
        N = subtomogaligneMD.size()
        X = np.zeros([N, N])
        print(N)
        A = time.time()
        for i in subtomogaligneMD:
            if i == 2:
                B = time.time()-A
                print('estimated time to finish is ', B*N/2)
            name_i = subtomogaligneMD.getValue(md.MDL_IMAGE, i)
            wedge_i = mwalignedMD.getValue(md.MDL_IMAGE, i)
            Vi = open_volume(name_i)
            Wi = open_volume(wedge_i)
            print('line is', str(i),' out of ',str(N))
            for j in range(i,N+1):
                name_j = subtomogaligneMD.getValue(md.MDL_IMAGE, j)
                wedge_j = mwalignedMD.getValue(md.MDL_IMAGE, j)
                Vj = open_volume(name_j)
                Wj = open_volume(wedge_j)
                Omega = Wi * Wj
                FVi = fft(Vi)
                FVi = fftshift(FVi) * Omega
                FVi = ifftshift(FVi)
                Vi_p = ifft(FVi)
                FVj = fft(Vj)
                FVj = fftshift(FVj) * Omega
                FVj = ifftshift(FVj)
                Vj_p = ifft(FVj)
                Vi_p = np.array(Vi_p, dtype=np.float32)
                Vj_p = np.array(Vj_p, dtype=np.float32)
                X[i-1, j-1] = self.cc(Vi_p, Vj_p)
                X[j-1, i-1] = self.cc(Vj_p, Vi_p)
        # save the covariance matrix:
        dump(X, fn_covarmat)

[docs]    def performHierarchicalClustering(self):
        fn_covarmat = self._getExtraPath('covar_mat.pkl')
        data = load(fn_covarmat)
        # 1 - CCCij (to keep with the literature)
        data = np.ones_like(data) - data
        clustering = AgglomerativeClustering(n_clusters=self.numOfClasses.get(), linkage='ward')
        clustering_class = clustering.fit_predict(data)
        labels = clustering.labels_
        # for l in np.unique(label):
        #     print(len(data[label == l]))
        dump(labels, filename=self._getExtraPath('hierarchical_clustering_labels.pkl'))
        print(clustering_class)
        # creating a metadate of subtomograms for each class and an average
        N = self.getVolumeSize()
        Averages = np.zeros([N, N, N, self.numOfClasses.get()])
        subtomogaligneMD = md.MetaData(self._getExtraPath('aligned_subtomograms.xmd'))
        # creating a metadata for each class
        classesMD = [md.MetaData() for i in range(self.numOfClasses.get())]
        for i in subtomogaligneMD:
            name = subtomogaligneMD.getValue(md.MDL_IMAGE, i)
            # the metadata index start from 1, we should subtract one to match the matrix
            label = labels[i-1]
            vol = open_volume(name)
            Averages[:, :, :, label] += vol
            classesMD[label].setValue(md.MDL_IMAGE, name, classesMD[label].addObject())
        for i in range(self.numOfClasses.get()):
            name = self._getExtraPath('class_'+str(i).zfill(2)+'.xmd')
            classesMD[i].write(name)
        # creating a metadata for all class averages:
        md_averages = md.MetaData()
        makePath(self._getExtraPath('class_averages/'))
        for j in range(self.numOfClasses.get()):
            num = len(labels[labels == j])
            print('Number of subtomograms in class #', j, 'is ', num)
            name = self._getExtraPath('class_averages/') + 'cluster' + str(j).zfill(2) + '.spi'
            ave = np.array(Averages[:, :, :, j] / num, dtype=np.float32)
            save_volume(ave, name)
            md_averages.setValue(md.MDL_IMAGE,name,md_averages.addObject())
        md_averages.write(self._getExtraPath('averages.xmd'))


[docs]    def performKmeansClustering(self):
        X = load(self._getExtraPath('covar_mat.pkl'))
        pca = decomposition.PCA(n_components=self.reducedDim.get())
        pca.fit(X)
        data = pca.transform(X)
        pca_pickled = self._getExtraPath('pca_pickled.pkl')
        np.savetxt(self._getExtraPath('dimred_mat.txt'),data)
        dump(pca,pca_pickled)
        # clustering now
        clustering = KMeans(n_clusters=self.numOfClasses.get()).fit(data)
        clustering_class = clustering.fit_predict(data)
        labels = clustering.labels_
        # for l in np.unique(label):
        #     print(len(data[label == l]))
        dump(clustering, filename=self._getExtraPath('kmeans_algo.pkl'))
        dump(labels, filename=self._getExtraPath('kmeans_clustering_labels.pkl'))
        print(clustering_class)
        # creating a metadate of subtomograms for each class and an average
        N = self.getVolumeSize()
        Averages = np.zeros([N, N, N, self.numOfClasses.get()])
        subtomogaligneMD = md.MetaData(self._getExtraPath('aligned_subtomograms.xmd'))
        # creating a metadata for each class
        classesMD = [md.MetaData() for i in range(self.numOfClasses.get())]
        for i in subtomogaligneMD:
            name = subtomogaligneMD.getValue(md.MDL_IMAGE, i)
            # the metadata index start from 1, we should subtract one to match the matrix
            label = labels[i - 1]
            vol = open_volume(name)
            Averages[:, :, :, label] += vol
            classesMD[label].setValue(md.MDL_IMAGE, name, classesMD[label].addObject())
        for i in range(self.numOfClasses.get()):
            name = self._getExtraPath('class_' + str(i).zfill(2) + '.xmd')
            classesMD[i].write(name)
        # creating a metadata for all class averages:
        md_averages = md.MetaData()
        makePath(self._getExtraPath('class_averages/'))
        for j in range(self.numOfClasses.get()):
            num = len(labels[labels == j])
            print('Number of subtomograms in class #', j, 'is ', num)
            name = self._getExtraPath('class_averages/') + 'cluster' + str(j).zfill(2) + '.spi'
            ave = np.array(Averages[:, :, :, j] / num, dtype=np.float32)
            save_volume(ave, name)
            md_averages.setValue(md.MDL_IMAGE, name, md_averages.addObject())
        md_averages.write(self._getExtraPath('averages.xmd'))
        pass

[docs]    def findTotalAverage(self):
        N = self.getVolumeSize()
        Average = np.zeros([N, N, N])
        subtomogaligneMD = md.MetaData(self._getExtraPath('aligned_subtomograms.xmd'))
        count = 0
        for i in subtomogaligneMD:
            name = subtomogaligneMD.getValue(md.MDL_IMAGE, i)
            vol = open_volume(name)
            Average[:, :, :] += vol
            count+=1
        name = self._getExtraPath('global_average.spi')
        save_volume(np.array(Average[:, :, :] / count, dtype=np.float32), name)
        pass

[docs]    def createOutputStep(self):
        out_mdfn = self._getExtraPath('averages.xmd')
        partSet = self._createSetOfVolumes('Averages')
        xmipp3.convert.readSetOfVolumes(out_mdfn, partSet)
        partSet.setSamplingRate(self.getSamplingRate())
        outvolume = Volume()
        outvolume.setSamplingRate(self.getSamplingRate())
        outvolume.setFileName(self._getExtraPath('global_average.spi'))
        self._defineOutputs(ClassAvarages=partSet, GlobalAverage=outvolume)
        pass

    # --------------------------- INFO functions --------------------------------------------
    def _summary(self):
        summary = []
        return summary

    def _validate(self):
        errors = []
        return errors

    def _citations(self):
        return ['harastani2020hybrid','Jin2014']

    def _methods(self):
        pass

    # --------------------------- UTILS functions --------------------------------------------
    def _printWarnings(self, *lines):
        """ Print some warning lines to 'warnings.xmd',
        the function should be called inside the working dir."""
        fWarn = open("warnings.xmd", 'w')
        for l in lines:
            print >> fWarn, l
        fWarn.close()

[docs]    def getSubtomoMetaData(self):
        if self.SubtomoSource.get()==0:
            return self.ProtSynthesize.get()._getExtraPath('GroundTruth.xmd')
        if self.SubtomoSource.get()==1:
            return self.StA.get()._getExtraPath('final_md.xmd')

[docs]    def getTiltRange(self):
        if self.SubtomoSource.get()==0:
            return [self.ProtSynthesize.get().tiltLow.get(), self.ProtSynthesize.get().tiltHigh.get()]
        if self.SubtomoSource.get()==1:
            return [self.StA.get().tiltLow.get(), self.StA.get().tiltHigh.get()]

[docs]    def getAngleY(self):
        if self.SubtomoSource.get() == 0:
            return 0
        if self.SubtomoSource.get() == 1:
            temp = md.MetaData(self.StA.get()._getExtraPath('final_md.xmd'))
            return temp.getValue(md.MDL_ANGLE_Y, 1)


[docs]    def getSamplingRate(self):
        if self.SubtomoSource.get()==0:
            return self.ProtSynthesize.get().samplingRate.get()
        if self.SubtomoSource.get()==1:
            return self.StA.get().inputVolumes.get().getSamplingRate()

[docs]    def getVolumeSize(self):
        if self.SubtomoSource.get() == 0:
            return self.ProtSynthesize.get().volumeSize.get()
        if self.SubtomoSource.get() == 1:
            return self.StA.get().SubtomogramAverage.getDim()[0]


[docs]    def getOutputMatrixFile(self):
        return self._getExtraPath('output_matrix.txt')

[docs]    def getDeformationFile(self):
        return self._getExtraPath('pdbs_mat.txt')

[docs]    def normalize(self,v):
        """Normalize a volume.
        @param v: input volume.
        @return: Normalized volume.
        """
        m = np.mean(v)
        v = v - m
        s = np.std(v)
        v = v / s
        return v

[docs]    def cc(self, v1, v2):
        """Compute the Normalized Cross Correlation between the two volumes.
        @param v1: volume 1.
        @param v2: volume 2.
        @return: NCC.
        """
        vv1 = self.normalize(v1)
        vv2 = self.normalize(v2)
        score = np.sum(vv1 * vv2) / vv1.size
        return score