Source code for xmipp3.protocols.protocol_consensus_classes3D

# **************************************************************************
# * Authors:     David Maluenda (dmaluenda@cnb.csic.es)
# *              Daniel del Hoyo Gomez (daniel.delhoyo.gomez@alumnos.upm.es)
# *              Jorge Garcia Condado (jgcondado@cnb.csic.es)
# *              Oier Lauzirika Zarrabeitia (oierlauzi@bizkaia.eu)
# *
# * Unidad de  Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307  USA
# *
# *  All comments concerning this program package may be sent to the
# *  e-mail address 'scipion@cnb.csic.es'
# *
# **************************************************************************

from unittest import result
from pwem import emlib
from pwem.protocols import EMProtocol
from pwem.objects import Volume

from pyworkflow.protocol.params import MultiPointerParam, EnumParam, IntParam
from pyworkflow.protocol.params import Range, GE, GT, LE, LT
from pyworkflow.object import Float
from pyworkflow.constants import BETA
from pyworkflow.protocol.constants import STEPS_PARALLEL
from pyworkflow.utils.path import makePath

from xmipp3.convert import setXmippAttribute

import math
import csv
import collections
import itertools
import os.path
import numpy as np
import multiprocessing as mp
from multiprocessing.pool import ThreadPool

[docs]class XmippProtConsensusClasses3D(EMProtocol):
    """ Compare several SetOfClasses3D.
        Return the consensus clustering based on a objective function
        that uses the similarity between clusters intersections and
        the entropy of the clustering formed.
    """
    _label = 'consensus clustering 3D'
    _devStatus = BETA

    def __init__(self, *args, **kwargs):
        EMProtocol.__init__(self, *args, **kwargs)
        self._createFileNames()
        
    def _createFileNames(self):
        iterFmt='i%(iter)06d'

        myDict = {
            'intersections': f'intersections.csv',
            'clustering': f'clusterings/{iterFmt}.csv',
            'objective_values': f'objective_values.csv',
            'elbows': f'elbows.csv',
            'reference_sizes': f'reference_sizes.csv',
            'reference_relative_sizes': f'reference_relative_sizes.csv',
        }
        self._updateFilenamesDict(myDict)

    def _defineParams(self, form):
        # Input data
        form.addSection(label='Input')
        form.addParam('inputClassifications', MultiPointerParam, pointerClass='SetOfClasses3D', label="Input classes", 
                      important=True,
                      help='Select several sets of classes where to evaluate the '
                           'intersections.')

        # Consensus
        form.addSection(label='Consensus clustering')
        form.addParam('clusteringManualCount', IntParam, label='Manual cluster count',
                      validators=[GE(2)], default=3,
                      help='Number of clusters to be obtained manually')
        form.addParam('clusteringAngle', IntParam, label='Objective function threshold angle',
                      validators=[Range(0, 90)], default=45,
                      help='Angle to determine the cluster count')
        
        # Reference random classification
        form.addSection(label='Reference random classification')
        form.addParam('randomClassificationCount', IntParam, label='Number of random classifications',
                      validators=[GE(0)], default=0,
                      help='Number of random classifications used for computing the significance of the actual '
                           'clusters. 0 for skipping this step')
        form.addParallelSection(mpi=0, threads=mp.cpu_count())

    # --------------------------- INSERT steps functions -----------------------
    def _insertAllSteps(self):
        """ Inserting one step for each intersections analysis """
        outputPrerequisites = []
        self._setupThreadPool(int(self.numberOfThreads))

        # Intersect and ensemble
        self._insertFunctionStep('intersectStep', prerequisites=[])
        self._insertFunctionStep('ensembleStep')
        outputPrerequisites.append(self._insertFunctionStep('findElbowsStep'))

        # Perform a random classification and use it as a reference
        if self.randomClassificationCount.get() > 0:
            outputPrerequisites.append(self._insertFunctionStep('checkSignificanceStep', prerequisites=[]))

        # Lastly create the output    
        self._insertFunctionStep('createOutputStep', prerequisites=outputPrerequisites)

[docs]    def intersectStep(self):
        classifications = self._getInputClassifications()

        # Compute the intersection among all classes
        intersections = self._calculateClassificationIntersections(classifications)

        # Write the result to disk
        self._writeIntersections(intersections)
        
[docs]    def ensembleStep(self, numClusters=1):
        # Read input data
        classifications = self._getInputClassifications()
        intersections = self._readIntersections()

        # Iteratively ensemble intersections until only 1 remains
        allClusterings, allObValues = self._ensembleIntersections(classifications, intersections, numClusters)

        # Reverse them so that their size increases with the index
        allClusterings.reverse()
        allObValues.reverse()

        # Save the results
        self._writeAllClusterings(allClusterings)
        self._writeObjectiveValues(allObValues)

[docs]    def findElbowsStep(self):
        obValues = self._readObjectiveValues()
        numClusters = list(range(1, 1+len(obValues)))

        # Normalize clusters and obValues
        normObValues = self._normalizeValues(obValues)
        normNumClusters = self._normalizeValues(numClusters)

        # Calculate the elbows
        elbows = {
            'origin': self._findElbowOrigin(normNumClusters, normObValues),
            'angle': self._findElbowAngle(normNumClusters, normObValues, -np.radians(self.clusteringAngle.get())),
            'pll': self._findElbowPll(obValues)
        }

        # Start counting at 1
        for key in elbows.keys():
            elbows[key] += 1

        # Save the results
        self._writeElbows(elbows)

[docs]    def checkSignificanceStep(self):
        classifications = self._getInputClassifications()
        numRand = self.randomClassificationCount.get()
        threadPool = self._getThreadPool()

        # Compute the random consensus
        consensusSizes, consensusRelativeSizes = self._calculateReferenceClassification(
            classifications, numRand, threadPool
        )

        # Sort the sizes so that percentile calculation is easy
        consensusSizes.sort()
        consensusRelativeSizes.sort()

        # Write results to disk
        self._writeReferenceClassificationSizes(consensusSizes)
        self._writeReferenceClassificationRelativeSizes(consensusRelativeSizes)

[docs]    def createOutputStep(self):
        # Read all the necessary data from disk
        manualClusterCount = self.clusteringManualCount.get()
        initialClusterCount = self._readClusteringCount()
        elbows = self._readElbows()
        consensusSizes = self._readReferenceClassificationSizes()
        consensusRelativeSizes = self._readReferenceClassificationRelativeSizes()

        # Append the manual and initial indices to the elbow
        if manualClusterCount > 0:
            elbows['manual'] = manualClusterCount
        elbows['initial'] =  initialClusterCount

        # Read selected clusters
        clusterings = dict(zip(elbows.keys(), map(self._readClustering, elbows.values())))

        # Create the output classes and define them
        particles = self._getInputParticles()
        outputClasses = dict(zip(
            map(lambda name : 'outputClasses_'+name,
                clusterings.keys() 
            ),
            map(self._createOutputClasses3D,
                itertools.repeat(particles),
                clusterings.values(),
                clusterings.keys(),
                itertools.repeat(consensusSizes),
                itertools.repeat(consensusRelativeSizes)
            )
        ))
        self._defineOutputs(**outputClasses)

        # Stablish source output relationships
        sources = list(self.inputClassifications)
        destinations = list(outputClasses.values())
        for src in sources:
            for dst in destinations:
                self._defineSourceRelation(src, dst)

    # --------------------------- INFO functions -------------------------------
    def _validate(self):
        errors = []

        # Ensure that all classifications are made of the same images
        images = self._getInputParticles(0)
        for i in range(1, len(self.inputClassifications)):
            if self._getInputParticles(i) != images:
                errors.append(f'Classification {i} has been done with different images')

        return errors

    # --------------------------- UTILS functions ------------------------------
    def _getThreadPool(self):
        return getattr(self, '_threadPool', None)

    def _setupThreadPool(self, maxThreads):
        if maxThreads > 1:
            self._threadPool = ThreadPool(processes=maxThreads)

    def _getInputParticles(self, i=0):
        return self.inputClassifications[i].get().getImages()

    def _getInputClassifications(self):
        if not hasattr(self, '_classifications'):
            self._classifications = self._convertInputClassifications(self.inputClassifications)
        
        return self._classifications

    def _writeTable(self, path, table, fmt='%s'):
        np.savetxt(path, table, delimiter=',', fmt=fmt)

    def _readTable(self, path, dtype=str):
        return np.genfromtxt(path, delimiter=',', dtype=dtype)

    def _writeList(self, path, lst, fmt='%s'):
        self._writeTable(path, lst, fmt)

    def _readList(self, path, dtype=str):
        return self._readTable(path, dtype=dtype)
        
    def _writeClassification(self, path, classification):
        with open(path, 'w') as file:
            writer = csv.writer(file)
            
            # Write the header
            header = ['Representative index', 'Representative path', 'Source size', 'Particle ids']
            writer.writerow(header)

            # Transform the classification
            def f(cls):
                result = []
                
                # Add the location of the representative
                representative = cls.getRepresentative()
                location = representative.getLocation() if representative is not None else (0, '')
                result += list(location)

                # Add the source size
                result += [cls.getSourceSize()]

                # Add the particles
                result += list(cls.getParticleIds())

                assert(len(result) == (len(cls) + 3))
                return result
            writer.writerows(map(f, classification))

    def _readClassification(self, path):
        with open(path, 'r') as file:
            reader = csv.reader(file)

            # Ignore the header
            next(reader)

            # Transform each row to a ParticleCluster in a classification
            def f(row):
                ite = iter(row)

                # Parse the representative
                index = int(next(ite))
                path = next(ite)
                representative = Volume(location=(index, path))

                # Parse the source size
                sourceSize = int(next(ite))

                # Parse the id set
                particleIds = set(map(lambda id : int(id), ite))

                result = XmippProtConsensusClasses3D.ParticleCluster(particleIds, representative, sourceSize)
                assert(len(row) == (len(result) + 3))
                return result
            return list(map(f, reader))

    def _writeDictionary(self, path, d):
        with open(path, 'w') as file:
            writer = csv.writer(file)
            writer.writerow(d.keys()) # Header
            writer.writerow(d.values()) # Values

    def _readDictionary(self, path, dtype=str):
        with open(path, 'r') as file:
            reader = csv.reader(file)
            keys = list(next(reader))
            values = list(map(lambda value : dtype(value), next(reader)))
            return dict(zip(keys, values))

    def _writeIntersections(self, intersections):
        self._writeClassification(self._getExtraPath(self._getFileName('intersections')), intersections)

    def _readIntersections(self):
        return self._readClassification(self._getExtraPath(self._getFileName('intersections')))

    def _writeClustering(self, iter, clustering):
        self._writeClassification(self._getExtraPath(self._getFileName('clustering', iter=iter)), clustering)

    def _readClustering(self, iter):
        return self._readClassification(self._getExtraPath(self._getFileName('clustering', iter=iter)))

    def _readClusteringCount(self):
        count = 0
        while os.path.exists(self._getExtraPath(self._getFileName('clustering', iter=count+1))):
            count += 1
        return count

    def _writeAllClusterings(self, clusterings):   
        # Create the path
        makePath(os.path.dirname(self._getExtraPath(self._getFileName('clustering', iter=0))))

        # Write all
        for iter, clustering in enumerate(clusterings, start=1):
            assert(len(clustering) == iter)
            self._writeClustering(iter, clustering)

    def _readAllClusterings(self):
        return list(map(self._readClustering, range(1, 1+self._readClusteringCount())))

    def _writeObjectiveValues(self, allObValues):
        self._writeList(self._getExtraPath(self._getFileName('objective_values')), allObValues, '%f')

    def _readObjectiveValues(self):
        return self._readList(self._getExtraPath(self._getFileName('objective_values')), dtype=float)

    def _writeElbows(self, elbows):
        self._writeDictionary(self._getExtraPath(self._getFileName('elbows')), elbows)

    def _readElbows(self):
        return self._readDictionary(self._getExtraPath(self._getFileName('elbows')), dtype=int)

    def _writeReferenceClassificationSizes(self, sizes):
        self._writeList(self._getExtraPath(self._getFileName('reference_sizes')), sizes, '%d')

    def _readReferenceClassificationSizes(self):
        path = self._getExtraPath(self._getFileName('reference_sizes'))
        if os.path.exists(path):
            return self._readList(path, dtype=int)
        else:
            return None

    def _writeReferenceClassificationRelativeSizes(self, sizes):
        self._writeList(self._getExtraPath(self._getFileName('reference_relative_sizes')), sizes, '%f')

    def _readReferenceClassificationRelativeSizes(self):
        path = self._getExtraPath(self._getFileName('reference_relative_sizes'))
        if os.path.exists(path):
            return self._readList(path, dtype=float)
        else:
            return None

    def _convertInputClassifications(self, classifications):
        """ Returns the list of lists of sets that stores the set of ids of each class"""
        def class3dToParticleCluster(cls):
            ids = cls.getIdSet()
            rep = cls.getRepresentative().clone() if cls.hasRepresentative() else None
            return XmippProtConsensusClasses3D.ParticleCluster(ids, rep)

        f = lambda classification : list(map(class3dToParticleCluster, classification.get()))
        result = list(map(f, classifications))
        return result
    
    def _calculateClassificationIntersections(self, classifications):
        # Start with the first classification
        result = classifications[0]

        for i in range(1, len(classifications)):
            classification = classifications[i]
            intersections = []

            # Perform the intersection with previous classes
            for cls0 in result:
                for cls1 in classification:
                    intersection = cls0.intersection(cls1)
                    if intersection:
                        intersections.append(intersection)

            # Use the new intersection for the next step
            result = intersections

        return result

    def _calculateSubsetSimilarity(self, n, c):
        """ Return the similarity of intersection subset (n) with class subset (c) """
        inter = len(n.intersection(c))
        return inter / len(n)

    def _buildSimilarityVector(self, n, cs):
        """ Build the similarity vector of n subset with each c subset in cs """
        return list(map(self._calculateSubsetSimilarity, itertools.repeat(n), cs))

    def _calculateEntropy(self, p):
        """ Return the entropy of the elements in a vector """
        result = -sum(map(lambda x : x*math.log2(x), p))
        assert(result >= 0) # Entropy cannot be negative
        return result

    def _calculateDistribution(self, ns):
        """ Return the vector of distribution (p) from the n subsets in ns
        """
        p = list(map(len, ns))
        return np.array(p) / sum(p)

    def _calculateCosineSimilarity(self, v1, v2):
        """ Return the cosine similarity of two vectors that is used as similarity """
        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

    def _objectiveFunction(self, ns, indices, vs, eps=0.01):
        """Objective function to minimize based on the entropy and the similarity between the clusters to merge
        """
        p = self._calculateDistribution(ns)
        pi = self._calculateDistribution(self._ensembleIntersectionPair(ns, indices))
        h = self._calculateEntropy(p)
        hi = self._calculateEntropy(pi)

        sv = self._calculateCosineSimilarity(vs[indices[0]], vs[indices[1]])

        return (h - hi) / (sv + eps)

    def _ensembleIntersectionPair(self, intersections, indices):
        """ Return a list of subsets where the intersections[indices] have been merged """
        # Divide the input intersections into the ones referred by indices and not
        selection = [intersections[i] for i in indices]  # intersection[indices]
        result = [intersections[i] for i in range(len(intersections)) if i not in indices]  # intersections - selection

        # Merge all the selected clusters into the same one
        merged = XmippProtConsensusClasses3D.ParticleCluster.union(*selection)
        
        # Add the merged items to the result
        result.append(merged)
        return result

    def _ensembleCheapestIntersectionPair(self, allClasses, intersections):
        """ Merges the lest costly pair of intersections """
        # Compute the similarity vectors
        similarityVectors = list(map(self._buildSimilarityVector, intersections, itertools.repeat(allClasses)))

        # For each possible pair of intersections compute the cost of merging them
        obValues = {}
        for pair in itertools.combinations(range(len(intersections)), r=2):
            obValues[pair] = self._objectiveFunction(intersections, pair, similarityVectors)

        values = list(obValues.values())
        keys = list(obValues.keys())

        # Select the minimum cost pair and merge them
        obValue = min(values)
        mergePair = keys[values.index(obValue)]
        mergedIntersections = self._ensembleIntersectionPair(intersections, mergePair)

        return mergedIntersections, obValue

    def _ensembleIntersections(self, classifications, intersections, numClusters=1):
        """ Ensemble clustering by COSS that merges interactions of clusters based
            on entropy minimization """
        # Results
        allIntersections = [intersections]
        allObValues = [0.0]

        # Obtain a list with all source classes
        allClasses = list(itertools.chain(*classifications))

        # Iteratively merge intersections
        while len(allIntersections[-1]) > numClusters:
            mergedIntersections, obValue = self._ensembleCheapestIntersectionPair(allClasses, allIntersections[-1])
            allIntersections.append(mergedIntersections)
            allObValues.append(obValue)

        assert(len(allIntersections) == len(allObValues))
        return allIntersections, allObValues
            
    def _normalizeValues(self, x):
        """Normalize values to range 0 to 1"""
        return (x-np.min(x))/(np.max(x)-np.min(x))

    def _calculateGaussianMleEstimates(self, d, q):
        """ MLE estimates of guassian distributions
        with common variance parameter
        """
        p = len(d)
        d1 = d[:q]
        d2 = d[q:]
        mu1 = np.mean(d1)
        mu2 = np.mean(d2)
        var1 = np.var(d1)
        var2 = np.var(d2)
        sigma = ((q-1)*var1+(p-q-1)*var2)/(p-2)
        theta1 = [mu1, sigma]
        theta2 = [mu2, sigma]
        return theta1, theta2

    def _calculateGaussianDistribution(self, d, theta):
        """ Gaussian pdf """
        m = theta[0]
        s = theta[1]
        gauss = 1/(np.sqrt(2*np.pi*s))*np.exp(-1/(2*s)*(d-m)**2)
        return gauss

    def _calculateProfileLogLikelihood(self, d, q, theta1, theta2):
        """ Profile log likelihood for given parameters """
        log_f_q = np.log(self._calculateGaussianDistribution(d[:q], theta1))
        log_f_p = np.log(self._calculateGaussianDistribution(d[q:], theta2))
        l_q = np.sum(log_f_q) + np.sum(log_f_p)
        return l_q

    def _calculateFullProfileLogLikelihood(self, d):
        """ Calculate profile log likelihood for each partition
            of the data """
        pll = []
        for q in range(1, len(d)):
            theta1, theta2 = self._calculateGaussianMleEstimates(d, q)
            pll.append(self._calculateProfileLogLikelihood(d, q, theta1, theta2))
        return pll

    def _findElbowPll(self, obValues):
        """ Find the elbow according to the full profile likelihood
        """
        # Get profile log likelihood for log of objective values
        # Remove last obValue as it is zero and log(0) is undefined
        pll = self._calculateFullProfileLogLikelihood(np.log(obValues[:-1]))
        return np.argmax(pll)

    def _findElbowOrigin(self, x, y):
        """ Find the point closest to origin from normalied data
        """
        coors = np.column_stack((x, y))
        distances = np.linalg.norm(coors, axis=1)
        assert(len(x) == len(distances))
        return np.argmin(distances)

    def _findElbowAngle(self, x, y, angle=-math.pi/4):
        """ Find the angle the slope of the function makes and
            return the point at which the slope crosses the given angle
        """
        dx = np.diff(x)
        dy = np.diff(y)
        angles = np.arctan2(dy, dx)

        # Find the first point with an angle greater than the given one
        crossing = np.argmax(angles >= angle)

        return crossing

    def _getClassificationLengths(self, classifications):
        """ Returns a list of lists that stores the lengths of each classification """
        return list(map(lambda classification : list(map(len, classification)), classifications))

    def _calculateRandomClassification(self, C, N):
        """ Randomly classifies N element indices into groups with
        sizes defined by rows of C """
        Cp = []

        for Ci in C:
            assert(sum(Ci) == N)  # The groups should address all the elements TODO: Maybe LEQ?
            x = np.argsort(np.random.uniform(size=N)).tolist()  # Shuffles a [0, N) iota

            # Select the number random indices requested by each group
            Cip = []
            first = 0
            for s in Ci:
                Cip.append(XmippProtConsensusClasses3D.ParticleCluster(x[first:first + s]))
                first += s

            # Add the random classification to the result
            Cp.append(Cip)

        return Cp

    def _calculateRandomClassificationConsensus(self, C, N):
        """ Obtains the intersections of a consensus
            of a random classification of sizes defined in C of
            N elements. C is a list of lists, where de sum of each
            row must equal N """

        # Create random partitions of same size
        randomClassification = self._calculateRandomClassification(C, N)

        # Compute the repeated classifications
        return self._calculateClassificationIntersections(randomClassification)

    def _calculateReferenceClassification(self, classifications, numExec, threadPool=None):
        """ Create random partitions of same size to compare the quality
         of the classification """
        clusterLengths = self._getClassificationLengths(classifications)
        numParticles = sum(clusterLengths[0])

        # Repeatedly obtain a consensus of a random classification of same size
        if threadPool:
            consensus = threadPool.starmap(
                self._calculateRandomClassificationConsensus,
                itertools.repeat((clusterLengths, numParticles), numExec)
            )
        else:
            consensus = list(map(
                self._calculateRandomClassificationConsensus, 
                itertools.repeat(clusterLengths, numExec),
                itertools.repeat(numParticles, numExec)
            ))
        assert(len(consensus) == numExec)

        # Concatenate all consensuses
        consensus = list(itertools.chain(*consensus))

        # Obtain the consensus sizes
        consensusSizes = list(map(len, consensus))
        consensusSizeRatios = list(map(XmippProtConsensusClasses3D.ParticleCluster.getRelativeSize, consensus))
        return consensusSizes, consensusSizeRatios

    def _createOutputClasses3D(self, particles, clustering, name, randomConsensusSizes=None, randomConsensusRelativeSizes=None):
        outputClasses = self._createSetOfClasses3D(particles, suffix=name)

        # Create a list with the filenames of the representatives
        representatives = list(map(
            XmippProtConsensusClasses3D.ParticleCluster.getRepresentative,
            clustering
        ))

        # Fill the output
        loader = XmippProtConsensusClasses3D.ClassesLoader(
            clustering, 
            representatives,
            randomConsensusSizes,
            randomConsensusRelativeSizes
        )
        loader.fillClasses(outputClasses)

        return outputClasses
    
[docs]    class ParticleCluster:
        """ Keeps track of the information related to successive class intersections.
            It is instantiated with a single class and allows to perform intersections
            and unions with it"""
        def __init__(self, particleIds, representative=None, sourceSize=None):
            self._particleIds = set(particleIds)  # Particle ids belonging to this intersection
            self._representative = representative
            self._sourceSize = sourceSize if sourceSize is not None else len(self._particleIds)  # The size of the representative class

        def __len__(self):
            return len(self.getParticleIds())

        def __eq__(self, other):
            return self.getParticleIds() == other.getParticleIds()

[docs]        def intersection(self, *others):
            return self._combine(set.intersection, min, XmippProtConsensusClasses3D.ParticleCluster.getSourceSize, *others)

[docs]        def union(self, *others):
            return self._combine(set.union, max, len, *others)

[docs]        def getParticleIds(self):
            return self._particleIds

[docs]        def getRepresentative(self):
            return self._representative

[docs]        def getSourceSize(self):
            return self._sourceSize

[docs]        def getRelativeSize(self):
            return len(self) / self.getSourceSize()

        def _combine(self, operation, selector, selectionCriteria, *others):
            allItems = (self, ) + others
            
            # Perform the requested operation among the id sets
            allParticleIds = map(XmippProtConsensusClasses3D.ParticleCluster.getParticleIds, allItems)
            particleIds = operation(*allParticleIds)

            # Select the class with the appropriate criteria
            selection = selector(allItems, key=selectionCriteria)
            representative = selection.getRepresentative()
            sourceSize = selection.getSourceSize()

            return XmippProtConsensusClasses3D.ParticleCluster(particleIds, representative, sourceSize)

[docs]    class ClassesLoader:
        """ Helper class to produce classes
        """
        def __init__(self, clustering, representatives, randomConsensusSizes=None, randomConsensusRelativeSizes=None):
            self.clustering = clustering
            self.classification = self._createClassification(clustering)
            self.representatives = representatives
            self.randomConsensusSizes = randomConsensusSizes
            self.randomConsensusRelativeSizes = randomConsensusRelativeSizes

[docs]        def fillClasses(self, clsSet):
            clsSet.classifyItems(updateItemCallback=self._updateParticle,
                                updateClassCallback=self._updateClass,
                                itemDataIterator=iter(self.classification.items()),
                                doClone=False)

        def _updateParticle(self, item, row):
            particleId, classId = row
            assert(item.getObjId() == particleId)
            item.setClassId(classId)

        def _updateClass(self, item):
            classId = item.getObjId()
            classIdx = classId-1

            # Set the representative
            item.setRepresentative(self.representatives[classIdx])

            # Set the size p-value
            if self.randomConsensusSizes is not None:
                size = len(self.clustering[classIdx])
                percentile = self._findPercentile(self.randomConsensusSizes, size)
                pValue = 1 - percentile
                setXmippAttribute(item, emlib.MDL_CLASS_INTERSECTION_SIZE_PVALUE, Float(pValue))


            # Set the relative size p-value
            if self.randomConsensusRelativeSizes is not None:
                size = self.clustering[classIdx].getRelativeSize()
                percentile = self._findPercentile(self.randomConsensusRelativeSizes, size)
                pValue = 1 - percentile
                setXmippAttribute(item, emlib.MDL_CLASS_INTERSECTION_RELATIVE_SIZE_PVALUE, Float(pValue))

        def _createClassification(self, clustering):
            # Fill the classification data (particle-class mapping)
            result = {}
            for cls, data in enumerate(clustering):
                for particleId in data.getParticleIds():
                    result[particleId] = cls + 1 

            return collections.OrderedDict(sorted(result.items())) 

        def _findPercentile(self, data, value):
            """ Given an array of values (data), finds the corresponding percentile of the value.
                Percentile is returned in range [0, 1] """
            assert(np.array_equal(sorted(data), data))  # In order to iterate it in ascending order

            # Count the number of elements that are smaller than the given value
            i = 0
            while i < len(data) and data[i] < value:
                i += 1

            # Convert it into a percentage
            return float(i) / float(len(data))