Source code for xmipp3.protocols.protocol_consensus_classes3D

# **************************************************************************
# * Authors:     David Maluenda (
# *              Daniel del Hoyo Gomez (
# *              Jorge Garcia Condado (
# *              Oier Lauzirika Zarrabeitia (
# *
# * Unidad de  Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307  USA
# *
# *  All comments concerning this program package may be sent to the
# *  e-mail address ''
# *
# **************************************************************************

from unittest import result
from pwem import emlib
from pwem.protocols import EMProtocol
from pwem.objects import Volume

from pyworkflow.protocol.params import MultiPointerParam, EnumParam, IntParam
from pyworkflow.protocol.params import Range, GE, GT, LE, LT
from pyworkflow.object import Float
from pyworkflow.constants import BETA
from pyworkflow.protocol.constants import STEPS_PARALLEL
from pyworkflow.utils.path import makePath

from xmipp3.convert import setXmippAttribute

import math
import csv
import collections
import itertools
import os.path
import numpy as np
import multiprocessing as mp
from multiprocessing.pool import ThreadPool

[docs]class XmippProtConsensusClasses3D(EMProtocol): """ Compare several SetOfClasses3D. Return the consensus clustering based on a objective function that uses the similarity between clusters intersections and the entropy of the clustering formed. """ _label = 'consensus clustering 3D' _devStatus = BETA def __init__(self, *args, **kwargs): EMProtocol.__init__(self, *args, **kwargs) self._createFileNames() def _createFileNames(self): iterFmt='i%(iter)06d' myDict = { 'intersections': f'intersections.csv', 'clustering': f'clusterings/{iterFmt}.csv', 'objective_values': f'objective_values.csv', 'elbows': f'elbows.csv', 'reference_sizes': f'reference_sizes.csv', 'reference_relative_sizes': f'reference_relative_sizes.csv', } self._updateFilenamesDict(myDict) def _defineParams(self, form): # Input data form.addSection(label='Input') form.addParam('inputClassifications', MultiPointerParam, pointerClass='SetOfClasses3D', label="Input classes", important=True, help='Select several sets of classes where to evaluate the ' 'intersections.') # Consensus form.addSection(label='Consensus clustering') form.addParam('clusteringManualCount', IntParam, label='Manual cluster count', validators=[GE(2)], default=3, help='Number of clusters to be obtained manually') form.addParam('clusteringAngle', IntParam, label='Objective function threshold angle', validators=[Range(0, 90)], default=45, help='Angle to determine the cluster count') # Reference random classification form.addSection(label='Reference random classification') form.addParam('randomClassificationCount', IntParam, label='Number of random classifications', validators=[GE(0)], default=0, help='Number of random classifications used for computing the significance of the actual ' 'clusters. 0 for skipping this step') form.addParallelSection(mpi=0, threads=mp.cpu_count()) # --------------------------- INSERT steps functions ----------------------- def _insertAllSteps(self): """ Inserting one step for each intersections analysis """ outputPrerequisites = [] self._setupThreadPool(int(self.numberOfThreads)) # Intersect and ensemble self._insertFunctionStep('intersectStep', prerequisites=[]) self._insertFunctionStep('ensembleStep') outputPrerequisites.append(self._insertFunctionStep('findElbowsStep')) # Perform a random classification and use it as a reference if self.randomClassificationCount.get() > 0: outputPrerequisites.append(self._insertFunctionStep('checkSignificanceStep', prerequisites=[])) # Lastly create the output self._insertFunctionStep('createOutputStep', prerequisites=outputPrerequisites)
[docs] def intersectStep(self): classifications = self._getInputClassifications() # Compute the intersection among all classes intersections = self._calculateClassificationIntersections(classifications) # Write the result to disk self._writeIntersections(intersections)
[docs] def ensembleStep(self, numClusters=1): # Read input data classifications = self._getInputClassifications() intersections = self._readIntersections() # Iteratively ensemble intersections until only 1 remains allClusterings, allObValues = self._ensembleIntersections(classifications, intersections, numClusters) # Reverse them so that their size increases with the index allClusterings.reverse() allObValues.reverse() # Save the results self._writeAllClusterings(allClusterings) self._writeObjectiveValues(allObValues)
[docs] def findElbowsStep(self): obValues = self._readObjectiveValues() numClusters = list(range(1, 1+len(obValues))) # Normalize clusters and obValues normObValues = self._normalizeValues(obValues) normNumClusters = self._normalizeValues(numClusters) # Calculate the elbows elbows = { 'origin': self._findElbowOrigin(normNumClusters, normObValues), 'angle': self._findElbowAngle(normNumClusters, normObValues, -np.radians(self.clusteringAngle.get())), 'pll': self._findElbowPll(obValues) } # Start counting at 1 for key in elbows.keys(): elbows[key] += 1 # Save the results self._writeElbows(elbows)
[docs] def checkSignificanceStep(self): classifications = self._getInputClassifications() numRand = self.randomClassificationCount.get() threadPool = self._getThreadPool() # Compute the random consensus consensusSizes, consensusRelativeSizes = self._calculateReferenceClassification( classifications, numRand, threadPool ) # Sort the sizes so that percentile calculation is easy consensusSizes.sort() consensusRelativeSizes.sort() # Write results to disk self._writeReferenceClassificationSizes(consensusSizes) self._writeReferenceClassificationRelativeSizes(consensusRelativeSizes)
[docs] def createOutputStep(self): # Read all the necessary data from disk manualClusterCount = self.clusteringManualCount.get() initialClusterCount = self._readClusteringCount() elbows = self._readElbows() consensusSizes = self._readReferenceClassificationSizes() consensusRelativeSizes = self._readReferenceClassificationRelativeSizes() # Append the manual and initial indices to the elbow if manualClusterCount > 0: elbows['manual'] = manualClusterCount elbows['initial'] = initialClusterCount # Read selected clusters clusterings = dict(zip(elbows.keys(), map(self._readClustering, elbows.values()))) # Create the output classes and define them particles = self._getInputParticles() outputClasses = dict(zip( map(lambda name : 'outputClasses_'+name, clusterings.keys() ), map(self._createOutputClasses3D, itertools.repeat(particles), clusterings.values(), clusterings.keys(), itertools.repeat(consensusSizes), itertools.repeat(consensusRelativeSizes) ) )) self._defineOutputs(**outputClasses) # Stablish source output relationships sources = list(self.inputClassifications) destinations = list(outputClasses.values()) for src in sources: for dst in destinations: self._defineSourceRelation(src, dst)
# --------------------------- INFO functions ------------------------------- def _validate(self): errors = [] # Ensure that all classifications are made of the same images images = self._getInputParticles(0) for i in range(1, len(self.inputClassifications)): if self._getInputParticles(i) != images: errors.append(f'Classification {i} has been done with different images') return errors # --------------------------- UTILS functions ------------------------------ def _getThreadPool(self): return getattr(self, '_threadPool', None) def _setupThreadPool(self, maxThreads): if maxThreads > 1: self._threadPool = ThreadPool(processes=maxThreads) def _getInputParticles(self, i=0): return self.inputClassifications[i].get().getImages() def _getInputClassifications(self): if not hasattr(self, '_classifications'): self._classifications = self._convertInputClassifications(self.inputClassifications) return self._classifications def _writeTable(self, path, table, fmt='%s'): np.savetxt(path, table, delimiter=',', fmt=fmt) def _readTable(self, path, dtype=str): return np.genfromtxt(path, delimiter=',', dtype=dtype) def _writeList(self, path, lst, fmt='%s'): self._writeTable(path, lst, fmt) def _readList(self, path, dtype=str): return self._readTable(path, dtype=dtype) def _writeClassification(self, path, classification): with open(path, 'w') as file: writer = csv.writer(file) # Write the header header = ['Representative index', 'Representative path', 'Source size', 'Particle ids'] writer.writerow(header) # Transform the classification def f(cls): result = [] # Add the location of the representative representative = cls.getRepresentative() location = representative.getLocation() if representative is not None else (0, '') result += list(location) # Add the source size result += [cls.getSourceSize()] # Add the particles result += list(cls.getParticleIds()) assert(len(result) == (len(cls) + 3)) return result writer.writerows(map(f, classification)) def _readClassification(self, path): with open(path, 'r') as file: reader = csv.reader(file) # Ignore the header next(reader) # Transform each row to a ParticleCluster in a classification def f(row): ite = iter(row) # Parse the representative index = int(next(ite)) path = next(ite) representative = Volume(location=(index, path)) # Parse the source size sourceSize = int(next(ite)) # Parse the id set particleIds = set(map(lambda id : int(id), ite)) result = XmippProtConsensusClasses3D.ParticleCluster(particleIds, representative, sourceSize) assert(len(row) == (len(result) + 3)) return result return list(map(f, reader)) def _writeDictionary(self, path, d): with open(path, 'w') as file: writer = csv.writer(file) writer.writerow(d.keys()) # Header writer.writerow(d.values()) # Values def _readDictionary(self, path, dtype=str): with open(path, 'r') as file: reader = csv.reader(file) keys = list(next(reader)) values = list(map(lambda value : dtype(value), next(reader))) return dict(zip(keys, values)) def _writeIntersections(self, intersections): self._writeClassification(self._getExtraPath(self._getFileName('intersections')), intersections) def _readIntersections(self): return self._readClassification(self._getExtraPath(self._getFileName('intersections'))) def _writeClustering(self, iter, clustering): self._writeClassification(self._getExtraPath(self._getFileName('clustering', iter=iter)), clustering) def _readClustering(self, iter): return self._readClassification(self._getExtraPath(self._getFileName('clustering', iter=iter))) def _readClusteringCount(self): count = 0 while os.path.exists(self._getExtraPath(self._getFileName('clustering', iter=count+1))): count += 1 return count def _writeAllClusterings(self, clusterings): # Create the path makePath(os.path.dirname(self._getExtraPath(self._getFileName('clustering', iter=0)))) # Write all for iter, clustering in enumerate(clusterings, start=1): assert(len(clustering) == iter) self._writeClustering(iter, clustering) def _readAllClusterings(self): return list(map(self._readClustering, range(1, 1+self._readClusteringCount()))) def _writeObjectiveValues(self, allObValues): self._writeList(self._getExtraPath(self._getFileName('objective_values')), allObValues, '%f') def _readObjectiveValues(self): return self._readList(self._getExtraPath(self._getFileName('objective_values')), dtype=float) def _writeElbows(self, elbows): self._writeDictionary(self._getExtraPath(self._getFileName('elbows')), elbows) def _readElbows(self): return self._readDictionary(self._getExtraPath(self._getFileName('elbows')), dtype=int) def _writeReferenceClassificationSizes(self, sizes): self._writeList(self._getExtraPath(self._getFileName('reference_sizes')), sizes, '%d') def _readReferenceClassificationSizes(self): path = self._getExtraPath(self._getFileName('reference_sizes')) if os.path.exists(path): return self._readList(path, dtype=int) else: return None def _writeReferenceClassificationRelativeSizes(self, sizes): self._writeList(self._getExtraPath(self._getFileName('reference_relative_sizes')), sizes, '%f') def _readReferenceClassificationRelativeSizes(self): path = self._getExtraPath(self._getFileName('reference_relative_sizes')) if os.path.exists(path): return self._readList(path, dtype=float) else: return None def _convertInputClassifications(self, classifications): """ Returns the list of lists of sets that stores the set of ids of each class""" def class3dToParticleCluster(cls): ids = cls.getIdSet() rep = cls.getRepresentative().clone() if cls.hasRepresentative() else None return XmippProtConsensusClasses3D.ParticleCluster(ids, rep) f = lambda classification : list(map(class3dToParticleCluster, classification.get())) result = list(map(f, classifications)) return result def _calculateClassificationIntersections(self, classifications): # Start with the first classification result = classifications[0] for i in range(1, len(classifications)): classification = classifications[i] intersections = [] # Perform the intersection with previous classes for cls0 in result: for cls1 in classification: intersection = cls0.intersection(cls1) if intersection: intersections.append(intersection) # Use the new intersection for the next step result = intersections return result def _calculateSubsetSimilarity(self, n, c): """ Return the similarity of intersection subset (n) with class subset (c) """ inter = len(n.intersection(c)) return inter / len(n) def _buildSimilarityVector(self, n, cs): """ Build the similarity vector of n subset with each c subset in cs """ return list(map(self._calculateSubsetSimilarity, itertools.repeat(n), cs)) def _calculateEntropy(self, p): """ Return the entropy of the elements in a vector """ result = -sum(map(lambda x : x*math.log2(x), p)) assert(result >= 0) # Entropy cannot be negative return result def _calculateDistribution(self, ns): """ Return the vector of distribution (p) from the n subsets in ns """ p = list(map(len, ns)) return np.array(p) / sum(p) def _calculateCosineSimilarity(self, v1, v2): """ Return the cosine similarity of two vectors that is used as similarity """ return, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) def _objectiveFunction(self, ns, indices, vs, eps=0.01): """Objective function to minimize based on the entropy and the similarity between the clusters to merge """ p = self._calculateDistribution(ns) pi = self._calculateDistribution(self._ensembleIntersectionPair(ns, indices)) h = self._calculateEntropy(p) hi = self._calculateEntropy(pi) sv = self._calculateCosineSimilarity(vs[indices[0]], vs[indices[1]]) return (h - hi) / (sv + eps) def _ensembleIntersectionPair(self, intersections, indices): """ Return a list of subsets where the intersections[indices] have been merged """ # Divide the input intersections into the ones referred by indices and not selection = [intersections[i] for i in indices] # intersection[indices] result = [intersections[i] for i in range(len(intersections)) if i not in indices] # intersections - selection # Merge all the selected clusters into the same one merged = XmippProtConsensusClasses3D.ParticleCluster.union(*selection) # Add the merged items to the result result.append(merged) return result def _ensembleCheapestIntersectionPair(self, allClasses, intersections): """ Merges the lest costly pair of intersections """ # Compute the similarity vectors similarityVectors = list(map(self._buildSimilarityVector, intersections, itertools.repeat(allClasses))) # For each possible pair of intersections compute the cost of merging them obValues = {} for pair in itertools.combinations(range(len(intersections)), r=2): obValues[pair] = self._objectiveFunction(intersections, pair, similarityVectors) values = list(obValues.values()) keys = list(obValues.keys()) # Select the minimum cost pair and merge them obValue = min(values) mergePair = keys[values.index(obValue)] mergedIntersections = self._ensembleIntersectionPair(intersections, mergePair) return mergedIntersections, obValue def _ensembleIntersections(self, classifications, intersections, numClusters=1): """ Ensemble clustering by COSS that merges interactions of clusters based on entropy minimization """ # Results allIntersections = [intersections] allObValues = [0.0] # Obtain a list with all source classes allClasses = list(itertools.chain(*classifications)) # Iteratively merge intersections while len(allIntersections[-1]) > numClusters: mergedIntersections, obValue = self._ensembleCheapestIntersectionPair(allClasses, allIntersections[-1]) allIntersections.append(mergedIntersections) allObValues.append(obValue) assert(len(allIntersections) == len(allObValues)) return allIntersections, allObValues def _normalizeValues(self, x): """Normalize values to range 0 to 1""" return (x-np.min(x))/(np.max(x)-np.min(x)) def _calculateGaussianMleEstimates(self, d, q): """ MLE estimates of guassian distributions with common variance parameter """ p = len(d) d1 = d[:q] d2 = d[q:] mu1 = np.mean(d1) mu2 = np.mean(d2) var1 = np.var(d1) var2 = np.var(d2) sigma = ((q-1)*var1+(p-q-1)*var2)/(p-2) theta1 = [mu1, sigma] theta2 = [mu2, sigma] return theta1, theta2 def _calculateGaussianDistribution(self, d, theta): """ Gaussian pdf """ m = theta[0] s = theta[1] gauss = 1/(np.sqrt(2*np.pi*s))*np.exp(-1/(2*s)*(d-m)**2) return gauss def _calculateProfileLogLikelihood(self, d, q, theta1, theta2): """ Profile log likelihood for given parameters """ log_f_q = np.log(self._calculateGaussianDistribution(d[:q], theta1)) log_f_p = np.log(self._calculateGaussianDistribution(d[q:], theta2)) l_q = np.sum(log_f_q) + np.sum(log_f_p) return l_q def _calculateFullProfileLogLikelihood(self, d): """ Calculate profile log likelihood for each partition of the data """ pll = [] for q in range(1, len(d)): theta1, theta2 = self._calculateGaussianMleEstimates(d, q) pll.append(self._calculateProfileLogLikelihood(d, q, theta1, theta2)) return pll def _findElbowPll(self, obValues): """ Find the elbow according to the full profile likelihood """ # Get profile log likelihood for log of objective values # Remove last obValue as it is zero and log(0) is undefined pll = self._calculateFullProfileLogLikelihood(np.log(obValues[:-1])) return np.argmax(pll) def _findElbowOrigin(self, x, y): """ Find the point closest to origin from normalied data """ coors = np.column_stack((x, y)) distances = np.linalg.norm(coors, axis=1) assert(len(x) == len(distances)) return np.argmin(distances) def _findElbowAngle(self, x, y, angle=-math.pi/4): """ Find the angle the slope of the function makes and return the point at which the slope crosses the given angle """ dx = np.diff(x) dy = np.diff(y) angles = np.arctan2(dy, dx) # Find the first point with an angle greater than the given one crossing = np.argmax(angles >= angle) return crossing def _getClassificationLengths(self, classifications): """ Returns a list of lists that stores the lengths of each classification """ return list(map(lambda classification : list(map(len, classification)), classifications)) def _calculateRandomClassification(self, C, N): """ Randomly classifies N element indices into groups with sizes defined by rows of C """ Cp = [] for Ci in C: assert(sum(Ci) == N) # The groups should address all the elements TODO: Maybe LEQ? x = np.argsort(np.random.uniform(size=N)).tolist() # Shuffles a [0, N) iota # Select the number random indices requested by each group Cip = [] first = 0 for s in Ci: Cip.append(XmippProtConsensusClasses3D.ParticleCluster(x[first:first + s])) first += s # Add the random classification to the result Cp.append(Cip) return Cp def _calculateRandomClassificationConsensus(self, C, N): """ Obtains the intersections of a consensus of a random classification of sizes defined in C of N elements. C is a list of lists, where de sum of each row must equal N """ # Create random partitions of same size randomClassification = self._calculateRandomClassification(C, N) # Compute the repeated classifications return self._calculateClassificationIntersections(randomClassification) def _calculateReferenceClassification(self, classifications, numExec, threadPool=None): """ Create random partitions of same size to compare the quality of the classification """ clusterLengths = self._getClassificationLengths(classifications) numParticles = sum(clusterLengths[0]) # Repeatedly obtain a consensus of a random classification of same size if threadPool: consensus = threadPool.starmap( self._calculateRandomClassificationConsensus, itertools.repeat((clusterLengths, numParticles), numExec) ) else: consensus = list(map( self._calculateRandomClassificationConsensus, itertools.repeat(clusterLengths, numExec), itertools.repeat(numParticles, numExec) )) assert(len(consensus) == numExec) # Concatenate all consensuses consensus = list(itertools.chain(*consensus)) # Obtain the consensus sizes consensusSizes = list(map(len, consensus)) consensusSizeRatios = list(map(XmippProtConsensusClasses3D.ParticleCluster.getRelativeSize, consensus)) return consensusSizes, consensusSizeRatios def _createOutputClasses3D(self, particles, clustering, name, randomConsensusSizes=None, randomConsensusRelativeSizes=None): outputClasses = self._createSetOfClasses3D(particles, suffix=name) # Create a list with the filenames of the representatives representatives = list(map( XmippProtConsensusClasses3D.ParticleCluster.getRepresentative, clustering )) # Fill the output loader = XmippProtConsensusClasses3D.ClassesLoader( clustering, representatives, randomConsensusSizes, randomConsensusRelativeSizes ) loader.fillClasses(outputClasses) return outputClasses
[docs] class ParticleCluster: """ Keeps track of the information related to successive class intersections. It is instantiated with a single class and allows to perform intersections and unions with it""" def __init__(self, particleIds, representative=None, sourceSize=None): self._particleIds = set(particleIds) # Particle ids belonging to this intersection self._representative = representative self._sourceSize = sourceSize if sourceSize is not None else len(self._particleIds) # The size of the representative class def __len__(self): return len(self.getParticleIds()) def __eq__(self, other): return self.getParticleIds() == other.getParticleIds()
[docs] def intersection(self, *others): return self._combine(set.intersection, min, XmippProtConsensusClasses3D.ParticleCluster.getSourceSize, *others)
[docs] def union(self, *others): return self._combine(set.union, max, len, *others)
[docs] def getParticleIds(self): return self._particleIds
[docs] def getRepresentative(self): return self._representative
[docs] def getSourceSize(self): return self._sourceSize
[docs] def getRelativeSize(self): return len(self) / self.getSourceSize()
def _combine(self, operation, selector, selectionCriteria, *others): allItems = (self, ) + others # Perform the requested operation among the id sets allParticleIds = map(XmippProtConsensusClasses3D.ParticleCluster.getParticleIds, allItems) particleIds = operation(*allParticleIds) # Select the class with the appropriate criteria selection = selector(allItems, key=selectionCriteria) representative = selection.getRepresentative() sourceSize = selection.getSourceSize() return XmippProtConsensusClasses3D.ParticleCluster(particleIds, representative, sourceSize)
[docs] class ClassesLoader: """ Helper class to produce classes """ def __init__(self, clustering, representatives, randomConsensusSizes=None, randomConsensusRelativeSizes=None): self.clustering = clustering self.classification = self._createClassification(clustering) self.representatives = representatives self.randomConsensusSizes = randomConsensusSizes self.randomConsensusRelativeSizes = randomConsensusRelativeSizes
[docs] def fillClasses(self, clsSet): clsSet.classifyItems(updateItemCallback=self._updateParticle, updateClassCallback=self._updateClass, itemDataIterator=iter(self.classification.items()), doClone=False)
def _updateParticle(self, item, row): particleId, classId = row assert(item.getObjId() == particleId) item.setClassId(classId) def _updateClass(self, item): classId = item.getObjId() classIdx = classId-1 # Set the representative item.setRepresentative(self.representatives[classIdx]) # Set the size p-value if self.randomConsensusSizes is not None: size = len(self.clustering[classIdx]) percentile = self._findPercentile(self.randomConsensusSizes, size) pValue = 1 - percentile setXmippAttribute(item, emlib.MDL_CLASS_INTERSECTION_SIZE_PVALUE, Float(pValue)) # Set the relative size p-value if self.randomConsensusRelativeSizes is not None: size = self.clustering[classIdx].getRelativeSize() percentile = self._findPercentile(self.randomConsensusRelativeSizes, size) pValue = 1 - percentile setXmippAttribute(item, emlib.MDL_CLASS_INTERSECTION_RELATIVE_SIZE_PVALUE, Float(pValue)) def _createClassification(self, clustering): # Fill the classification data (particle-class mapping) result = {} for cls, data in enumerate(clustering): for particleId in data.getParticleIds(): result[particleId] = cls + 1 return collections.OrderedDict(sorted(result.items())) def _findPercentile(self, data, value): """ Given an array of values (data), finds the corresponding percentile of the value. Percentile is returned in range [0, 1] """ assert(np.array_equal(sorted(data), data)) # In order to iterate it in ascending order # Count the number of elements that are smaller than the given value i = 0 while i < len(data) and data[i] < value: i += 1 # Convert it into a percentage return float(i) / float(len(data))