# ******************************************************************************
# *
# * Authors: J.M. De la Rosa Trevin (jmdelarosa@cnb.csic.es)
# *
# * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307 USA
# *
# * All comments concerning this program package may be sent to the
# * e-mail address 'scipion@cnb.csic.es'
# *
# ******************************************************************************
from os.path import join, dirname, exists
from glob import glob
import pyworkflow.protocol.params as param
import pyworkflow.protocol.constants as const
from pyworkflow.utils.path import cleanPath, makePath
import pwem.emlib.metadata as md
from pwem.protocols import ProtClassify2D
from pwem.objects import SetOfClasses2D
from pwem.constants import ALIGN_NONE, ALIGN_2D
from pyworkflow import BETA, UPDATED, NEW, PROD
from xmipp3.convert import (writeSetOfParticles, createItemMatrix,
writeSetOfClasses2D, xmippToLocation,
rowToAlignment)
# Comparison methods enum
CMP_CORRELATION = 0
CMP_CORRENTROPY = 1
# Clustering methods enum
CL_CLASSICAL = 0
CL_ROBUST = 1
# Classes keys
OUTPUTCLASSES = 'outputClasses'
CLASSES = ''
CLASSES_CORE = '_core'
CLASSES_STABLE_CORE = '_stable_core'
# Suggested number of images per class
IMAGES_PER_CLASS = 200
[docs]class XmippProtCL2D(ProtClassify2D):
""" Classifies a set of 2D images using clustering algorithms. It
subdivides the original dataset into user-defined classes, aiding the
identification of particle heterogeneity or structural variations within
the data.
AI Generated:
What this protocol is for
CL2D is Xmipp’s classic 2D classification method for single-particle
datasets. Its goal is to take a set of particle images and organize them
into 2D classes so that particles with similar views cluster together. In
practice, this is one of the most useful “dataset-understanding” steps in
a workflow: it helps you see whether your sample is homogeneous, whether
you have different conformations or compositions, whether there is
preferred orientation, and how much junk/contamination is present.
Compared to very fast “quick look” approaches, CL2D is designed to be a
robust, iterative classifier that can build structure in the dataset
gradually. A particularly helpful concept in this protocol is the idea of
cores: subsets of particles that are most representative of each class (and
optionally “stable cores”, which are particles that remain consistently
grouped through the multilevel process). For biological users, cores are
often the easiest way to obtain a cleaner subset without doing aggressive
manual rejection.
Inputs: what you need to provide
You provide a single mandatory input: a SetOfParticles (your particle
stack). These particles do not need to be aligned beforehand for CL2D to
work as a classifier, but the quality of the result always depends on the
usual prerequisites: reasonable particle extraction, sensible box size,
and (often) some form of downsampling if the sampling rate is very small
and you are only aiming at a medium-resolution 2D classification.
Optionally, you may provide initial classes/averages if you prefer to start
from known references rather than random initialization. This is useful
when you want reproducibility, when you are “updating” a previously known
set of views, or when you already trust a set of 2D averages and want the
classification to converge around them.
The most important knob: how many classes to ask for
The parameter Number of classes controls the granularity of the
classification. Biologically, this is where you decide whether you want a
coarse view of the data (fewer classes, more particles per class, easier to
interpret) or a finer partition (more classes, smaller classes, better at
separating rare views or subtle heterogeneity but also easier to
over-fragment noise).
A practical way to think about it is the “typical number of images per
class”. If you have N particles and choose K classes, the average class
size is N/K. CL2D is often most interpretable when classes have enough
particles to average out noise (hundreds is comfortable for many datasets),
but this depends strongly on SNR and particle size.
How initialization works, and when to use each mode
CL2D can start in two different ways.
If you enable random initialization, the algorithm will build the
classification starting from a small number of initial classes (the
“first level”), and then progressively refine/split them until the final
number of classes is reached. This is the most common choice when you do
not have a reliable set of starting references. The key advanced parameter
here is Number of initial classes: conceptually, you begin with a small
number of broad groups and then increase complexity in later levels. If you
set this too low, the early grouping can be overly coarse; if you set it
too high, you lose the benefit of gradual organization and can become more
sensitive to noise.
If you disable random initialization, you provide Initial classes (a set of
2D classes or averages). In this mode, CL2D starts from those references
and tends to behave more like “classify relative to these starting views”.
This is often useful for controlled workflows (e.g., you already have a
trusted set of views from a previous run or a known sample) or when you
want more repeatable outcomes between runs.
Iterations and convergence: what “Number of iterations” really means
Within each level of the multilevel process, CL2D performs iterative
refinement. The parameter Number of iterations sets an upper bound for how
long it will refine at each level. Biologically, increasing iterations can
help when particles are noisy or when the dataset has subtle differences
that need more refinement to separate. On the other hand, very large
iteration counts are not always beneficial: if the dataset contains a lot
of junk or highly heterogeneous content, extra iterations may mostly refine
noise patterns rather than producing cleaner biology.
A common, practical approach is to keep a moderate value for routine runs
and only increase it if you see unstable or underdeveloped class averages.
How similarity is measured: correlation vs correntropy
CL2D lets you choose a comparison method that defines how “close” two
images are during clustering.
With correlation, similarity is driven by standard cross-correlation–like
behavior. This tends to work well for many cryo-EM particle sets and is
usually a safe default.
With correntropy, similarity becomes more robust to certain kinds of
outliers and non-Gaussian noise. Biologically, this option can sometimes
help when a dataset contains a mixture of good particles and difficult
contaminants, or when you suspect that outliers are pulling classes in
unhelpful directions. It is not a magic switch, but it is a reasonable
alternative if correlation-based grouping produces classes that look
“washed” or dominated by a few atypical images.
How clustering is done: classical vs robust
The clustering method controls the criterion used to build and update
classes.
The classical criterion is the standard behavior and is typically the
first choice.
The robust option is meant to be less sensitive to problematic particles.
For biological users, a good rule of thumb is: if the dataset is clean and
you mostly want to resolve views, classical is usually fine; if you have
strong junk, variable backgrounds, or particles with systematic artifacts,
robust clustering may give more stable, interpretable classes.
Core analysis: extracting the “most representative” particles per class
One of the most biologically useful features of this protocol is core
analysis. When core analysis is enabled, CL2D analyzes each class and
identifies a subset of particles that are close to the class center
according to internal criteria (including Z-score–like measures and
PCA-based distances). Intuitively, the core is the set of particles that
best represent the class signal and are least likely to be junk or
misassigned.
The parameters Junk Zscore and PCA Zscore control how strict this selection
is. Lower thresholds reject more particles (giving smaller, cleaner cores),
while higher thresholds accept more (giving larger cores that may include
more borderline particles). If your goal is to build a high-quality subset
for downstream refinement, stricter cores are often attractive. If your
goal is to avoid losing rare but real views, you may want to be less strict.
CL2D can also compute a stable core, which is a stricter idea: particles
that have remained essentially grouped throughout the multilevel
classification. The Tolerance parameter controls how many “exceptions” are
allowed across levels. With tolerance set to zero, stable core membership
becomes very stringent; with higher tolerance, you keep more particles
while still favoring stable assignments. For biological workflows, stable
cores are often a good way to define “high-confidence” subsets for more
delicate steps, like initial model generation or high-resolution refinement.
Optional hierarchy: understanding how classes split across levels
If you enable Compute class hierarchy, CL2D will track how classes relate
between levels (how groups split and evolve). This is mostly a diagnostic
feature, but it can be biologically informative: it helps you see whether a
class is stable, whether it splits into meaningful sub-views, or whether it
fragments in ways that look like noise.
Optional analysis of rejected particles
If you enable Analyze rejected particles, the protocol will generate
additional information in the run directory showing what was excluded from
cores (and stable cores). This can be useful when you want to understand
the nature of the rejected set—whether it is mostly obvious junk, rare
orientations, contaminants, or simply low-SNR particles.
Outputs: what you get at the end
The main output is outputClasses, a SetOfClasses2D with your final
classification.
If core analysis is enabled and applicable, you also get
outputClasses_core, which contains the “core” particles for each class.
If stable core analysis is enabled and applicable, you also get
outputClasses_stable_core, which contains the most stable, high-confidence
particles per class.
From a biological processing standpoint, these outputs give you three
practical processing paths: you can continue with the full classified
dataset, or you can continue with a cleaner “core” subset, or you can
choose the most conservative “stable core” subset when you want maximum
reliability at the cost of throwing away more particles.
Typical processing strategies (how users usually apply CL2D)
A very common strategy is to run CL2D with a reasonable number of classes
and then visually inspect the class averages to decide what to keep. In
parallel, core/stable core outputs can be used as an automated way to
define a cleaner subset, especially when you want to reduce bias in manual
selection.
Another common approach is iterative: run CL2D once to diagnose dataset
quality and remove obvious junk, then rerun CL2D on the cleaner subset with
more classes to resolve views and heterogeneity more finely.
Finally, if you already have a reliable set of 2D averages (from previous
runs or a known sample), starting from initial classes can make outcomes
more reproducible and can help the classification converge to the expected
view set more quickly."""
_label = 'cl2d'
_devStatus = PROD
_possibleOutputs = {OUTPUTCLASSES: SetOfClasses2D,
OUTPUTCLASSES+CLASSES_CORE: SetOfClasses2D,
OUTPUTCLASSES+CLASSES_STABLE_CORE: SetOfClasses2D}
def __init__(self, **args):
ProtClassify2D.__init__(self, **args)
if self.numberOfMpi.get() < 2:
self.numberOfMpi.set(2)
def _defineFileNames(self):
""" Centralize how files are called within the protocol. """
self.levelPath = self._getExtraPath('level_%(level)02d/')
myDict = {
'input_particles': self._getTmpPath('input_particles.xmd'),
'input_references': self._getTmpPath('input_references.xmd'),
'final_classes': self._getPath('classes2D%(sub)s.sqlite'),
'output_particles': self._getExtraPath('images.xmd'),
'level_classes' : self.levelPath + 'level_classes%(sub)s.xmd',
'level_images' : self.levelPath + 'level_images%(sub)s.xmd',
'classes_scipion': (self.levelPath + 'classes_scipion_level_'
'%(level)02d%(sub)s.sqlite'),
'classes_hierarchy': self._getExtraPath("classes%(sub)s"
"_hierarchy.txt")
}
self._updateFilenamesDict(myDict)
#--------------------------- DEFINE param functions ------------------------
def _defineParams(self, form):
form.addSection(label='Input')
form.addParam('inputParticles', param.PointerParam,
label="Input images",
important=True, pointerClass='SetOfParticles',
help='Select the input images to be classified.')
form.addParam('numberOfClasses', param.IntParam, default=64,
label='Number of classes:',
help='Number of classes (or references) to be generated.')
form.addParam('randomInitialization', param.BooleanParam, default=True,
expertLevel=const.LEVEL_ADVANCED,
label='Random initialization of classes:',
help="Initialize randomly the first classes. If you "
"don't initialize randomly, you must supply a set "
"of initial classes")
form.addParam('initialClasses', param.PointerParam,
label="Initial classes",
condition="not randomInitialization",
pointerClass='SetOfClasses2D, SetOfAverages',
help='Set of initial classes to start the classification')
form.addParam('numberOfInitialClasses', param.IntParam, default=4,
expertLevel=const.LEVEL_ADVANCED,
label='Number of initial classes:',
condition="randomInitialization",
help='Initial number of classes used in the first level.')
form.addParam('numberOfIterations', param.IntParam, default=10,
expertLevel=const.LEVEL_ADVANCED,
label='Number of iterations:',
help='Maximum number of iterations within each level.')
form.addParam('comparisonMethod', param.EnumParam,
choices=['correlation', 'correntropy'],
label="Comparison method", default=CMP_CORRELATION,
expertLevel=const.LEVEL_ADVANCED,
display=param.EnumParam.DISPLAY_COMBO,
help='Use correlation or correntropy')
form.addParam('clusteringMethod', param.EnumParam,
choices=['classical', 'robust'],
label="Clustering method", default=CL_CLASSICAL,
expertLevel=const.LEVEL_ADVANCED,
display=param.EnumParam.DISPLAY_COMBO,
help='Use the classical clustering criterion or the '
'robust')
form.addParam('extraParams', param.StringParam,
expertLevel=const.LEVEL_ADVANCED,
label='Additional parameters',
help='Additional parameters for classify_CL2D: \n'
' --verbose, --corrSplit, ...')
form.addSection(label='Core analysis')
form.addParam('doCore', param.BooleanParam, default=True,
label='Perform core analysis',
help='An image belongs to the core if it is close (see '
'Junk Zscore and PCA Zscore) to the class center')
form.addParam('thZscore', param.FloatParam, default=3,
label='Junk Zscore', expertLevel=const.LEVEL_ADVANCED,
condition='doCore',
help='Which is the average Z-score to be considered as '
'junk. Typical values go from 1.5 to 3. For the '
'Gaussian distribution 99.5% of the data is '
'within a Z-score of 3. Lower Z-scores reject more '
'images. Higher Z-scores accept more images.')
form.addParam('thPCAZscore', param.FloatParam, default=3,
condition='doCore', expertLevel=const.LEVEL_ADVANCED,
label='PCA Zscore',
help='Which is the PCA Z-score to be considered as junk. '
'Typical values go from 1.5 to 3. For the Gaussian '
'distribution 99.5% of the data is within a '
'Z-score of 3. Lower Z-scores reject more images. '
'Higher Z-scores accept more images.')
form.addParam('doStableCore', param.BooleanParam, default=True,
condition='doCore', label='Perform stable core analysis',
help='Two images belong to the stable core if they have '
'been essentially together along the classification '
'process')
form.addParam('tolerance', param.IntParam, default=1, label='Tolerance',
expertLevel=const.LEVEL_ADVANCED,
condition='doCore and doStableCore',
help='An image belongs to the stable core if it has been '
'with other images in the same class in all the '
'previous levels except possibly a few of them. '
'Tolerance defines how few is few. Tolerance=0 '
'means that an image must be in all previous levels '
'with the rest of images in the core.',)
form.addParam("computeHierarchy", param.BooleanParam, default=False,
label="Compute class hierarchy",
expertLevel=const.LEVEL_ADVANCED)
form.addParam("analyzeRejected", param.BooleanParam, default=False,
label="Analyze rejected particles",
expertLevel=const.LEVEL_ADVANCED,
help='To see the analysis you need to browse the '
'execution directory and go into the different '
'levels')
form.addParallelSection(threads=0, mpi=4)
#--------------------------- INSERT steps functions ------------------------
def _insertAllSteps(self):
""" Mainly prepare the command line for call cl2d program"""
# Convert input images if necessary
self._defineFileNames()
if self.initialClasses.get():
initialClassesId = self.initialClasses.get().getObjId()
else:
initialClassesId = None
self._insertFunctionStep(self.convertInputStep,
self.inputParticles.get().getObjId(),
initialClassesId)
self._params = {'imgsFn': self._getFileName('input_particles'),
'extraDir': self._getExtraPath(),
'nref': self.numberOfClasses.get(),
'nref0': self.numberOfInitialClasses.get(),
'iter': self.numberOfIterations.get(),
'extraParams': self.extraParams.get(''),
'thZscore': self.thZscore.get(),
'thPCAZscore': self.thPCAZscore.get(),
'tolerance': self.tolerance.get(),
'initClassesFn': self._getFileName('input_references')
}
args = self._defArgsClassify()
self._insertClassifySteps("xmipp_classify_CL2D", args, subset=CLASSES)
#TODO: Added this If. Check with COSS error if makes sense.
#Also, if conditions below are enough to validate that classes core
# and stable core are not empty
if not self.randomInitialization:
self.numberOfInitialClasses.set(self.initialClasses.get().getSize())
# Analyze cores and stable cores
if self.numberOfClasses > self.numberOfInitialClasses and self.doCore:
program = "xmipp_classify_CL2D_core_analysis"
# core analysis
args = self._defArgsCoreAnalisys()
self._insertClassifySteps(program, args, subset=CLASSES_CORE)
if self.analyzeRejected:
self._insertFunctionStep(self.analyzeOutOfCores, CLASSES_CORE)
if (self.numberOfClasses > (2 * self.numberOfInitialClasses.get())
and self.doStableCore): # Number of levels should be > 2
# stable core analysis
args = self._defArgsCoreAnalisys("stable")
self._insertClassifySteps(program, args,
subset=CLASSES_STABLE_CORE)
if self.analyzeRejected:
self._insertFunctionStep(self.analyzeOutOfCores,
CLASSES_STABLE_CORE)
def _insertClassifySteps(self, program, args, subset=CLASSES):
""" Defines four steps for the subset:
1. Run the main program.
2. Evaluate classes
3. Sort the classes.
4. And create output
"""
self._insertRunJobStep(program, args % self._params)
self._insertFunctionStep(self.evaluateClassesStep, subset)
self._insertFunctionStep(self.sortClassesStep, subset)
self._insertFunctionStep(self.createOutputStep, subset)
#--------------------------- STEPS functions -------------------------------
[docs] def sortClassesStep(self, subset=''):
""" Sort the classes and provided a quality criterion. """
levelMdFiles = self._getAllLevelMdFiles(subset)
for mdFn in levelMdFiles:
fnRoot = join(dirname(mdFn), "classes%s_sorted" % subset)
params = "-i classes@%s --oroot %s" % (mdFn, fnRoot)
self.runJob("xmipp_image_sort", params)
mdFnOut = fnRoot + ".xmd"
mdOut = md.MetaData(mdFnOut)
for objId in mdOut:
mdOut.setValue(md.MDL_ITEM_ID,
int(mdOut.getValue(md.MDL_REF,objId)),objId)
mdOut.write("classes_sorted@" + mdFn, md.MD_APPEND)
[docs] def evaluateClassesStep(self, subset=''):
""" Calculate the FRC and output the hierarchy for
each level of classes.
"""
levelMdFiles = self._getAllLevelMdFiles(subset)
hierarchyFnOut = self._getExtraPath("classes%s_hierarchy.txt" % subset)
prevMdFn = None
for mdFn in levelMdFiles:
self.runJob("xmipp_classify_evaluate_classes",
"-i " + mdFn, numberOfMpi=1)
if self.computeHierarchy and prevMdFn is not None:
args = "--i1 %s --i2 %s -o %s" % (prevMdFn, mdFn, hierarchyFnOut)
if exists(hierarchyFnOut):
args += " --append"
self.runJob("xmipp_classify_compare_classes",
args, numberOfMpi=1)
prevMdFn = mdFn
[docs] def createOutputStep(self, subset=''):
""" Store the SetOfClasses2D object
resulting from the protocol execution.
"""
level = self._lastLevel()
subsetFn = self._getFileName("level_classes", level=level, sub=subset)
if exists(subsetFn):
classes2DSet = self._createSetOfClasses2D(self.inputParticles, subset)
self._fillClassesFromLevel(classes2DSet, "last", subset)
result = {OUTPUTCLASSES + subset: classes2DSet}
self._defineOutputs(**result)
self._defineSourceRelation(self.inputParticles, classes2DSet)
[docs] def analyzeOutOfCores(self,subset):
""" Analyze which images are out of cores """
levelMdFiles = self._getAllLevelMdFiles(subset)
for fn in levelMdFiles:
mdAll=md.MetaData()
blocks = md.getBlocksInMetaDataFile(fn)
fnDir=dirname(fn)
# Gather all images in block
for block in blocks:
if block.startswith('class0'):
mdClass=md.MetaData(block+"@"+fn)
mdAll.unionAll(mdClass)
if mdAll.size()>0:
# Compute difference to images
fnSubset=join(fnDir,"images%s.xmd"%subset)
mdAll.write(fnSubset)
fnOutOfSubset=join(fnDir,"imagesOut.xmd")
inputMd = self._getFileName('input_particles')
args = "-i %s --set subtraction %s -o %s" % (inputMd,
fnSubset,
fnOutOfSubset)
self.runJob("xmipp_metadata_utilities", args, numberOfMpi=1,
numberOfThreads=1)
# Remove disabled and intermediate files
mdClass=md.MetaData(fnOutOfSubset)
mdClass.removeDisabled()
fnRejected="images_rejected@"+fn
mdClass.write(fnRejected,md.MD_APPEND)
cleanPath(fnOutOfSubset)
cleanPath(fnSubset)
# If enough images, make a small summary
if mdClass.size()>100:
from math import ceil
fnRejectedDir=join(fnDir,"rejected%s"%subset)
makePath(fnRejectedDir)
Nclasses=int(ceil(mdClass.size()/300))
self.runJob("xmipp_classify_CL2D",
"-i %s --nref0 1 --nref %d --iter 5 --distance "
"correlation --classicalMultiref "
"--classifyAllImages --odir %s"
%( fnRejected, Nclasses, fnRejectedDir))
#--------------------------- INFO functions --------------------------------
def _validate(self):
validateMsgs = []
if self.numberOfMpi <= 1:
validateMsgs.append('Mpi needs to be greater than 1.')
if self.numberOfInitialClasses > self.numberOfClasses:
validateMsgs.append('The number of final classes cannot be smaller'
' than the number of initial classes')
if isinstance(self.initialClasses.get(), SetOfClasses2D):
if not self.initialClasses.get().hasRepresentatives():
validateMsgs.append("The input classes should have "
"representatives.")
return validateMsgs
def _warnings(self):
validateMsgs = []
if self.inputParticles.get().getSamplingRate() < 3:
validateMsgs.append("The sampling rate is smaller than 3 A/pix, "
"consider downsampling the input images to "
"speed-up the process. Probably you don't want"
" such a precise 2D classification.")
return validateMsgs
def _citations(self):
citations=['Sorzano2010a']
if self.doCore:
citations.append('Sorzano2014')
return citations
def _summaryLevelFiles(self, summary, levelFiles, subset):
if levelFiles:
levels = [i for i in range(self._lastLevel()+1)]
summary.append('Computed classes%s, levels: %s' % (subset, levels))
def _summary(self):
self._defineFileNames()
summary = []
levelFiles = self._getAllLevelMdFiles()
if not hasattr(self, 'outputClasses'):
summary.append("Output classes not ready yet.")
elif levelFiles:
self._summaryLevelFiles(summary, levelFiles, CLASSES)
self._summaryLevelFiles(summary, self._getAllLevelMdFiles(CLASSES_CORE), CLASSES_CORE)
self._summaryLevelFiles(summary, self._getAllLevelMdFiles(CLASSES_STABLE_CORE), CLASSES_STABLE_CORE)
else:
summary.append("Input Particles: *%d*\nClassified into *%d* classes\n"
% (self.inputParticles.get().getSize(),
self.numberOfClasses.get()))
# summary.append('- Used a _clustering_ algorithm to subdivide the original dataset into the given number of classes')
return summary
def _methods(self):
strline = ''
if hasattr(self, 'outputClasses'):
strline += 'We classified %d particles from %s ' % (self.inputParticles.get().getSize(),
self.getObjectTag('inputParticles'))
strline += 'into %d classes %s using CL2D [Sorzano2010a]. ' % (self.numberOfClasses,
self.getObjectTag('outputClasses'))
strline += '%s method was used to compare images and %s clustering criterion. '%\
(self.getEnumText('comparisonMethod'), self.getEnumText('clusteringMethod'))
if self.numberOfClasses > self.numberOfInitialClasses and self.doCore:
strline+='We also calculated the class cores %s' % self.getObjectTag('outputClasses_core')
if self.numberOfClasses > (2 * self.numberOfInitialClasses.get()) and self.doStableCore: # Number of levels should be > 2
strline += ' and the class stable cores %s' % self.getObjectTag('outputClasses_stable_core')
strline+=' [Sorzano2014].'
return [strline]
#--------------------------- UTILS functions -------------------------------
def _defArgsClassify(self):
# Prepare arguments to call program: xmipp_classify_CL2D
args = '-i %(imgsFn)s --odir %(extraDir)s --oroot level --nref ' \
'%(nref)d --iter %(iter)d %(extraParams)s'
if self.comparisonMethod == CMP_CORRELATION:
args += ' --distance correlation'
if self.clusteringMethod == CL_CLASSICAL:
args += ' --classicalMultiref'
if self.randomInitialization:
args += ' --nref0 %(nref0)d'
else:
args += ' --ref0 %(initClassesFn)s'
return args
def _defArgsCoreAnalisys(self,coreType="core"):
args = " --dir %(extraDir)s --root level "
if coreType =="core":
args += "--computeCore %(thZscore)f %(thPCAZscore)f"
else:
args += "--computeStableCore %(tolerance)d"
return args
def _getAllLevelMdFiles(self, subset=''):
""" Grab the metadata class files for each level. """
levelMdFiles = []
lastLevel = self._lastLevel()
for i in range(lastLevel):
classFn = self._getLevelMdClasses(lev=i, block="", subset=subset)
if exists(classFn):
levelMdFiles.append(classFn)
return levelMdFiles
def _createItemMatrix(self, item, row):
createItemMatrix(item, row, align=ALIGN_2D)
def _updateParticle(self, item, row):
item.setClassId(row.getValue(md.MDL_REF))
item.setTransform(rowToAlignment(row, ALIGN_2D))
def _updateClass(self, item):
classId = item.getObjId()
if classId in self._classesInfo:
index, fn, _ = self._classesInfo[classId]
item.setAlignment2D()
rep = item.getRepresentative()
rep.setLocation(index, fn)
rep.setSamplingRate(self.inputParticles.get().getSamplingRate())
def _loadClassesInfo(self, filename):
""" Read some information about the produced 2D classes
from the metadata file.
"""
self._classesInfo = {} # store classes info, indexed by class id
mdClasses = md.MetaData(filename)
for classNumber, row in enumerate(md.iterRows(mdClasses)):
index, fn = xmippToLocation(row.getValue(md.MDL_IMAGE))
# Store info indexed by id, we need to store the row.clone() since
# the same reference is used for iteration
self._classesInfo[classNumber + 1] = (index, fn, row.clone())
def _fillClassesFromLevel(self, clsSet, level, subset):
""" Create the SetOfClasses2D from a given iteration. """
self._loadClassesInfo(self._getLevelMdClasses(lev=level, subset=subset))
if subset == '' and level == "last":
xmpMd = self._getFileName('output_particles')
if not exists(xmpMd):
xmpMd = self._getLevelMdImages(level, subset)
else:
xmpMd = self._getLevelMdImages(level, subset)
iterator = md.SetMdIterator(xmpMd, sortByLabel=md.MDL_ITEM_ID,
updateItemCallback=self._updateParticle,
skipDisabled=True)
# itemDataIterator is not neccesary because, the class SetMdIterator
# contain all the information about the metadata
clsSet.classifyItems(updateItemCallback=iterator.updateItem,
updateClassCallback=self._updateClass)
def _getLevelMdClasses(self, lev=0, block="classes", subset=""):
""" Return the classes metadata for this iteration.
block parameter can be 'info' or 'classes'."""
if lev == "last":
lev = self._lastLevel()
mdFile = self._getFileName('level_classes', level=lev, sub=subset)
if block:
mdFile = block + '@' + mdFile
return mdFile
def _getLevelMdImages(self, level, subset):
if level == "last":
level = self._lastLevel()
xmpMd = self._getFileName('level_images', level=level, sub=subset)
if not exists(xmpMd):
self._createLevelMdImages(level, subset)
return xmpMd
def _createLevelMdImages(self, level, sub):
if level == "last":
level = self._lastLevel()
mdClassesFn = self._getLevelMdClasses(lev=level, block="", subset=sub)
mdImgs = md.joinBlocks(mdClassesFn, "class0")
mdImgs.write(self._getFileName('level_images',level=level, sub=sub))
def _lastLevel(self):
""" Find the last Level number """
clsFn = self._getFileName('level_classes', level=0, sub="")
levelTemplate = clsFn.replace('level_00','level_??')
lev = len(glob(levelTemplate)) - 1
return lev
def _getLevelClasses(self, lev, suffix, clean=False):
""" Return a classes .sqlite file for this level.
If the file doesn't exists, it will be created by
converting from this level level_images.xmd file.
"""
dataClasses = self._getFileName('classes_scipion', level=lev,
sub=suffix)
if clean:
cleanPath(dataClasses)
if not exists(dataClasses):
clsSet = SetOfClasses2D(filename=dataClasses)
clsSet.setImages(self.inputParticles.get())
self._fillClassesFromLevel(clsSet, level=lev, subset=suffix)
clsSet.write()
clsSet.close()
return dataClasses