# ******************************************************************************
# *
# * Authors: J.M. De la Rosa Trevin (jmdelarosa@cnb.csic.es)
# * Carlos Oscar Sánchez Sorzano (coss@cnb.csic.es)
# * Daniel Marchán Torres (da.marchan@cnb.csic.es)
# * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307 USA
# *
# * All comments concerning this program package may be sent to the
# * e-mail address 'scipion@cnb.csic.es'
# *
# ******************************************************************************
from os.path import join, exists
from glob import glob
import pyworkflow.protocol.params as param
from pyworkflow.utils.path import makePath
import pwem.emlib.metadata as md
from pwem.protocols import ProtClassify2D
from pwem.constants import ALIGN_2D
from xmipp3.convert import (writeSetOfClasses2D, xmippToLocation,
rowToAlignment)
CLASSES_CORE = '_core'
[docs]class XmippProtCoreAnalysis(ProtClassify2D):
""" Analyzes the core of a 2D classification. The core is calculated through
the Mahalanobis distance from each image to the center of the class.
AI Generated:
## Overview
The Core Analysis protocol evaluates the internal consistency of 2D classes
by identifying the most representative particles within each class. It is
based on statistical distances—specifically the Mahalanobis distance—between
each particle and the center of its assigned class. The goal is to
distinguish well-aligned, structurally consistent particles (the *core*)
from outliers or poorly aligned images (often referred to as *junk*).
In practical cryo-EM workflows, this protocol is particularly useful after
a 2D classification step. Even when classes appear visually clean, they
often contain particles that do not fully conform to the dominant structure.
Removing these particles improves the quality of downstream steps such as
3D reconstruction, refinement, or heterogeneity analysis.
For a biological user, this protocol provides a principled way to “clean”
classes without relying purely on visual inspection, which can be
subjective and time-consuming.
## Inputs and General Workflow
The protocol takes as input a **set of 2D classes**, typically produced by
a previous classification method. Each class contains particles that are
assumed to represent similar projections of the underlying structure.
The analysis proceeds by modeling the statistical distribution of particles
within each class. Using this model, the protocol computes how far each
particle deviates from the class center. Particles that deviate strongly
are considered less reliable and may be excluded from the class core.
The result is a refined set of classes that contains only the most
representative particles, while outliers are effectively discarded.
## Understanding the Concept of “Core”
The *core* of a class can be understood as the subset of particles that
best represent the underlying signal. These particles are mutually
consistent in both appearance and alignment.
In contrast, particles outside the core may arise from several sources:
* Misalignment during classification
* Structural heterogeneity (different conformations)
* Noise-dominated images
* Contaminants or artifacts
By focusing on the core, the protocol enhances the structural signal and
reduces variability that is not biologically meaningful.
## Z-score Thresholds: Controlling Particle Selection
The main parameters of this protocol are two Z-score thresholds, which
control how strictly particles are filtered.
### Junk Z-score
This parameter defines how far a particle can deviate from the class center
before being considered an outlier. Lower values make the selection
stricter, removing more particles. Higher values are more permissive and
retain more images.
From a practical perspective, values around 2–3 are commonly used. A value
near 3 corresponds roughly to keeping particles within the main body of a
Gaussian distribution. Reducing this threshold is useful when classes are
noisy or suspected to contain significant contamination.
### PCA Z-score
This parameter performs a similar filtering but in a reduced feature space
obtained through principal component analysis (PCA). It captures
variability in the main modes of variation within the class.
Biologically, this is particularly relevant when subtle structural
differences or alignment inconsistencies exist. The PCA-based filtering can
detect outliers that are not obvious in the original space.
As with the Junk Z-score, lower values enforce stricter filtering.
## Outputs and Their Interpretation
The protocol produces a new set of 2D classes containing only the particles
that belong to the *core* of each class.
Importantly, the class identities are preserved, but their composition
changes: each class now contains fewer particles, ideally those that are
more consistent and better aligned.
From a biological standpoint, these refined classes typically show:
* Sharper structural features
* Reduced noise
* Improved interpretability
However, users should be aware that aggressive filtering may remove rare
but biologically relevant states. This is particularly important in systems
with continuous heterogeneity or multiple conformations.
## Practical Recommendations
In routine workflows, this protocol is best used after an initial 2D
classification, especially when preparing data for high-resolution
reconstruction.
A good strategy is to start with moderate Z-score thresholds (around 3)
and visually inspect the resulting classes. If classes still appear noisy
or blurred, lowering the thresholds can improve consistency.
For datasets with suspected heterogeneity, caution is advised.
Over-filtering may eliminate particles corresponding to minor
conformational states, which could be biologically important.
It is also useful to compare results before and after core analysis.
Improvements in class sharpness and downstream reconstruction quality are
good indicators that the protocol has been beneficial.
## Final Perspective
Core Analysis is a statistically grounded alternative to manual class
cleaning. By identifying the most representative particles within each
class, it enhances data quality in a reproducible and objective manner.
For most cryo-EM users, this protocol serves as a bridge between initial
classification and high-quality structural interpretation, helping to
ensure that subsequent analyses are based on the most reliable subset of
the data.
"""
_label = 'core analysis'
def __init__(self, **args):
ProtClassify2D.__init__(self, **args)
if self.numberOfMpi.get() < 2:
self.numberOfMpi.set(2)
#--------------------------- DEFINE param functions ------------------------
def _defineParams(self, form):
form.addSection(label='Input')
form.addParam('inputClasses', param.PointerParam,
label="Input classes",
pointerClass='SetOfClasses2D',
help='Set of input classes to be analyzed')
form.addParam('thZscore', param.FloatParam, default=3,
label='Junk Zscore',
help='Which is the average Z-score to be considered as '
'junk. Typical values go from 1.5 to 3. For the '
'Gaussian distribution 99.5% of the data is '
'within a Z-score of 3. Lower Z-scores reject more '
'images. Higher Z-scores accept more images.')
form.addParam('thPCAZscore', param.FloatParam, default=3,
label='PCA Zscore',
help='Which is the PCA Z-score to be considered as junk. '
'Typical values go from 1.5 to 3. For the Gaussian '
'distribution 99.5% of the data is within a '
'Z-score of 3. Lower Z-scores reject more images. '
'Higher Z-scores accept more images.')
form.addParallelSection(threads=0, mpi=4)
#--------------------------- INSERT steps functions ------------------------
def _insertAllSteps(self):
self._defineFileNames()
self._insertFunctionStep('analyzeCore')
self._insertFunctionStep('createOutputStep')
[docs] def analyzeCore(self):
# Put in a function convertInputStep
fnLevel = self._getExtraPath('level_00')
makePath(fnLevel)
inputMdName = join(fnLevel, 'level_classes.xmd')
writeSetOfClasses2D(self.inputClasses.get(), inputMdName, writeParticles=True)
args = " --dir %s --root level --computeCore %f %f" % (self._getExtraPath(),
self.thZscore, self.thPCAZscore)
self.runJob('xmipp_classify_CL2D_core_analysis', args)
self.runJob("xmipp_classify_evaluate_classes", "-i %s"%\
self._getExtraPath(join("level_00", "level_classes_core.xmd")), numberOfMpi=1)
#--------------------------- STEPS functions -------------------------------
def _defineFileNames(self):
""" Centralize how files are called within the protocol. """
self.levelPath = self._getExtraPath('level_%(level)02d/')
myDict = {
'final_classes': self._getPath('classes2D%(sub)s.sqlite'),
'output_particles': self._getExtraPath('images.xmd'),
'level_classes': self.levelPath + 'level_classes%(sub)s.xmd',
'level_images': self.levelPath + 'level_images%(sub)s.xmd',
'classes_scipion': (self.levelPath + 'classes_scipion_level_'
'%(level)02d%(sub)s.sqlite'),
}
self._updateFilenamesDict(myDict)
[docs] def createOutputStep(self):
""" Store the SetOfClasses2D object
resulting from the protocol execution.
"""
inputParticles = self.inputClasses.get().getImagesPointer()
level = self._getLastLevel()
subset = CLASSES_CORE
subsetFn = self._getFileName("level_classes", level=level, sub=subset)
if exists(subsetFn):
classes2DSet = self._createSetOfClasses2D(inputParticles, subset)
self._fillClassesFromLevel(classes2DSet, 'last', subset)
result = {'outputClasses' + subset: classes2DSet}
self._defineOutputs(**result)
self._defineSourceRelation(inputParticles, classes2DSet)
#--------------------------- INFO functions --------------------------------
def _validate(self):
validateMsgs = []
if self.numberOfMpi <= 1:
validateMsgs.append('Mpi needs to be greater than 1.')
return validateMsgs
def _citations(self):
citations=['Sorzano2014']
return citations
def _methods(self):
strline ='We calculated the class cores %s. [Sorzano2014]' % self.getObjectTag('outputClasses_core')
return [strline]
# --------------------------- UTILS functions -------------------------------
def _updateParticle(self, item, row):
item.setClassId(row.getValue(md.MDL_REF))
item.setTransform(rowToAlignment(row, ALIGN_2D))
def _updateClass(self, item):
classId = item.getObjId()
if classId in self._classesInfo:
index, fn, _ = self._classesInfo[classId]
item.setAlignment2D()
rep = item.getRepresentative()
rep.setLocation(index, fn)
rep.setSamplingRate(self.inputClasses.get().getImages().getSamplingRate())
def _loadClassesInfo(self, filename):
""" Read some information about the produced 2D classes
from the metadata file.
"""
self._classesInfo = {} # store classes info, indexed by class id
mdClasses = md.MetaData(filename)
for classNumber, row in enumerate(md.iterRows(mdClasses)):
index, fn = xmippToLocation(row.getValue(md.MDL_IMAGE))
# Store info indexed by id, we need to store the row.clone() since
# the same reference is used for iteration
self._classesInfo[index] = (index, fn, row.clone())
def _fillClassesFromLevel(self, clsSet, level, subset):
""" Create the SetOfClasses2D from a given iteration. """
classRf = ''
self._loadClassesInfo(self._getLevelMdClasses(lev=level, subset=classRf))
if subset == '' and level == "last":
xmpMd = self._getFileName('output_particles')
if not exists(xmpMd):
xmpMd = self._getLevelMdImages(level, subset)
else:
xmpMd = self._getLevelMdImages(level, subset)
iterator = md.SetMdIterator(xmpMd, sortByLabel=md.MDL_ITEM_ID,
updateItemCallback=self._updateParticle,
skipDisabled=True)
# itemDataIterator is not neccesary because, the class SetMdIterator
# contain all the information about the metadata
clsSet.classifyItems(updateItemCallback=iterator.updateItem,
updateClassCallback=self._updateClass)
def _getLevelMdClasses(self, lev=0, block="classes", subset=""):
""" Return the classes metadata for this iteration.
block parameter can be 'info' or 'classes'."""
if lev == "last":
lev = self._getLastLevel()
mdFile = self._getFileName('level_classes', level=lev, sub=subset)
if block:
mdFile = block + '@' + mdFile
return mdFile
def _getLevelMdImages(self, level, subset):
if level == "last":
level = self._getLastLevel()
xmpMd = self._getFileName('level_images', level=level, sub=subset)
if not exists(xmpMd):
self._createLevelMdImages(level, subset)
return xmpMd
def _createLevelMdImages(self, level, sub):
if level == "last":
level = self._getLastLevel()
mdClassesFn = self._getLevelMdClasses(lev=level, block="", subset=sub)
mdImgs = md.joinBlocks(mdClassesFn, "class0")
mdImgs.write(self._getFileName('level_images', level=level, sub=sub))
def _getLastLevel(self):
""" Find the last Level number """
clsFn = self._getFileName('level_classes', level=0, sub="")
levelTemplate = clsFn.replace('level_00', 'level_??')
lev = len(glob(levelTemplate)) - 1
return lev