# **************************************************************************
# *
# * Authors: Daniel Marchan (da.marchan@cnb.csic.es)
# *
# * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307 USA
# *
# * All comments concerning this program package may be sent to the
# * e-mail address 'scipion@cnb.csic.es'
# *
# **************************************************************************
import os
from datetime import datetime
import numpy as np
import random
from collections import defaultdict
from pyworkflow import VERSION_3_0
from pwem.objects import SetOfCTF, SetOfMicrographs
from pyworkflow.object import Pointer
import pyworkflow.protocol.params as params
import pyworkflow.utils as pwutils
from pwem.protocols import ProtCTFMicrographs
from pyworkflow.protocol.constants import (STATUS_NEW)
from pyworkflow import UPDATED, NEW
OUTPUT_CTF = "outputCTF"
OUTPUT_MICS = "outputMicrographs"
[docs]class XmippProtMicDefocusSampler(ProtCTFMicrographs):
"""
Protocol to make a balanced subsample of meaningful CTFs in basis of the
defocus values. Both CTFs and micrographs will be output. CTFs with
different defocus values look differently, while low defocus images
will have bigger fringes, higher defocus values will have more compact rings.
Micrographs with a greater defocus will have more contrast.
AI Generated
## Overview
The Micrographs Defocus Sampler protocol selects a subset of micrographs and
their associated CTF estimations so that the selected images are balanced across
the defocus range of the dataset.
In cryo-EM, micrographs acquired at different defocus values can look quite
different. Low-defocus micrographs usually show wider CTF fringes and lower
image contrast, while high-defocus micrographs show more compact Thon rings and
stronger contrast. Because of this, a random subset of micrographs may not
represent the full optical diversity of the dataset.
This protocol addresses that problem by sampling micrographs according to their
defocus values. It divides the defocus range into bins and selects images from
each bin, producing a more balanced subset than a purely random selection.
The protocol outputs both the selected CTF estimations and the corresponding
micrographs.
## Inputs and General Workflow
The main input is a **SetOfCTF**. Each CTF item is linked to the micrograph
from which it was estimated.
The protocol reads the defocus value of each CTF estimation, specifically the
defocusU value, and uses these values to organize the micrographs across the
observed defocus range.
The defocus range is divided into a fixed number of bins. The protocol then
samples micrographs from each bin so that the final output contains a more
uniform representation of low, medium, and high defocus values.
The final outputs are:
- a selected set of CTF estimations;
- the corresponding selected set of micrographs.
These outputs can be used for inspection, quality control, representative
visualization, or downstream tests on a manageable but defocus-balanced subset.
## Input CTF
The **Input CTF** parameter should point to the CTF estimations that will be
sampled.
Each CTF must be associated with a micrograph. The protocol uses this
relationship to produce both output CTFs and output micrographs.
The quality of the result depends on the quality and completeness of the input
CTF set. If the CTF estimation contains incorrect defocus values, the balanced
sampling will reflect those incorrect values.
This protocol does not estimate CTF parameters. It assumes that CTF estimation
has already been performed.
## Sample Size
The **Sample size** parameter defines the number of images that the protocol
will try to select.
For example, if the sample size is 25, the protocol will output up to 25
micrographs and their corresponding CTF estimations.
The sample size should be chosen according to the intended use. A small sample
is useful for quick visual inspection or illustrative examples. A larger sample
may be preferable for more robust quality-control checks or benchmark tests.
The selected subset is not intended to replace the full dataset for final
processing. Its purpose is to provide a representative defocus-balanced sample.
## Minimum Number of Images to Make Sampling
The **Minimum number of images to make sampling** parameter controls when the
protocol starts the balanced sampling.
This is especially relevant in streaming workflows, where CTF estimations may
arrive gradually as micrographs are processed. The protocol waits until at
least this number of CTF estimations is available before performing the
sampling, unless the input stream is closed.
This avoids selecting a sample too early, before the dataset contains enough
micrographs to represent the defocus distribution.
For non-streaming datasets, this parameter acts as a practical threshold for
when sampling should be performed.
## Defocus-Based Balanced Sampling
The protocol uses defocusU values to organize the input CTFs.
Conceptually, the procedure is:
1. Read the defocus value for each CTF.
2. Determine the minimum and maximum defocus values.
3. Divide this interval into several bins.
4. Select images from each bin.
5. If the selected set is still smaller than the requested sample size, add
additional images from the remaining pool.
This strategy tries to prevent the selected subset from being dominated by the
most common defocus values. Instead, it increases the chance that the final
sample contains examples from across the full defocus range.
Because the selection involves random sampling within bins, running the
protocol again may produce a different subset.
## Why Balance by Defocus?
Defocus affects both image contrast and the appearance of the CTF. Therefore,
a balanced defocus sample is useful when the user wants to inspect whether the
dataset behaves consistently across optical conditions.
For example, such a subset may help answer questions such as:
- Do low-defocus and high-defocus micrographs both look acceptable?
- Are Thon rings visible across the whole defocus range?
- Are some defocus ranges associated with poorer images?
- Does a downstream protocol behave similarly for different defocus values?
This can be especially useful for quality-control reports, visual summaries,
training examples, or testing processing workflows on a representative subset.
## Output Micrographs
The **outputMicrographs** object contains the micrographs associated with the
selected CTF estimations.
These micrographs are not modified. They are simply copied into a new Scipion
set so that the user can inspect or process the selected subset separately from
the full dataset.
This output is useful for visual inspection, manual checking, or running quick
tests on representative micrographs.
## Output CTF
The **outputCTF** object contains the selected CTF estimations.
The output CTF set remains linked to the selected output micrographs. This
means that downstream protocols can use the selected micrographs together with
their corresponding CTF information.
This output is useful when the user wants to inspect the CTFs themselves, plot
their defocus distribution, or run downstream tests that require both
micrographs and CTF metadata.
## Summary Statistics
The protocol reports basic statistics of the defocus values in the input group
used for sampling. These include the defocus range, minimum, maximum, mean, and
standard deviation.
These values help the user understand the defocus distribution from which the
sample was drawn.
A wide defocus range indicates that the dataset contains substantial optical
variation. A narrow range means that the micrographs were acquired with more
similar defocus values.
These statistics are descriptive. They are not a quality criterion by
themselves, but they provide useful context for interpreting the selected
sample.
## Streaming Behavior
The protocol is designed to work with streaming input.
In a streaming workflow, new CTF estimations may appear progressively. The
protocol checks whether new input CTFs are available and waits until either
enough CTFs have accumulated or the input stream has closed.
Once a balanced sample has been selected and the outputs have been created, the
protocol finishes.
This behavior is useful during online or near-real-time processing, where the
user may want a representative subset for early inspection without waiting for
all downstream processing to finish.
## Practical Recommendations
Use this protocol after CTF estimation, not before. The protocol needs defocus
values and micrograph associations from an existing CTF set.
Choose a sample size large enough to cover the defocus range meaningfully. Very
small samples may miss some parts of the distribution even if the sampling is
balanced.
Use a larger minimum number of images when working with streaming data, so that
the protocol does not sample too early from an incomplete and unrepresentative
defocus distribution.
Remember that the selection is balanced by defocus, not by all possible quality
criteria. A selected micrograph may still be poor because of drift, ice
contamination, astigmatism, poor CTF fit, or other problems.
Inspect the output micrographs and CTFs together. The purpose of this protocol
is to make such inspection more representative across the defocus range.
If reproducibility of the exact selected subset is important, note that the
sampling includes random choices within defocus bins.
## Final Perspective
The Micrographs Defocus Sampler is a practical quality-control and dataset
selection tool. It does not modify images or estimate new CTF parameters.
Instead, it selects a representative subset of micrographs and CTFs across the
observed defocus range.
For biological users and facility workflows, this can be useful for quickly
checking whether different defocus conditions are well represented and whether
data quality is consistent across the acquisition strategy.
The protocol is especially helpful when the full dataset is large and the user
needs a small, interpretable, defocus-balanced subset for inspection, reporting,
or preliminary testing.
"""
_label = 'micrographs defocus sampler'
_devStatus = NEW
_lastUpdateVersion = VERSION_3_0
_possibleOutputs = {OUTPUT_MICS: SetOfMicrographs,
OUTPUT_CTF: SetOfCTF}
def __init__(self, **args):
ProtCTFMicrographs.__init__(self, **args)
def _defineParams(self, form):
form.addSection(label='Input')
form.addParam('inputCTF', params.PointerParam, pointerClass='SetOfCTF',
label="Input CTF", important=True,
help='Select the estimated CTF to evaluate.')
form.addSection(label='Sampling')
form.addParam('numImages', params.IntParam,
default=25, label='Sample size',
help='Number of images after the defocus balanced sampling.')
form.addParam('minImages', params.IntParam,
default=100, label='Minimum number of images to make sampling',
help='Minimum number of images to make the defocus balanced sampling.')
# --------------------------- INSERT steps functions -------------------------
def _insertAllSteps(self):
self.initializeParams()
self._insertFunctionStep(self.createOutputStep,
prerequisites=[], wait=True, needsGPU=False)
[docs] def createOutputStep(self):
self._closeOutputSet()
[docs] def initializeParams(self):
self.finished = False
# Important to have both:
self.insertedIds = [] # Contains images that have been inserted in a Step (checkNewInput).
self.sampled_images = [] # Ids to be sample
# Contains images that have been processed in a Step (checkNewOutput).
self.ctfFn = self.inputCTF.get().getFileName()
def _getFirstJoinStepName(self):
''' This function will be used for streaming, to check which is
the first function that need to wait for all ctfs
to have completed, this can be overriden in subclasses
(e.g., in Xmipp 'sortPSDStep')
'''
return 'createOutputStep'
def _getFirstJoinStep(self):
for s in self._steps:
if s.funcName == self._getFirstJoinStepName():
return s
return None
def _insertNewCtfsSteps(self, newIds):
deps = []
stepId = self._insertFunctionStep(self.extractBalancedDefocus, newIds, needsGPU=False, prerequisites=[])
deps.append(stepId)
self.insertedIds.extend(newIds)
return deps
def _stepsCheck(self):
self._checkNewInput()
self._checkNewOutput()
def _checkNewInput(self):
# Check if there are new ctf to process from the input set
self.lastCheck = getattr(self, 'lastCheck', datetime.now())
mTime = datetime.fromtimestamp(os.path.getmtime(self.ctfFn))
self.debug('Last check: %s, modification: %s'
% (pwutils.prettyTime(self.lastCheck),
pwutils.prettyTime(mTime)))
# If the input movies.sqlite have not changed since our last check,
# it does not make sense to check for new input data
if self.lastCheck > mTime and self.insertedIds: # If this is empty it is dut to a static "continue" action or it is the first round
return None
ctfsSet = self._loadInputCtfSet(self.ctfFn)
ctfSetIds = ctfsSet.getIdSet()
newIds = [idCTF for idCTF in ctfSetIds if idCTF not in self.insertedIds]
self.lastCheck = datetime.now()
isStreamClosed = ctfsSet.isStreamClosed()
ctfsSet.close()
outputStep = self._getFirstJoinStep()
if self.isContinued() and not self.insertedIds: # For "Continue" action and the first round
doneIds, _ = self._getAllDoneIds()
if doneIds:
self.finished = True
self.info('The sampling images are already created.')
return
if (newIds and len(newIds) >= self.minImages.get()) or isStreamClosed:
fDeps = self._insertNewCtfsSteps(newIds)
if outputStep is not None:
outputStep.addPrerequisites(*fDeps)
self.updateSteps()
def _loadInputCtfSet(self, ctfFn):
self.debug("Loading input db: %s" % ctfFn)
ctfSet = SetOfCTF(filename=ctfFn)
ctfSet.loadAllProperties()
return ctfSet
def _checkNewOutput(self):
""" Check for already selected CTF and update the output set. """
# Check for results: we have finished when there is results in sample_images list
if not self.finished:
if self.sampled_images:
ctfSet, micSet = self.createOutputs(self.sampled_images)
self.updateRelations(ctfSet, micSet)
self.finished = True
self._store() # Update the summary dictionary
else: # Unlock createOutputStep if finished all jobs
outputStep = self._getFirstJoinStep()
if outputStep and outputStep.isWaiting():
outputStep.setStatus(STATUS_NEW)
[docs] def createOutputs(self, newDone):
cSet = self._loadOutputSet(SetOfCTF, 'ctfs.sqlite')
mSet = self._loadOutputSet(SetOfMicrographs,
'micrographs.sqlite')
self.fillOutput(cSet, mSet, newDone)
return cSet, mSet
def _loadOutputSet(self, SetClass, baseName):
"""
Create the output set.
"""
setFile = self._getPath(baseName)
outputSet = SetClass(filename=setFile)
micSet = self.inputCTF.get().getMicrographs()
if isinstance(outputSet, SetOfMicrographs):
outputSet.copyInfo(micSet)
elif isinstance(outputSet, SetOfCTF):
outputSet.setMicrographs(micSet)
return outputSet
[docs] def fillOutput(self, ctfSet, micSet, newDone):
inputCtfSet = self._loadInputCtfSet(self.ctfFn)
for ctfId in newDone:
ctf = inputCtfSet[ctfId].clone()
mic = ctf.getMicrograph().clone()
ctfSet.append(ctf)
micSet.append(mic)
inputCtfSet.close()
[docs] def updateRelations(self, cSet, mSet):
micsAttrName = OUTPUT_MICS
self._updateOutputSet(micsAttrName, mSet)
# Set micrograph as pointer to protocol to prevent pointer end up as another attribute (String, Booelan,...)
# that happens somewhere while scheduling.
cSet.setMicrographs(Pointer(self, extended=micsAttrName))
self._updateOutputSet(OUTPUT_CTF, cSet)
self._defineTransformRelation(self.inputCTF.get().getMicrographs(), mSet)
self._defineTransformRelation(self.inputCTF, cSet)
self._defineCtfRelation(mSet, cSet)
def _getAllDoneIds(self):
doneIds = []
sizeOutput = 0
if hasattr(self, OUTPUT_CTF):
sizeOutput = self.outputCTF.getSize()
doneIds.extend(list(self.outputCTF.getIdSet()))
return doneIds, sizeOutput
def _summary(self):
summary = []
if not hasattr(self, OUTPUT_MICS):
summary.append("Output set not ready yet.")
else:
populationSize = self.minImages.get()
outputSize = self.outputMicrographs.getSize()
summary.append("From %d micrographs extract a balanced defocus sample of: %d micrographs"
% (populationSize, outputSize))
summary.append(self.summaryVar.get())
return summary
[docs]def balanced_sampling(image_dict, N, bins=10):
"""
Perform balanced sampling of N images based on defocus values.
Parameters:
- image_dict (dict): Dictionary where keys are image IDs and values are defocus values.
- N (int): Total number of images to sample.
- bins (int): Number of bins to divide defocus values into (default is 10).
Returns:
- sampled_images (list): List of sampled image IDs.
"""
# Step 1: Get all defocus values and determine the bin edges
defocus_values = list(image_dict.values())
bin_edges = np.linspace(min(defocus_values), max(defocus_values), bins + 1)
# Step 2: Organize image IDs by bins
binned_images = defaultdict(list)
for image_id, defocus in image_dict.items():
# Find the bin index for the current defocus value
bin_index = np.digitize(defocus, bin_edges) - 1
# Avoid indexing beyond the available bins
bin_index = min(bin_index, bins - 1)
binned_images[bin_index].append(image_id)
# Step 3: Calculate how many images to sample per bin
images_per_bin = max(1, N // bins)
sampled_images = []
for bin_index in range(bins):
images_in_bin = binned_images[bin_index]
if len(images_in_bin) > images_per_bin:
# Randomly sample from the bin if there are more images than needed
sampled_images.extend(random.sample(images_in_bin, images_per_bin))
else:
# If fewer images than needed, take all images in this bin
sampled_images.extend(images_in_bin)
# If we have fewer than N images, randomly sample additional images to reach N
if len(sampled_images) < N:
remaining_images = list(set(image_dict.keys()) - set(sampled_images))
sampled_images.extend(random.sample(remaining_images, N - len(sampled_images)))
# Limit to N images in case there are extra
return sampled_images[:N]
[docs]def compute_statistics(values):
"""
Compute basic statistics for a list of numerical values.
Parameters:
- values (list or array-like): A list of numerical values (e.g., defocus values).
Returns:
- dict: A dictionary containing the statistics: min, max, mean, median, std, variance, and range.
"""
# Convert to a NumPy array for efficient computation
values = np.array(values)
# Compute statistics
stats = {
"min": np.min(values),
"max": np.max(values),
"mean": np.mean(values),
"median": np.median(values),
"std": np.std(values, ddof=1), # Sample standard deviation
"variance": np.var(values, ddof=1), # Sample variance
"range": np.max(values) - np.min(values),
}
return stats