# **************************************************************************
# *
# * Authors: Carlos Oscar Sorzano (coss@cnb.csic.es)
# * Tomas Majtner (tmajtner@cnb.csic.es) -- streaming version
# *
# * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307 USA
# *
# * All comments concerning this program package may be sent to the
# * e-mail address 'coss@cnb.csic.es'
# *
# **************************************************************************
import os
import enum
from math import sqrt
import numpy as np
from pyworkflow.object import Set, String, Pointer
import pyworkflow.protocol.params as params
from pwem.protocols import ProtParticlePicking
from pyworkflow.protocol.constants import *
from pwem.objects import SetOfCoordinates, Coordinate
from pyworkflow.utils import getFiles, removeBaseExt, moveFile
from pyworkflow import UPDATED, PROD
PICK_MODE_LARGER = 0
PICK_MODE_EQUAL = 1
[docs]class ProtPickingConsensusOutput(enum.Enum):
""" Possible outputs for particle picking protocols
AI Generated
## Overview
The Picking Consensus protocol combines several particle-picking results and
keeps coordinates that are supported by a user-defined level of agreement.
Different particle pickers, or the same picker with different parameters, may
select slightly different coordinates. Some picks may be reliable because
several methods select the same particle. Other picks may be method-specific
and may correspond to false positives, contaminants, weak particles, or
borderline cases.
This protocol compares multiple coordinate sets micrograph by micrograph. A
coordinate is considered to represent the same particle as another coordinate
if the two positions are within a user-defined radius. The protocol then counts
how many input pickers support each candidate particle and writes a consensus
coordinate set.
The protocol can behave strictly, keeping only particles selected by all
pickers, or flexibly, keeping particles selected by one or more pickers. This
makes it useful both for cleaning particle sets and for merging complementary
picking strategies.
## Inputs and General Workflow
The input is a list of coordinate sets.
Each input coordinate set should correspond to particle picks on the same
micrographs, although the protocol can also handle streaming situations where
different coordinate sets become available progressively.
For each micrograph, the protocol collects the coordinates from all input
coordinate sets. If the input coordinate sets have different sampling rates,
their coordinates are rescaled to the sampling rate of the first coordinate
set.
The protocol then groups nearby coordinates within the selected consensus
radius. Each group receives a number of votes corresponding to how many input
coordinate sets contributed a coordinate to that group.
Finally, the protocol keeps only the coordinate groups that satisfy the
selected consensus rule and writes them as a new Scipion coordinate set.
## Input Coordinates
The **Input coordinates** parameter contains the coordinate sets to compare.
These coordinate sets may come from different particle-picking protocols,
different picking algorithms, different parameter settings, or manual and
automatic picking runs.
The first input coordinate set is used as the main reference for micrograph
information, box size, and sampling-rate scaling. The output coordinate set is
linked to the micrographs of this first input.
For best results, all input coordinate sets should refer to the same
micrographs or to matching micrograph sets. If the input pickers were run on
different subsets of micrographs, the protocol processes only the micrographs
that are available according to its streaming and readiness logic.
## Consensus Radius
The **Radius** parameter defines the distance, in pixels, within which two
coordinates are considered to correspond to the same particle.
For example, if the radius is 10 pixels, two picks closer than 10 pixels are
merged into a single candidate particle. The merged coordinate is updated as a
weighted average of the contributing positions.
A small radius is strict. It requires different pickers to place coordinates
very close to each other. This may reduce false matches but can also fail to
merge picks that clearly correspond to the same particle but are slightly
shifted.
A large radius is more permissive. It can merge picks with larger positional
differences, but if it is too large it may incorrectly merge nearby distinct
particles.
The radius should be chosen according to particle size, picking precision, and
typical particle separation. It should normally be smaller than the particle
diameter.
## Consensus Number
The **Consensus** parameter defines how many input coordinate sets must support
a particle for it to be included in the output.
If the value is set to **-1**, the protocol requires support from all input
coordinate sets. This corresponds to an AND operation. It is the strictest
mode.
If the value is set to **1**, a coordinate is accepted if it is selected by at
least one input coordinate set. This corresponds to an OR operation. It is the
most permissive mode.
Intermediate values allow more flexible behavior. For example, if four picking
methods are provided and the consensus value is 3, a particle is accepted if
three or more pickers agree on it, depending on the selected consensus mode.
This parameter controls the balance between precision and recall. Strict
consensus reduces false positives but may discard true particles missed by some
pickers. Flexible consensus recovers more particles but may retain more false
positives.
## Consensus Mode
The **Consensus mode** parameter controls how the number of votes is compared
with the consensus number.
If the mode is **>=**, a particle is accepted if it has at least the selected
number of votes. This is the usual and more flexible behavior.
If the mode is **=**, a particle is accepted only if it has exactly the selected
number of votes. This is more specialized and can be useful when the user wants
to isolate particles picked by a specific number of methods.
For most workflows, the **>=** mode is recommended.
## Coordinate Merging
When coordinates from different pickers fall within the consensus radius, the
protocol merges them into a single candidate coordinate.
The merged coordinate is computed progressively as an average of the positions
that vote for the same particle. This means that the output coordinate may not
be exactly equal to any individual input coordinate, but instead represents the
central consensus position.
This is useful when different pickers identify the same particle with slightly
different centers.
## AND and OR Interpretations
The protocol can be understood in terms of familiar logical operations.
With **Consensus = -1**, the protocol behaves like an AND operation: a particle
is accepted only if all input pickers selected it.
With **Consensus = 1** and mode **>=**, the protocol behaves like an OR
operation: a particle is accepted if at least one picker selected it.
Values between these extremes implement majority-like rules. For example, with
three input pickers and Consensus = 2, the protocol keeps particles selected by
at least two pickers.
These modes allow users to tune how conservative the final coordinate set
should be.
## Jaccard Index File
During consensus calculation, the protocol writes a Jaccard-like agreement
value for each processed micrograph.
This value summarizes the degree of overlap between the input coordinate sets
for that micrograph. It can help identify micrographs where pickers agree well
and micrographs where the picking results are inconsistent.
Low agreement may indicate poor micrograph quality, difficult particle
appearance, contaminants, low contrast, or strong differences between picking
methods.
This file is mainly diagnostic and should be interpreted as a guide to picking
agreement rather than as an absolute quality measure.
## Output Consensus Coordinates
The main output is **consensusCoordinates**, a new SetOfCoordinates.
This output contains the coordinates that satisfy the selected consensus rule.
It is linked to the micrographs of the first input coordinate set and uses the
box size from that same set.
The consensus coordinate set can be passed to particle extraction protocols in
the same way as any other coordinate set.
Depending on the selected consensus parameters, the output may be smaller,
similar in size, or larger than the individual input coordinate sets.
## Streaming Behavior
The protocol supports streaming coordinate inputs.
In streaming mode, the protocol waits until coordinates for a micrograph are
available from the relevant input coordinate sets before computing consensus
for that micrograph. When all input streams are closed, it processes any
remaining available micrographs and closes the output stream.
This makes the protocol suitable for automated pipelines where particle
picking results are produced progressively by different pickers.
## Practical Recommendations
Use this protocol when you have several picking results and want to combine
them in a controlled way.
For a conservative cleaned set, use Consensus = -1, requiring all pickers to
agree. This can reduce false positives but may lose true particles.
For a permissive merged set, use Consensus = 1 with mode >=. This keeps any
particle selected by at least one picker, but later cleaning by particle
screening or 2D classification becomes more important.
For a balanced strategy, use a majority rule, such as requiring 2 out of 3
pickers or 3 out of 4 pickers.
Choose the consensus radius carefully. It should reflect expected picking
uncertainty but should not be so large that nearby particles are merged.
Inspect the output coordinates visually on representative micrographs. This is
especially important when combining pickers with very different behavior.
Use the Jaccard agreement information to identify micrographs where picking
methods disagree strongly.
## Final Perspective
Picking Consensus is a coordinate-combination and quality-control protocol. It
does not pick particles directly. Instead, it integrates the results of several
picking methods or parameter settings and produces a coordinate set based on
their agreement.
For biological users, this is useful because no single picker is perfect.
Consensus can help reduce method-specific false positives, recover robust
particles, and create more reliable coordinate sets for extraction.
The protocol is most effective when the input pickers are complementary and
when the consensus radius and voting threshold are chosen according to the
particle size and the desired strictness of the workflow.
"""
consensusCoordinates = SetOfCoordinates
[docs]class XmippProtConsensusPicking(ProtParticlePicking):
"""
Protocol to estimate the agreement between different particle picking
algorithms. The protocol takes several Sets of Coordinates calculated
by different programs and/or different parameter settings. Let's say:
we consider N independent pickings. Then, a coordinate is considered
to be a correct particle if M pickers have selected the same particle
(within a radius in pixels specified in the form).
If you want to be very strict, then set M=N; that is, a coordinate
represents a particle if it has been selected by all particles (this
is the default behaviour). Then you may relax this condition by setting
M=N-1, N-2, ...
If you want to be very flexible, set M=1, in this way it suffices that
1 picker has selected the coordinate to be considered as a particle. Note
that in this way, the cleaning of the dataset has to be performed by other
means (screen particles, 2D and 3D classification, ...).
"""
_label = 'picking consensus'
_devStatus = PROD
_possibleOutputs = ProtPickingConsensusOutput
outputName = ProtPickingConsensusOutput.consensusCoordinates.name
FN_PREFIX = 'consensusCoords_'
def __init__(self, **args):
ProtParticlePicking.__init__(self, **args)
self.stepsExecutionMode = STEPS_SERIAL
def _defineParams(self, form):
form.addSection(label='Input')
form.addParam('inputCoordinates', params.MultiPointerParam,
pointerClass='SetOfCoordinates',
label="Input coordinates", important=True,
help='Select the set of coordinates to compare')
form.addParam('consensusRadius', params.IntParam, default=10,
label="Radius", allowsPointers=True, allowsNull=True,
help="All coordinates within this radius (in pixels) "
"are presumed to correspond to the same particle")
form.addParam('consensus', params.IntParam, default=-1,
label="Consensus",
help="How many times need a particle to be selected to "
"be considered as a consensus particle.\n"
"*Set to -1* to indicate that it needs to be selected "
"by all algorithms: *AND* operation.\n"
"*Set to 1* to indicate that it suffices that only "
"1 algorithm selects the particle: *OR* operation.")
form.addParam('mode', params.EnumParam, label='Consensus mode',
choices=['>=', '='], default=PICK_MODE_LARGER,
expertLevel=LEVEL_ADVANCED,
help='If the number of votes to progress to the output '
'must be either (=) strictly speaking equals to '
'the consensus number or (>=) at least equals.')
# FIXME: It's not using more than one since
# self.stepsExecutionMode = STEPS_SERIAL
# form.addParallelSection(threads=4, mpi=0)
#--------------------------- INSERT steps functions ---------------------------
def _insertAllSteps(self):
self.checkedMics = set() # those mics ready to be processed (micId)
self.processedMics = set() # those mics already processed (micId)
self.sampligRates = []
coorSteps = self.insertNewCoorsSteps([])
self._insertFunctionStep('createOutputStep',
prerequisites=coorSteps, wait=True)
[docs] def createOutputStep(self):
pass
def _getFirstJoinStepName(self):
# This function will be used for streaming, to check which is
# the first function that need to wait for all mics
# to have completed, this can be overriden in subclasses
# (e.g., in Xmipp 'sortPSDStep')
return 'createOutputStep'
def _getFirstJoinStep(self):
for s in self._steps:
if s.funcName == self._getFirstJoinStepName():
return s
return None
def _stepsCheck(self):
self._checkNewInput()
self._checkNewOutput()
def _checkNewInput(self):
# If continue from an stopped run, don't repeat what is done
if not self.checkedMics:
for fn in getFiles(self._getExtraPath()):
fn = removeBaseExt(fn)
if fn.startswith(self.FN_PREFIX):
self.checkedMics.update([self.getMicId(fn)])
self.processedMics.update([self.getMicId(fn)])
streamClosed = []
readyMics = None
allMics = set()
for coordSet in self.inputCoordinates:
currentPickMics, isSetClosed = getReadyMics(coordSet.get())
streamClosed.append(isSetClosed)
if not readyMics: # first time
readyMics = currentPickMics
else: # available mics are those ready for all pickers
readyMics.intersection_update(currentPickMics)
allMics = allMics.union(currentPickMics)
mainPickMics = getReadyMics(self.inputCoordinates[0].get())[0]
secondaryPickMics = getReadyMics(self.inputCoordinates[1].get())[0]
if mainPickMics != secondaryPickMics:
allMics = mainPickMics.intersection(secondaryPickMics)
self.streamClosed = all(streamClosed)
if self.streamClosed:
# for non streaming do all and in the last iteration of streaming do the rest
newMicIds = allMics.difference(self.checkedMics)
else: # for streaming processing, only go for the ready mics in all pickers
if readyMics is not None:
newMicIds = readyMics.difference(self.checkedMics)
if newMicIds:
self.checkedMics.update(newMicIds)
inMics = self.getMainInput().getMicrographs()
newMics = [inMics[micId].clone() for micId in newMicIds]
fDeps = self.insertNewCoorsSteps(newMics)
outputStep = self._getFirstJoinStep()
if outputStep is not None:
outputStep.addPrerequisites(*fDeps)
self.updateSteps()
def _checkNewOutput(self):
if getattr(self, 'finished', False):
return
self.finished = self.streamClosed and self.checkedMics == self.processedMics
streamMode = Set.STREAM_CLOSED if self.finished else Set.STREAM_OPEN
newFiles = getFiles(self._getTmpPath())
if newFiles or self.finished: # when finished to close the output set
outSet = self._loadOutputSet(SetOfCoordinates, 'coordinates.sqlite')
for fnTmp in newFiles:
coords = np.loadtxt(fnTmp)
moveFile(fnTmp, self._getExtraPath())
if coords.size == 2: # special case with only one coordinate
coords = [coords]
for coord in coords:
newCoord = Coordinate()
micrographs = self.getMainInput().getMicrographs()
newCoord.setMicrograph(micrographs[self.getMicId(fnTmp)])
newCoord.setPosition(coord[0], coord[1])
outSet.append(newCoord)
firstTime = not self.hasAttribute(self.outputName)
self._updateOutputSet(self.outputName, outSet, streamMode)
if firstTime:
self.defineRelations(outSet)
outSet.close()
if self.finished: # Unlock createOutputStep if finished all jobs
outputStep = self._getFirstJoinStep()
if outputStep and outputStep.isWaiting():
outputStep.setStatus(STATUS_NEW)
[docs] def defineRelations(self, outputSet):
for inCorrds in self.inputCoordinates:
self._defineTransformRelation(inCorrds, outputSet)
def _loadOutputSet(self, SetClass, baseName):
setFile = self._getPath(baseName)
if os.path.exists(setFile):
outputSet = SetClass(filename=setFile)
outputSet.loadAllProperties()
outputSet.enableAppend()
else:
outputSet = SetClass(filename=setFile)
outputSet.setStreamState(outputSet.STREAM_OPEN)
outputSet.setBoxSize(self.getMainInput().getBoxSize())
inMicsPointer = self.getMainInput().getMicrographs(asPointer=True)
outputSet.setMicrographs(inMicsPointer)
return outputSet
[docs] def calculateConsensusStep(self, micId, micName):
print("Consensus calculation for micrograph %d: '%s'"
% (micId, micName))
# Take the sampling rates just once
if not self.sampligRates:
for coordinates in self.inputCoordinates:
micrograph = coordinates.get().getMicrographs()
self.sampligRates.append(micrograph.getSamplingRate())
# Get all coordinates for this micrograph
coords = []
for idx, coordinates in enumerate(self.inputCoordinates):
coordArray = np.asarray([x.getPosition() for x in
coordinates.get().iterCoordinates(micId)],
dtype=float)
coordArray *= float(self.sampligRates[idx]) / float(self.sampligRates[0])
coords.append(np.asarray(coordArray, dtype=int))
consensusWorker(coords, self.consensus.get(), self.consensusRadius.get(),
self._getTmpPath('%s%s.txt' % (self.FN_PREFIX, micId)),
self._getExtraPath('jaccard.txt'), self.mode.get())
self.processedMics.update([micId])
def _validate(self):
errors = []
# Only for Scipion 2.0, next versions should have the default
# PointerList validation and this can be removed
if len(self.inputCoordinates) == 0:
errors.append('inputCoordinates cannot be EMPTY.')
# Consider empty pointers:
else:
for pointer in self.inputCoordinates:
obj = pointer.get()
if obj is None:
errors.append('%s is empty.' % obj)
return errors
def _summary(self):
message = []
for i, coordinates in enumerate(self.inputCoordinates):
protocol = self.getMapper().getParent(coordinates.get())
message.append("Method %d %s" % (i + 1, protocol.getClassLabel()))
message.append("Radius = %d" % self.consensusRadius)
message.append("Consensus = %d" % self.consensus)
return message
def _methods(self):
return []
[docs] def getMainInput(self):
return self.inputCoordinates[0].get()
[docs] @classmethod
def getMicId(self, fn):
return int(removeBaseExt(fn).lstrip(self.FN_PREFIX))
[docs]def consensusWorker(coords, consensus, consensusRadius, posFn, jaccFn=None,
mode=PICK_MODE_LARGER):
""" Worker for calculate the consensus of N picking algorithms of
M_n coordinates each one.
coords: Array of N numpy arrays of M_n coordinates each one.
consensus: Minimum number of votes to get a consensus coordinate
consensusRadius: Tolerance to see two coordinates as the same (in pixels)
posFn: Where to write the consensus coordinates
jaccFN: Where to write the Jaccard index per micrograph
"""
if len(coords) == 1: # self consensus (remove duplicates)
N0 = 0
firstInput = 0
else: # in regular consensus all first coords are directly added to allCoords
N0 = coords[0].shape[0]
firstInput = 1
# initializing arrays
Ninputs = len(coords)
Ncoords = sum([x.shape[0] for x in coords])
allCoords = np.zeros([Ncoords, 2])
votes = np.zeros(Ncoords)
inAllMicrographs = consensus <= 0 or consensus >= Ninputs
# if nothing in the first and it should be in all, nothing to do
if (not all([coords[idx].shape[0] for idx in range(Ninputs)])
and inAllMicrographs):
print("Returning from worker: doing AND consensus and, at least, one "
"picker is empty for this micrograph (%s)." % posFn)
return
# Add all the first coordinates to 'allCoords' and 'votes' lists
if N0 > 0:
allCoords[0:N0, :] = coords[0]
votes[0:N0] = 1
# Add the rest of coordinates to 'allCoords' and 'votes' lists
Ncurrent = N0
for n in range(firstInput, Ninputs):
for coord in coords[n]:
if Ncurrent > 0:
dist = np.sum((coord - allCoords[0:Ncurrent]) ** 2, axis=1)
imin = np.argmin(dist)
if sqrt(dist[imin]) < consensusRadius:
newCoord = (votes[imin] * allCoords[imin,] + coord) / (
votes[imin] + 1)
allCoords[imin,] = newCoord
votes[imin] += 1
else:
allCoords[Ncurrent, :] = coord
votes[Ncurrent] = 1
Ncurrent += 1
else:
allCoords[Ncurrent, :] = coord
votes[Ncurrent] = 1
Ncurrent += 1
# Select those in the consensus
if consensus <= 0 or consensus > Ninputs:
consensus = Ninputs
elif not isinstance(consensus, int):
consensus = consensus.get()
if mode==PICK_MODE_LARGER:
consensusCoords = allCoords[votes >= consensus, :]
else:
consensusCoords = allCoords[votes == consensus, :]
try:
if jaccFn:
jaccardIdx = float(len(consensusCoords)) / (
float(len(allCoords)) / Ninputs)
# COSS: Possible problem with concurrent writes
with open(jaccFn, "a") as fhJaccard:
fhJaccard.write("%s \t %f\n" % (posFn, jaccardIdx))
except Exception as exc:
print("Some error occurred during Jaccard index calculation or "
"writing it's file. Maybe a concurrence issue:\n%s" % exc)
# Write the consensus file only if there
# are some coordinates (size > 0)
if consensusCoords.size:
np.savetxt(posFn, consensusCoords)
[docs]def getReadyMics(coordSet):
coorSet = SetOfCoordinates(filename=coordSet.getFileName())
coorSet._xmippMd = String()
coorSet.loadAllProperties()
setClosed = coorSet.isStreamClosed()
coorSet.close()
currentPickMics = {micAgg["_micId"] for micAgg in
coordSet.aggregate(["MAX"], "_micId", ["_micId"])}
return currentPickMics, setClosed