Source code for xmipp3.protocols.protocol_eliminate_empty_images

# *****************************************************************************
# *
# * Authors:     Tomas Majtner         tmajtner@cnb.csic.es (2017)
# *              David Maluenda        dmaluenda@cnb.csic.es
# *
# * Unidad de  Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307  USA
# *
# *  All comments concerning this program package may be sent to the
# *  e-mail address 'scipion@cnb.csic.es'
# *
# *****************************************************************************

import os, sys
from datetime import datetime

import pwem.emlib.metadata as md
import pyworkflow.protocol.constants as cons
import pyworkflow.protocol.params as param
from pyworkflow import VERSION_2_0
from pyworkflow.object import Set
from pyworkflow.utils import cleanPath
from pyworkflow.utils.properties import Message

from pwem import ALIGN_NONE
from pwem.protocols import ProtClassify2D
from pwem.objects import SetOfParticles, SetOfAverages, SetOfClasses2D, Class2D, SetOfClasses, SetOfImages

from xmipp3.convert import (writeSetOfParticles, readSetOfParticles,
                            setXmippAttributes)


[docs]class XmippProtEliminateEmptyBase(ProtClassify2D):
    """ Base to eliminate images using statistical methods
    (variance of variances of sub-parts of input image) eliminates those samples,
    where there is no object/particle (only noise is presented there).
    Threshold parameter can be used for fine-tuning the algorithm for type of data.
    """
    _lastUpdateVersion = VERSION_2_0

    def __init__(self, **args):
        ProtClassify2D.__init__(self, **args)
        self.stepsExecutionMode = cons.STEPS_PARALLEL

[docs]    def addAdvancedParams(self, form):
        form.addParam('addFeatures', param.BooleanParam, default=False,
                      label='Add features', expertLevel=param.LEVEL_ADVANCED,
                      help='Add features used for the ranking to each '
                           'one of the input particles.')
        form.addParam('useDenoising', param.BooleanParam, default=True,
                      label='Turning on denoising',
                      expertLevel=param.LEVEL_ADVANCED,
                      help='Option for turning on denoising method '
                           'while computing emptiness feature.')
        form.addParam('denoising', param.IntParam, default=5,
                      expertLevel=param.LEVEL_ADVANCED,
                      condition='useDenoising',
                      label='Denoising factor:',
                      help='Factor to be used during Gaussian blurring. '
                           'Higher value applies stronger denoising, '
                           'could be more precise but also slower.')

    # --------------------------- INSERT steps functions ----------------------
    def _insertAllSteps(self):
        self.lenPartsSet = 0  # inputSize after checkNewInput
        self.outputSize = 0  # outputSize after checkNewOutput
        self.check = None  # last creationTime of the processed images
        self.stepCount = 0  # to label the input.xmd
        self.fnInputMd = self._getExtraPath("input%d.xmd")  # to feed the bin
        self.fnOutMdTmp = self._getExtraPath("outTemp.xmd")  # partial out
        self.fnElimMdTmp = self._getExtraPath("elimTemp.xmd")  # partial out2
        self.fnOutputMd = self._getExtraPath("output.xmd")  # final out
        self.fnElimMd = self._getExtraPath("eliminated.xmd")  # final out2

        checkStep = self._insertNewPartsSteps()
        self._insertFunctionStep('createOutputStep',
                                 prerequisites=checkStep, wait=True)

    def _insertNewPartsSteps(self):
        deps = []
        self.stepCount += 1
        stepId = self._insertFunctionStep('eliminationStep',
                                          self.stepCount,
                                          prerequisites=[])
        deps.append(stepId)
        return deps

[docs]    def eliminationStep(self, stepId):
        """ Common code for particles and classes/averages """
        fnInputMd = self.fnInputMd % stepId
        partsSet = self.prepareImages()

        if self.check == None:  # if no previous, get all
            writeSetOfParticles(partsSet, fnInputMd,
                                alignType=ALIGN_NONE, orderBy='creation')
        else:  # if previous, take the last ones
            writeSetOfParticles(partsSet, fnInputMd,
                                alignType=ALIGN_NONE, orderBy='creation',
                                where='creation>"' + str(self.check) + '"')

        # special use of partSet before closing it
        self.specialBehavoir(partsSet)

        args = "-i %s -o %s -e %s -t %f" % (fnInputMd, self.fnOutputMd,
                                            self.fnElimMd, self.threshold.get())
        if self.addFeatures:
            args += " --addFeatures"
        if self.useDenoising:
            args += " --useDenoising -d %f" % self.denoising.get()
        self.runJob("xmipp_image_eliminate_empty_particles", args)

[docs]    def specialBehavoir(self, inSet):
        """ To be implemented by child. Must set self.check and inSet.close()
        """
        pass

    def _getFirstJoinStep(self):
        for s in self._steps:
            if s.funcName == self._getFirstJoinStepName():
                return s
        return None

    def _getFirstJoinStepName(self):
        # This function will be used for streaming, to check which is
        # the first function that need to wait for all particles
        # to have completed, this can be overriden in subclasses
        # (e.g., in Xmipp 'sortPSDStep')
        return 'createOutputStep'

[docs]    def createOutputStep(self):
        pass

    def _stepsCheck(self):
        # Input particles set can be loaded or None when checked for new inputs
        # If None, we load it
        self._checkNewInput()
        self._checkNewOutput()

    def _checkNewInput(self):
        # Check if there are new particles to process from the input set
        partsFile = self.getInput().getFileName()
        self.lastmTime = getattr(self, 'lastmTime', None)
        mTime = datetime.fromtimestamp(os.path.getmtime(partsFile))
        # If the input movies.sqlite have not changed since our last check,
        # it does not make sense to check for new input data
        if self.lastmTime == mTime:
            return

        self.lastmTime = mTime

        outputStep = self._getFirstJoinStep()

        self.prepareImages()

        fDeps = self._insertNewPartsSteps()
        if outputStep is not None:
            outputStep.addPrerequisites(*fDeps)
        self.updateSteps()

[docs]    def prepareImages(self):
        """ Must set:
         - self.inputImages:  Images to process in a SetOfImages.
         - self.streamClosed: Streaming state of the input.
         - self.lenPartsSet:  Size of the input set.
        """
        pass

    def _checkNewOutput(self):
        if getattr(self, 'finished', False):
            return

        self.finished = self.inputImages.isStreamClosed() and self.outputSize == self.lenPartsSet

        self.createOutputs()

        if self.finished:  # Unlock createOutputStep if finished all jobs
            cleanPath(self._getPath('particlesAUX.sqlite'))
            cleanPath(self._getPath('averagesAUX.sqlite'))
            outputStep = self._getFirstJoinStep()
            if outputStep and outputStep.isWaiting():
                outputStep.setStatus(cons.STATUS_NEW)

[docs]    def createOutputs(self):
        """ To be implemented by child. (create, fill and close the outputSet)
        """
        pass

    def _loadOutputSet(self, SetClass, baseName):
        setFile = self._getPath(baseName)
        if os.path.exists(setFile):
            outputSet = SetClass(filename=setFile)
            outputSet.loadAllProperties()
            outputSet.enableAppend()
        else:
            outputSet = SetClass(filename=setFile)
            outputSet.setStreamState(outputSet.STREAM_OPEN)

        inputs = self.inputImages
        outputSet.copyInfo(inputs)

        return outputSet

    def _updateOutputSet(self, outputName, outputSet, state=Set.STREAM_OPEN):
        outputSet.setStreamState(state)
        if self.hasAttribute(outputName):
            outputSet.write()  # Write to commit changes
            outputAttr = getattr(self, outputName)
            # Copy the properties to the object contained in the protocol
            outputAttr.copy(outputSet, copyId=False)
            # Persist changes
            self._store(outputAttr)
        else:
            self._defineOutputs(**{outputName: outputSet})
            self._defineTransformRelation(self.getInput(), outputSet)
            self._store(outputSet)

        # Close set databaset to avoid locking it
        outputSet.close()

    # --------------------------- UTILS functions -----------------------------
    def _updateParticle(self, item, row):
        setXmippAttributes(item, row, md.MDL_SCORE_BY_EMPTINESS)
        if row.getValue(md.MDL_ENABLED) <= 0:
            item._appendItem = False
        else:
            item._appendItem = True

[docs]    def getInput(self):
        """ Get the input as it is in the form """
        pass


[docs]class XmippProtEliminateEmptyParticles(XmippProtEliminateEmptyBase):
    """ Takes a set of particles and using statistical methods
    (variance of variances of sub-parts of input image) eliminates those samples,
    where there is no object/particle (only noise is presented there).
    Threshold parameter can be used for fine-tuning the algorithm for type of data.
    """

    _label = 'eliminate empty particles'

    def __init__(self, **args):
        XmippProtEliminateEmptyBase.__init__(self, **args)

    # --------------------------- DEFINE param functions ----------------------
    def _defineParams(self, form):
        form.addSection(label=Message.LABEL_INPUT)
        # - - - F O R   P A R T I C L E S - - -
        form.addParam('inputParticles', param.PointerParam, important=True,
                      label="Input particles", pointerClass='SetOfParticles',
                      help='Select the input particles to be classified.')
        form.addParam('threshold', param.FloatParam, default=1.1,
                      label='Threshold used in elimination:',
                      help='Higher threshold => more particles will be '
                           'eliminated. Set to -1 for no elimination, even so '
                           'the "xmipp_scoreEmptiness" value will be attached to '
                           'every paricle for a posterior inspection.')

        self.addAdvancedParams(form)

    # --------------------------- INSERT steps functions ----------------------
[docs]    def specialBehavoir(self, partsSet):
        """ Just setting the self.check """
        for p in partsSet.iterItems(orderBy='creation', direction='DESC'):
            self.check = p.getObjCreation()
            break
        partsSet.close()

[docs]    def createOutputs(self):
        streamMode = (Set.STREAM_CLOSED if getattr(self, 'finished', False)
                      else Set.STREAM_OPEN)

        def updateOutputs(mdFn, suffix):
            """ Common use for accepted and discarded output. """
            newData = os.path.exists(mdFn)  # new data if partial out exists
            lastToClose = (getattr(self, 'finished', False) and    # last if fisished
                           hasattr(self, '%sParticles' % suffix))  #  and exists
            if newData or lastToClose:
                outSet = self._loadOutputSet(SetOfParticles,
                                             '%sParticles.sqlite' % suffix)
                if newData:
                    partsSet = self._createSetOfParticles("AUX")
                    readSetOfParticles(mdFn, partsSet)
                    outSet.copyItems(partsSet,
                                     updateItemCallback=self._updateParticle,
                                     itemDataIterator=md.iterRows(mdFn,
                                                    sortByLabel=md.MDL_ITEM_ID))
                    self.outputSize = self.outputSize + len(partsSet)
                self._updateOutputSet('%sParticles'%suffix, outSet, streamMode)
                cleanPath(mdFn)

        updateOutputs(self.fnOutMdTmp, 'output')
        updateOutputs(self.fnElimMdTmp, 'eliminated')

[docs]    def getInput(self):
        return self.inputParticles.get()

[docs]    def prepareImages(self):
        self.inputImages = self.getInput()
        partsSet = self.inputImages
        partsSet.loadAllProperties()
        self.streamClosed = partsSet.isStreamClosed()
        self.lenPartsSet = len(partsSet)

        return partsSet


DISCARDED = 0
ACCEPTED = 1
ATTACHED = 2

[docs]class XmippProtEliminateEmptyClasses(XmippProtEliminateEmptyBase):
    """ Takes a set of classes (or averages) and using statistical methods
    (variances of sub-parts of input image) eliminates those samples,
    where there is no object/particle (only noise is presented there).
    Threshold parameter can be used for fine-tuning the algorithm for
    type of data. Also discards classes with less population than a given
    percentage.
    """

    _label = 'eliminate empty classes'

    def __init__(self, **args):
        XmippProtEliminateEmptyBase.__init__(self, **args)
        self.enableCls = {}

    # --------------------------- DEFINE param functions ----------------------
    def _defineParams(self, form):
        form.addSection(label=Message.LABEL_INPUT)
        # - - - F O R   C L A S S E S - - -
        form.addParam('inputClasses', param.PointerParam, important=True,
                      pointerClass='SetOfClasses, SetOfAverages',
                      label="Input classes",
                      help='Select the input averages to be classified.')
        form.addParam('threshold', param.FloatParam, default=8.0,
                      label='Threshold used in elimination',
                      help='Higher threshold => more particles will be '
                           'eliminated. Set to -1 for no elimination, even so '
                           'the "xmipp_scoreEmptiness" value will be attached to '
                           'every paricle for a posterior inspection.')
        form.addParam('usePopulation', param.BooleanParam, default=True,
                      label='Use class population',
                      help="Consider class population to reject a class.")
        form.addParam('minPopulation', param.FloatParam, default=20,
                      label='Min. population (%)',
                      condition="usePopulation",
                      expertLevel=param.LEVEL_ADVANCED,
                      help="Minimum population to accept a class.\n"
                           "Classes with less population than the mean population "
                           "times this value will be rejected.")

        self.addAdvancedParams(form)

    # --------------------------- INSERT steps functions ----------------------
    def _validate(self):
        errors = []
        if (not isinstance(self.getInput(), SetOfClasses)
                and self.usePopulation.get()):
            errors.append("Using population to reject classes is not possible "
                          "with Averages as input.\nPlease, introduce a "
                          "setOfClasses to analyse or disable the _use class "
                          "population_ option.")
        return errors

[docs]    def specialBehavoir(self, partSet):
        idsToCheck = []
        for p in partSet.iterItems(orderBy='creation', direction='ASC'):
            self.check = p.getObjCreation()
            idsToCheck.append(p.getObjId())
        partSet.close()

        self.rejectByPopulation(idsToCheck)

[docs]    def createOutputs(self):
        streamMode = Set.STREAM_CLOSED if getattr(self, 'finished', False) \
            else (Set.STREAM_CLOSED if self.streamClosed else Set.STREAM_OPEN)

        def updateOutputs(mdFn, suffix):
            lastToClose = getattr(self, 'finished', False) and \
                          hasattr(self, '%sClasses' % suffix)
            newData = os.path.exists(mdFn)
            enableOut = {}
            if newData or lastToClose:
                outSet = self._loadOutputSet(SetOfAverages,
                                             '%sAverages.sqlite' % suffix)
                if newData:
                    # if new data, we read it
                    partsSet = self._createSetOfParticles("AUX")
                    readSetOfParticles(mdFn, partsSet)
                    # updating the enableCls dictionary
                    # print(" - %s Averages:" % ("ACCEPTED" if suffix == 'output' else "DISCARTDED"))
                    for part in partsSet:
                        partId = part.getObjId()
                        if partId not in self.enableCls:
                            # this happends when a classifier give an empty class
                            continue
                        # - accept if we are in accepted and the current is accepted
                        # - discard if we are in the discarted scope and any
                        currentStatus = self.enableCls[partId]
                        decision = suffix == 'output' and currentStatus == ACCEPTED
                        enableOut[partId] = ACCEPTED if decision else DISCARDED
                        # print("%d: %s -> %s" % (partId, currentStatus, decision))
                    # updating the Averages set
                    outSet.copyItems(partsSet,
                                     updateItemCallback=self._updateParticle,
                                     itemDataIterator=md.iterRows(mdFn,
                                                    sortByLabel=md.MDL_ITEM_ID))
                    self.outputSize = self.outputSize + len(partsSet)

                self._updateOutputSet('%sAverages' % suffix, outSet, streamMode)
                cleanPath(mdFn)

            return enableOut

        accOut = updateOutputs(self.fnOutMdTmp, 'output')
        discOut = updateOutputs(self.fnElimMdTmp, 'eliminated')

        self.createOutputClasses('output', streamMode, accOut)
        self.createOutputClasses('eliminated', streamMode, discOut)

    # ------------- UTILS Fuctions ------------------------------------
[docs]    def prepareImages(self):
        inSet = self.getInput()

        if isinstance(inSet, SetOfImages):
            firstRep = inSet.getFirstItem()
            getImage = lambda item: item.clone()
            self.classesDict = None
        else:  # SetOfClasses
            firstRep = inSet.getFirstItem().getFirstItem()
            getImage = lambda item: item.getRepresentative().clone()
            self.classesDict = {cls.getObjId(): cls.getSize() for cls in inSet}

        self.inputImages = self._createSetOfAverages("AUX")
        self.inputImages.enableAppend()
        self.inputImages.copyAttributes(firstRep, '_samplingRate')
        self.inputImages.copyAttributes(inSet, '_streamState')
        self.streamClosed = self.inputImages.isStreamClosed()

        for item in inSet:
            self.inputImages.append(getImage(item))
        self.lenPartsSet = len(self.inputImages)

        self.inputImages.write()
        self._store(self.inputImages)

        return self.inputImages

[docs]    def rejectByPopulation(self, ids):
        if self.usePopulation.get() and self.classesDict is not None:
            sizeDict = {clsId: size for clsId, size
                        in self.classesDict.items()
                        if clsId in ids}

            meanPop = sum(sizeDict.values())/len(sizeDict)

            for clsId, size in sizeDict.items():
                # minPopulation is normalized to 100% not to 1
                decision = int(size*100 > meanPop * self.minPopulation.get())
                self.enableCls[clsId] = ACCEPTED if decision else DISCARDED
        else:
            self.enableCls = {clsId: ACCEPTED for clsId in ids}

[docs]    def createOutputClasses(self, suffix, streamingState, enableDict):
        if not self.classesDict or not enableDict:
            # If there are no classes, nothing to do
            return

        baseName = '%sClasses.sqlite' % suffix
        setFile = self._getPath(baseName)
        if os.path.exists(setFile):
            outputSet = SetOfClasses2D(filename=setFile)
            outputSet.loadAllProperties()
            outputSet.enableAppend()
        else:
            outputSet = SetOfClasses2D(filename=setFile)
            outputSet.setStreamState(streamingState)

        outputSet.copyInfo(self.getInput())  # if fails, delete

        decision = ACCEPTED if suffix == 'output' else DISCARDED
        desiredIds = [ids for ids, enable in enableDict.items()
                      if enable == decision]
        enableFunc = lambda cls: cls.getObjId() in desiredIds
        outputSet.appendFromClasses(self.getInput(), enableFunc)

        outputSet.setStreamState(streamingState)
        outputName = '%sClasses' % suffix
        if self.hasAttribute(outputName):
            outputSet.write()  # Write to commit changes
            outputAttr = getattr(self, outputName)
            # Copy the properties to the object contained in the protocol
            outputAttr.copy(outputSet, copyId=False)
            # Persist changes
            self._store(outputAttr)
        else:
            self._defineOutputs(**{outputName: outputSet})
            self._defineSourceRelation(self.inputClasses, outputSet)
            self._store(outputSet)

        # Close set databaset to avoid locking it
        outputSet.close()

[docs]    def getInput(self):
        return self.inputClasses.get()