Source code for pwem.protocols.protocol_sets

# **************************************************************************
# *
# * Authors:     J.M. De la Rosa Trevin (jmdelarosa@cnb.csic.es)
# *
# * Unidad de  Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 3 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307  USA
# *
# *  All comments concerning this program package may be sent to the
# *  e-mail address 'scipion@cnb.csic.es'
# *
# **************************************************************************
"""
This module contains protocols related to Set operations such us:
- subsets
- unions
- split
... etc
"""

import random
import sys

import pyworkflow.protocol as pwprot
from pyworkflow.object import Object,Float, Integer, String

from pwem.protocols import EMProtocol
from pwem.objects import Volume, EMSet, SetOfClasses, SetOfData
from pyworkflow.utils import ProgressBar, getListFromRangeString
from pwem.constants import ID_COLUMN, ID_ATTRIBUTE


[docs]class ProtSets(EMProtocol):
    """ Base class for all protocols related to subsets. """

    def _append(self, outputSet, item, sourceItem=None, itemUpdateCallback=None, subElemUpdateCallback=None):
        """ Add an item to the outputSet.
        If the item is a new copy of sourceItem(case of the join sets),
        then use a sourceItem since item lost the information related with the
        mapper

        :param itemUpdateCallback: callback receiving the item to apply any operation (optional)
        :param subElemUpdateCallback: callback receiving the sub-element to apply any operation (optional)
        """
        subElemList = []
        if sourceItem is None:
            sourceItem = item
        if isinstance(item, EMSet):
            for subElem in sourceItem.iterItems():
                # We need to create a clone because all items have a same _objId
                clon = subElem.clone(copyEnable=True)

                # Update the sub-element if callback is passed
                if subElemUpdateCallback:
                    subElemUpdateCallback(clon)

                subElemList.append(clon)

        # Update the main item if callback is passed
        if itemUpdateCallback:
            itemUpdateCallback(item)

        outputSet.append(item)
        if subElemList:
            for subElem in subElemList:
                item.append(subElem)
            # When adding sub-elements, some item "summary" properties may be updated: e.g. TiltSeries anglesCount.
            # Need to persist them.
            outputSet.update(item)


[docs]class ProtUnionSet(ProtSets):
    """ Protocol to join two or more sets of images.
    This protocol allows to select two or more set of images
    and will produce another set joining all elements of the 
    selected sets. It will validate that all sets are of the
    same type of elements (Micrographs, Particles or Volumes) 
    """
    _label = 'join sets'
    TYPE_CTF = 'CTFs'
    TYPE_VOLUME='Volumes'
    TYPE_VOLUME_INDEX = 3

    _unionTypes = ['Particles',
                   'Micrographs',
                   TYPE_CTF,
                   TYPE_VOLUME,
                   'Averages',
                   'All']

    def __init__(self, **kwargs):
        ProtSets.__init__(self, **kwargs)

        # We need to trace the changes of 'inputType' to
        # dynamically modify the property of pointerClass
        # of the 'inputSets' parameter
        def onChangeInputType():
            inputText = self.getEnumText('inputType')

            if inputText == 'All':
                pointerClass = 'EMSet'
            # elif inputText == 'CTFs + Micrographs':
            #     pointerClass = 'SetOfCTF'
            else:
                pointerClass = 'SetOf' + inputText
            # For relatively small set we usually want to include
            # the single element type, this will allow, for example
            # to union SetOfVolumes and Volumes in the final set
            if inputText in [self.TYPE_VOLUME]:
                pointerClass += ',%s' % inputText[:-1]  # remove last 's'
            elif inputText in [self.TYPE_CTF]:
                # remove last 's'
                pointerClass = '%s,CTFModel' % pointerClass[:-1]

            self.inputSetsParam.setPointerClass(pointerClass)

        self.inputType.trace(onChangeInputType)

    # -------------------------- DEFINE param functions ------------------------
    def _defineParams(self, form):
        form.addSection(label='Input')

        form.addParam('inputType', pwprot.params.EnumParam,
                      choices=self._unionTypes, default=5,  # All
                      label='Input type:',
                      help='Select the type of objects that you want to union.\n'
                           'Special case All will allow you to select any type.')
        self.inputSetsParam = form.addParam('inputSets', pwprot.params.MultiPointerParam,
                                            label="Input set", important=True,
                                            pointerClass='EMSet', minNumObjects=2, maxNumObjects=0,
                                            help='Select two or more sets (of micrographs, particles,'
                                                 ' volumes, etc.) to be united. If you select 3 sets '
                                                 'with 100, 200, 200 elements, the final set will '
                                                 'contain a total of 500 elements.')
        form.addParam('ignoreDuplicates', pwprot.params.BooleanParam,
                      default=False,
                      label='Remove duplicates?',
                      help='By default, duplicated items found (same ID) '
                           'within the input sets, will cause renumbering of all the '
                           'items ids in the output set. '
                           'This is the case for example when doing several '
                           'imports (which will cause ids overlapping) '
                           'but we really want to insert as new items in the '
                           'output. \n'
                           'On the other hand, items originated in a previous common '
                           'protocol (above in the workflow) might have identical items '
                           'and you would like to remove them. '
                           'Therefore, set this option to *Yes* to remove duplicates and keep only '
                           'one copy of the item (the first occurrence).')
        form.addParam('renumber', pwprot.params.BooleanParam, default=False,
                      expertLevel=pwprot.LEVEL_ADVANCED,
                      label="Force new ids",
                      help='Perform an automatic renumbering of ids to ensure all objects have unique ids. '
                           'This will mean new objects will not be associated to the old ones.')

        # TODO: See what kind of restrictions we add,
        # like "All sets should have the same sampling rate."

    # -------------------------- INSERT steps functions -----------------------
    def _insertAllSteps(self):
        self._insertFunctionStep(self.createOutputStep)

    # --------------------------- STEPS functions ------------------------------
[docs]    def createOutputStep(self):

        set1 = self.inputSets[0].get()  # 1st set (we use it many times)

        # Read ClassName and create the corresponding EMSet (SetOfParticles...)
        try:
            if str(set1.getClassName()) is not Volume.__name__:
                outputSetFunction = getattr(self, "_create%s" % set1.getClassName())
            else:
                outputSetFunction = self._createSetOfVolumes
            outputSet = outputSetFunction()
        except Exception:
            outputSet = set1.createCopy(self._getPath())

        # Copy info from input sets (sampling rate, etc).
        if str(set1.getClassName()) is not Volume.__name__:
            outputSet.copyInfo(set1)  # all sets must have the same info as set1!
        else:
            outputSet.setSamplingRate(set1.getSamplingRate())

        # Renumber from the beginning if either the renumber option is selected
        # or we find duplicated ids in the sets
        cleanIds = not self.ignoreDuplicates.get() and self.duplicatedIds()

        # Warn in the log in case attributes will be lost
        allSetAttributes, commonAttrs = self.commonAttributes()
        warnings = self._getHeterogeneityWarning(allSetAttributes, commonAttrs)
        if warnings:
            self.info(warnings)

        # Always ignore non-common attributes
        ignoreExtraAttributes = True

        # Get the 1st level attributes to be used for the copyAttributes
        copyAttrs = list()
        for attr in commonAttrs:
            if "." not in attr:
                copyAttrs.append(attr)

        self.info("Common attributes to all sets are: %s" % copyAttrs)

        idsList = {}
        setNum = 0
        for itemSet in self.inputSets:
            setNum += 1
            if str(itemSet.get().getClassName()) is not Volume.__name__:
                for obj in itemSet.get():
                    objId = obj.getObjId()
                    if self.ignoreDuplicates.get():
                        if objId in idsList:
                            continue
                        idsList[objId] = objId
                    # This is always TRUE, if stable we could remove the "if" and the "else".
                    if ignoreExtraAttributes:
                        newObj = itemSet.get().ITEM_TYPE()
                        newObj.copyAttributes(obj, *copyAttrs)

                        self.cleanExtraAttributes(newObj, commonAttrs)
                        if not cleanIds or setNum == 1:
                            newObj.setObjId(objId)
                    else:
                        newObj = obj

                    if (cleanIds and setNum > 1) or self.renumber.get():
                        newObj.cleanObjId()

                    self._append(outputSet, newObj, sourceItem=obj)

            else:
                obj = itemSet.get()
                objId = obj.getObjId()
                if self.ignoreDuplicates.get():
                    if objId in idsList:
                        continue
                    idsList[objId] = objId
                newObj = obj
                if (cleanIds and setNum > 1) or self.renumber.get():
                    newObj.cleanObjId()
                outputSet.append(newObj)

        self._defineOutputs(outputSet=outputSet)
        for itemSet in self.inputSets:
            self._defineSourceRelation(itemSet, outputSet)

    # Overwrite SetOfCoordinates creation
    def _createSetOfCoordinates(self, suffix=''):
        coordSet = self.inputSets[0].get()
        micSet = coordSet.getMicrographs()
        return ProtSets._createSetOfCoordinates(self, micSet, suffix)

[docs]    def cleanExtraAttributes(self, obj, verifyAttrs, prefix=""):

        for attr, value in obj.getAttributesToStore():

            prefixedAttribute = prefix + attr

            if prefixedAttribute not in verifyAttrs:
                value._objDoStore = False
                self.info("%s will be lost." % attr)

            else:
                self.cleanExtraAttributes(value, verifyAttrs,
                                          prefixedAttribute + ".")

    # def getObjDict(self, includeClass=False, includeBasic=False):
    #     return super(ProtUnionSet, self).getObjDict(
    #         includeClass=includeClass, includeBasic=includeBasic)

[docs]    def duplicatedIds(self):
        """ Check if there are duplicated ids to renumber from
        the beginning. """
        usedIds = set()  # to keep track of the object ids we have already seen
        for item_pointer in self.inputSets:
            if str(item_pointer.get().getClassName()) is not Volume.__name__:
                for objIds in item_pointer.get().getIdSet():
                    if objIds in usedIds:
                        return True
                    else:
                        usedIds.add(objIds)
            else:
                objId = item_pointer.get().getObjId()
                if objId in usedIds:
                    return True
                else:
                    usedIds.add(objId)
        return False

[docs]    def getAllSetsAttributes(self):
        allSetsAttributes = list()
        for itemSet in self.inputSets:
            if str(itemSet.get().getClassName()) is not Volume.__name__:
                item = itemSet.get().getFirstItem()
            else:
                item = itemSet.get()
            attrs = set(item.getObjDict().keys())
            allSetsAttributes.append(attrs)

        return allSetsAttributes

[docs]    def commonAttributes(self):
        """ Compute the set of common attributes to all items within
        each input set. """
        commonAttrs = None
        allSetsAttributes = self.getAllSetsAttributes()

        for attrSet in allSetsAttributes:
            if commonAttrs is None:  # first time
                commonAttrs = attrSet
            else:
                commonAttrs = commonAttrs & attrSet

        return allSetsAttributes, list(commonAttrs)

    # -------------------------- INFO functions -------------------------------
    def _validate(self):
        # Are all inputSets from the same class?
        classes = {x.get().getClassName() for x in self.inputSets}
        if len(classes) > 1:
            return ["All objects should have the same type.",
                    "Types of objects found: %s" % ", ".join(classes)]
        if issubclass(type(self.inputSets[0].get()), SetOfClasses):
            return ["Is not possible to join different sets of classes.\n"
                    "If you want to join different representative, extract them "
                    "with the viewer and them run this protocol with the "
                    "resulting averages."]

        # Validate attributes like sampling rate or dimensions
        return self._checkSetsCompatibility()

    def _checkSetsCompatibility(self):
        """ Check if all input sets have a minimum compatible attributes """
        # Attributes to check -> defined by the Set subclass type that are requested to be joined
        firstInput=self.inputSets[0].get()

        # Verify inputs are sets
        if not isinstance(firstInput, EMSet):
            return []

        attrs = firstInput.getCompatibilityDict()
        errors = []
        # For each attribute
        for key, attr in attrs.items():

            # Intentional: we need a default value not None, since some
            # attributes could return None as a valid value.
            refValue = '?'

            # For pointer to a set
            for setPointer in self.inputSets:

                # Get the set:
                inputSet = setPointer.get()

                # If the set has the attribute
                if not hasattr(inputSet, attr):
                    break

                # Get the attribute and "call it" --> final ().
                setValue = getattr(inputSet, attr)()

                if refValue == '?':
                    refValue = setValue
                else:
                    if refValue != setValue:
                        errors.append("There are different %s among the input"
                                      " sets: %s and %s" % (key, refValue, setValue))
                        break

        return errors

    def _warnings(self):
        """ Warn about loosing info. """

        # Get all attributes "map"
        allSetsAttributes, commonAttributes = self.commonAttributes()

        return self._getHeterogeneityWarning(allSetsAttributes, commonAttributes)

    def _getHeterogeneityWarning(self, allSetsAttributes, commonAttributes):

        warnings = []
        # Use a set
        commonAttributes = set(commonAttributes)

        # Go through all sets attributes
        for index, setAttributes in enumerate(allSetsAttributes):
            setAttributes = set(setAttributes)
            # Get the difference
            lostAttributes = setAttributes - commonAttributes

            if len(lostAttributes) != 0:
                warnings.append("Set #%d will loose following "
                                "attributes:" % index)
                for attr in lostAttributes:
                    warnings.append(attr)

        if len(warnings):
            warnings.append("Your input sets have different attributes. "
                            "We will keep only the common ones. This may "
                            "cause the lost of important data like CTF, "
                            "alignment information,...")

        return  warnings

    def _summary(self):
        if not hasattr(self, 'outputSet'):
            return ["Protocol has not finished yet."]
        else:
            return ["We have merged the following sets:",
                    ", ".join(x.get().getNameId() for x in self.inputSets)]

    def _methods(self):
        return self._summary()


[docs]class ProtSplitSet(ProtSets):
    """ Protocol to split a set in two or more subsets.
    """
    _label = 'split sets'

    # -------------------------- DEFINE param functions -----------------------
    def _defineParams(self, form):
        form.addSection(label='Input')

        form.addParam('inputSet', pwprot.params.PointerParam,
                      pointerClass='EMSet',
                      label="Input set", important=True,
                      help='Select the set of elements (images, etc) that you '
                           'want to split.')

        form.addParam('numberOfSets', pwprot.params.IntParam, default=2,
                      label="Number of subsets",
                      help='Select how many subsets do you want to create.')

        form.addParam('randomize', pwprot.params.BooleanParam, default=False,
                      label="Randomize elements",
                      help='Put the elements at random in the different '
                           'subsets.')

    # Overwrite SetOfCoordinates creation
    def _createSetOfCoordinates(self, suffix=''):
        coordSet = self.inputSet.get()
        micSet = coordSet.getMicrographs()
        return ProtSets._createSetOfCoordinates(self, micSet, suffix)

    # -------------------------- INSERT steps functions -----------------------
    def _insertAllSteps(self):
        self._insertFunctionStep(self.createOutputStep)

    # -------------------------- STEPS functions ------------------------------
[docs]    def createOutputStep(self):
        inputSet = self.inputSet.get()
        inputClassName = str(inputSet.getClassName())
        n = self.numberOfSets.get()
        # Create as many subsets as requested by the user
        try:
            outputSetFunction = getattr(self, "_create%s" % inputClassName)
            subsets = [outputSetFunction(suffix=str(i)) for i in range(1, n + 1)]
        except Exception:
            subsets = [inputSet.createCopy(self._getPath(), suffix=str(i))
                       for i in range(1, n + 1)]

        # Iterate over the elements in the input set and assign
        # to different subsets.
        elements = self.inputSet.get()

        ns = [len(elements) // n + (1 if i < len(elements) % n else 0)
              for i in range(n)]  # number of elements in each subset
        pos, i = 0, 0  # index of current subset and index of position inside it
        orderBy = 'RANDOM()' if self.randomize else 'id'

        for elem in elements.iterItems(orderBy=orderBy, direction='ASC'):
            if i >= ns[pos]:
                pos += 1
                i = 0
            self._append(subsets[pos], elem)
            i += 1

        key = 'output' + inputClassName.replace('SetOf', '') + '%02d'
        for i in range(1, n + 1):
            subset = subsets[i - 1]
            subset.copyInfo(inputSet)
            self._defineOutputs(**{key % i: subset})
            self._defineTransformRelation(inputSet, subset)

    # -------------------------- INFO functions -------------------------------
    def _validate(self):
        errors = []
        if self.inputSet.get().getSize() < self.numberOfSets:
            errors.append("The number of subsets requested is greater than")
            errors.append("the number of elements in the input set.")
        return errors

    def _summary(self):
        if not any(x.startswith('output') for x in dir(self)):
            return ["Protocol has not finished yet."]
        else:
            return ["We have split the set %s in %d sets." %
                    (self.inputSet.getName(), self.numberOfSets.get())]


[docs]class ProtSubSet(ProtSets):
    """    
    Create a set with the elements of an original set that are also
    referenced in another set.
    
    Usually there is a bigger set with all the elements, and a smaller
    one obtained from classification, cleaning, etc. The desired result
    is a set with the elements from the original set that are also present
    somehow in the smaller set (in the smaller set they may be downsampled
    or processed in some other way).
    
    Both sets should be of the same kind (micrographs, particles, volumes)
    or related (micrographs and CTFs for example).
    """
    _label = 'subset'
    SET_INTERSECTION = 0
    SET_DIFFERENCE = 1

    # -------------------------- DEFINE param functions -----------------------
    def _defineParams(self, form):
        form.addSection(label='Input')

        add = form.addParam  # short notation
        add('inputFullSet', pwprot.params.PointerParam, pointerClass='EMSet',
            label="Full set of items", important=True,
            help='Even if the operation can be applied to two arbitrary sets,\n'
                 'the most common use-case is to retrieve a subset of\n'
                 'elements from an original full set.\n'
                 '*Note*: the elements of the resulting set will be the same\n'
                 'ones as this input set.')
        add('chooseAtRandom', pwprot.params.BooleanParam, default=False,
            label="Make random subset",
            help='Choose elements randomly form the full set.')
        add('nElements', pwprot.params.IntParam, default=2,
            condition='chooseAtRandom',
            label="Number of elements",
            help='How many elements will be taken from the full set.')
        add('selectIds', pwprot.params.BooleanParam, default=False,
            condition='not chooseAtRandom',
            label="Make a subset from specific IDs",
            help="Choose specific elements form the full set.")
        add('range', pwprot.params.NumericRangeParam,
            label="IDs range or list",
            condition='selectIds and not chooseAtRandom',
            allowsNull=True,
            help='Select the IDs that will be the subset.\n'
                 'You have several ways to specify the IDs.\n'
                 'Example: \n'
                 '"1,3,5-8,17-20" -> [1,3, 5, 6, 7, 8, 17, 18, 19, 20]\n')
        add('inputSubSet', pwprot.params.PointerParam,
            pointerClass='EMSet', condition='not (chooseAtRandom or selectIds)',
            label="Other set",
            allowsNull=True,
            help='The elements present in this set will be used to pick \n'
                 'elements from the input full set.     \n'
                 'This means that the output set will contain elements with \n'
                 'exact the same information of input full set.\n\n'
                 'Set operation: if _intersection_ is used,\n'
                 'elements that are both in input and other set\n'
                 'will be included. If _difference_, elements that\n'
                 'are in input but not in other will picked.')
        add('setOperation', pwprot.params.EnumParam,
            condition='not (chooseAtRandom or selectIds)',
            default=self.SET_INTERSECTION,
            choices=['intersection', 'difference'],
            display=pwprot.params.EnumParam.DISPLAY_HLIST,
            label='Set operation',
            help='Set operation: if _intersection_ is used,\n'
                 'elements that are both in input and other set\n'
                 'will be included. If _difference_, elements that\n'
                 'are in input but not in other will picked.')

    # -------------------------- INSERT steps functions -----------------------
    def _insertAllSteps(self):
        self._insertFunctionStep('createOutputStep')

    # -------------------------- STEPS functions ------------------------------
[docs]    def createOutputStep(self):
        inputFullSet = self.inputFullSet.get()

        inputClassName = inputFullSet.getClassName()

        try:
            outputSetFunction = getattr(self, "_create%s" % inputClassName)
            outputSet = outputSetFunction()
        except Exception:
            outputSet = inputFullSet.createCopy(self._getPath())

        outputSet.copyInfo(inputFullSet)

        if self.chooseAtRandom or self.selectIds:
            if self.chooseAtRandom:
                # Get all ids form iput set
                self.info("Creating subset from random positions from input set.")
                ids = set(random.sample(list(inputFullSet.getIdSet()), self.nElements.get()))
            else:
                self.info("Creating subset by range: %s" % self.range)
                ids = set(getListFromRangeString(self.range.get()))
        else:
            # Get the ids from both sets
            fullSetIds = inputFullSet.getIdSet()
            smallSetIds = self.inputSubSet.get().getIdSet()

            # The function to include an element or not
            # depends on the set operation
            # if it is 'intersection' we want that item is not None (found)
            # if it is 'difference' we want that item is None
            # (not found, different)
            if self.setOperation == self.SET_INTERSECTION:
                ids = fullSetIds.intersection(smallSetIds)
            else:
                ids = fullSetIds.difference(smallSetIds)

        progress = None
        nElements = len(ids)

        if nElements > 100000:  # show progressBar for large sets
            progress = ProgressBar(total=nElements, fmt=ProgressBar.NOBAR)
            progress.start()
            sys.stdout.flush()
            step = max(25000, nElements // 25000)

        i = 0

        for elem in inputFullSet.iterItems():
            if elem.getObjId() in ids:
                i += 1
                if progress and i % step == 0:
                    progress.update(i)
                self._append(outputSet, elem)

        if progress:
            progress.finish(printNewLine=True)

        if outputSet.getSize():
            key = 'output' + inputClassName.replace('SetOf', '')
            self._defineOutputs(**{key: outputSet})
            self._defineTransformRelation(inputFullSet, outputSet)
            if not (self.chooseAtRandom.get() or self.selectIds.get()):
                self._defineSourceRelation(self.inputSubSet, outputSet)
        else:
            self.summaryVar.set('Output was not generated. Resulting set '
                                'was EMPTY!!!')

    # Overwrite SetOfCoordinates creation
    def _createSetOfCoordinates(self, suffix=''):
        coordSet = self.inputFullSet.get()
        micSet = coordSet.getMicrographs()
        return ProtSets._createSetOfCoordinates(self, micSet, suffix)

    # -------------------------- INFO functions -------------------------------
    def _validate(self):
        """Make sure the input data make sense."""

        # Do not allow failing sets:
        notImplentedClasses = ['SetOfClasses2D', 'SetOfClasses3D',
                               'CoordinatesTiltPair']

        errors =[]
        if not self.chooseAtRandom and not self.selectIds and not self.inputSubSet.get():
            errors.append("Subsetting without ids or random selection needs the 'Other set' parameter.")

        if not self.inputFullSet.get():
            # Since is mandatory it will not validate
            # Stop validating since following validations need this set
            return errors

        c1 = self.inputFullSet.get().getClassName()
        if c1 in notImplentedClasses:
            errors.append("%s subset is not implemented." % c1)

        # First dispatch the easy case, where we choose elements at random.
        if self.chooseAtRandom:
            if self.nElements > self.inputFullSet.get().getSize():
                errors.append("Number of elements to choose cannot be bigger than",
                        "the number of elements in the set.")


        # Now the harder case: two sets. Check for compatible classes.

        # self.inputFullSet and self.inputSubSet .get().getClassName()
        # can be SetOf...:
        #   Alignment
        #   Angles
        #   Averages
        #   Classes
        #   ClassesVol
        #   Coordinates
        #   CTF
        #   Micrographs
        #   MovieParticles
        #   Movies
        #   Particles
        #   Volumes

        if not self.inputSubSet.get():
            # Stop validating since following validations need this set
            return errors
        
        c2 = self.inputSubSet.get().getClassName()
        if c2 in notImplentedClasses:
            errors.append("%s subset is not implemented." % c2)

        if c1 == c2:
            return errors

        # Avoid combinations that make no sense.
        for classA, classesIncompatible in [
            ('SetOfParticles',
             {'SetOfMicrographs', 'SetOfMovies', 'SetOfVolumes'}),
            ('SetOfCoordinates',
             {'SetOfMicrographs', 'SetOfMovies', 'SetOfVolumes'}),
            ('SetOfVolumes',
             {'SetOfMicrographs', 'SetOfMovies', 'SetOfParticles', 'SetOfCoordinates'})]:
            if ((c1 == classA and c2 in classesIncompatible) or
                    (c2 == classA and c1 in classesIncompatible)):
                errors.append("The full set and the subset are of incompatible classes",
                        "%s and %s." % (c1, c2))
        return errors

    def _summary(self):
        if self.summaryVar.hasValue():
            return [self.summaryVar.get()]

        key = 'output' + self.inputFullSet.get().getClassName().replace('SetOf', '')

        if not hasattr(self, key):
            return ["Protocol has not finished yet."]
        else:
            if self.setOperation == self.SET_INTERSECTION:
                return ["The elements of %s that also are referenced in %s" %
                        (self.inputFullSet.getName(), self.inputSubSet.getName()),
                        "are now in %s" % getattr(self, key).getName()]
            else:
                return ["%s has elements only present in %s." %
                        (getattr(self, key).getName(),
                         self.inputFullSet.getName())
                        ]


[docs]class ProtSubSetByMic(ProtSets):
    """
    Create a subset of those particles that come from a particular set of micrographs
    """
    _label = 'particles subset by micrograph'

    # --------------------------- DEFINE param functions ----------------------
    def _defineParams(self, form):
        form.addSection(label='Input')

        add = form.addParam  # short notation
        add('inputParticles', pwprot.params.PointerParam,
            pointerClass='SetOfParticles', label="Input particles",
            help='Set of particles from which the subset will be taken')
        add('inputMicrographs', pwprot.params.PointerParam,
            pointerClass='SetOfMicrographs', label="Input micrographs",
            help='Only the particles in this set of micrographs will be output')

    # --------------------------- INSERT steps functions ----------------------
    def _insertAllSteps(self):
        self._insertFunctionStep('createOutputStep',
                                 self.inputParticles.getObjId(),
                                 self.inputMicrographs.getObjId())

    # --------------------------- STEPS functions -----------------------------
[docs]    def createOutputStep(self, partsId, micsId):
        inputParticles = self.inputParticles.get()
        inputMicrographs = self.inputMicrographs.get()

        outputSet = self._createSetOfParticles()
        outputSet.copyInfo(inputParticles)

        micIds = []

        for mic in inputMicrographs:
            micIds.append(mic.getObjId())

        for particle in inputParticles:
            if particle.getMicId() in micIds:
                outputSet.append(particle)

        self._defineOutputs(outputParticles=outputSet)
        self._defineTransformRelation(inputParticles, outputSet)

    # --------------------------- INFO functions ------------------------------
    def _validate(self):
        """Make sure the input data make sense, i.e. hasMicId.
        Thus they come from some Mic"""
        if not self.inputParticles.get().getFirstItem().hasMicId():
            return ['The _Input Particles_ must come from some Micrographs '
                    'of the workflow, i.e. particles must have micId.']

    def _summary(self):
        if not hasattr(self, 'outputParticles'):
            summary = ["Protocol has not finished yet."]
        else:
            summary = ['A subset of *%d* particles is made from a total of *%d*'
                       ' particles.' % (self.outputParticles.getSize(),
                                        self.inputParticles.get().getSize())]
        return summary


[docs]class ProtSubSetByCoord(ProtSets):
    """
    Create a subset of those particles that have a particular set of coordinates
    """
    _label = 'particles subset by coordinates'

    # --------------------------- DEFINE param functions ----------------------
    def _defineParams(self, form):
        form.addSection(label='Input')

        add = form.addParam  # short notation
        add('inputParticles', pwprot.params.PointerParam,
            pointerClass='SetOfParticles', label="Input particles",
            help='Set of particles from which the subset will be taken')
        add('inputCoordinates', pwprot.params.PointerParam,
            pointerClass='SetOfCoordinates', label="Input coordinates",
            help='Only the particles with this set of coordinates will be output')
        add('coordTolerance', pwprot.params.FloatParam,
            label='Coordinate tolerance (px)', default=0,
            help='Two coordinates are supposed to be the same if their X and Y distance'
                 ' is smaller or equal this value')

    # --------------------------- INSERT steps functions ----------------------
    def _insertAllSteps(self):
        self._insertFunctionStep('createOutputStep',
                                 self.inputParticles.getObjId(),
                                 self.inputCoordinates.getObjId(),
                                 self.coordTolerance.get())

    # --------------------------- STEPS functions -----------------------------
[docs]    def createOutputStep(self, partsId, micsId, tolerance):
        inputParticles = self.inputParticles.get()
        inputCoordinates = self.inputCoordinates.get()

        outputSet = self._createSetOfParticles()
        outputSet.copyInfo(inputParticles)

        micCoordinates = {}
        for coord in inputCoordinates.iterCoordinates():
            micId = coord.getMicId()
            x, y = coord.getPosition()
            if micId not in micCoordinates:
                micCoordinates[micId] = []
            micCoordinates[micId].append((x, y))

        for particle in inputParticles:
            if particle.getMicId() in micCoordinates:
                x0, y0 = particle.getCoordinate().getPosition()
                okToAdd = False
                for x, y in micCoordinates[particle.getMicId()]:
                    if abs(x - x0) <= tolerance and abs(y - y0) <= tolerance:
                        okToAdd = True
                        break
                if okToAdd:
                    outputSet.append(particle)

        self._defineOutputs(outputParticles=outputSet)
        self._defineTransformRelation(inputParticles, outputSet)

    # --------------------------- INFO functions ------------------------------
    def _validate(self):
        """Make sure the input data make sense, i.e. hasMicId.
        Thus they come from some Mic"""
        if not self.inputParticles.get().getFirstItem().hasCoordinate():
            return ['The _Input Particles_ must have coordinates']

    def _summary(self):
        if not hasattr(self, 'outputParticles'):
            summary = ["Protocol has not finished yet."]
        else:
            summary = ['A subset of *%d* particles is made from a total of *%d*'
                       ' particles.' % (self.outputParticles.getSize(),
                                        self.inputParticles.get().getSize())]
        return summary

[docs]class ProtCrossSubSet(ProtSets):
    """
    Create a subset of the main set based on a matching field in another set. e.g.: Use _micName field (in both fields)
    to select micrographs (main set) present in a set of coordinates (secondary set)
    """
    _label = 'Crossed subset'

    # --------------------------- DEFINE param functions ----------------------
    def _defineParams(self, form):
        form.addSection(label='Input')

        add = form.addParam  # short notation
        add('mainSet', pwprot.params.PointerParam,
            pointerClass='EMSet', label="Main set",
            help='Set to be reduced')

        add('mainSetField', pwprot.params.StringParam,
            label='Main field', default="id",
            help='Field in the main set that contains the values in common with the secondary set. Use any of the metadata viewers to find the field name.')

        add('secSet', pwprot.params.PointerParam,
            pointerClass='EMSet', label="Secondary set",
            help='Set holding the matching field. e.g: Set of Coordinates hold the micName that can be used to filter a set of micrographs (main set)')

        add('secSetField', pwprot.params.StringParam,
            label='Secondary field', default="id",
            help='Field in the secondary set that contains the values in common with the main set. Use any of the metadata viewers to find the field name.')


    # --------------------------- INSERT steps functions ----------------------
    def _insertAllSteps(self):
        # These arguments are mainly for skipping the step if they are the same in the resume execution.
        self._insertFunctionStep(self.createOutputStep,
                                 self.mainSet.getObjId(),
                                 self.secSet.getObjId(),
                                 self.mainSetField.get(),
                                 self.secSetField.get())

    # --------------------------- STEPS functions -----------------------------
[docs]    def createOutputStep(self, mainId, secId, mainSetField, secSetField):
        mainSet = self.mainSet.get()
        secSet = self.secSet.get()

        # Instantiate and copy main properties
        outputSet = mainSet.create(self.getPath())
        outputSet.copyInfo(mainSet)

        # Get unique values of secfield in secset
        uniqueValuesinSec = secSet.getUniqueValues(secSetField)
        uniqueValuesinSec ={value:None for value in uniqueValuesinSec}

        mainSetField=self.getMainSetField(pythonName=True)
        isIdField = self._isIdField(mainSetField)
        self.info("Attribute in main set items is %s %s" % (mainSetField, "" if not isIdField else "(id field)"))

        pb = ProgressBar(mainSet.getSize(), fmt=ProgressBar.FULL)
        pb.start()

        for item in mainSet:
            valueInMain=getattr(item,mainSetField)

            valueInMain = valueInMain if isIdField else valueInMain.get()
            if  valueInMain in uniqueValuesinSec:
                self._append(outputSet,item)
            pb.increase()

        pb.finish()

        self._defineOutputs(subset=outputSet)
        self._defineTransformRelation(mainSet, outputSet)

[docs]    def getMainSetField(self, pythonName=False):
        if pythonName:
            return self._normalizeSpecialFields(self.mainSetField.get())
        else:
            return self.mainSetField.get()

[docs]    def getSecSetField(self, pythonName=False):
        if pythonName:
            return self._normalizeSpecialFields(self.secSetField.get())
        else:
            return self.secSetField.get()

    def _normalizeSpecialFields(self, field):
        if field == ID_COLUMN:
            return ID_ATTRIBUTE
        else:
            return field
    def _isIdField(self, field):
        return field in [ID_COLUMN, ID_ATTRIBUTE]
    # --------------------------- INFO functions ------------------------------
    def _validate(self):
        """Make sure the input data make sense"""
        errors=[]
        if not hasattr(self.mainSet.get().getFirstItem(), self.getMainSetField(pythonName=True)):
            errors.append('The main set does not have the field %s' % self.mainSetField.get())

        if not hasattr(self.secSet.get().getFirstItem(), self.getSecSetField(pythonName=True)):
            errors.append('The secondary set does not have the field %s' % self.secSetField.get())

        return errors
    def _summary(self):

        summary = ["Items in the main set where %s=%s of items in the secondary set were selected." % (self.mainSetField.get(), self.secSetField.get())]

        if hasattr(self, "subset"):
            summary.append('*%d* items matched the criteria' % self.subset.getSize())

        return summary


[docs]class ProtSetAggregate(EMProtocol):
    """ Aggregates any set data based on its fields"""
    _label = "data summary"
    def _defineParams(self, form):
        form.addSection(label='Input')

        add = form.addParam  # short notation
        add('inputSet', pwprot.params.PointerParam,
            pointerClass='EMSet', label="Any set",
            help='Set with the dta to be aggregated')

        add('operations', pwprot.params.StringParam,
            label='Summary operations',default="COUNT",
            help='Summary operations to apply to all fields in Fields parameter. e.g: MIN MAX AVG. Possible values are MIN, MAX, COUNT, '
                 'AVG, SUM, TOTAL, GROUP_CONCAT. For more technical information see: https://www.sqlite.org/lang_aggfunc.html')

        add('fields', pwprot.params.StringParam,
            label='Fields', default="id",
            help='Fields to apply operations on. Fields can be found in the metadata viewers.'
                  ' The header of the columns are valid names. e.g: _samplingRate id. Fields listed here should '
                 'support the operations specified: DO NOT add literal fields.',
            )

        add('groupby', pwprot.params.StringParam,
            label='Group by',
            help='Fields to make the group. An empty value will summarize the whole dataset.',
            )

    def _insertAllSteps(self):
        self._insertFunctionStep(self.aggregateSet, self.operations.get(), self.fields.get(), self.groupby.get())

[docs]    def aggregateSet(self, *args):

        mainSet = self.inputSet.get()

        # Instantiate and copy main properties
        outputSet = SetOfData.create(self.getPath())

        # Run the aggregation method
        operations = self.operations.getListFromValues(caster=str)
        self.info("Operations: %s" % operations)

        fields = self.fields.getListFromValues(caster=str)
        self.info("Fields: %s" % fields)


        if self.groupby.get():
            groupBy = self.groupby.getListFromValues(caster=str)
            self.info("Grouping by: %s" % groupBy)
        else:
            groupBy = None
            self.info("No grouping fields.")


        result = mainSet.aggregate(operations,
                                   fields, groupBy)

        pb = ProgressBar(len(result), fmt=ProgressBar.FULL)
        pb.start()

        # Dictionary to hold the scipion data type based on the key
        scipionTypes ={}

        def getScipionType(fieldName:str):

            if fieldName not in scipionTypes:

                if fieldName.startswith("COUNT"):
                    scipionType=Integer
                elif fieldName.startswith(("MIN","MAX","AVG", "SUM","TOTAL")):
                    scipionType=Float
                else:
                    scipionType=String

                self.info("Scipion type for %s is %s" %(key, scipionType.getClassName()))
                scipionTypes[key] = scipionType
            return scipionTypes[key]

        # Fill the set
        for line in result:
            newItem = Object()
            for key in line.keys():
                scipionType = getScipionType(key)
                value =line[key]
                setattr(newItem, key, scipionType(value))

            outputSet.append(newItem)
            pb.increase()

        pb.finish()

        self._defineOutputs(aggregate=outputSet)
        self._defineTransformRelation(mainSet, outputSet)