Source code for xmipp3.protocols.protocol_validate_overfitting

# **************************************************************************
# *
# * Authors:     Mohsen Kazemi  (mkazemi@cnb.csic.es)
# *              C.O.S. Sorzano (coss@cnb.csic.es)
# *
# * Unidad de  Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307  USA
# *
# *  All comments concerning this program package may be sent to the
# *  e-mail address 'scipion@cnb.csic.es'
# *
# **************************************************************************

from math import sqrt
import glob
import os

from pyworkflow import VERSION_1_1
from pyworkflow.utils import getFloatListFromValues
from pyworkflow.utils.path import cleanPattern
from pwem.protocols import ProtReconstruct3D
from pyworkflow.protocol.params import (PointerParam, FloatParam,
                                        NumericListParam, IntParam,
                                        StringParam, BooleanParam,
                                        LEVEL_ADVANCED, GPU_LIST, USE_GPU)
from pwem import emlib
from xmipp3.convert import writeSetOfParticles
from xmipp3.base import isXmippCudaPresent


[docs]class XmippProtValidateOverfitting(ProtReconstruct3D):
    """    
    Check how the resolution changes with the number of projections used for 
    3D reconstruction. 

    NOTE:
    Using the output plot, with the reconstruction of aligned gaussian noise,
    you can assess the validity of the reconstruction from your micrograph
    images. Practically, if the resolution of reconstruction based on your
    images is not considerably different from aligned gaussian noise one
    (for less number of particles),your images may not produce a valid
    reconstruction.

    This method has been proposed by:
    B. Heymann "Validation of 3D EM Reconstructions", 2015.
    (see References)
    """
    _label = 'validate overfitting'
    _lastUpdateVersion = VERSION_1_1

    # --------------------------- DEFINE param functions --------------------------------------------

    def _defineParams(self, form):
        form.addHidden(USE_GPU, BooleanParam, default=True,
                       label="Use GPU for execution",
                       help="This protocol has both CPU and GPU implementation.\
                       Select the one you want to use.")

        form.addHidden(GPU_LIST, StringParam, default='0',
                       expertLevel=LEVEL_ADVANCED,
                       label="Choose GPU IDs",
                       help="Add a list of GPU devices that can be used")

        form.addSection(label='Input')

        form.addParam('inputParticles', PointerParam,
                      pointerClass='SetOfParticles',
                      pointerCondition='hasAlignmentProj',
                      label="Input particles",
                      help='Select the input images from the project.')
        form.addParam('doResize', BooleanParam, default=False,
                      label='Resize input particles and volume?',
                      help="If obtaining the best possible reconstruction "
                           "is not your goal, you can resize your input "
                           "particales and volume to reduce running time "
                           "of the protocol")
        form.addParam('newSize', FloatParam, default=64, condition="doResize",
                      label='New size (px)',
                      help="Resizing input particles and volume"
                           "using fourier method")
        form.addParam('doNoise', BooleanParam, default=False,
                      label='Calculate the noise bound for resolution?',
                      help="Select if you want to obtain the noise bound for "
                           "resolution. This calculation will increase the "
                           "computational time of this protocol.")
        form.addParam('input3DReference', PointerParam,
                      pointerClass='Volume,SetOfVolumes',
                      label='Initial 3D reference volume',
                      condition='doNoise == True',
                      help="Input 3D reference reconstruction to "
                           "align gaussian noise.")
        form.addParam('symmetryGroup', StringParam, default='c1',
                      label="Symmetry group",
                      help="See [[Xmipp Symmetry][http://www2.mrc-lmb.cam.ac.uk/Xmipp/index.php/Conventions_%26_File_formats#Symmetry]] page"
                           "for a description of the symmetry format"
                           "accepted by Xmipp")
        form.addParam('numberOfParticles', NumericListParam,
                      default="10 20 50 100 200 500 1000 1500 2000 3000 5000",
                      expertLevel=LEVEL_ADVANCED,
                      label="Number of particles",
                      help="Number of particles in each subset and consequently "
                           "number of subsets (for instance, in default values,"
                           "a number of 6 subsets with given values are chosen)\n"
                           "Note:\n"
                           "The number of particles in each subset should not "
                           "be larger than 1/2 of the input set of particles. "
                           "The protocol consider this issue automatically. It "
                           "means that if the input set of particles are lower "
                           "than 10,000, you could leave default values unchanged.")
        form.addParam('numberOfIterations', IntParam, default=10,
                      expertLevel=LEVEL_ADVANCED,
                      label="Number of times the randomization is performed")
        form.addParam('maxRes', FloatParam, default=0.5,
                      expertLevel=LEVEL_ADVANCED,
                      label="Maximum resolution (dig.freq)",
                      help='Nyquist is 0.5')
        form.addParam('angSampRate', FloatParam, default=5,
                      expertLevel=LEVEL_ADVANCED,
                      label="Angular sampling rate")

        form.addParallelSection(threads=1, mpi=4)

    # --------------------------- INSERT steps functions --------------------------------------------

    def _createFilenameTemplates(self):
        """ Centralize how files are called for iterations and references. """
        myDict = {
            'input_xmd': self._getExtraPath('input_particles.xmd')
        }
        self._updateFilenamesDict(myDict)

    def _insertAllSteps(self):
        self._createFilenameTemplates()

        # convertInputStep
        particlesMd = self._getFileName('input_xmd')
        imgSet = self.inputParticles.get()
        writeSetOfParticles(imgSet, particlesMd)

        # for debugging purpose
        debugging = False

        if self.doNoise.get():
            inputNameRefVol = self.input3DReference.get().getFileName()
            fnNewVol = self._getExtraPath('newVolume.vol')
        fnNewImgStk = self._getExtraPath('newImages.stk')
        fnNewImgMd = self._getExtraPath('newImages.xmd')
        # do resizing
        if self.doResize.get():
            if self.doNoise.get():
                args = "-i %s ""-o %s --fourier %f " % (inputNameRefVol,
                                                        fnNewVol,
                                                        self.newSize)
                self.runJob("xmipp_image_resize", args)

            args = "-i %s -o %s --fourier %f" % (particlesMd,
                                                 fnNewImgStk,
                                                 self.newSize)
            args += " --save_metadata_stack %s" % fnNewImgMd
            args += " --keep_input_columns"

            self.runJob("xmipp_image_resize", args)

            oldSize = self.inputParticles.get().getDim()[0]
            scaleFactor = float(self.newSize.get()) / float(oldSize)

            args = "-i %s" % fnNewImgMd
            args += " --operate modify_values 'shiftX=shiftX*%f'" % scaleFactor
            self.runJob('xmipp_metadata_utilities', args, numberOfMpi=1)

            args = "-i %s" % fnNewImgMd
            args += " --operate modify_values 'shiftY=shiftY*%f'" % scaleFactor
            self.runJob('xmipp_metadata_utilities', args, numberOfMpi=1)

        # projections from reference volume
        if self.doNoise.get():
            if self.doResize.get():
                args = "-i %s -o %s" % (fnNewVol,
                                        self._getExtraPath(
                                            'Ref_Projections.stk'))
                args += " --experimental_images %s" % fnNewImgMd
            else:
                args = "-i %s -o %s" % (
                self.input3DReference.get().getFileName(),
                self._getExtraPath('Ref_Projections.stk'))
                args += " --experimental_images %s" % particlesMd
            args += " --sampling_rate %f --sym %s" % (self.angSampRate,
                                                      self.symmetryGroup.get())
            args += " --min_tilt_angle 0 --max_tilt_angle 90"
            args += " --compute_neighbors --angular_distance -1"
            self.runJob("xmipp_angular_project_library",
                        args,
                        numberOfMpi=self.numberOfMpi.get() * self.numberOfThreads.get())

        numberOfParticles = getFloatListFromValues(self.numberOfParticles.get())
        fractionCounter = 0
        maxNumberOfParticles = 0.5 * self.inputParticles.get().getSize()
        for number in numberOfParticles:
            if number <= maxNumberOfParticles: #AJ before maxNumberOfParticles
                for iteration in range(0, self.numberOfIterations.get()):
                    self._insertFunctionStep('reconstructionStep', number,
                                             fractionCounter, iteration,
                                             debugging, fnNewImgMd,
                                             particlesMd)
                fractionCounter += 1
        self._insertFunctionStep('gatherResultsStep', debugging)
        # --------------------------- STEPS functions --------------------------------------------

[docs]    def reconstructionStep(self, numberOfImages, fractionCounter,
                           iteration, debugging,
                           fnNewImgMd, particlesMd):
        fnRoot = self._getExtraPath("fraction%02d" % fractionCounter)
        Ts = self.inputParticles.get().getSamplingRate()

        # for noise
        fnRootN = self._getExtraPath("Nfraction%02d" % fractionCounter)

        for i in range(0, 2):
            fnImgs = fnRoot + "_images_%02d_%02d.xmd" % (i, iteration)

            if self.doResize.get():
                self.runJob("xmipp_metadata_utilities",
                            "-i %s -o %s --operate random_subset %d" % (
                                fnNewImgMd, fnImgs, numberOfImages),
                            numberOfMpi=1)
            else:
                self.runJob("xmipp_metadata_utilities",
                            "-i %s -o %s --operate random_subset %d" % (
                                particlesMd, fnImgs, numberOfImages),
                            numberOfMpi=1)

            params = '  -i %s' % fnImgs
            params += '  -o %s' % fnRoot + "_%02d_%02d.vol" % (i, iteration)
            params += ' --sym %s' % self.symmetryGroup.get()
            params += ' --max_resolution %0.3f' % self.maxRes
            params += ' --padding 2'
            params += ' --fast'
            params += ' --sampling %f' % Ts

            if self.useGpu.get():
                #AJ to make it work with and without queue system
                if self.numberOfMpi.get()>1:
                    N_GPUs = len((self.gpuList.get()).split(','))
                    params += ' -gpusPerNode %d' % N_GPUs
                    params += ' -threadsPerGPU %d' % max(self.numberOfThreads.get(),4)
                count=0
                GpuListCuda=''
                if self.useQueueForSteps() or self.useQueue():
                    GpuList = os.environ["CUDA_VISIBLE_DEVICES"]
                    GpuList = GpuList.split(",")
                    for elem in GpuList:
                        GpuListCuda = GpuListCuda+str(count)+' '
                        count+=1
                else:
                    GpuListAux = ''
                    for elem in self.getGpuList():
                        GpuListCuda = GpuListCuda+str(count)+' '
                        GpuListAux = GpuListAux+str(elem)+','
                        count+=1
                    os.environ["CUDA_VISIBLE_DEVICES"] = GpuListAux
                if self.numberOfMpi.get()==1:
                    params += " --device %s" %(GpuListCuda)
                params += ' --thr %d' % self.numberOfThreads.get()
                if self.numberOfMpi.get()>1:
                    self.runJob('xmipp_cuda_reconstruct_fourier', params, numberOfMpi=len((self.gpuList.get()).split(','))+1)
                else:
                    self.runJob('xmipp_cuda_reconstruct_fourier', params)
            else:
                self.runJob('xmipp_reconstruct_fourier_accel', params)

            # for noise
            if self.doNoise.get():
                noiseStk = fnRoot + "_noises_%02d.stk" % i
                self.runJob("xmipp_image_convert",
                            "-i %s -o %s" % (fnImgs, noiseStk), numberOfMpi=1)
                self.runJob("xmipp_image_operate",
                            "-i %s --mult 0" % noiseStk)
                self.runJob("xmipp_transform_add_noise",
                            "-i %s --type gaussian 3" % noiseStk, numberOfMpi=1)
                fnImgsNL = fnRoot + "_noisesL_%02d.xmd" % i
                noiseStk2 = fnRoot + "_noises2_%02d.stk" % i
                self.runJob("xmipp_image_convert",
                            "-i %s -o %s --save_metadata_stack %s" % (
                                noiseStk, noiseStk2, fnImgsNL), numberOfMpi=1)
                fnImgsNoiseOld = fnRoot + "_noisesOld_%02d.xmd" % i
                fnImgsN = fnRoot + "_noises_%02d_%02d.xmd" % (i, iteration)
                self.runJob("xmipp_metadata_utilities",
                            '-i %s -o %s --operate drop_column "image"' % (
                                fnImgs, fnImgsNoiseOld), numberOfMpi=1)
                self.runJob("xmipp_metadata_utilities",
                            "-i %s  --set merge %s -o %s" % (
                                fnImgsNL, fnImgsNoiseOld, fnImgsN),
                            numberOfMpi=1)

                # alignment gaussian noise
                fnImgsAlign = self._getExtraPath(
                    "Nfraction_alignment%02d" % fractionCounter)
                fnImgsAlignN = fnImgsAlign + "_%02d_%02d.xmd" % (i, iteration)

                args = "-i %s -o %s -r %s --Ri 0 --Ro -1 --mem 2 --append " % (
                    fnImgsN, fnImgsAlignN,
                    self._getExtraPath("Ref_Projections.stk"))

                self.runJob('xmipp_angular_projection_matching',
                            args,
                            numberOfMpi=self.numberOfMpi.get())
                # numberOfMpi = self.numberOfMpi.get() * self.numberOfThreads.get())

                params = '  -i %s' % fnImgsAlignN
                params += '  -o %s' % fnRootN + "_%02d_%02d.vol" % (
                i, iteration)
                params += ' --sym %s' % self.symmetryGroup.get()
                params += ' --max_resolution %0.3f' % self.maxRes
                params += ' --padding 2'
                params += ' --sampling %f' % Ts

                if self.useGpu.get():
                    #AJ to make it work with and without queue system
                    if self.numberOfMpi.get()>1:
                        N_GPUs = len((self.gpuList.get()).split(','))
                        params += ' -gpusPerNode %d' % N_GPUs
                        params += ' -threadsPerGPU %d' % max(self.numberOfThreads.get(),4)
                    count=0
                    GpuListCuda=''
                    if self.useQueueForSteps() or self.useQueue():
                        GpuList = os.environ["CUDA_VISIBLE_DEVICES"]
                        GpuList = GpuList.split(",")
                        for elem in GpuList:
                            GpuListCuda = GpuListCuda+str(count)+' '
                            count+=1
                    else:
                        GpuListAux = ''
                        for elem in self.getGpuList():
                            GpuListCuda = GpuListCuda+str(count)+' '
                            GpuListAux = GpuListAux+str(elem)+','
                            count+=1
                        os.environ["CUDA_VISIBLE_DEVICES"] = GpuListAux
                    if self.numberOfMpi.get()==1:
                        params += " --device %s" %(GpuListCuda)
                    params += ' --thr %d' % self.numberOfThreads.get()
                    if self.numberOfMpi.get()>1:
                        self.runJob('xmipp_cuda_reconstruct_fourier', params, numberOfMpi=len((self.gpuList.get()).split(','))+1)
                    else:
                        self.runJob('xmipp_cuda_reconstruct_fourier', params)
                else:
                    self.runJob('xmipp_reconstruct_fourier_accel', params)


        self.runJob('xmipp_resolution_fsc',
                    "--ref %s -i %s -o %s --sampling_rate %f" % \
                    (fnRoot + "_00_%02d.vol" % iteration,
                     fnRoot + "_01_%02d.vol" % iteration,
                     fnRoot + "_fsc_%02d.xmd" % iteration, Ts),
                    numberOfMpi=1)

        mdFSC = emlib.MetaData(fnRoot + "_fsc_%02d.xmd" % iteration)
        for id in mdFSC:
            fscValue = mdFSC.getValue(emlib.MDL_RESOLUTION_FRC, id)
            maxFreq = mdFSC.getValue(emlib.MDL_RESOLUTION_FREQREAL, id)
            if fscValue < 0.5:
                break
        fh = open(fnRoot + "_freq.txt", "a")
        fh.write("%f\n" % maxFreq)
        fh.close()

        # for noise
        if self.doNoise.get():
            self.runJob('xmipp_resolution_fsc',
                        "--ref %s -i %s -o %s --sampling_rate %f" % (
                            fnRootN + "_00_%02d.vol" % iteration,
                            fnRootN + "_01_%02d.vol" % iteration,
                            fnRootN + "_fsc_%02d.xmd" % iteration, Ts),
                        numberOfMpi=1)

            cleanPattern(fnRoot + "_noises_0?_0?.xmd")
            cleanPattern(fnRoot + "_noisesOld_0?.xmd")
            cleanPattern(fnRoot + "_noisesL_0?.xmd")
            cleanPattern(fnRoot + "_noises2_0?.stk")

            mdFSCN = emlib.MetaData(fnRootN + "_fsc_%02d.xmd" % iteration)
            for id in mdFSCN:
                fscValueN = mdFSCN.getValue(emlib.MDL_RESOLUTION_FRC, id)
                maxFreqN = mdFSCN.getValue(emlib.MDL_RESOLUTION_FREQREAL, id)
                if fscValueN < 0.5:
                    break
            fhN = open(fnRootN + "_freq.txt", "a")
            fhN.write("%f\n" % maxFreqN)
            fhN.close()

        if not debugging:
            cleanPattern(fnRoot + "_0?_0?.vol")
            cleanPattern(fnRoot + "_images_0?_0?.xmd")
            cleanPattern(fnRoot + "_fsc_0?.xmd")
            cleanPattern(fnRoot + "_noises_0?.stk")
            if self.doNoise.get():
                cleanPattern(fnImgsAlign + "_0?_0?.xmd")
                cleanPattern(fnRootN + "_0?_0?.vol")
                cleanPattern(fnRootN + "_fsc_0?.xmd")

[docs]    def gatherResultsStep(self, debugging):
        self._writeFreqsMetaData("fraction*_freq.txt",
                                 self._defineResultsTxt())
        if self.doNoise.get():
            self._writeFreqsMetaData("Nfraction*_freq.txt",
                                     self._defineResultsNoiseTxt())

        if not debugging:
            #cleanPattern(self._getExtraPath("fraction*_freq.txt")) AJ volver
            cleanPattern(self._getExtraPath('newImages.stk'))
            cleanPattern(self._getExtraPath('newImages.xmd'))
            if self.doNoise.get():
                cleanPattern(self._getExtraPath("Nfraction*_freq.txt"))
                cleanPattern(self._getExtraPath('Ref_Projections*'))
                cleanPattern(self._getExtraPath('newVolume.vol'))

                # --------------------------- INFO functions --------------------------------------------

    def _summary(self):
        """ Should be overriden in subclasses to
        return summary message for NORMAL EXECUTION.
        """
        numberOfParticles = getFloatListFromValues(self.numberOfParticles.get())
        maxNumberOfParticles = 0.5 * self.inputParticles.get().getSize()
        intNumberOfParticles = [int(float(x)) for x in numberOfParticles]
        particlesNumber = ''
        for number in intNumberOfParticles:
            if number <= maxNumberOfParticles:
                particlesNumber += str(number) + '  '
        msg = []
        msg.append("Number of particles: " + particlesNumber)
        msg.append("Number of times that reconstruction is performed per each "
                   "particles subset: %d" % self.numberOfIterations)
        return msg

    def _methods(self):
        messages = []
        messages.append('B. Heymann "Validation of 3D EM Reconstructions"')
        return messages

    def _citations(self):
        return ['B.Heymann2015']

    def _validate(self):
        errors = []
        if (self.doResize.get() and
                self.newSize.get() > self.inputParticles.get().getDim()[0]):
            errors.append("Fourier resize method cannot be used "
                          "to increase the dimensions")
        if (self.doResize.get() and
                self.newSize.get() == self.inputParticles.get().getDim()[0]):
            errors.append("The new chosen size is equal to the "
                          "recent particles size")
        if self.useGpu and not isXmippCudaPresent():
            errors.append("You have asked to use GPU, but I cannot find the Xmipp GPU programs")
        return errors

        # --------------------------- UTILS functions --------------------------------------------

    def _writeFreqsMetaData(self, pattern, outputFn):
        """ Glob and read frequencies either from reconstruction
        or noise and write the proper metadata file.
        """
        fnFreqs = sorted(glob.glob(self._getExtraPath(pattern)))
        subset = 0

        numberOfParticles = getFloatListFromValues(self.numberOfParticles.get())
        validationMd = emlib.MetaData()
        fnOut = open(outputFn, 'w')

        for fnFreq in fnFreqs:
            data = []
            dataInv = []
            fnFreqOpen = open(fnFreq, "r")
            for line in fnFreqOpen:
                fields = line.split()
                rowdata = list(map(float, fields))
                val = 1.0/(float(rowdata[0])*float(rowdata[0]))
                dataInv.append(val)
                data.extend(rowdata)
            meanRes = (sum(data) / len(data))
            data[:] = [(x - meanRes) ** 2 for x in data]
            varRes = (sum(data) / (len(data) - 1))
            stdRes = sqrt(varRes)

            meanResInv = (sum(dataInv) / len(dataInv))
            dataInv[:] = [(x - meanResInv) ** 2 for x in dataInv]
            varResInv = (sum(dataInv) / (len(dataInv) - 1))
            stdResInv = sqrt(varResInv)

            fnOut.write(str(numberOfParticles[subset]) + ' ' + str(meanRes)
                        + ' ' + str(stdRes) + ' ' + str(meanResInv) + ' '
                        + str(stdResInv) + '\n')
        #
        #     objId = validationMd.addObject()
        #     validationMd.setValue(emlib.MDL_COUNT,
        #                           long(numberOfParticles[subset]),
        #                           objId)
        #     validationMd.setValue(emlib.MDL_AVG, meanRes, objId)
        #     validationMd.setValue(emlib.MDL_STDDEV, stdRes, objId)
            subset += 1

        # validationMd.write(outputFn)
        fnOut.close()

    # def _defineResultsName(self):
    #     return self._getExtraPath('results.xmd')
    #
    # def _defineResultsNoiseName(self):
    #     return self._getExtraPath('resultsNoise.xmd')

    def _defineResultsTxt(self):
        return self._getExtraPath('outputValues.txt')

    def _defineResultsNoiseTxt(self):
        return self._getExtraPath('outputNoiseValues.txt')