Source code for xmipp3.protocols.protocol_validate_overfitting

# **************************************************************************
# *
# * Authors:     Mohsen Kazemi  (mkazemi@cnb.csic.es)
# *              C.O.S. Sorzano (coss@cnb.csic.es)
# *
# * Unidad de  Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307  USA
# *
# *  All comments concerning this program package may be sent to the
# *  e-mail address 'scipion@cnb.csic.es'
# *
# **************************************************************************

from math import sqrt
import glob
import os

from pyworkflow import VERSION_1_1
from pyworkflow.utils import getFloatListFromValues
from pyworkflow.utils.path import cleanPattern
from pwem.protocols import ProtReconstruct3D
from pyworkflow.protocol.params import (PointerParam, FloatParam,
                                        NumericListParam, IntParam,
                                        StringParam, BooleanParam,
                                        LEVEL_ADVANCED, GPU_LIST, USE_GPU)
from pwem import emlib
from xmipp3.convert import writeSetOfParticles
from xmipp3.base import isXmippCudaPresent


[docs]class XmippProtValidateOverfitting(ProtReconstruct3D): """ Check how the resolution changes with the number of projections used for 3D reconstruction. NOTE: Using the output plot, with the reconstruction of aligned gaussian noise, you can assess the validity of the reconstruction from your micrograph images. Practically, if the resolution of reconstruction based on your images is not considerably different from aligned gaussian noise one (for less number of particles),your images may not produce a valid reconstruction. This method has been proposed by: B. Heymann "Validation of 3D EM Reconstructions", 2015. (see References) """ _label = 'validate overfitting' _lastUpdateVersion = VERSION_1_1 # --------------------------- DEFINE param functions -------------------------------------------- def _defineParams(self, form): form.addHidden(USE_GPU, BooleanParam, default=True, label="Use GPU for execution", help="This protocol has both CPU and GPU implementation.\ Select the one you want to use.") form.addHidden(GPU_LIST, StringParam, default='0', expertLevel=LEVEL_ADVANCED, label="Choose GPU IDs", help="Add a list of GPU devices that can be used") form.addSection(label='Input') form.addParam('inputParticles', PointerParam, pointerClass='SetOfParticles', pointerCondition='hasAlignmentProj', label="Input particles", help='Select the input images from the project.') form.addParam('doResize', BooleanParam, default=False, label='Resize input particles and volume?', help="If obtaining the best possible reconstruction " "is not your goal, you can resize your input " "particales and volume to reduce running time " "of the protocol") form.addParam('newSize', FloatParam, default=64, condition="doResize", label='New size (px)', help="Resizing input particles and volume" "using fourier method") form.addParam('doNoise', BooleanParam, default=False, label='Calculate the noise bound for resolution?', help="Select if you want to obtain the noise bound for " "resolution. This calculation will increase the " "computational time of this protocol.") form.addParam('input3DReference', PointerParam, pointerClass='Volume,SetOfVolumes', label='Initial 3D reference volume', condition='doNoise == True', help="Input 3D reference reconstruction to " "align gaussian noise.") form.addParam('symmetryGroup', StringParam, default='c1', label="Symmetry group", help="See [[Xmipp Symmetry][http://www2.mrc-lmb.cam.ac.uk/Xmipp/index.php/Conventions_%26_File_formats#Symmetry]] page" "for a description of the symmetry format" "accepted by Xmipp") form.addParam('numberOfParticles', NumericListParam, default="10 20 50 100 200 500 1000 1500 2000 3000 5000", expertLevel=LEVEL_ADVANCED, label="Number of particles", help="Number of particles in each subset and consequently " "number of subsets (for instance, in default values," "a number of 6 subsets with given values are chosen)\n" "Note:\n" "The number of particles in each subset should not " "be larger than 1/2 of the input set of particles. " "The protocol consider this issue automatically. It " "means that if the input set of particles are lower " "than 10,000, you could leave default values unchanged.") form.addParam('numberOfIterations', IntParam, default=10, expertLevel=LEVEL_ADVANCED, label="Number of times the randomization is performed") form.addParam('maxRes', FloatParam, default=0.5, expertLevel=LEVEL_ADVANCED, label="Maximum resolution (dig.freq)", help='Nyquist is 0.5') form.addParam('angSampRate', FloatParam, default=5, expertLevel=LEVEL_ADVANCED, label="Angular sampling rate") form.addParallelSection(threads=1, mpi=4) # --------------------------- INSERT steps functions -------------------------------------------- def _createFilenameTemplates(self): """ Centralize how files are called for iterations and references. """ myDict = { 'input_xmd': self._getExtraPath('input_particles.xmd') } self._updateFilenamesDict(myDict) def _insertAllSteps(self): self._createFilenameTemplates() # convertInputStep particlesMd = self._getFileName('input_xmd') imgSet = self.inputParticles.get() writeSetOfParticles(imgSet, particlesMd) # for debugging purpose debugging = False if self.doNoise.get(): inputNameRefVol = self.input3DReference.get().getFileName() fnNewVol = self._getExtraPath('newVolume.vol') fnNewImgStk = self._getExtraPath('newImages.stk') fnNewImgMd = self._getExtraPath('newImages.xmd') # do resizing if self.doResize.get(): if self.doNoise.get(): args = "-i %s ""-o %s --fourier %f " % (inputNameRefVol, fnNewVol, self.newSize) self.runJob("xmipp_image_resize", args) args = "-i %s -o %s --fourier %f" % (particlesMd, fnNewImgStk, self.newSize) args += " --save_metadata_stack %s" % fnNewImgMd args += " --keep_input_columns" self.runJob("xmipp_image_resize", args) oldSize = self.inputParticles.get().getDim()[0] scaleFactor = float(self.newSize.get()) / float(oldSize) args = "-i %s" % fnNewImgMd args += " --operate modify_values 'shiftX=shiftX*%f'" % scaleFactor self.runJob('xmipp_metadata_utilities', args, numberOfMpi=1) args = "-i %s" % fnNewImgMd args += " --operate modify_values 'shiftY=shiftY*%f'" % scaleFactor self.runJob('xmipp_metadata_utilities', args, numberOfMpi=1) # projections from reference volume if self.doNoise.get(): if self.doResize.get(): args = "-i %s -o %s" % (fnNewVol, self._getExtraPath( 'Ref_Projections.stk')) args += " --experimental_images %s" % fnNewImgMd else: args = "-i %s -o %s" % ( self.input3DReference.get().getFileName(), self._getExtraPath('Ref_Projections.stk')) args += " --experimental_images %s" % particlesMd args += " --sampling_rate %f --sym %s" % (self.angSampRate, self.symmetryGroup.get()) args += " --min_tilt_angle 0 --max_tilt_angle 90" args += " --compute_neighbors --angular_distance -1" self.runJob("xmipp_angular_project_library", args, numberOfMpi=self.numberOfMpi.get() * self.numberOfThreads.get()) numberOfParticles = getFloatListFromValues(self.numberOfParticles.get()) fractionCounter = 0 maxNumberOfParticles = 0.5 * self.inputParticles.get().getSize() for number in numberOfParticles: if number <= maxNumberOfParticles: #AJ before maxNumberOfParticles for iteration in range(0, self.numberOfIterations.get()): self._insertFunctionStep('reconstructionStep', number, fractionCounter, iteration, debugging, fnNewImgMd, particlesMd) fractionCounter += 1 self._insertFunctionStep('gatherResultsStep', debugging) # --------------------------- STEPS functions --------------------------------------------
[docs] def reconstructionStep(self, numberOfImages, fractionCounter, iteration, debugging, fnNewImgMd, particlesMd): fnRoot = self._getExtraPath("fraction%02d" % fractionCounter) Ts = self.inputParticles.get().getSamplingRate() # for noise fnRootN = self._getExtraPath("Nfraction%02d" % fractionCounter) for i in range(0, 2): fnImgs = fnRoot + "_images_%02d_%02d.xmd" % (i, iteration) if self.doResize.get(): self.runJob("xmipp_metadata_utilities", "-i %s -o %s --operate random_subset %d" % ( fnNewImgMd, fnImgs, numberOfImages), numberOfMpi=1) else: self.runJob("xmipp_metadata_utilities", "-i %s -o %s --operate random_subset %d" % ( particlesMd, fnImgs, numberOfImages), numberOfMpi=1) params = ' -i %s' % fnImgs params += ' -o %s' % fnRoot + "_%02d_%02d.vol" % (i, iteration) params += ' --sym %s' % self.symmetryGroup.get() params += ' --max_resolution %0.3f' % self.maxRes params += ' --padding 2' params += ' --fast' params += ' --sampling %f' % Ts if self.useGpu.get(): #AJ to make it work with and without queue system if self.numberOfMpi.get()>1: N_GPUs = len((self.gpuList.get()).split(',')) params += ' -gpusPerNode %d' % N_GPUs params += ' -threadsPerGPU %d' % max(self.numberOfThreads.get(),4) count=0 GpuListCuda='' if self.useQueueForSteps() or self.useQueue(): GpuList = os.environ["CUDA_VISIBLE_DEVICES"] GpuList = GpuList.split(",") for elem in GpuList: GpuListCuda = GpuListCuda+str(count)+' ' count+=1 else: GpuListAux = '' for elem in self.getGpuList(): GpuListCuda = GpuListCuda+str(count)+' ' GpuListAux = GpuListAux+str(elem)+',' count+=1 os.environ["CUDA_VISIBLE_DEVICES"] = GpuListAux if self.numberOfMpi.get()==1: params += " --device %s" %(GpuListCuda) params += ' --thr %d' % self.numberOfThreads.get() if self.numberOfMpi.get()>1: self.runJob('xmipp_cuda_reconstruct_fourier', params, numberOfMpi=len((self.gpuList.get()).split(','))+1) else: self.runJob('xmipp_cuda_reconstruct_fourier', params) else: self.runJob('xmipp_reconstruct_fourier_accel', params) # for noise if self.doNoise.get(): noiseStk = fnRoot + "_noises_%02d.stk" % i self.runJob("xmipp_image_convert", "-i %s -o %s" % (fnImgs, noiseStk), numberOfMpi=1) self.runJob("xmipp_image_operate", "-i %s --mult 0" % noiseStk) self.runJob("xmipp_transform_add_noise", "-i %s --type gaussian 3" % noiseStk, numberOfMpi=1) fnImgsNL = fnRoot + "_noisesL_%02d.xmd" % i noiseStk2 = fnRoot + "_noises2_%02d.stk" % i self.runJob("xmipp_image_convert", "-i %s -o %s --save_metadata_stack %s" % ( noiseStk, noiseStk2, fnImgsNL), numberOfMpi=1) fnImgsNoiseOld = fnRoot + "_noisesOld_%02d.xmd" % i fnImgsN = fnRoot + "_noises_%02d_%02d.xmd" % (i, iteration) self.runJob("xmipp_metadata_utilities", '-i %s -o %s --operate drop_column "image"' % ( fnImgs, fnImgsNoiseOld), numberOfMpi=1) self.runJob("xmipp_metadata_utilities", "-i %s --set merge %s -o %s" % ( fnImgsNL, fnImgsNoiseOld, fnImgsN), numberOfMpi=1) # alignment gaussian noise fnImgsAlign = self._getExtraPath( "Nfraction_alignment%02d" % fractionCounter) fnImgsAlignN = fnImgsAlign + "_%02d_%02d.xmd" % (i, iteration) args = "-i %s -o %s -r %s --Ri 0 --Ro -1 --mem 2 --append " % ( fnImgsN, fnImgsAlignN, self._getExtraPath("Ref_Projections.stk")) self.runJob('xmipp_angular_projection_matching', args, numberOfMpi=self.numberOfMpi.get()) # numberOfMpi = self.numberOfMpi.get() * self.numberOfThreads.get()) params = ' -i %s' % fnImgsAlignN params += ' -o %s' % fnRootN + "_%02d_%02d.vol" % ( i, iteration) params += ' --sym %s' % self.symmetryGroup.get() params += ' --max_resolution %0.3f' % self.maxRes params += ' --padding 2' params += ' --thr 1' # params += ' --thr %d' % self.numberOfThreads.get() params += ' --sampling %f' % Ts if self.useGpu.get(): #AJ to make it work with and without queue system if self.numberOfMpi.get()>1: N_GPUs = len((self.gpuList.get()).split(',')) params += ' -gpusPerNode %d' % N_GPUs params += ' -threadsPerGPU %d' % max(self.numberOfThreads.get(),4) count=0 GpuListCuda='' if self.useQueueForSteps() or self.useQueue(): GpuList = os.environ["CUDA_VISIBLE_DEVICES"] GpuList = GpuList.split(",") for elem in GpuList: GpuListCuda = GpuListCuda+str(count)+' ' count+=1 else: GpuListAux = '' for elem in self.getGpuList(): GpuListCuda = GpuListCuda+str(count)+' ' GpuListAux = GpuListAux+str(elem)+',' count+=1 os.environ["CUDA_VISIBLE_DEVICES"] = GpuListAux if self.numberOfMpi.get()==1: params += " --device %s" %(GpuListCuda) params += ' --thr %d' % self.numberOfThreads.get() if self.numberOfMpi.get()>1: self.runJob('xmipp_cuda_reconstruct_fourier', params, numberOfMpi=len((self.gpuList.get()).split(','))+1) else: self.runJob('xmipp_cuda_reconstruct_fourier', params) else: self.runJob('xmipp_reconstruct_fourier_accel', params) self.runJob('xmipp_resolution_fsc', "--ref %s -i %s -o %s --sampling_rate %f" % \ (fnRoot + "_00_%02d.vol" % iteration, fnRoot + "_01_%02d.vol" % iteration, fnRoot + "_fsc_%02d.xmd" % iteration, Ts), numberOfMpi=1) mdFSC = emlib.MetaData(fnRoot + "_fsc_%02d.xmd" % iteration) for id in mdFSC: fscValue = mdFSC.getValue(emlib.MDL_RESOLUTION_FRC, id) maxFreq = mdFSC.getValue(emlib.MDL_RESOLUTION_FREQREAL, id) if fscValue < 0.5: break fh = open(fnRoot + "_freq.txt", "a") fh.write("%f\n" % maxFreq) fh.close() # for noise if self.doNoise.get(): self.runJob('xmipp_resolution_fsc', "--ref %s -i %s -o %s --sampling_rate %f" % ( fnRootN + "_00_%02d.vol" % iteration, fnRootN + "_01_%02d.vol" % iteration, fnRootN + "_fsc_%02d.xmd" % iteration, Ts), numberOfMpi=1) cleanPattern(fnRoot + "_noises_0?_0?.xmd") cleanPattern(fnRoot + "_noisesOld_0?.xmd") cleanPattern(fnRoot + "_noisesL_0?.xmd") cleanPattern(fnRoot + "_noises2_0?.stk") mdFSCN = emlib.MetaData(fnRootN + "_fsc_%02d.xmd" % iteration) for id in mdFSCN: fscValueN = mdFSCN.getValue(emlib.MDL_RESOLUTION_FRC, id) maxFreqN = mdFSCN.getValue(emlib.MDL_RESOLUTION_FREQREAL, id) if fscValueN < 0.5: break fhN = open(fnRootN + "_freq.txt", "a") fhN.write("%f\n" % maxFreqN) fhN.close() if not debugging: cleanPattern(fnRoot + "_0?_0?.vol") cleanPattern(fnRoot + "_images_0?_0?.xmd") cleanPattern(fnRoot + "_fsc_0?.xmd") cleanPattern(fnRoot + "_noises_0?.stk") if self.doNoise.get(): cleanPattern(fnImgsAlign + "_0?_0?.xmd") cleanPattern(fnRootN + "_0?_0?.vol") cleanPattern(fnRootN + "_fsc_0?.xmd")
[docs] def gatherResultsStep(self, debugging): self._writeFreqsMetaData("fraction*_freq.txt", self._defineResultsTxt()) if self.doNoise.get(): self._writeFreqsMetaData("Nfraction*_freq.txt", self._defineResultsNoiseTxt()) if not debugging: #cleanPattern(self._getExtraPath("fraction*_freq.txt")) AJ volver cleanPattern(self._getExtraPath('newImages.stk')) cleanPattern(self._getExtraPath('newImages.xmd')) if self.doNoise.get(): cleanPattern(self._getExtraPath("Nfraction*_freq.txt")) cleanPattern(self._getExtraPath('Ref_Projections*')) cleanPattern(self._getExtraPath('newVolume.vol'))
# --------------------------- INFO functions -------------------------------------------- def _summary(self): """ Should be overriden in subclasses to return summary message for NORMAL EXECUTION. """ numberOfParticles = getFloatListFromValues(self.numberOfParticles.get()) maxNumberOfParticles = 0.5 * self.inputParticles.get().getSize() intNumberOfParticles = [int(float(x)) for x in numberOfParticles] particlesNumber = '' for number in intNumberOfParticles: if number <= maxNumberOfParticles: particlesNumber += str(number) + ' ' msg = [] msg.append("Number of particles: " + particlesNumber) msg.append("Number of times that reconstruction is performed per each " "particles subset: %d" % self.numberOfIterations) return msg def _methods(self): messages = [] messages.append('B. Heymann "Validation of 3D EM Reconstructions"') return messages def _citations(self): return ['B.Heymann2015'] def _validate(self): errors = [] if (self.doResize.get() and self.newSize.get() > self.inputParticles.get().getDim()[0]): errors.append("Fourier resize method cannot be used " "to increase the dimensions") if (self.doResize.get() and self.newSize.get() == self.inputParticles.get().getDim()[0]): errors.append("The new chosen size is equal to the " "recent particles size") if self.useGpu and not isXmippCudaPresent(): errors.append("You have asked to use GPU, but I cannot find the Xmipp GPU programs") return errors # --------------------------- UTILS functions -------------------------------------------- def _writeFreqsMetaData(self, pattern, outputFn): """ Glob and read frequencies either from reconstruction or noise and write the proper metadata file. """ fnFreqs = sorted(glob.glob(self._getExtraPath(pattern))) subset = 0 numberOfParticles = getFloatListFromValues(self.numberOfParticles.get()) validationMd = emlib.MetaData() fnOut = open(outputFn, 'w') for fnFreq in fnFreqs: data = [] dataInv = [] fnFreqOpen = open(fnFreq, "r") for line in fnFreqOpen: fields = line.split() rowdata = list(map(float, fields)) val = 1.0/(float(rowdata[0])*float(rowdata[0])) dataInv.append(val) data.extend(rowdata) meanRes = (sum(data) / len(data)) data[:] = [(x - meanRes) ** 2 for x in data] varRes = (sum(data) / (len(data) - 1)) stdRes = sqrt(varRes) meanResInv = (sum(dataInv) / len(dataInv)) dataInv[:] = [(x - meanResInv) ** 2 for x in dataInv] varResInv = (sum(dataInv) / (len(dataInv) - 1)) stdResInv = sqrt(varResInv) fnOut.write(str(numberOfParticles[subset]) + ' ' + str(meanRes) + ' ' + str(stdRes) + ' ' + str(meanResInv) + ' ' + str(stdResInv) + '\n') # # objId = validationMd.addObject() # validationMd.setValue(emlib.MDL_COUNT, # long(numberOfParticles[subset]), # objId) # validationMd.setValue(emlib.MDL_AVG, meanRes, objId) # validationMd.setValue(emlib.MDL_STDDEV, stdRes, objId) subset += 1 # validationMd.write(outputFn) fnOut.close() # def _defineResultsName(self): # return self._getExtraPath('results.xmd') # # def _defineResultsNoiseName(self): # return self._getExtraPath('resultsNoise.xmd') def _defineResultsTxt(self): return self._getExtraPath('outputValues.txt') def _defineResultsNoiseTxt(self): return self._getExtraPath('outputNoiseValues.txt')