# **************************************************************************
# *
# * Authors: Ruben Sanchez Garcia (rsanchez@cnb.csic.es)
# * David Maluenda (dmaluenda@cnb.csic.es)
# * Daniel Del Hoyo (daniel.delhoyo.gomez@alumnos.upm.es)
# * Daniel Marchán (da.marchan@cnb.csic.es)
# *
# * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 2 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307 USA
# *
# * All comments concerning this program package may be sent to the
# * e-mail address 'coss@cnb.csic.es'
# *
# **************************************************************************
"""
Deep Consensus picking protocol
"""
import os, sys, time
from glob import glob
import six
import json, shutil, pickle
from pyworkflow import VERSION_2_0
from pyworkflow.utils.path import (makePath, cleanPattern, cleanPath, copyTree,
createLink)
from pwem.constants import RELATION_CTF, ALIGN_NONE
from pwem.emlib.image import ImageHandler
from pwem.objects import SetOfParticles, Micrograph, Particle, Coordinate
from pwem.protocols import ProtParticlePicking, ProtUserSubSet
import pyworkflow.utils as pwutils
from pyworkflow.protocol import params, STATUS_NEW
import pwem.emlib.metadata as md
from pwem import emlib
from xmipp3.base import XmippProtocol
from xmipp3.protocols.protocol_pick_noise import pickNoise_prepareInput, IN_COORDS_POS_DIR_BASENAME
from xmipp3.convert import (readSetOfParticles, setXmippAttributes,
micrographToCTFParam, writeSetOfParticles,
writeSetOfCoordinates, readSetOfCoordsFromPosFnames, readSetOfCoordinates)
from pyworkflow import BETA, UPDATED, NEW, PROD
MIN_NUM_CONSENSUS_COORDS = 256
AND = 'by_all'
OR = 'by_at_least_one'
UNION_INTERSECTIONS = 'by_at_least_two'
try:
from xmippPyModules.deepConsensusWorkers.deepConsensus_networkDef import DEEP_PARTICLE_SIZE
except ImportError as e:
DEEP_PARTICLE_SIZE = 128
[docs]class XmippProtScreenDeepConsensus(ProtParticlePicking, XmippProtocol):
""" Protocol to compute a smart consensus between different particle picking
algorithms. The protocol takes several Sets of Coordinates calculated
by different programs and/or different parameter settings. Let's say:
we consider N independent pickings. Then, a neural network is trained
using different subset of picked and not picked cooridantes. Finally,
a coordinate is considered to be a correct particle according to the
neural network predictions.
In streaming, the network is trained and used to predict in batches.
The network is trained until the number of particles set is reached,
meanwhile, a preliminary output is generated. Once the threshold is reached,
the final output is produced by batches.
AI Generated
## Overview
The Deep Consensus Picking protocol combines several particle-picking results
using a deep-learning classifier.
Different particle pickers, or the same picker with different parameters, may
produce different coordinate sets. Some particles are selected by all pickers,
some are selected by only one picker, and some false positives appear only in
specific picking results. Classical consensus picking keeps coordinates based
on voting rules, but Deep Consensus goes one step further: it uses the
agreement and disagreement between coordinate sets to train a neural network
that scores candidate particles.
The protocol first creates candidate coordinates from the union of the input
picking results. It also creates highly reliable positive examples from
coordinates supported by multiple pickers, and negative examples from
background/noise regions. These examples are used to train a convolutional
neural network. The trained network then assigns a score between 0 and 1 to
candidate particles, and the protocol keeps those above the selected threshold.
The main outputs are filtered particle coordinates and the corresponding
extracted particles.
## Inputs and General Workflow
The main input is a list of coordinate sets.
The protocol uses the micrographs associated with those coordinate sets. It
preprocesses the micrographs, extracts candidate particles at a fixed internal
size of 128 × 128 pixels, trains or loads a neural-network model, scores the
candidate particles, and creates final outputs.
The workflow can be summarized as follows:
1. Collect input coordinate sets.
2. Build consensus coordinate groups.
3. Preprocess the associated micrographs.
4. Extract particles from candidate coordinates.
5. Generate positive and negative training examples.
6. Train, continue, or load a deep-learning model.
7. Score candidate particles.
8. Keep particles whose score passes the selected threshold.
9. Output both coordinates and particles.
The protocol also supports streaming. In streaming mode, training and
prediction are performed in batches as new micrographs and coordinates arrive.
## Input Coordinates
The **Input coordinates** parameter contains the coordinate sets to be combined
and screened.
These coordinate sets may come from different picking protocols, different
algorithms, different parameter settings, or different manual/automatic
strategies.
The protocol assumes that the coordinate sets refer to the same micrographs or
to overlapping micrograph sets. It uses the first coordinate set as the main
source for box-size and micrograph information.
For training a new model, more than one coordinate set is normally required.
If only one coordinate set is provided and training is requested, the protocol
reports a validation error, because it cannot derive a meaningful internal
consensus between pickers.
## Model Type
The **Select model type** parameter controls how the neural network is
initialized.
There are three options:
**New** starts from a randomly initialized model and trains it from the current
input data.
**Pretrained** starts from a pretrained Deep Consensus model.
**PreviousRun** reuses a model trained in a previous Deep Consensus run within
the same Scipion project.
The best choice depends on the amount and quality of available training data.
A new model is appropriate when there are enough reliable positive and negative
examples. A pretrained or previous model is useful when the user wants to reuse
prior training or score coordinates directly.
## Previous Run
When **PreviousRun** is selected, the **Select previous run** parameter defines
which earlier Deep Consensus run provides the model.
This allows the user to continue from a previous model or apply a model already
trained in the same project.
This option is useful when a dataset is processed in several stages, when
training was already performed in a previous run, or when the same picking
behavior should be applied consistently to new micrographs.
## Skip Training
The **Skip training and score directly with pretrained model?** option is
available when a pretrained or previous model is used.
If enabled, the protocol does not train the network again. It directly scores
the candidate particles using the selected model.
This is useful when the model is already considered appropriate for the data.
If disabled, the protocol continues training using the current training data.
Users should skip training only when the input data are compatible with the
model being reused. A model trained on a different specimen, contrast
convention, box size, or preprocessing strategy may not score particles
reliably.
## Relative Radius
The **Relative Radius** parameter defines how close coordinates from different
input sets must be to be considered the same particle.
The value is expressed as a fraction of the particle size. For example, a value
of 0.1 means that coordinates within 10% of the particle box size are treated
as corresponding to the same candidate particle.
This radius is used when creating consensus coordinate groups. If the radius is
too small, coordinates that correspond to the same particle may fail to merge.
If it is too large, nearby distinct particles may be merged incorrectly.
The default value is intended to capture small picker-to-picker differences
without merging clearly separate particles.
## Tolerance Threshold
The **Tolerance threshold** parameter defines the neural-network score required
for a candidate particle to be accepted.
The network assigns each candidate particle a score between 0 and 1. A score
near 1 indicates that the network considers the candidate more likely to be a
good particle. A score near 0 indicates that it is more likely to be a bad
particle or false positive.
Particles with scores above the threshold are included in the final outputs.
If the threshold is set to **-1**, all scored particles are allowed to pass.
This is useful when the user wants to inspect the scores manually or create a
subset later using the analysis tools.
## Micrograph Preprocessing
Before extracting particles for training and prediction, the protocol
preprocesses the micrographs internally.
The preprocessing is designed to make the extracted particle boxes compatible
with the Deep Consensus neural network. It includes:
- downsampling micrographs so that extracted particles become 128 × 128 pixels;
- normalizing micrographs to approximately zero mean and unit standard
deviation;
- inverting contrast when needed so that particles are white;
- optionally applying CTF phase flipping;
- extracting particle boxes.
This internal preprocessing is important because the neural network expects a
standardized particle representation.
## Contrast Inversion
The **Did you invert the micrographs contrast?** option tells the protocol
whether the input micrographs have already been contrast-inverted.
Deep Consensus expects particles to be white on a darker background.
If the micrographs have not already been inverted, the protocol can invert the
contrast during preprocessing. If they have already been inverted, the user
should indicate this so that the protocol does not invert them again.
Using the wrong contrast convention can seriously affect the neural-network
scores.
## Ignore CTF
The **Ignore CTF** option controls whether CTF information is used during
particle preprocessing.
If CTF is ignored, particles are extracted without phase flipping.
If CTF is not ignored, the user must provide a CTF estimation relation. The
protocol uses the CTF information to perform phase flipping during
preprocessing.
Phase flipping can make particle images more consistent, but it should only be
used when reliable CTF estimates are available and when this preprocessing is
appropriate for the intended workflow.
## CTF Estimation
The **CTF estimation** parameter is required when **Ignore CTF** is disabled.
It provides the CTF information associated with the input micrographs. The
protocol converts this information into Xmipp CTF-parameter files and uses it
during micrograph preprocessing and particle extraction.
If CTF correction is requested but no CTF relation is provided, the protocol
reports a validation error.
## Training Examples
Deep Consensus builds internal training examples from the input coordinate
sets.
Positive examples are obtained from strict or high-confidence consensus
coordinates, such as coordinates supported by several pickers. Negative
examples are generated by selecting noise coordinates away from the candidate
particle coordinates.
The protocol also creates a broader OR set containing coordinates selected by
at least one picker. These OR candidates are the particles that are later
scored by the neural network.
This strategy allows the protocol to learn from the agreement and disagreement
between picking methods.
## Additional Training Data
The **Additional training data** parameter allows the user to supplement the
internal training examples.
There are three options:
**None** uses only the internal positive and negative examples derived from the
input coordinate sets.
**Precompiled** adds a precompiled negative training set distributed with the
Deep Consensus model resources.
**Custom** allows the user to provide additional positive and negative training
data.
Additional data can improve training when the internal examples are limited,
but they must be compatible with the current preprocessing and specimen.
## Custom Additional Training Data
When **Custom** additional training data are selected, the user can provide
either particles or coordinates.
If particles are provided, they must already be preprocessed in the format
expected by the network: 128 × 128 pixels, white particles, and optionally CTF
corrected in the same way as the protocol.
If coordinates are provided, they should come from the same micrographs as the
input coordinates. The protocol will preprocess and extract them internally.
The user can provide positive and negative custom examples and assign weights
to control how much they contribute during training.
## Positive and Negative Weights
The **Weight of positive additional train data** and **Weight of negative
additional train data** parameters control the relative contribution of custom
training examples.
A weight of 1 means that additional examples are weighted similarly to internal
examples.
If the weight is set to **-1**, the protocol estimates a weight so that the
additional data contribute approximately as much as the internal particles.
These weights are useful when the custom training set is much larger or much
smaller than the internally generated training set.
## Number of Epochs
The **Number of epochs** parameter defines how many training epochs are used
for the neural network.
More epochs allow the model to learn longer from the training data but increase
runtime and may increase overfitting if the dataset is small or biased.
The default value is intended as a practical starting point. Training can also
stop automatically when convergence is detected, depending on the auto-stopping
option.
## Learning Rate
The **Learning rate** controls how strongly the neural network weights are
updated during training.
A larger learning rate may train faster but can become unstable. A smaller
learning rate is more conservative but may train more slowly.
Most users should keep the default value unless they have experience tuning
deep-learning training.
## Auto Stopping
The **Auto stop training when convergence is detected?** option enables
automatic stopping based on validation behavior.
When enabled, the protocol can reduce the learning rate if improvement stops
and eventually stop training if the learning rate becomes too small. It can
also stop if the validation accuracy reaches the selected threshold.
This option is generally useful, but it may stop too early in very small
training sets. The protocol help notes that it is not recommended for very
small datasets with fewer than about 100 true particles.
## Training Accuracy Threshold
The **Training mean val_acc threshold** parameter defines a validation-accuracy
level at which training can stop.
If the mean validation accuracy surpasses this threshold, the protocol considers
the training sufficiently good and stops further training.
The default value is high, reflecting the fact that the network should separate
positive and negative examples clearly before being used for final scoring.
## Regularization Strength
The **Regularization strength** parameter controls L2 regularization of the
neural-network weights.
Regularization helps reduce overfitting. If the training accuracy improves but
validation accuracy decreases, increasing regularization may help.
Typical values span several orders of magnitude. This is an advanced parameter
and should normally be left at its default unless overfitting is observed.
## Number of Models for Ensemble
The **Number of models for ensemble** parameter controls how many neural
network models are trained and combined.
Training several models can make the prediction more robust, because the final
score benefits from an ensemble rather than a single network. However, runtime
increases approximately linearly with the number of models.
Typical values are between 1 and 5. The default provides a compromise between
robustness and computation time.
## Expected Number of Particles for Training
The **Expected number of particles to use for training** parameter controls how
many positive particles are used before training is considered complete.
If the value is **-1**, the protocol uses all particles found for training.
This parameter also affects the effective network size used by the protocol.
The code distinguishes small, medium, and large training regimes according to
the number of training examples.
Larger training sets generally improve robustness, but they require more time
and memory.
## Testing After Training
The **Perform testing after training?** option allows the user to provide
independent positive and negative test particle sets.
If enabled, the protocol scores these test sets after training. This can help
assess whether the trained model generalizes beyond the internal training
examples.
The test particles must be preprocessed in the same expected format:
128 × 128 pixels and compatible contrast and CTF treatment.
## Streaming Behavior
The protocol is designed for streaming workflows.
As micrographs and coordinate sets arrive, the protocol preprocesses
micrographs, computes consensus coordinates, extracts particles, trains the
network in batches, and predicts candidate particles in batches.
The relevant streaming parameters are:
- **Extraction batch size**;
- **Training batch size**;
- **Perform preliminar predictions with on training CNN**.
During streaming, preliminary outputs can be produced while the network is
still being trained. After training is complete, the final network is used to
produce final scored outputs.
## Preliminary Predictions
The **Perform preliminar predictions with on training CNN** option enables
temporary predictions before the final model is fully trained.
These preliminary predictions are stored in separate preliminary output sets.
They are useful in streaming workflows where the user wants early feedback
before all training data have arrived.
Preliminary outputs should be interpreted cautiously because the network is
still being trained. Final outputs should be preferred for downstream
processing.
## Output Coordinates
The main coordinate output is **outputCoordinates**.
This set contains candidate coordinates whose Deep Consensus score passes the
selected threshold. The coordinates are scaled back to the original micrograph
coordinate system and annotated with the deep-learning score.
The score is stored as an Xmipp attribute corresponding to
`zScoreDeepLearning1`.
These coordinates can be used for particle extraction or subset selection in
later workflows.
## Output Particles
The protocol also creates **outputParticles**.
These are the extracted particle images corresponding to the accepted
coordinates. The particles carry the Deep Consensus score and are scaled to the
appropriate sampling rate after internal preprocessing.
This output can be inspected directly or used as a starting point for
downstream classification and cleaning.
## Preliminary Outputs
When preliminary prediction is enabled, the protocol may also produce:
- **preliminarOutputCoordinates**;
- **preliminarOutputParticles**.
These outputs are generated while training is still ongoing. They can provide
early information in streaming workflows but should not be considered as final
screening results.
## Validation and Requirements
The protocol performs several validation checks.
The input coordinate box size must be at least 128 pixels, because the internal
Deep Consensus particle size is 128 × 128 pixels.
If CTF phase flipping is requested, CTF information must be provided.
Additional training or testing particle sets must also have 128-pixel box size.
If only one coordinate set is provided and training is requested, the protocol
reports an error. In that case, the user should use a pretrained or previous
model for direct scoring, or provide additional coordinate sets.
The protocol also checks that the required deep-learning toolkit and model
resources are available.
## Practical Recommendations
Use several complementary coordinate sets as input. Deep Consensus is most
useful when different pickers provide partially overlapping but not identical
results.
Make sure the input coordinate box size is at least 128 pixels.
Use a new model when enough data are available for training. Use a pretrained
or previous model when the current data are similar to previous training data
or when only one coordinate set is available.
Keep the default threshold at first. Lower it to retain more candidate
particles; raise it to be stricter. Use -1 if you want to keep all candidates
and inspect the scores later.
Check the contrast convention carefully. The network expects white particles.
Use CTF phase flipping only when reliable CTF estimates are available and the
workflow expects phase-flipped particles.
Inspect both accepted and rejected particles before committing to downstream
classification or reconstruction.
In streaming workflows, treat preliminary outputs as provisional and use final
outputs once training is complete.
## Final Perspective
Deep Consensus Picking is a neural-network-based particle-screening protocol
built on top of multiple picking results.
For biological users, its main value is that it converts agreement between
pickers into a learned particle-quality score. It can keep particles that look
convincing to the network even if not all pickers agree, and it can reject
false positives that appear in the broad union of picks.
The protocol is especially useful when several picking strategies are
available, when manual inspection of all candidates is impractical, or when a
streaming workflow needs progressively improved particle selection.
As with any learned screening method, the result should be checked visually and
validated downstream by 2D classification, particle cleaning, and final
reconstruction behavior.
"""
_label = 'deep consensus picking'
_lastUpdateVersion = VERSION_2_0
_conda_env = 'xmipp_DLTK_v0.3'
_stepsCheckSecs = 5 # time in seconds to check the steps
_devStatus = PROD
USING_INPUT_COORDS = False
USING_INPUT_MICS = False
CONSENSUS_COOR_PATH_TEMPLATE="consensus_coords_%s"
CONSENSUS_PARTS_PATH_TEMPLATE="consensus_parts_%s"
PRE_PROC_MICs_PATH="preProcMics"
PARTICLES_TEMPLATE = "particles{}.xmd"
NET_TEMPLATE = "nnetData{}"
ADD_DATA_TRAIN_TYPES = ["None", "Precompiled", "Custom"]
ADD_DATA_TRAIN_NONE = 0
ADD_DATA_TRAIN_PRECOMP = 1
ADD_DATA_TRAIN_CUST = 2
ADD_DATA_TRAIN_CUSTOM_OPT = ["Particles", "Coordinates" ]
ADD_DATA_TRAIN_CUSTOM_OPT_PARTS = 0
ADD_DATA_TRAIN_CUSTOM_OPT_COORS = 1
ADD_MODEL_TRAIN_TYPES = ["New", "Pretrained", "PreviousRun"]
ADD_MODEL_TRAIN_NEW = 0
ADD_MODEL_TRAIN_PRETRAIN = 1
ADD_MODEL_TRAIN_PREVRUN = 2
#Streaming parameters
PREPROCESSING = False
TO_EXTRACT_MICFNS = {'OR': [],
'NOISE': [],
'AND': [],
'ADDITIONAL_COORDS_TRUE':[],
'ADDITIONAL_COORDS_FALSE':[]}
EXTRACTING = {'OR': False,
'NOISE': False,
'AND': False}
TRAIN_BATCH_MAX = 20
PREPROCESS_BATCH_MAX = 200
PREDICT_BATCH_MAX = 20
TO_TRAIN_MICFNS = []
TRAINED_PARAMS_PATH = 'trainedParams.pickle'
TRAINING = False
PREDICTING = False
LAST_ROUND = False
ENDED = False
counter = 0
def __init__(self, **args):
ProtParticlePicking.__init__(self, **args)
#self.stepsExecutionMode = params.STEPS_PARALLEL
def _defineParams(self, form):
# GPU settings
form.addHidden(params.USE_GPU, params.BooleanParam, default=True,
expertLevel=params.LEVEL_ADVANCED,
label="Use GPU (vs CPU)",
help="Set to true if you want to use GPU implementation ")
form.addHidden(params.GPU_LIST, params.StringParam, default='0',
expertLevel=params.LEVEL_ADVANCED,
label="Choose GPU ID",
help="GPU may have several cores. Set it to zero"
" if you do not know what we are talking about."
" First core index is 0, second 1 and so on.")
form.addParallelSection(threads=2, mpi=1)
form.addSection(label='Input')
form.addParam('modelInitialization', params.EnumParam,
choices=self.ADD_MODEL_TRAIN_TYPES,
default=self.ADD_MODEL_TRAIN_NEW,
label='Select model type',
help='If you set to *%s*, a new model randomly initialized will be '
'employed. If you set to *%s* a pretrained model will be used. '
'If you set to *%s*, a model trained in a previous run, within '
'this project, will be employed'
% tuple(self.ADD_MODEL_TRAIN_TYPES))
#CONTINUE FROM PREVIOUS TRAIN
form.addParam('continueRun', params.PointerParam,
pointerClass=self.getClassName(),
condition='modelInitialization== %s'%self.ADD_MODEL_TRAIN_PREVRUN, allowsNull=True,
label='Select previous run',
help='Select a previous run to continue from.')
form.addParam('skipTraining', params.BooleanParam,
default=False, condition='modelInitialization!= %s '%self.ADD_MODEL_TRAIN_NEW,
label='Skip training and score directly with pretrained model?',
help='If you set to *No*, you should provide training set. If set to *Yes* '
'the coordinates will be directly scored using the pretrained/previous model')
form.addParam('inputCoordinates', params.MultiPointerParam,
pointerClass='SetOfCoordinates', allowsNull=False,
label="Input coordinates",
help='Select the set of coordinates to compare')
form.addParam('consensusRadius', params.FloatParam, default=0.1,
label="Relative Radius", expertLevel=params.LEVEL_ADVANCED,
validators=[params.Positive],
help="All coordinates within this radius "
"(as fraction of particle size) "
"are presumed to correspond to the same particle")
form.addParam('threshold', params.FloatParam, default=0.5,
label='Tolerance threshold',
expertLevel=params.LEVEL_ADVANCED,
help='The method attach a score between 0 and 1, where 0 '
'if for _bad_ particles and 1 for _good_ ones. '
'Introduce -1 to let pass all for posterior inspection.')
form.addSection(label='Preprocess')
form.addParam('notePreprocess', params.LabelParam,
label='How to extract particles from micrograph',
help='Our method, internally, uses particles that are '
'extracted from preprocess micrographs. '
'Preprocess steps are:\n'
'1) mic donwsampling to the required size such that '
'the particle box size become 128 px. \n E.g. xmipp_transform_downsample -i'
' in/100_movie_aligned.mrc -o out1/100_movie_aligned.mrc --step newSamplingRate --method fourier\n'
'2) mic normalization to 0 mean and 1 std and mic contrast inversion to have white particles.\n '
' E.g. '
' xmipp_transform_normalize -i out1/101_movie_aligned.mrc -o out2/101_movie_aligned.mrc --method '
'OldXmipp [ --invert ]\n'
'3) particles extraction.\n E.g. xmipp_micrograph_scissor -i out2/101_movie_aligned.mrc '
'--pos particles@Runs/101_movie_aligned.pos -o out3/105_movie_aligned_particles '
' --Xdim 128 --downsampling newSamplingRate --fillBorders ( Correct your coordinates with '
'newSamplingRate if needed)\n'
'4) OPTIONAL: phase flipping using CTF.\n xmipp_ctf_phase_flip -i '
'particles/105_movie_aligned_noDust.xmp -o particles/105_movie_aligned_flipped.xmp '
'--ctf ctfPath/105_movie_aligned.ctfParam --sampling newSamplingRate')
form.addParam('skipInvert', params.BooleanParam, default=False,
label='Did you invert the micrographs contrast (particles are bright now)?',
help='If you invert the contrast, your particles will be white over a black background in the micrograph. '
'We use white particles. Select *No* if you already have inverted the constrast in the micrograph'
' so that we can extract white particles directly')
form.addParam('ignoreCTF', params.BooleanParam, default=True,
label='Ignore CTF',
help='Deep Consensus extracts particles. Do you want to ignore CTF for '
'particle extraction')
form.addParam('ctfRelations', params.RelationParam, allowsNull=True,
relationName=RELATION_CTF, condition="not ignoreCTF",
attributeName='_getInputMicrographs',
label='CTF estimation',
help='Choose some CTF estimation related to input '
'micrographs. \nCTF estimation is needed if you '
'want to do phase flipping or you want to '
'associate CTF information to the particles.')
form.addSection(label='Training')
form.addParam('nEpochs', params.FloatParam,
label="Number of epochs", default=5.0,
condition="modelInitialization==%s or not skipTraining"%self.ADD_MODEL_TRAIN_NEW,
help='Number of epochs for neural network training.')
form.addParam('learningRate', params.FloatParam,
label="Learning rate", default=1e-4,
condition="modelInitialization==%s or not skipTraining"%self.ADD_MODEL_TRAIN_NEW,
help='Learning rate for neural network training')
form.addParam('auto_stopping',params.BooleanParam,
label='Auto stop training when convergency is detected?',
default=True, condition="modelInitialization==%s or not skipTraining"%self.ADD_MODEL_TRAIN_NEW,
help='If you set to *Yes*, the program will automatically '
'stop training if there is no improvement for '
'consecutive 2 epochs, learning rate will be '
'decreased by a factor 10. '
'If learningRate_t < 0.01*learningrate_0 training will stop. '
'Warning: Sometimes convergency seems to be reached, '
'but after time, improvement can still happen. '
'Not recommended for very small data sets (<100 true particles)')
form.addParam('maxValAcc', params.FloatParam,
label="Training mean val_acc threshold", default=0.95,
condition="modelInitialization==%s or not skipTraining" % self.ADD_MODEL_TRAIN_NEW,
help='Stop training if at any training batch the selected threshold is achieved')
form.addParam('l2RegStrength', params.FloatParam,
label="Regularization strength",
default=1e-5, expertLevel=params.LEVEL_ADVANCED,
condition="modelInitialization==%s or not skipTraining"%self.ADD_MODEL_TRAIN_NEW,
help='L2 regularization for neural network weights.'
'Make it bigger if suffering overfitting (validation acc decreases but training acc increases)\n'
'Typical values range from 1e-1 to 1e-6')
form.addParam('nModels', params.IntParam,
label="Number of models for ensemble",
default=3, expertLevel=params.LEVEL_ADVANCED,
condition="modelInitialization==%s or (not skipTraining and modelInitialization==%s)"%(
self.ADD_MODEL_TRAIN_NEW, self.ADD_MODEL_TRAIN_PRETRAIN),
help='Number of models to fit in order to build an ensamble. '
'Tipical values are 1 to 5. The more the better '
'until a point where no gain is obtained. '
'Each model increases running time linearly')
form.addParam('toTrainDataSize', params.IntParam,
label="Expected number of particles to use for training", default=20000,
help='Number of particles for training the CNN. Once surpassed, there will not be more training\n'
'Set to -1 to use all the particles found\n'
'It will determine the size of the CNN'
'Usually, the bigger the better, but more training data is needed\n'
'Three CNN sizes: n < 1500 | 1500 <= n < 20000 | n >= 20000')
form.addParam('doTesting', params.BooleanParam, default=False,
label='Perform testing after training?', expertLevel=params.LEVEL_ADVANCED,
condition="modelInitialization==%s or not skipTraining"%self.ADD_MODEL_TRAIN_NEW,
help='If you set to *Yes*, you should select a testing '
'positive set and a testing negative set')
form.addParam('testTrueSetOfParticles', params.PointerParam,
label="Set of positive test particles", expertLevel=params.LEVEL_ADVANCED,
pointerClass='SetOfParticles',condition='doTesting',
help='Select the set of ground true positive particles.')
form.addParam('testFalseSetOfParticles', params.PointerParam,
label="Set of negative test particles", expertLevel=params.LEVEL_ADVANCED,
pointerClass='SetOfParticles', condition='doTesting',
help='Select the set of ground false positive particles.')
form.addSection(label='Additional training data')
form.addParam('addTrainingData', params.EnumParam,
condition="modelInitialization==%s or not skipTraining"%self.ADD_MODEL_TRAIN_NEW,
choices=self.ADD_DATA_TRAIN_TYPES,
default=self.ADD_DATA_TRAIN_PRECOMP,
label='Additional training data',
help='If you set to *%s*, only the AND and RANDOM will be used for training.\n'
'If you set to *%s*, a precompiled additional training set will be added to '
'to the AND and RANDOM sets for training.\n'
'If you set to *%s*, you can provide your own data that will be added to '
'the AND and RANDOM sets for training.\n'
%tuple( self.ADD_DATA_TRAIN_TYPES)
)
form.addParam('trainingDataType', params.EnumParam,
condition=("(modelInitialization==%s or not skipTraining ) " +
"and addTrainingData==%s") % (self.ADD_MODEL_TRAIN_NEW, self.ADD_DATA_TRAIN_CUST),
choices=self.ADD_DATA_TRAIN_CUSTOM_OPT,
default=self.ADD_DATA_TRAIN_CUSTOM_OPT_COORS,
label='Additional training data',
help='You can provide either particles or coordinates as additional training set.' \
'If you provide coordinantes, they have to be picked from the same micrographs that the' \
'inputs\n If you provide particles, they have to be processed in the same way that the protocol' \
'does (128x128 pixels and withe particles). Thus, what the protocol does is to perform the ' \
'following steps:\n'
'1) mic donwsampling to the required size such that '
'the particle box size become 128 px. \n E.g. xmipp_transform_downsample -i'
' in/100_movie_aligned.mrc -o out1/100_movie_aligned.mrc --step newSamplingRate --method fourier\n'
'2) mic normalization to 0 mean and 1 std and mic contrast inversion to have WHITE particles.\n '
'E.g. '
' xmipp_transform_normalize -i out1/101_movie_aligned.mrc -o out2/101_movie_aligned.mrc --method '
'OldXmipp [ --invert ]\n'
'3) particles extraction.\n E.g. xmipp_micrograph_scissor -i out2/101_movie_aligned.mrc '
'--pos particles@Runs/101_movie_aligned.pos -o out3/105_movie_aligned_particles '
' --Xdim 128 --downsampling newSamplingRate --fillBorders ( Correct your coordinates with '
'newSamplingRate if needed)\n'
'4) OPTIONAL: phase flipping using CTF.\n xmipp_ctf_phase_flip -i '
'particles/105_movie_aligned_noDust.xmp -o particles/105_movie_aligned_flipped.xmp '
'--ctf ctfPath/105_movie_aligned.ctfParam --sampling newSamplingRate\n'
'Then, particles are extracted with no further alteration.\n'
'Please ensure that the additional particles have been '
'preprocessed as indicated before.\n' )
form.addParam('trainTrueSetOfParticles', params.PointerParam,
label="Positive train particles 128px (optional)",
pointerClass='SetOfParticles', allowsNull=True,
condition=("(modelInitialization==%s or not skipTraining ) "+
"and addTrainingData==%s and trainingDataType==%s")%(self.ADD_MODEL_TRAIN_NEW,
self.ADD_DATA_TRAIN_CUST,
self.ADD_DATA_TRAIN_CUSTOM_OPT_PARTS),
help='Select a set of true positive particles. '
'Take care of the preprocessing (128x128 pixels, contrast inverted (white particles), possibly '
'CTF corrected')
form.addParam('trainTrueSetOfCoords', params.PointerParam,
label="Positive coordinates(optional)",
pointerClass='SetOfCoordinates', allowsNull=True,
condition="(modelInitialization==%s or not skipTraining ) "
"and addTrainingData==%s and trainingDataType==%s"
% (self.ADD_MODEL_TRAIN_NEW,
self.ADD_DATA_TRAIN_CUST,
self.ADD_DATA_TRAIN_CUSTOM_OPT_COORS),
help="Select a set of true coordinates collected from the "
"same microgaphs that the input")
form.addParam('trainPosWeight', params.IntParam, default='1',
label="Weight of positive additional train data",
condition="(modelInitialization==%s or not skipTraining ) "
"and addTrainingData==%s"
% (self.ADD_MODEL_TRAIN_NEW,
self.ADD_DATA_TRAIN_CUST),
allowsNull=True,
help='Select the weigth for the additional train set of '
'positive particles.The weight value indicates '
'internal particles are weighted with 1. '
'If weight is -1, weight will be calculated such that '
'the contribution of additional data is equal to '
'the contribution of internal particles')
form.addParam('trainFalseSetOfParticles', params.PointerParam,
label="Negative train particles 128px (optional)",
pointerClass='SetOfParticles', allowsNull=True,
condition="(modelInitialization==%s or not skipTraining ) "
"and addTrainingData==%s and trainingDataType==%s"
% (self.ADD_MODEL_TRAIN_NEW,
self.ADD_DATA_TRAIN_CUST,
self.ADD_DATA_TRAIN_CUSTOM_OPT_PARTS),
help='Select a set of false positive particles. '
'Take care of the preprocessing: 128x128 pixels, '
'contrast inverted (white particles), '
'possibly CTF corrected')
form.addParam('trainFalseSetOfCoords', params.PointerParam,
label="Negative coordinates(optional)",
pointerClass='SetOfCoordinates', allowsNull=True,
condition="(modelInitialization==%s or not skipTraining ) "
"and addTrainingData==%s and trainingDataType==%s"
% (self.ADD_MODEL_TRAIN_NEW,
self.ADD_DATA_TRAIN_CUST,
self.ADD_DATA_TRAIN_CUSTOM_OPT_COORS),
help="Select a set of incorrect coordinates collected from "
"the same microgaphs that the input")
form.addParam('trainNegWeight', params.IntParam, default='1',
label="Weight of negative additional train data",
condition="(modelInitialization==%s or not skipTraining ) "
"and addTrainingData==%s"
% (self.ADD_MODEL_TRAIN_NEW,
self.ADD_DATA_TRAIN_CUST,),
allowsNull=True,
help='Select the weigth for the additional train set of '
'negative particles. The weight value indicates '
'the number of times each image may be included at '
'most per epoch. Deep consensus internal particles '
'are weighted with 1. If weight is -1, weight '
'will be calculated such that the contribution of '
'additional data is equal to the contribution of '
'internal particles')
form.addSection(label='Streaming')
form.addParam('doPreliminarPredictions', params.BooleanParam, default=False,
label="Perform preliminar predictions with on training CNN",
help='The protocol will make preliminar preedictions with the network before it is fully trained\n'
'These preliminar results will be stored in a different output set')
form.addParam('extractingBatch', params.IntParam, default='5',
label="Extraction batch size",
help='Size of the extraction batches (in number of micrographs)')
form.addParam('trainingBatch', params.IntParam, default='5',
label="Training batch size",
help='Size of the training batches (in number of micrographs).'
'The CNN needs a minimum number of particles to train for each batch, if there are not'
' enough particles, the batch size must be increased')
def _validate(self):
errorMsg = []
if self._getBoxSize()< DEEP_PARTICLE_SIZE:
errorMsg.append("Error, too small particles (needed 128 px), "
"have you provided already downsampled micrographs? "
"If so, use original ones")
if not self.ignoreCTF.get() and self.ctfRelations.get() is None:
errorMsg.append("Error, CTFs must be provided to compute phase flip. "
"Please, provide a set of CTFs.")
if self.trainTrueSetOfParticles.get() and self.trainTrueSetOfParticles.get().getXDim()!=DEEP_PARTICLE_SIZE:
errorMsg.append("Error, trainTrueSetOfParticles needed to be 128 px")
if self.trainFalseSetOfParticles.get() and self.trainFalseSetOfParticles.get().getXDim()!=DEEP_PARTICLE_SIZE:
errorMsg.append("Error, trainFalseSetOfParticles needed to be 128 px")
if self.testTrueSetOfParticles.get() and self.testTrueSetOfParticles.get().getXDim()!=DEEP_PARTICLE_SIZE:
errorMsg.append("Error, testTrueSetOfParticles needed to be 128 px")
if self.testFalseSetOfParticles.get() and self.testFalseSetOfParticles.get().getXDim()!=DEEP_PARTICLE_SIZE:
errorMsg.append("Error, testFalseSetOfParticles needed to be 128 px")
if len(self.inputCoordinates)==1 and not self.justPredict():
errorMsg.append("Error, just one coordinate set provided but trained desired. Select pretrained "+
"model or previous run model and *No* continue training from previous trained model "+
" to score coordiantes directly or add another set of particles and continue training")
errorMsg = self.validateDLtoolkit(errorMsg, model="deepConsensus",
assertModel=self.addTrainingData.get()==self.ADD_DATA_TRAIN_PRECOMP)
return errorMsg
#--------------------------- INSERT steps functions ---------------------------
def _doContinue(self):
return self.modelInitialization.get()== self.ADD_MODEL_TRAIN_PREVRUN
[docs] def justPredict(self):
return self.skipTraining.get()==True
def _useNewModel(self):
return self.modelInitialization.get() == self.ADD_MODEL_TRAIN_NEW
def _usePretrainedModel(self):
return self.modelInitialization.get()== self.ADD_MODEL_TRAIN_PRETRAIN
def _insertAllSteps(self):
self.inputMicrographs = None
self.boxSize = None
self.coordinatesDict = {}
self.initDeps = [self._insertFunctionStep("initializeStep")]
self.lastStep = self._insertFunctionStep('lastRoundStep', wait=True, prerequisites=self.initDeps)
self.endStep = self._insertFunctionStep('endProtocolStep', wait=True, prerequisites=[self.lastStep])
[docs] def getGpusList(self, separator):
strGpus = ""
for elem in self._stepsExecutor.getGpuList():
strGpus = strGpus + str(elem) + separator
return strGpus[:-1]
[docs] def setGPU(self, oneGPU=False):
if oneGPU:
gpus = self.getGpusList(",")[0]
else:
gpus = self.getGpusList(",")
os.environ["CUDA_VISIBLE_DEVICES"] = gpus
self.info(f'Visible GPUS: {gpus}')
return gpus
def _stepsCheck(self):
'''Checks if new steps can be executed'''
self.newSteps = []
if self.ENDED:
return
# Preprocessing
if len(self.readyToPreprocessMics(shared=False)) > 0 and not self.PREPROCESSING and (self.counter%2 == 0):
#print('----------------------------------ENTERING PREPROCESSING STEP-----------------------------')
self.PREPROCESSING = True
self.lastDeps = [self._insertFunctionStep("preprocessMicsStep", prerequisites=self.initDeps)]
#Particle extraction OR (for predictions)
if len(self.readyToExtractMicFns('OR')) >= self.extractingBatch.get() and not self.EXTRACTING['OR']:
#print('----------------------------ENTERING OR EXTRACTION FOR PREDICTION-----------------------------')
self.EXTRACTING['OR'] = True
self.newSteps += self.insertCaculateConsensusSteps('OR', prerequisites=self.initDeps)
self.newSteps += self.insertExtractPartSteps('OR', prerequisites=self.newSteps)
#Particle extraction for training and training
trainedParams = self.loadTrainedParams()
toTrainSize = self.toTrainDataSize.get() if self.toTrainDataSize.get() != -1 else 1e10
if self.trainingOn() and trainedParams['posParticlesTrained'] < toTrainSize and trainedParams['trainingPass'] != '':
#print('----------------------------------ENTERING AND EXTRACTION AND/OR TRAINING--------------------')
self.doTraining()
elif trainedParams['posParticlesTrained'] >= toTrainSize and trainedParams['trainingPass'] != '':
#print('-------------------------ENTERING IN THE CHANGING THE TRAINING_PASS LOGIC-------------------------')
lastTrainingPass = trainedParams['trainingPass']
self.retrievePreviousPassModel('', lastTrainingPass)
trainedParams['trainingPass'] = ''
self.saveTrainedParams(trainedParams)
#Prediction
if self.networkReadyToPredict() and self.cnnFree() and self.predictionsOn() and len(self.readyToPredictMicFns()) > 0:
#print('---------------------------------------------ENTERING PREDICTION---------------------------------')
self.PREDICTING = True
depPredict = self._insertFunctionStep('predictCNN', prerequisites= self.newSteps)
self.newSteps += [self._insertFunctionStep('endPredictingStep', prerequisites=[depPredict])]
self.newSteps += [self._insertFunctionStep('createOutputStep', prerequisites=[depPredict])]
#Last round with batch size == 1 to include all input
if self.allFree() and not self.LAST_ROUND and self.checkIfParentsFinished():
#print('----------------------------------NOT LAST ROUND BUT ACTIVATES LAST_STEPS---------------------------')
protLast = self._steps[self.lastStep - 1]
protLast.addPrerequisites(*self.newSteps)
protLast.setStatus(STATUS_NEW)
self.updateSteps()
protEnd = self._steps[self.endStep-1]
protEnd.addPrerequisites(*self.newSteps)
#Ending the protocol when everything is done
if self.LAST_ROUND and self.allFree():
#print('----------------------------------LAST ROUND ACTIVATES END_STEP---------------------------')
protEnd.setStatus(STATUS_NEW)
self.updateSteps()
self.counter += 1
[docs] def endPredictingStep(self):
self.PREDICTING = False
[docs] def endTrainingStep(self):
self.saveTrainedParams(self.curTrainedParams)
self.TRAINING = False
mean_acc = self.loadMeanAccuracy()
threshold = self.maxValAcc.get()
trainedParams = self.loadTrainedParams()
toTrainSize = self.toTrainDataSize.get() if self.toTrainDataSize.get() != -1 else 1e10
if (mean_acc != None and mean_acc > threshold) or \
(trainedParams['posParticlesTrained'] >= toTrainSize and trainedParams['trainingPass'] != ''):
#print('-------------------------ENTERING IN THE CHANGING THE TRAININGPASS LOGIC-------------------------')
lastTrainingPass = trainedParams['trainingPass']
self.retrievePreviousPassModel('', lastTrainingPass)
trainedParams['trainingPass'] = ''
self.saveTrainedParams(trainedParams)
if mean_acc > threshold:
print('Mean accuracy %f surpass training accuracy threshold %f -> end training'
%(mean_acc, threshold))
[docs] def lastRoundStep(self):
'''Starts the last round of training and predictions with the remainign microgrpahs
when all the inputs have arrived'''
self.extractingBatch.set(1)
self.trainingBatch.set(1)
self.LAST_ROUND = True
[docs] def endProtocolStep(self):
'''Finish the protocol with a final prediction using the final CNN'''
lastTrainingPass = self.loadTrainedParams()['trainingPass']
if lastTrainingPass != '':
self.retrievePreviousPassModel('', lastTrainingPass)
self.uploadTrainedParam('trainingPass', '')
self.ENDED = True
self.depLastPredict = self._insertFunctionStep('predictCNN', prerequisites=[self.endStep])
self._insertFunctionStep('createOutputStep', True, prerequisites=[self.depLastPredict])
else:
self.updateOutput(closeStream=True)
self.ENDED = True
[docs] def initializeStep(self):
"""
Create paths where data will be saved
"""
if self.doTesting.get() and self.testTrueSetOfParticles.get() and self.testFalseSetOfParticles.get():
writeSetOfParticles(self.testTrueSetOfParticles.get(),
self._getExtraPath("testTrueParticlesSet.xmd"))
writeSetOfParticles(self.testFalseSetOfParticles.get(),
self._getExtraPath("testFalseParticlesSet.xmd"))
if self.addTrainingData.get() == self.ADD_DATA_TRAIN_CUST:
if self.trainingDataType== self.ADD_DATA_TRAIN_CUSTOM_OPT_PARTS:
if self.trainTrueSetOfParticles.get():
writeSetOfParticles(self.trainTrueSetOfParticles.get(),
self._getExtraPath("trainTrueParticlesSet.xmd"))
if self.trainFalseSetOfParticles.get():
writeSetOfParticles(self.trainFalseSetOfParticles.get(),
self._getExtraPath("trainFalseParticlesSet.xmd"))
elif self.addTrainingData.get() == self.ADD_DATA_TRAIN_PRECOMP:
writeSetOfParticles(self.retrieveTrainSets(),
self._getTmpPath("addNegTrainParticles.xmd"))
makePath(self._getExtraPath(self.PRE_PROC_MICs_PATH))
for mode in ["AND", "OR", "NOISE"]:
consensusCoordsPath = self.CONSENSUS_COOR_PATH_TEMPLATE % mode
makePath(self._getExtraPath(consensusCoordsPath))
if self._doContinue():
if self.skipTraining.get():
trPass=''
else:
trPass=1
self.retrievePreviousRunModel(self.continueRun.get(), trPass)
self.uploadTrainedParam('trainingPass', trPass)
if self._usePretrainedModel() and self.skipTraining.get():
trPass=''
self.uploadTrainedParam('trainingPass', trPass)
if self._useNewModel() and self.skipTraining.get():
self.skipTraining.set(False)
preprocessParamsFname= self._getExtraPath("preprocess_params.json")
preprocParams= self.getPreProcParamsFromForm()
with open(preprocessParamsFname, "w") as f:
json.dump(preprocParams, f)
#Initializing outputs
self.inSamplingRate = self._getInputMicrographs().getSamplingRate()
self.USING_INPUT_MICS = False
self.preCorrectedParSet, self.preCoordSet = [], []
[docs] def preprocessMicsStep(self):
'''Step which preprocesses the input micrographs'''
micIds = self.getMicsIds(filterOutNoCoords=False)
if len(micIds) > 0:
mics_ = self._getInputMicrographs()
micsFnameSet = {mics_[micId].getMicName(): mics_[micId].getFileName() for micId in micIds
if mics_[micId] is not None} # to skip failed mics
self.USING_INPUT_MICS = False
dirName = os.path.dirname(list(micsFnameSet.values())[0])
toPreprocessMicFns = self.readyToPreprocessMics(shared=False)
print('New mics to be preprocessed: %d' % len(toPreprocessMicFns))
if len(toPreprocessMicFns) > self.PREPROCESS_BATCH_MAX:
toPreprocessMicFns = toPreprocessMicFns[:self.PREPROCESS_BATCH_MAX]
if self.ignoreCTF.get():
preproMicsContent="#mics\n"
for micFileName in toPreprocessMicFns:
preproMicsContent+= "%s\n"%os.path.join(dirName, micFileName)
else:
preproMicsContent="#mics ctfs\n"
setOfMicCtf= self.ctfRelations.get()
assert setOfMicCtf is not None, "Error, CTFs must be provided to compute phase flip"
for ctf in setOfMicCtf:
ctf_mic = ctf.getMicrograph()
ctfFnName = ctf_mic.getFileName()
if os.path.basename(ctfFnName) in toPreprocessMicFns:
ctf_mic.setCTF(ctf)
fnCTF = self._getTmpPath("%s.ctfParam" % os.path.basename(ctfFnName))
micrographToCTFParam(ctf_mic, fnCTF)
preproMicsContent+= "%s %s\n"%(ctfFnName, fnCTF)
inputsFname= self._getTmpPath("preprocMic_inputs.txt")
ouputDir= self._getExtraPath(self.PRE_PROC_MICs_PATH)
nThrs= self.numberOfThreads.get()
with open(inputsFname, "w") as f:
f.write(preproMicsContent)
downFactor = self._getDownFactor()
args= "-i %s -s %s -d %s -o %s -t %d"%(inputsFname, self.inSamplingRate, downFactor, ouputDir, nThrs)
if not self.skipInvert.get():
args+=" --invert_contrast"
if not self.ignoreCTF.get():
args+=" --phase_flip"
self.runJob('xmipp_preprocess_mics', args, numberOfMpi=1)
self.PREPROCESSING = False
[docs] def insertCaculateConsensusSteps(self, mode, prerequisites):
'''Insert the steps neccessary for calculating the consensus coordinates of type "mode"'''
outCoordsDataPath = self._getExtraPath(self.CONSENSUS_COOR_PATH_TEMPLATE% mode)
if not os.path.exists(outCoordsDataPath):
makePath(outCoordsDataPath)
newDep = self._insertFunctionStep('calculateCoorConsensusStep', outCoordsDataPath, mode, prerequisites=prerequisites)
newDep = self._insertFunctionStep('loadCoords', outCoordsDataPath, mode, prerequisites=[newDep])
return [newDep]
[docs] def calculateCoorConsensusStep(self, outCoordsDataPath, mode):
'''Calculates the consensus coordinates from micrographs whose particles haven't been extracted yet in "mode"'''
#Only calculate consensus for coordinates that has not been extracted yet
trainedParams = self.loadTrainedParams()
if trainedParams['trainingPass'] != '' or mode != 'AND':
if self.checkIfNewMics(mode):
extractedSetOfCoordsFns = []
for micFn in self.getExtractedMicFns(mode):
extractedSetOfCoordsFns.append(pwutils.path.replaceBaseExt(micFn,'pos'))
self.waitFreeInputCoords()
inputCoordsFnames = self.getInpCoordsFns(mode, extractedSetOfCoordsFns)
self.USING_INPUT_COORDS = False
inputFileHeader="#pos_i\n"
inputFileStr=inputFileHeader
for baseName in inputCoordsFnames:
fnames= inputCoordsFnames[baseName]
inputFileStr+=" ".join(fnames)+"\n"
assert len(inputFileStr)>len(inputFileHeader), "Error, no consensus can be computed as there " \
"is a mismatch in coordinate sets filenames"
#Concensus decision
consensus = UNION_INTERSECTIONS if mode=="AND" else OR
# THE MODE CAN BE A NEW PARAMETER TO BE ASKED TO THE USER
configFname= self._getTmpPath("consensus_%s_inputs.txt"%(mode) )
with open(configFname, "w") as f:
f.write(inputFileStr)
args = "-i %s -s %d -c %s -d %f -o %s -t %d" % (configFname, self._getBoxSize(), consensus, self.consensusRadius.get(),
outCoordsDataPath, self.numberOfThreads.get())
self.runJob('xmipp_coordinates_consensus', args, numberOfMpi=1)
self.TO_EXTRACT_MICFNS[mode] = self.readyToExtractMicFns(mode)
return
[docs] def pickNoise(self):
'''Find noise coordinates from micrographs in order to use them as negatives in the training process'''
trainedParams = self.loadTrainedParams()
if trainedParams['trainingPass'] != '':
orPosDir = self._getExtraPath(self.CONSENSUS_COOR_PATH_TEMPLATE % "OR")
outputPosDir = self._getExtraPath(self.CONSENSUS_COOR_PATH_TEMPLATE % "NOISE")
if not "OR" in self.coordinatesDict: # fill self.coordinatesDict['OR']
self.loadCoords(orPosDir, 'OR')
# Getting the extracted mics where noise have not been picked yet
micsDir = self._getExtraPath(self.PRE_PROC_MICs_PATH)
toPickNoiseFns = list(set(self.getExtractedMicFns('OR')) -
(set(self.getExtractedMicFns('NOISE')) | set(self.TO_EXTRACT_MICFNS['NOISE'])))
if len(toPickNoiseFns) > 0:
micNoiseSet = self.loadMicSetFromFns(micsDir, toPickNoiseFns)
toNoiseSetOfCoords = self._createSetOfCoordinates(micNoiseSet)
toNoiseSetOfCoords.setBoxSize(self._getBoxSize())
readSetOfCoordinates(orPosDir, micSet=micNoiseSet, coordSet=toNoiseSetOfCoords)
# Write the tonoise mic files into a tmp directory
coordsDir = self._getTmpPath(IN_COORDS_POS_DIR_BASENAME)
toPickMicsDir = micsDir + '_toPickNoise'
toPickCoorsDir = coordsDir + '_toPickNoise'
if not os.path.exists(coordsDir):
makePath(coordsDir)
if os.path.exists(toPickMicsDir):
cleanPath(toPickMicsDir)
if os.path.exists(toPickCoorsDir):
cleanPath(toPickCoorsDir)
makePath(toPickMicsDir)
makePath(toPickCoorsDir)
writeSetOfCoordinates(coordsDir, toNoiseSetOfCoords)
for micFn in toPickNoiseFns:
posFn = pwutils.path.replaceBaseExt(micFn, 'pos')
shutil.copyfile(micsDir + '/' + micFn, toPickMicsDir + '/' + micFn)
shutil.copyfile(coordsDir + '/' + posFn, toPickCoorsDir + '/' + posFn)
argsDict = pickNoise_prepareInput(toNoiseSetOfCoords, self._getTmpPath())
argsDict['toPickMicsDir'] = toPickMicsDir
argsDict['toPickCoorsDir'] = toPickCoorsDir
argsDict["outputPosDir"] = outputPosDir
argsDict["nThrs"] = self.numberOfThreads.get()
argsDict["nToPick"] = -1
args = (" -i %(toPickMicsDir)s -c %(toPickCoorsDir)s -o %(outputPosDir)s -s %(boxSize)s " +
"-n %(nToPick)s -t %(nThrs)s") % argsDict
self.runJob('xmipp_pick_noise', args, numberOfMpi=1)
self.loadCoords(outputPosDir, 'NOISE', micSet=micNoiseSet)
self.TO_EXTRACT_MICFNS['NOISE'] = toPickNoiseFns
print('Adding to extract {} {} micrographs'.format(len(toPickNoiseFns), 'NOISE'))
return
[docs] def loadCoords(self, posCoorsPath, mode, micSet=[]):
#Upload coords sqlite
trainedParams = self.loadTrainedParams()
if trainedParams['trainingPass'] != '' or mode != 'AND':
if len(micSet):
#Load coordinates from an specific set of mics
batchSetOfCoordinates = self._createSetOfCoordinates(micSet)
batchSetOfCoordinates.setBoxSize(self._getBoxSize())
readSetOfCoordinates(posCoorsPath, micSet=micSet, coordSet = batchSetOfCoordinates)
if mode in self.coordinatesDict:
for newCoord in batchSetOfCoordinates:
apCoord = Coordinate()
apCoord.copy(newCoord, copyId=False)
self.coordinatesDict[mode].append(apCoord)
else:
self.coordinatesDict[mode] = batchSetOfCoordinates
else:
sqliteName = self._getExtraPath(self.CONSENSUS_COOR_PATH_TEMPLATE % mode) + ".sqlite"
if os.path.isfile(self._getExtraPath(sqliteName)):
cleanPath(self._getExtraPath(sqliteName))
self.waitFreeInputCoords()
totalSetOfCoordinates = readSetOfCoordsFromPosFnames(posCoorsPath,
setOfInputCoords=self.inputCoordinates[0].get(),
sqliteOutName=sqliteName, write=True)
print("Coordinates %s size: %d" % (mode, totalSetOfCoordinates.getSize()))
assert totalSetOfCoordinates.getSize() > MIN_NUM_CONSENSUS_COORDS, \
("Error, the consensus (%s) of your input coordinates was too small (%s). " +
"It must be > %s. Try a different input..."
) % (mode, str(totalSetOfCoordinates.getSize()), str(MIN_NUM_CONSENSUS_COORDS))
self.coordinatesDict[mode] = totalSetOfCoordinates
self.USING_INPUT_COORDS = False
[docs] def extractParticles(self, mode):
'''Extract the particles from a set of micrographs with their corresponding coordinates'''
trainedParams = self.loadTrainedParams()
if trainedParams['trainingPass'] == '' and (mode.startswith("ADDITIONAL_COORDS") or mode == 'AND' or mode == 'NOISE'):
print('Training already finished')
else:
micsFnameSet = {}
posDir = self._getExtraPath(self.CONSENSUS_COOR_PATH_TEMPLATE % mode)
preprocMicsPath = self._getExtraPath(self.PRE_PROC_MICs_PATH)
toExtractMicFns = self.TO_EXTRACT_MICFNS[mode]
print('To extract {} in mode {}: {}'.format(len(toExtractMicFns), mode, toExtractMicFns))
if len(toExtractMicFns) <= 0:
return
for micFname in toExtractMicFns:
micFnameBase= pwutils.removeExt(micFname)
micFname= os.path.join(preprocMicsPath, micFname)
micsFnameSet[micFnameBase]= micFname
extractCoordsContent="#mics coords\n"
if mode.startswith("ADDITIONAL_COORDS"):
if not os.path.exists(posDir):
os.mkdir(posDir)
if mode.endswith("TRUE"):
coordSet= self.trainTrueSetOfCoords.get()
elif mode.endswith("FALSE"):
coordSet= self.trainFalseSetOfCoords.get()
writeSetOfCoordinates(posDir, coordSet)
for posFname in os.listdir(posDir):
posNameBase= pwutils.removeExt(posFname)
posFname= os.path.join(posDir, posFname)
if posNameBase in micsFnameSet:
extractCoordsContent+= "%s particles@%s\n"%(micsFnameSet[posNameBase], posFname)
inputsFname= self._getTmpPath("extractParticles_inputs_%s.txt"%mode)
outputDir= self._getConsensusParticlesDir(mode)
if not os.path.exists(outputDir):
makePath(outputDir)
nThrs= self.numberOfThreads.get()
with open(inputsFname, "w") as f:
f.write(extractCoordsContent)
downFactor= self._getDownFactor()
args= "-i %s -s %s -d %s -o %s -t %d"%(inputsFname, DEEP_PARTICLE_SIZE, downFactor, outputDir, nThrs)
self.runJob('xmipp_extract_particles', args, numberOfMpi=1)
[docs] def joinSetOfParticlesStep(self, mode, micFns='', trainingPass='', clean=False):
'''Stores the particles extracted from a set of micrographs in a images.xmd metadata file'''
trainedParams = self.loadTrainedParams()
if trainedParams['trainingPass'] == '' and (mode.startswith("ADDITIONAL_COORDS") or mode == 'AND' or mode == 'NOISE'):
print('Training already finished')
else:
fnImages = self._getExtraPath("particles_{}{}.xmd".format(mode, trainingPass))
posDir = self._getExtraPath(self.CONSENSUS_COOR_PATH_TEMPLATE % mode)
if micFns == '':
micFns = self.TO_EXTRACT_MICFNS[mode]
self.TO_EXTRACT_MICFNS[mode] = []
print('Updating set of particles written in {}'.format(fnImages))
else:
print('Creating set of particles in {}'.format(fnImages))
imgsXmd = md.MetaData()
for micFn in micFns:
posFn = os.path.join(posDir, pwutils.replaceBaseExt(micFn, "pos"))
xmdFn = os.path.join(self._getConsensusParticlesDir(mode),
pwutils.replaceBaseExt(posFn, "xmd"))
if os.path.exists(xmdFn):
mdFn = md.MetaData(xmdFn)
mdPos = md.MetaData('particles@%s' % posFn)
mdPos.merge(mdFn)
imgsXmd.unionAll(mdPos)
else:
self.warning("The coord file %s wasn't used for extraction! "
% os.path.basename(posFn))
if not os.path.exists(fnImages) or clean:
imgsXmd.write(fnImages)
else:
imgsXmd.append(fnImages)
self.EXTRACTING[mode] = False
[docs] def doTraining(self): # PUT SOMETHING LIKE IN THE PCIKNOISE BUT MAKE SURE YOU ARE ONLY USING IT FOR AND SETS
'''Prepares the positive (AND) and negative (NOISE) coordinates for the training and executes it'''
if len(self.readyToExtractMicFns('NOISE')) >= self.extractingBatch.get() and not self.EXTRACTING['NOISE']:
self.EXTRACTING['NOISE'] = True
depNoise = self._insertFunctionStep('pickNoise', prerequisites=self.initDeps)
self.newSteps += self.insertExtractPartSteps('NOISE', prerequisites=[depNoise])
if len(self.readyToExtractMicFns('AND')) >= self.extractingBatch.get() and not self.EXTRACTING['AND']:
self.EXTRACTING['AND'] = True
depsAnd = self.insertCaculateConsensusSteps('AND', prerequisites=self.initDeps)
self.newSteps += self.insertExtractPartSteps('AND', prerequisites=depsAnd)
trainedParams = self.loadTrainedParams()
if self.addTrainingData.get() == self.ADD_DATA_TRAIN_CUST and \
self.trainingDataType == self.ADD_DATA_TRAIN_CUSTOM_OPT_COORS:
if self.trainTrueSetOfCoords.get() is not None and \
len(self.readyToExtractMicFns('ADDITIONAL_COORDS_TRUE')) >= self.extractingBatch.get():
self.TO_EXTRACT_MICFNS['ADDITIONAL_COORDS_TRUE'] = self.readyToExtractMicFns('ADDITIONAL_COORDS_TRUE')
self.newSteps += self.insertExtractPartSteps('ADDITIONAL_COORDS_TRUE', prerequisites=self.initDeps)
if self.trainFalseSetOfCoords.get() is not None and \
len(self.readyToExtractMicFns('ADDITIONAL_COORDS_FALSE')) >= self.extractingBatch.get():
self.TO_EXTRACT_MICFNS['ADDITIONAL_COORDS_FALSE'] = self.readyToExtractMicFns('ADDITIONAL_COORDS_FALSE')
self.newSteps += self.insertExtractPartSteps('ADDITIONAL_COORDS_FALSE', prerequisites=self.initDeps)
if self.cnnFree():
self.TO_TRAIN_MICFNS = self.readyToTrainMicFns()
if len(self.TO_TRAIN_MICFNS) >= self.trainingBatch.get():
self.TRAINING = True
self.curTrainedParams = trainedParams
self.depsTrain = [self._insertFunctionStep('trainCNN', self.TO_TRAIN_MICFNS, prerequisites=self.initDeps)]
self.depsTrain = [self._insertFunctionStep('endTrainingStep', prerequisites=self.depsTrain)]
self.newSteps += self.depsTrain
[docs] def trainCNN(self, toTrainMicFns):
'''Trains the CNN with the particles from the ready to train micrographs'''
trainedParams = self.curTrainedParams
trPass = trainedParams['trainingPass']
if not trPass == '':
trPass += 1
trainedParams['trainingPass'] = trPass
#Writting the inputs in xmd
for mode in ['AND', 'NOISE']:
self.joinSetOfParticlesStep(mode, toTrainMicFns, trPass, clean=True)
#Creatting the training pass directory
netDataPath = self._getExtraPath(self.NET_TEMPLATE.format(trPass))
if not os.path.exists(netDataPath):
makePath(netDataPath)
nEpochs = self.nEpochs.get()
#Setting the input and input weights
posTrainDict = {self._getExtraPath("particles_AND{}.xmd".format(trPass)): 1}
negTrainDict = {self._getExtraPath("particles_NOISE{}.xmd".format(trPass)): 1}
if self.addTrainingData.get() == self.ADD_DATA_TRAIN_PRECOMP and trainedParams['firstTraining']:
negTrainDict[self._getTmpPath("addNegTrainParticles.xmd")]= 1
if self.usesGpu():
numberOfThreads = None
gpuToUse = self.setGPU(oneGPU=True)
else:
numberOfThreads = self.numberOfThreads.get()
gpuToUse = None
if self.addTrainingData.get() == self.ADD_DATA_TRAIN_CUST:
if self.trainingDataType.get() == self.ADD_DATA_TRAIN_CUSTOM_OPT_PARTS and trainedParams['firstTraining']:
if self.trainTrueSetOfParticles.get():
posTrainFn = self._getExtraPath("trainTrueParticlesSet.xmd")
posTrainDict[posTrainFn] = self.trainPosWeight.get()
if self.trainFalseSetOfParticles.get():
negTrainFn = self._getExtraPath("trainFalseParticlesSet.xmd")
negTrainDict[negTrainFn] = self.trainNegWeight.get()
elif self.trainingDataType.get() == self.ADD_DATA_TRAIN_CUSTOM_OPT_COORS:
if self.trainTrueSetOfCoords.get():
self.joinSetOfParticlesStep('ADDITIONAL_COORDS_TRUE', toTrainMicFns, trPass, clean=True)
posTrainFn = self._getExtraPath("particles_ADDITIONAL_COORDS_TRUE{}.xmd".format(trPass))
posTrainDict[posTrainFn] = self.trainPosWeight.get()
if self.trainFalseSetOfCoords.get():
self.joinSetOfParticlesStep('ADDITIONAL_COORDS_FALSE', toTrainMicFns, trPass, clean=True)
negTrainFn = self._getExtraPath("particles_ADDITIONAL_COORDS_FALSE{}.xmd".format(trPass))
negTrainDict[negTrainFn] = self.trainNegWeight.get()
effectiveSize=-1
nTrueParticles = self.toTrainDataSize.get() if self.toTrainDataSize.get() != -1 else 1e10
if self._usePretrainedModel():
if nTrueParticles<1500:
effectiveSize=1000
elif 1500<=nTrueParticles<20000:
effectiveSize=5000
else:
effectiveSize=50000
self.__retrievePreTrainedModel(netDataPath, effectiveSize)
if self.skipTraining.get():
nEpochs = 0
elif not trPass == '' and trPass > 1:
#Starting from model in previous trainingPass
self.retrievePreviousPassModel(trPass)
trainedParams['posParticlesTrained'] += self._getEffectiveNumPartsTrain(posTrainDict)
fnamesPos, weightsPos= self.__dataDict_toStrs(posTrainDict)
fnamesNeg, weightsNeg= self.__dataDict_toStrs(negTrainDict)
args= " -n %s --mode train -p %s -f %s --trueW %s --falseW %s --effective_data_size %s"%(netDataPath,
fnamesPos, fnamesNeg, weightsPos, weightsNeg, nTrueParticles)
args+= " -e %s -l %s -r %s -m %s "%(nEpochs, self.learningRate.get(), self.l2RegStrength.get(),
self.nModels.get())
if not self.auto_stopping.get():
args+=" -s"
if gpuToUse:
args+= " -g %s"%(gpuToUse)
if numberOfThreads:
args+= " -t %s"%(numberOfThreads)
trainedParams['trainedMicFns'] += self.TO_TRAIN_MICFNS
trainedParams['firstTraining'] = False
self.curTrainedParams = trainedParams
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
self.runJob('xmipp_deep_consensus', args, numberOfMpi=1, env=self.getCondaEnv())
[docs] def predictCNN(self):
'''Predict the particles from the micrographs and calificates the consensus coordinates'''
trainedParams = self.loadTrainedParams()
trPass = trainedParams['trainingPass']
if trPass != '':
predExten = trPass
mdORPath = self._getExtraPath("particles_OR.xmd")
else:
toPredictMicFns = self.readyToPredictMicFns()
print("Mics ready to predict {}".format(len(toPredictMicFns)))
if len(toPredictMicFns) > self.PREDICT_BATCH_MAX:
toPredictMicFns = toPredictMicFns[:self.PREDICT_BATCH_MAX]
predExten = '_partial'
self.joinSetOfParticlesStep(mode='OR', micFns=toPredictMicFns, trainingPass=predExten, clean=True)
mdORPath = self._getExtraPath("particles_OR{}.xmd".format(predExten))
netDataPath = self._getExtraPath(self.NET_TEMPLATE.format(trPass))
if not os.path.isdir(netDataPath) and self._doContinue():
prevRunPath = self.continueRun.get()._getExtraPath(self.NET_TEMPLATE.format(trPass))
copyTree(prevRunPath, netDataPath)
elif self.skipTraining.get() and self._usePretrainedModel():
self.__retrievePreTrainedModel(netDataPath)
if self.usesGpu():
numberOfThreads = None
gpuToUse = self.setGPU(oneGPU=True)
else:
numberOfThreads = self.numberOfThreads.get()
gpuToUse = None
mdObject = md.MetaData(mdORPath)
print('Predicting on {} true particles'.format(mdObject.size()))
#'in {} micrographs'.format(mdObject.size(), len(toPredictMicFns)))
predictDict = {mdORPath: 1}
if self.doTesting.get() and self.testTrueSetOfParticles.get() and self.testFalseSetOfParticles.get() and not\
self.loadTrainedParams()['doneExtraTesting']:
self.uploadTrainedParam('doneExtraTesting', True)
posTestDict = {self._getExtraPath("testTrueParticlesSet.xmd"): 1}
negTestDict = {self._getExtraPath("testFalseParticlesSet.xmd"): 1}
else:
posTestDict = None
negTestDict = None
outParticlesPath = self._getPath(self.PARTICLES_TEMPLATE.format(predExten))
fnamesPred, weightsPred= self.__dataDict_toStrs(predictDict)
args= " -n %s --mode score -i %s -o %s "%(netDataPath, fnamesPred, outParticlesPath)
if posTestDict and negTestDict:
fnamesPosTest, weightsPosTest= self.__dataDict_toStrs(posTestDict)
fnamesNegTest, weightsNegTest= self.__dataDict_toStrs(negTestDict)
args+= " --testingTrue %s --testingFalse %s "%(fnamesPosTest, fnamesNegTest)
if gpuToUse:
args+= " -g %s"%(gpuToUse)
if numberOfThreads:
args+= " -t %s"%(numberOfThreads)
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
self.runJob('xmipp_deep_consensus', args, numberOfMpi=1,
env=self.getCondaEnv())
trainedParams = self.loadTrainedParams()
if trPass != '':
trainedParams['predictionPasses'].append(trPass)
else:
trainedParams['predictedMicFns'] += toPredictMicFns
self.saveTrainedParams(trainedParams)
[docs] def createOutputStep(self, closeStream=False):
trPass = self.loadTrainedParams()['trainingPass']
if not "OR" in self.coordinatesDict:
self.loadCoords(self._getExtraPath(self.CONSENSUS_COOR_PATH_TEMPLATE % 'OR'), 'OR')
if trPass == '':
self.createFinalOutput(closeStream)
else:
self.createPreliminarOutput(trPass)
[docs] def createFinalOutput(self, closeStream=False):
predExten = '_partial'
partSet = self._createSetOfParticles("outputParts_tmp{}".format(predExten))
readSetOfParticles(self._getPath(self.PARTICLES_TEMPLATE.format(predExten)), partSet)
partSet.setSamplingRate(self._getDownFactor() * self.inSamplingRate)
self.outputParticles, self.outputCoordinates = self.getParticlesOutput(partSet), self.getCoordinatesOutput()
downFactor = self._getDownFactor()
for part in partSet:
coord = part.getCoordinate().clone()
coord.scale(downFactor)
deepZscoreLabel = '_xmipp_%s' % emlib.label2Str(md.MDL_ZSCORE_DEEPLEARNING1)
setattr(coord, deepZscoreLabel, getattr(part, deepZscoreLabel))
newPart = Particle()
newPart.copy(part, copyId=False)
newPart.scaleCoordinate(downFactor)
if (self.threshold.get() < 0 or
getattr(newPart, deepZscoreLabel) > self.threshold.get()):
self.outputCoordinates.append(coord)
self.outputParticles.append(newPart)
cleanPattern(self._getPath(self.PARTICLES_TEMPLATE.format(predExten)))
cleanPattern(self._getPath("*outputParts_tmp{}.sqlite".format(predExten)))
writeSetOfParticles(self.outputParticles, self._getPath("particles.xmd"))
self.updateOutput(closeStream)
[docs] def createPreliminarOutput(self, trPass):
partSet = self._createSetOfParticles("outputParts_tmp{}".format(trPass))
readSetOfParticles(self._getPath(self.PARTICLES_TEMPLATE.format(trPass)), partSet)
partSet.setSamplingRate(self._getDownFactor() * self.inSamplingRate)
self.preliminarOutputParticles = self.getPreParticlesOutput(partSet)
self.preliminarOutputCoordinates = self.getPreCoordinatesOutput()
downFactor = self._getDownFactor()
for part in partSet:
coord = part.getCoordinate().clone()
coord.scale(downFactor)
deepZscoreLabel = '_xmipp_%s' % emlib.label2Str(md.MDL_ZSCORE_DEEPLEARNING1)
setattr(coord, deepZscoreLabel, getattr(part, deepZscoreLabel))
part = part.clone()
part.scaleCoordinate(downFactor)
if (self.threshold.get() < 0 or
getattr(part, deepZscoreLabel) > self.threshold.get()):
self.preliminarOutputCoordinates.append(coord)
self.preliminarOutputParticles.append(part)
cleanPattern(self._getPath(self.PARTICLES_TEMPLATE.format(trPass)))
cleanPattern(self._getPath("*outputParts_tmp{}.sqlite".format(trPass)))
writeSetOfParticles(self.preliminarOutputParticles, self._getPath(self.PARTICLES_TEMPLATE.format(trPass)))
self.updatePreOutput(closeStream=True)
[docs] def getPreCoordinatesOutput(self):
print('Creating new preliminarOutputCoordinates set')
self.preliminarOutputCoordinates = \
self._createSetOfCoordinates(self.coordinatesDict['OR'].getMicrographs(asPointer=True))
self.preliminarOutputCoordinates.copyInfo(self.coordinatesDict['OR'])
self.preliminarOutputCoordinates.setBoxSize(self._getBoxSize())
self.preliminarOutputCoordinates.setStreamState(SetOfParticles.STREAM_OPEN)
self._defineOutputs(preliminarOutputCoordinates=self.preliminarOutputCoordinates)
self.waitFreeInputCoords()
for inSetOfCoords in self.inputCoordinates:
self._defineSourceRelation(inSetOfCoords.get(), self.preliminarOutputCoordinates)
self.USING_INPUT_COORDS = False
return self.preliminarOutputCoordinates
[docs] def getCoordinatesOutput(self):
if not hasattr(self, "outputCoordinates"):
print('Creating new outputCoordinates set')
self.outputCoordinates = self._createSetOfCoordinates(self.coordinatesDict['OR'].getMicrographs(asPointer=True))
self.outputCoordinates.copyInfo(self.coordinatesDict['OR'])
self.outputCoordinates.setBoxSize(self._getBoxSize())
self.outputCoordinates.setStreamState(SetOfParticles.STREAM_OPEN)
self._defineOutputs(outputCoordinates=self.outputCoordinates)
self.waitFreeInputCoords()
for inSetOfCoords in self.inputCoordinates:
self._defineSourceRelation(inSetOfCoords.get(), self.outputCoordinates)
self.USING_INPUT_COORDS = False
else:
# Micrographs of the set removed because there might be new ones in streaming
self.outputCoordinates.setMicrographs(self.coordinatesDict['OR'].getMicrographs(asPointer=False))
return self.outputCoordinates
[docs] def getPreParticlesOutput(self, partSet):
print('Creating new preliminarOutputParticles set')
self.preliminarOutputParticles = self._createSetOfParticles()
self.preliminarOutputParticles.copyInfo(partSet)
self.preliminarOutputParticles.setStreamState(SetOfParticles.STREAM_OPEN)
self._defineOutputs(preliminarOutputParticles=self.preliminarOutputParticles)
return self.preliminarOutputParticles
[docs] def getParticlesOutput(self, partSet):
if not hasattr(self, "outputParticles"):
print('Creating new outputParticles set')
self.outputParticles = self._createSetOfParticles()
self.outputParticles.copyInfo(partSet)
self.outputParticles.setStreamState(SetOfParticles.STREAM_OPEN)
self._defineOutputs(outputParticles=self.outputParticles)
return self.outputParticles
[docs] def updatePreOutput(self, closeStream=False):
if closeStream:
self.preliminarOutputCoordinates.setStreamState(SetOfParticles.STREAM_CLOSED)
self.preliminarOutputParticles.setStreamState(SetOfParticles.STREAM_CLOSED)
self.preliminarOutputCoordinates.write()
self.preliminarOutputParticles.write()
self._store(self.preliminarOutputCoordinates, self.preliminarOutputParticles)
[docs] def updateOutput(self, closeStream=False):
if closeStream:
self.outputCoordinates.setStreamState(SetOfParticles.STREAM_CLOSED)
self.outputParticles.setStreamState(SetOfParticles.STREAM_CLOSED)
self.outputCoordinates.write()
self.outputParticles.write()
self._store(self.outputCoordinates, self.outputParticles)
def _summary(self):
message = []
for i, coordinates in enumerate(self.inputCoordinates):
protocol = self.getMapper().getParent(coordinates.get())
message.append("Data source %d %s" % (i + 1, protocol.getClassLabel()))
message.append("Relative Radius = %f" % self.consensusRadius)
message.append("\nThe output contains the OR junction of all the input "
"coordinates with a 'zScoreDeepLearning1' value attached.\n"
"Please, click on 'Analyze Results' to make a subset.")
return message
def _methods(self):
return []
#--------------------------- UTILS functions --------------------------------------------
def _updateParticle(self, item, row):
setXmippAttributes(item, row, md.MDL_ZSCORE_DEEPLEARNING1)
if row.getValue(md.MDL_ENABLED) <= 0:
item._appendItem = False
else:
item._appendItem = True
[docs] def retrieveTrainSets(self):
""" Retrieve, link and return a setOfParticles
corresponding to the NegativeTrain DeepConsensus trainning set
with certain extraction conditions (phaseFlip/invContrast)
"""
prefixYES = ''
prefixNO = 'no'
# We always work with inverted contrast particles
modelType = "negativeTrain_%sPhaseFlip_Invert.mrcs" % (
prefixNO if self.ignoreCTF.get() else prefixYES) # mics will be always internally inverted if not done before
modelPath = self.getModel("deepConsensus", modelType)
modelFn = self._getTmpPath(modelType)
pwutils.createLink(modelPath, modelFn)
tmpSqliteSuff = "AddTrain"
partSet = self._createSetOfParticles(tmpSqliteSuff)
img = SetOfParticles.ITEM_TYPE()
imgh = ImageHandler()
_, _, _, n = imgh.getDimensions(modelFn)
if n > 1:
for index in range(1, n + 1):
img.cleanObjId()
img.setMicId(9999)
img.setFileName(modelFn)
img.setIndex(index)
partSet.append(img)
partSet.setAlignment(ALIGN_NONE)
cleanPath(self._getPath("particles%s.sqlite" % tmpSqliteSuff))
return partSet
[docs] def getMicrographFnsWithCoordinates(self, shared=True):
'''Return a list with the filenames of those microgrpahs which already have coordinates associated in the input
sets. If shared, it must be in all the sets, if not shared, at least in one'''
sharedMics = self.getAllCoordsInputMicrographs(shared)
self.waitFreeInputCoords()
micPaths = []
for micFn in sharedMics:
coordsInMic, mic = [], sharedMics[micFn]
for coordSet in self.inputCoordinates:
for coord in coordSet.get().iterCoordinates(mic):
coordsInMic.append(coord)
break
if len(coordsInMic) == len(self.inputCoordinates):
micPaths.append(mic.getFileName())
self.USING_INPUT_COORDS = False
micFns = self.prunePaths(micPaths)
return micFns
def _getInputMicrographs(self):
'''Return a list with the micrographs corresponding the input coordinates'''
self.waitFreeInputMics()
if not hasattr(self, "inputMicrographs") or not self.inputMicrographs:
self.waitFreeInputCoords()
if len(self.inputCoordinates) == 0:
print("WARNING. PROVIDE MICROGRAPHS FIRST")
else:
inputMicrographs = self.inputCoordinates[0].get().getMicrographs()
if inputMicrographs is None:
raise ValueError("there are problems with your coordiantes, they do not have associated micrographs ")
self.inputMicrographs = inputMicrographs
self.USING_INPUT_COORDS = False
return self.inputMicrographs
def _getBoxSize(self):
'''Returns the box size of the input coordinates'''
if not hasattr(self, "boxSize") or not self.boxSize:
self.waitFreeInputCoords()
firstCoords = self.inputCoordinates[0].get()
self.USING_INPUT_COORDS = False
self.boxSize = firstCoords.getBoxSize()
self.downFactor = self.boxSize / float(DEEP_PARTICLE_SIZE)
return self.boxSize
def _getDownFactor(self):
if not hasattr(self, "downFactor") or not self.downFactor:
self.boxSize = self._getBoxSize()
self.downFactor = self.boxSize / float(DEEP_PARTICLE_SIZE)
assert self.downFactor >= 1, \
"Error, the particle box size must be greater or equal than 128."
return self.downFactor
def _getConsensusParticlesDir(self, mode):
pathFun = self._getTmpPath if mode != "OR" else self._getExtraPath
return pathFun(self.CONSENSUS_PARTS_PATH_TEMPLATE% mode)
[docs] def loadMicSetFromFns(self, inputDir, micFns):
'''Returns a set of Micrographs from their filenames'''
micSet = self._createSetOfMicrographs()
for micFn in micFns:
micPath = os.path.join(inputDir, micFn)
micSet.append(Micrograph(micPath))
micSet.copyInfo(self._getInputMicrographs())
self.USING_INPUT_MICS = False
return micSet
def __dataDict_toStrs(self, dataDict):
fnamesStr=[]
weightsStr=[]
for fname in dataDict:
fnamesStr.append(fname)
weightsStr.append(str(dataDict[fname]) )
return ":".join(fnamesStr), ":".join(weightsStr)
def _getEffectiveNumPartsTrain(self, dictTrueData):
'''Returns the number of particles being used for training'''
nParts=0
for mdPath in dictTrueData:
mdObject = md.MetaData(mdPath)
nParts+= mdObject.size()
return nParts
[docs] def loadMeanAccuracy(self):
trainedParams = self.curTrainedParams
trPass = trainedParams['trainingPass']
netDataPath = self._getExtraPath(self.NET_TEMPLATE.format(trPass))
netMeanAccFname = os.path.join(netDataPath, "netsMeanValAcc.txt")
if os.path.exists(netDataPath):
with open(netMeanAccFname) as f:
lines = f.readlines()
mean_accuracy = float(lines[0].split()[1])
return mean_accuracy
else:
return None
#STREAMING and state checks
[docs] def trainingOn(self):
'''Return a boolean for whether to perform training. True if the training must not be skipped and if the finish
training criteria has not been reached yet'''
trainedParams = self.loadTrainedParams()
return not self.skipTraining.get() and trainedParams['keepTraining']
[docs] def predictionsOn(self):
'''Return a boolean for whether to perform a prediction. True if there must be a preliminar prediction or
if the training process has finished (trainingPass=='') '''
trainedParams = self.loadTrainedParams()
return (self.readyPreliminarPrediction() or trainedParams['trainingPass'] == '')
[docs] def readyPreliminarPrediction(self):
'''Return a boolean for whether to perform a preliminar predition. True if the user set it and the current
trained network has not been used yet'''
if self.networkReadyToPredict():
trainedParams = self.loadTrainedParams()
if not trainedParams['trainingPass'] in trainedParams[
'predictionPasses'] and self.doPreliminarPredictions.get():
return True
return False
[docs] def networkReadyToPredict(self):
'''Returns true if the CNN is trained or the user specified it does not need to be trained'''
trainedParams = self.loadTrainedParams()
return (self.skipTraining.get() and len(self.readyToPredictMicFns()) > 0) \
or len(trainedParams['trainedMicFns']) > 0
[docs] def cnnFree(self):
return not self.PREDICTING and not self.TRAINING
[docs] def allFree(self):
'''Kind of "traficlight" that specifies if there is not extraction, training or prediction going on, which would
alterate the states of the protocol'''
gExtracting = False
for mode in ['OR', 'NOISE', 'AND']:
if self.EXTRACTING[mode]:
gExtracting = True
return not self.PREDICTING and not self.TRAINING and not gExtracting and not self.PREPROCESSING
[docs] def checkIfParentsFinished(self):
'''Check the streamState of the coordinates input to check if the parent protocols are finsihed'''
self.waitFreeInputCoords()
finished=True
for coords in self.inputCoordinates:
coords = coords.get()
coords.loadAllProperties()
if coords.isStreamOpen():
finished = False
break
self.USING_INPUT_COORDS = False
return finished
[docs] def checkIfNewMics(self, mode=''):
'''Check if the are new micrographs ready for extracting particles'''
if mode == '':
for mode in ['OR', 'NOISE', 'AND']:
if len(self.readyToExtractMicFns(mode)) > 0:
return True
else:
if len(self.readyToExtractMicFns(mode)) > 0:
return True
return False
#Get data attributes
[docs] def getMicsIds(self, filterOutNoCoords=False):
'''Returns the input micrographs Ids'''
if not filterOutNoCoords:
idSet = self._getInputMicrographs().getIdSet()
self.USING_INPUT_MICS = False
return idSet
self.waitFreeInputCoords()
micFnames, micIds = set([]), set([])
for coordinatesP in self.inputCoordinates:
for coord in coordinatesP.get():
micIds.add( coord.getMicId())
micFnames.add( coord.getMicName() )
self.USING_INPUT_COORDS = False
return sorted( micIds )
[docs] def prunePaths(self, paths):
fns = []
for path in paths:
fns.append(path.split('/')[-1])
return fns
[docs] def getPreprocessedMicFns(self):
'''Return the list of preprocessed micrograph filenames'''
prepDir = self._getExtraPath(self.PRE_PROC_MICs_PATH)
if not os.path.exists(prepDir):
return []
return os.listdir(prepDir)
[docs] def getTrainedMicFns(self):
'''Return the list of microgrpahs whose particles have been used for training'''
trainedParams = self.loadTrainedParams()
return trainedParams['trainedMicFns']
[docs] def getPredictedMicFns(self):
'''Return the list of microgrpahs whose particles have been used for prediction'''
trainedParams = self.loadTrainedParams()
return trainedParams['predictedMicFns']
#Get ready sets
[docs] def readyToPreprocessMics(self, shared):
'''Return the list of micrograph filenames which are ready to be preprocessed and have not been preprocessed yet'''
micFns = self.getInputMicsFns(shared)
return list(set(micFns) - set(self.getPreprocessedMicFns()))
[docs] def readyToTrainMicFns(self):
'''Return the list of micrograph filenames which are ready to be used for training and
have not or are not being trained yet'''
extractedMicFns = set(self.getExtractedMicFns('OR')) & set(self.getExtractedMicFns('NOISE')) & \
set(self.getExtractedMicFns('AND'))
readyToTrain = list(extractedMicFns - set(self.getTrainedMicFns()))
return readyToTrain[:min(len(readyToTrain), self.TRAIN_BATCH_MAX)]
[docs] def readyToPredictMicFns(self):
'''Return the list of micrograph filenames which are ready to be used for prediction and
have not or are not being predicted yet'''
extractedMicFns = set(self.getExtractedMicFns('OR'))
readyToPredict = list(extractedMicFns - set(self.getPredictedMicFns()))
return readyToPredict
[docs] def getInpCoordsFns(self, mode, extractedSetOfCoordsFns):
Tm = []
for coordinatesP in self.inputCoordinates:
mics = coordinatesP.get().getMicrographs()
Tm.append(mics.getSamplingRate())
nCoordsSets = len(Tm)
inputCoordsFnames = {}
for coord_num, coordinatesP in enumerate(self.inputCoordinates):
tmpPosDir = self._getTmpPath("input_coords_%d_%s" % (coord_num, mode))
if not os.path.exists(tmpPosDir):
makePath(tmpPosDir)
writeSetOfCoordinates(tmpPosDir, coordinatesP.get(), scale=float(Tm[coord_num]) / float(Tm[0]))
for posFname in os.listdir(tmpPosDir):
baseName, extension = os.path.splitext(os.path.basename(posFname))
if extension == ".pos" and not posFname in extractedSetOfCoordsFns:
if baseName not in inputCoordsFnames:
inputCoordsFnames[baseName] = ["None"] * nCoordsSets
inputCoordsFnames[baseName][coord_num] = os.path.join(tmpPosDir, posFname)
return inputCoordsFnames
#Training params utils
[docs] def loadTrainedParams(self):
'''Load the dictionary stored in pickle format which stores the trained parameters.
Creates a initial one if it does not exist yet'''
paramsFile = self._getExtraPath(self.TRAINED_PARAMS_PATH)
if os.path.exists(paramsFile):
with open(paramsFile, 'rb') as handle:
params = pickle.load(handle)
else:
params = {'trainedMicFns': [],
'predictedMicFns': [],
'posParticlesTrained': 0,
'trainingPass': 0,
'predictionPasses': [],
'doneExtraTesting': False,
'firstTraining': True,
'keepTraining': True,
}
return params
[docs] def uploadTrainedParam(self, keyParam, newValue):
'''Upload the value of a parameter from the trained parameters'''
params = self.loadTrainedParams()
params[keyParam] = newValue
self.saveTrainedParams(params)
[docs] def saveTrainedParams(self, params):
'''Save the trained parameters dictionary'''
paramsFile = self._getExtraPath(self.TRAINED_PARAMS_PATH)
with open(paramsFile, 'wb') as handle:
pickle.dump(params, handle)
#CNN models utils
[docs] def retrievePreviousPassModel(self, trPass, lastTrPass=''):
'''Retrieves a previous CNN model and copies its folders to used the network in a new location'''
curNetDataPath = self._getExtraPath(self.NET_TEMPLATE.format(trPass))
if trPass == '':
prevNetDataPath = self._getExtraPath(self.NET_TEMPLATE.format(lastTrPass))
else:
prevNetDataPath = self._getExtraPath(self.NET_TEMPLATE.format(trPass - 1))
if prevNetDataPath != curNetDataPath:
copyTree(prevNetDataPath, curNetDataPath)
[docs] def retrievePreviousRunModel(self, prevProt, trPass=''):
'''Retrieves a CNN model from other protocol and copies its folders to used the network in a new location'''
curNetDataPath = self._getExtraPath(self.NET_TEMPLATE.format(trPass))
prevNetDataPath = prevProt._getExtraPath("nnetData")
if prevNetDataPath != curNetDataPath:
copyTree(prevNetDataPath, curNetDataPath)
def __retrievePreTrainedModel(self, netDataPath, effectiveSize=-1):
'''Retrieves a previously trained CNN model'''
if effectiveSize==-1:
effectiveSize=int(5e4)
modelTypeDir= "keras_models/%sPhaseFlip_Invert/nnetData_%d/tfchkpoints_0" % (
"no" if self.ignoreCTF.get() else "", effectiveSize)
modelTypeDir= self.getModel("deepConsensus", modelTypeDir)
for i in range(self.nModels.get()):
targetPath= os.path.join(netDataPath, "tfchkpoints_%d"%(i))
print(targetPath, modelTypeDir)
copyTree(modelTypeDir, targetPath)
[docs]class XmippProtDeepConsSubSet(ProtUserSubSet):
""" Create subsets from the GUI for the Deep Consensus protocol.
This protocol will be executed mainly calling the script 'pw_create_image_subsets.py'
from the ShowJ gui. The enabled/disabled changes will be stored in a temporary sqlite
file that will be read to create the new subset.
"""
def __init__(self, **args):
ProtUserSubSet.__init__(self, **args)
def _createSimpleSubset(self, inputObj):
modifiedSet = inputObj.getClass()(filename=self._dbName,
prefix=self._dbPrefix)
className = inputObj.getClassName()
createFunc = getattr(self, '_create' + className)
output = createFunc(inputObj.getMicrographs())
for item in modifiedSet:
if item.isEnabled():
coord = item.getCoordinate().clone()
coord.scale(1)
output.append(coord)
output.copyInfo(inputObj)
output.setBoxSize(inputObj.getBoxSize())
# Register outputs
self._defineOutput(className, output)
if inputObj.hasObjId():
self._defineTransformRelation(inputObj, output)
return output