Source code for pyworkflow.utils.dataset

# **************************************************************************
# *
# * Authors:     J.M. De la Rosa Trevin (
# *
# * Unidad de  Bioinformatica of Centro Nacional de Biotecnologia , CSIC
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either version 3 of the License, or
# * (at your option) any later version.
# *
# * This program is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with this program; if not, write to the Free Software
# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# * 02111-1307  USA
# *
# *  All comments concerning this program package may be sent to the
# *  e-mail address ''
# *
# **************************************************************************
# JMRT (2018-12-11) This module is almost not used at all. Maybe it
can be removed in a future. Just kept here for the moment inside
pw.utils and not imported by default

import os
from collections import OrderedDict, namedtuple

from pyworkflow.mapper import SqliteFlatDb, SqliteDb

[docs]class DataSet(object): """ Holds several Tables All tables should have an unique tableName. """ def __init__(self, tables, tableName=None, volumeName=None, numberSlices=0): self._tables = list(tables) self._tableName = tableName # FIXME: You have to see if the volumeName is used anywhere self._volumeName = volumeName self._numberSlices = numberSlices self.projectPath = None
[docs] def currentTable(self): """ Returns the name of the last selected table. """ return self._tableName
[docs] def setVolumeName(self, volumeName): self._volumeName = volumeName
[docs] def getVolumeName(self): return self._volumeName
[docs] def setNumberSlices(self, numberSlices): self._numberSlices = numberSlices
[docs] def getNumberSlices(self): return self._numberSlices
[docs] def getNumberSlicesForTemplate(self): return range(self._numberSlices)
[docs] def listTables(self): """ List the actual table names on the DataSet. """ return self._tables
[docs] def getTable(self, tableName=None): if tableName is None: tableName = self.listTables()[0] if tableName not in self._tables: raise Exception("DataSet: table '%s' not found.\n Current tables: %s" % (tableName, self._tables)) table = self._loadTable(tableName) self._tableName = tableName return table
[docs] def getTypeOfColumn(self, label): """ this method should be implemented by subclasses. """ pass
def _loadTable(self, tableName): """ this method should be implemented by subclasses. """ pass
[docs]class Table(object): """ Table to hold rows of data. A table contains a list of columns. """ def __init__(self, *columns): self._columns = OrderedDict() self._rowDict = OrderedDict() self._addColumn(Column('id', int)) for col in columns: self._addColumn(col) colNames = [col.getName() for col in self.iterColumns()] # This imply that columns can only be added in __init__ self.Row = namedtuple('Row', colNames)
[docs] def setLabelToRender(self, labelToRender): self._labelToRender = labelToRender
def _addColumn(self, col): self._columns[col.getName()] = col
[docs] def getColumnValues(self, columnName): if self.hasColumn(columnName): return [getattr(row, columnName) for row in self.iterRows()] else: return [None] * self.getSize()
[docs] def iterColumns(self): return self._columns.values()
[docs] def hasColumn(self, columnName): """ Return true if column exists """ return columnName in self._columns
[docs] def getColumn(self, columnName): if columnName not in self._columns: raise Exception('Table: column "%s" not found.\nCurrent columns: %s' % ( columnName, '\n'.join(self._columns.keys()))) return self._columns[columnName]
[docs] def hasEnabledColumn(self): """ Return true if enabled column exists """ return self.hasColumn('enabled')
[docs] def getColumns(self): """ Return all columns. """ return self._columns.values()
[docs] def getNumberOfColumns(self): return len(self._columns)
[docs] def getSize(self): """ Return the number of rows. """ return len(self._rowDict)
[docs] def getRows(self): """ Return all rows. """ return [row for row in self.iterRows()]
[docs] def getRow(self, rowId): return self._rowDict[rowId]
def _setRow(self, rowId, row): self._rowDict[rowId] = row
[docs] def getDataToRenderAndExtra(self): return zip(self.getIdColumn(), self.getColumnValues("enabled"), self.getDataToRender(), self.getTransformationMatrix())
[docs] def getDataToRender(self): return self.getColumnValues(self._labelToRender)
[docs] def getIdColumn(self): return self.getColumnValues("id")
[docs] def getTransformationMatrix(self): return self.getColumnValues(self._labelToRender+"_transformationMatrix")
def _convertValues(self, values): """ Convert the input values to the actual expected type of each column. """ cValues = {} for k, v in values.items(): col = self.getColumn(k) cValues[k] = col.convert(v) return cValues
[docs] def addRow(self, rowId, **values): """ With this implementation the rowId should be provided. We need to work around to also allow automatic generation of id's """ values['id'] = rowId for col in self.iterColumns(): if col.getName() not in values: if col.hasDefault(): values[col.getName()] = col.getDefault() else: raise Exception('Table: value for column "%s" not provided.' % col.getName()) row = self.Row(**self._convertValues(values)) self._setRow(rowId, row)
[docs] def updateRow(self, rowId, **values): """ Update a row given its rowId and some values to update. """ row = self.getRow(rowId) self._setRow(rowId, row._replace(**self._convertValues(values)))
[docs] def iterRows(self): """ Iterate over the rows. """ return self._rowDict.values()
[docs] def getValueFromIndex(self, index, label): """ Return the value of the property 'label' in the element that has this 'index'. """ value = list(self._rowDict.values())[index]._asdict()[label] return value
[docs] def getIndexFromValue(self, value, label): """ Search the element that has property 'label' equals to value and returns its index. """ for index, row in enumerate(self.iterRows()): if value == row._asdict()[label]: return index return -1
def __str__(self): return '\n'.join([str(row) for row in self.iterRows()])
# JMRT (2018-12-11) This constants are duplicated in showj, since this module # is not widely used I don't find convenient such a dependency COL_RENDER_NONE = 0 COL_RENDER_ID = 1 COL_RENDER_TEXT = 2 COL_RENDER_IMAGE = 3 COL_RENDER_CHECKBOX = 4 COL_RENDER_VOLUME = 5
[docs]class Column(object): def __init__(self, colName, colType=None, default=None, label=None, renderType=COL_RENDER_NONE): self._name = colName self._type = colType self._default = default self._label = label or colName self._renderType = renderType
[docs] def getName(self): return self._name
[docs] def getLabel(self): return self._label
[docs] def getType(self): return self._type
[docs] def convert(self, value): """ Try to convert the value to the column type. """ return self._type(value)
[docs] def hasDefault(self): return self._default is not None
[docs] def getDefault(self): return self._default
[docs] def getRenderType(self): return self._renderType
[docs] def setRenderType(self, renderType): self._renderType = renderType
[docs]class SqliteDataSet(DataSet): """ Provide a DataSet implementation based on sqlite file. The tables of the dataset will be the object tables in database. Each block is a table on the dataset. """ def __init__(self, filename): self._dbName = filename db = SqliteDb() db._createConnection(filename, 1000) # Tables should be at pairs: # PREFIX_Classes # PREFIX_Objects # where PREFIX can be empty self.tablePrefixes = OrderedDict() tables = db.getTables() for t in tables: if t.endswith('Classes'): prefix = t.replace('Classes', '') to = prefix + 'Objects' if to not in tables: raise Exception('SqliteDataSet: table "%s" found, but not "%s"' % (t, to)) flatDb = SqliteFlatDb(filename, tablePrefix=prefix) tableName = prefix + self._getPlural(flatDb.getSelfClassName()) self.tablePrefixes[tableName] = prefix # tablePrefixes.append(prefix) DataSet.__init__(self, self.tablePrefixes.keys()) db.close() def _getPlural(self, className): """ Get the plural of word for tables labels. """ if className.startswith('Class'): return className.replace('Class', 'Classes') return className + 's' def _loadTable(self, tableName): """ Load information from tables PREFIX_Classes, PREFIX_Objects. """ tableName = self.tablePrefixes[tableName] BASIC_COLUMNS = [Column('id', int, renderType=COL_RENDER_ID), Column('enabled', bool, renderType=COL_RENDER_CHECKBOX), Column('label', str), Column('comment', str), Column('creation', str)] # Load columns from PREFIX_Classes table columns = list(BASIC_COLUMNS) db = SqliteDb() db._createConnection(self._dbName, 1000) db.executeCommand("SELECT * FROM %sClasses;" % tableName) # This will store the images columns to join # the _index and the _filename imgCols = {} for row in db._iterResults(): renderType = COL_RENDER_NONE colName = row['column_name'] colLabel = row['label_property'] if colLabel != 'self': # Keep track of _index and _filename pairs to mark as renderable images if colLabel.endswith('_index'): imgCols[colLabel.replace('_index', '')] = colName elif colLabel.endswith('_filename'): # TODO: Maybe not all the labels endswith "_filename" # have to be rendered. # for example in the RotSpectra with '_representative._filename' prefix = colLabel.replace('_filename', '') if prefix in imgCols: renderType = COL_RENDER_IMAGE imgCols[colName] = imgCols[prefix] # CTF FIX elif (colLabel.endswith('_psdFile') or colLabel.endswith('_enhanced_psd') or colLabel.endswith('_ctfmodel_quadrant') or colLabel.endswith('_ctfmodel_halfplane')): renderType = COL_RENDER_IMAGE if row['class_name'] == 'Boolean': renderType = COL_RENDER_CHECKBOX columns.append(Column(colName, str, label=colLabel, renderType=renderType)) table = Table(*columns) checkedImgCols = {} # Check if the image columns are volumes # FIXME: Move this to scipion-em? Maybe remove the whole module that is not used? from pwem.emlib.image import ImageHandler ih = ImageHandler() # Populate the table in the DataSet db.executeCommand("SELECT * FROM %sObjects;" % tableName) for row in db._iterResults(): rowDict = dict(row) for k, v in rowDict.items(): if v is None: rowDict[k] = '' # Set the index@filename for images columns values if k in imgCols: colName = imgCols[k] index = rowDict[colName] filename = os.path.join(self.projectPath, rowDict[k]) filepath = filename.replace(":mrc", "") if not checkedImgCols.get(colName, False): if os.path.exists(filepath): # print "Fn to get dims: %s@%s" % (index,filename) x, y, z, n = ih.getDimensions((index, filename)) if z > 1: table.getColumn(k).setRenderType(COL_RENDER_VOLUME) checkedImgCols[colName] = True if index: rowDict[k] = '%06d@%s' % (index, filename) table.addRow(row['id'], **rowDict) return table
[docs]class SingleFileDataSet(DataSet): """ DataSet implementation for single files such as Images or Volumes. """ def __init__(self, filename): self._filename = filename self._tableName = "" DataSet.__init__(self, [self._tableName]) self._table = self._createSingleTable() def _createSingleTable(self): table = Table(Column('filename', str, renderType=COL_RENDER_VOLUME)) # FIXME: for single images we need to read the dimensions table.addRow(1, filename=self._filename) return table def _loadTable(self, tableName): return self._table