Initial commit
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README b/README
new file mode 100644
index 0000000..82a44d3
--- /dev/null
+++ b/README
@@ -0,0 +1,12 @@
+This small package provides some framework and implementation of a relatively
+deep learning model for doing virtual screening based on Tensorflow. The model is as
+described in this paper
+
+http://arxiv.org/abs/1502.02072
+
+The framework is somewhat more general and can be used for a variety of
+tensorflow models.
+
+Note that this code is NOT fully functional as is. The input reading and label
+operations are specific to Google's implementation and need to be reimplemented
+for your environment.
diff --git a/biology/icml/icml_eval.py b/biology/icml/icml_eval.py
new file mode 100644
index 0000000..0c76402
--- /dev/null
+++ b/biology/icml/icml_eval.py
@@ -0,0 +1,78 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluate a model from the ICML-2015 paper.
+
+This script requires a trained model with its associated config and checkpoint.
+If you don't have a trained model, run icml_train.py first.
+
+"""
+# pylint: disable=line-too-long
+# pylint: enable=line-too-long
+
+
+from nowhere.research.biology.collaborations.pande.py import utils
+
+from tensorflow.python.platform import app
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import gfile
+
+from biology import model_config
+from biology.icml import icml_models
+
+flags.DEFINE_string('config', None, 'Serialized ModelConfig proto.')
+flags.DEFINE_string('checkpoint', None,
+ 'Model checkpoint file. File can contain either an '
+ 'absolute checkpoint (e.g. model.ckpt-{step}) or a '
+ 'serialized CheckpointState proto.')
+flags.DEFINE_string('input_pattern', None, 'Input file pattern; '
+ 'It should include %d for fold index substitution.')
+flags.DEFINE_string('master', 'local', 'BNS name of the TensorFlow master.')
+flags.DEFINE_string('logdir', None, 'Directory for output files.')
+flags.DEFINE_integer('num_folds', 5, 'Number of cross-validation folds.')
+flags.DEFINE_integer('fold', None, 'Fold index for this model.')
+flags.DEFINE_enum('model_type', 'single', ['single', 'deep', 'deepaux', 'py',
+ 'pydrop1', 'pydrop2'],
+ 'Which model from the ICML paper should be trained/evaluated')
+FLAGS = flags.FLAGS
+
+
+def main(unused_argv=None):
+ config = model_config.ModelConfig()
+ config.ReadFromFile(FLAGS.config, overwrite='allowed')
+ gfile.MakeDirs(FLAGS.logdir)
+ model = icml_models.CONSTRUCTORS[FLAGS.model_type](config,
+ train=False,
+ logdir=FLAGS.logdir,
+ master=FLAGS.master)
+
+ if FLAGS.num_folds is not None and FLAGS.fold is not None:
+ folds = utils.kfold_pattern(FLAGS.input_pattern, FLAGS.num_folds,
+ FLAGS.fold)
+ _, test_pattern = folds.next()
+ test_pattern = ','.join(test_pattern)
+ else:
+ test_pattern = FLAGS.input_pattern
+
+ with model.graph.as_default():
+ model.Eval(model.ReadInput(test_pattern), FLAGS.checkpoint)
+
+
+if __name__ == '__main__':
+ flags.MarkFlagAsRequired('config')
+ flags.MarkFlagAsRequired('checkpoint')
+ flags.MarkFlagAsRequired('input_pattern')
+ flags.MarkFlagAsRequired('logdir')
+ app.run()
diff --git a/biology/icml/icml_models.py b/biology/icml/icml_models.py
new file mode 100644
index 0000000..e79a70f
--- /dev/null
+++ b/biology/icml/icml_models.py
@@ -0,0 +1,309 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow implementation of the models from the ICML-2015 paper.
+
+
+hyperparam_dict = {
+ "single": Hyperparams(num_layers=1,
+ num_hidden=1200,
+ node_depth=1,
+ nonlinearity=ACTIVATION_RECTIFIED_LINEAR,
+ weight_init=GaussianWeightInit(0.01),
+ bias_init=ConstantBiasInit(0.5),
+ dropout=1.),
+ "deep": Hyperparams(num_layers=4,
+ num_hidden=1000,
+ node_depth=1,
+ nonlinearity=ACTIVATION_RECTIFIED_LINEAR,
+ weight_init=GaussianWeightInit(0.01),
+ bias_init=ConstantBiasInit(0.5),
+ dropout=1.),
+ "deepaux": Hyperparams(num_layers=4,
+ num_hidden=1000,
+ auxiliary_softmax_layers=[0, 1, 2],
+ auxiliary_softmax_weight=0.3,
+ node_depth=1,
+ nonlinearity=ACTIVATION_RECTIFIED_LINEAR,
+ weight_init=GaussianWeightInit(0.01),
+ bias_init=ConstantBiasInit(0.5),
+ dropout=1.),
+ "py": Hyperparams(num_layers=2,
+ num_hidden=[2000, 100],
+ node_depth=1,
+ nonlinearity=ACTIVATION_RECTIFIED_LINEAR,
+ weight_init=[GaussianWeightInit(0.01),
+ GaussianWeightInit(0.04)],
+ bias_init=[ConstantBiasInit(0.5),
+ ConstantBiasInit(3.0)],
+ dropout=1.),
+ "pydrop1": Hyperparams(num_layers=2,
+ num_hidden=[2000, 100],
+ node_depth=1,
+ nonlinearity=ACTIVATION_RECTIFIED_LINEAR,
+ weight_init=[GaussianWeightInit(0.01),
+ GaussianWeightInit(0.04)],
+ bias_init=[ConstantBiasInit(0.5),
+ ConstantBiasInit(3.0)],
+ dropout=[0.75, 1.]),
+ "pydrop2": Hyperparams(num_layers=2,
+ num_hidden=[2000, 100],
+ node_depth=1,
+ nonlinearity=ACTIVATION_RECTIFIED_LINEAR,
+ weight_init=[GaussianWeightInit(0.01),
+ GaussianWeightInit(0.04)],
+ bias_init=[ConstantBiasInit(0.5),
+ ConstantBiasInit(3.0)],
+ dropout=[0.75, 0.75])}
+"""
+
+import numpy as np
+
+import tensorflow.google as tf
+
+from tensorflow.python.platform import logging
+
+from biology import model
+from biology import model_ops
+from nowhere.mustreimplement import input_ops
+from nowhere.mustreimplement import label_ops
+from nowhere.learning.dist_belief import input_example_pb2
+from nowhere.learning.dist_belief import types_pb2 as legacy_types_pb2
+
+
+class UnreplicatedIcmlModel(model.Classifier):
+ """Implements an icml model as configured in a model_config.proto."""
+
+ def Build(self):
+ """Constructs the graph architecture as specified in its config.
+
+ This method creates the following Placeholders:
+ mol_features: Molecule descriptor (e.g. fingerprint) tensor with shape
+ batch_size x num_features.
+ """
+ with tf.name_scope(self.placeholder_scope):
+ self.mol_features = tf.placeholder(
+ tf.float32,
+ shape=[self.config.batch_size, self.config.num_features],
+ name='mol_features')
+
+ layer_sizes = self.config.layer_sizes
+ weight_init_stddevs = self.config.weight_init_stddevs
+ bias_init_consts = self.config.bias_init_consts
+ dropouts = self.config.dropouts
+ lengths_set = {
+ len(layer_sizes),
+ len(weight_init_stddevs),
+ len(bias_init_consts),
+ len(dropouts),
+ }
+ assert len(lengths_set) == 1, 'All layer params must have same length.'
+ num_layers = lengths_set.pop()
+ assert num_layers > 0, 'Must have some layers defined.'
+
+ prev_layer = self.mol_features
+ prev_layer_size = self.config.num_features
+ for i in xrange(num_layers):
+ layer = tf.nn.relu(model_ops.FullyConnectedLayer(
+ tensor=prev_layer,
+ size=layer_sizes[i],
+ weight_init=tf.truncated_normal(
+ shape=[prev_layer_size, layer_sizes[i]],
+ stddev=weight_init_stddevs[i]),
+ bias_init=tf.constant(value=bias_init_consts[i],
+ shape=[layer_sizes[i]])))
+ layer = model_ops.Dropout(layer, dropouts[i])
+ prev_layer = layer
+ prev_layer_size = layer_sizes[i]
+
+ self.output = model_ops.MultitaskLogits(
+ layer, self.config.num_classification_tasks)
+
+ def LabelsAndWeights(self):
+ """Parse Label protos and create tensors for labels and weights.
+
+ This method creates the following Placeholders in the graph:
+ labels: Tensor with shape batch_size x num_tasks containing serialized
+ Label protos.
+ """
+ config = self.config
+ with tf.name_scope(self.placeholder_scope):
+ labels = tf.placeholder(
+ tf.string,
+ shape=[config.batch_size, config.num_classification_tasks],
+ name='labels')
+ self.labels = label_ops.MultitaskLabelClasses(labels, config.num_classes)
+ self.weights = label_ops.MultitaskLabelWeights(labels)
+
+ def ReadInput(self, input_pattern, input_data_types=None):
+ """Read input data and return a generator for minibatches.
+
+ Args:
+ input_pattern: Input file pattern.
+ input_data_types: List of legacy_types_pb2 constants matching the
+ number of and data types present in the sstables. If not specified,
+ defaults to full ICML 259-task types, but can be specified
+ for unittests or other datasets with consistent types.
+
+ Returns:
+ A generator that yields a dict for feeding a single batch to Placeholders
+ in the graph.
+
+ Raises:
+ AssertionError: If no default session is available.
+ """
+ if model_ops.IsTraining():
+ randomize = True
+ num_iterations = None
+ else:
+ randomize = False
+ num_iterations = 1
+
+ num_tasks = self.config.num_classification_tasks
+ tasks_in_input = self.config.tasks_in_input
+ if input_data_types is None:
+ input_data_types = ([legacy_types_pb2.DF_FLOAT] +
+ [legacy_types_pb2.DF_LABEL_PROTO] * tasks_in_input)
+ features, labels = input_ops.InputExampleInputReader(
+ input_pattern=input_pattern,
+ batch_size=self.config.batch_size,
+ num_tasks=num_tasks,
+ input_data_types=input_data_types,
+ num_features=self.config.num_features,
+ randomize=randomize,
+ shuffling=randomize,
+ num_iterations=num_iterations)
+
+ return self._ReadInputGenerator(features, labels[:, :num_tasks])
+
+ def _GetFeedDict(self, named_values):
+ feed_dict = {}
+ for name, value in named_values.iteritems():
+ feed_dict['{}/{}:0'.format(self.placeholder_root, name)] = value
+
+ return feed_dict
+
+ def EvalBatch(self, input_batch):
+ """Runs inference on the provided batch of input.
+
+ Args:
+ input_batch: iterator of input with len self.config.batch_size.
+
+ Returns:
+ Tuple of three numpy arrays with shape num_examples x num_tasks (x ...):
+ output: Model predictions.
+ labels: True labels. numpy array values are scalars,
+ not 1-hot classes vector.
+ weights: Example weights.
+ """
+ output, labels, weights = super(UnreplicatedIcmlModel, self).EvalBatch(
+ input_batch)
+
+ # Converts labels from 1-hot to float.
+ labels = labels[:, :, 1] # Whole batch, all tasks, 1-hot positive index.
+ return output, labels, weights
+
+ def BatchInputGenerator(self, serialized_batch):
+ """Returns a generator that iterates over the provided batch of input.
+
+ TODO(user): This is similar to input_ops.InputExampleInputReader(),
+ but doesn't need to be executed as part of the TensorFlow graph.
+ Consider refactoring so these can share code somehow.
+
+ Args:
+ serialized_batch: List of tuples: (_, value) where value is
+ a serialized InputExample proto. Must have self.config.batch_size
+ length or smaller. If smaller, we'll pad up to batch_size
+ and mark the padding as invalid so it's ignored in eval metrics.
+ Yields:
+ Dict of model inputs for use as a feed_dict.
+
+ Raises:
+ ValueError: If the batch is larger than the batch_size.
+ """
+ if len(serialized_batch) > self.config.batch_size:
+ raise ValueError(
+ 'serialized_batch length {} must be <= batch_size {}'.format(
+ len(serialized_batch), self.config.batch_size))
+ for _ in xrange(self.config.batch_size - len(serialized_batch)):
+ serialized_batch.append((None, ''))
+
+ features = []
+ labels = []
+ for _, serialized_proto in serialized_batch:
+ if serialized_proto:
+ input_example = input_example_pb2.InputExample()
+ input_example.ParseFromString(serialized_proto)
+ features.append([f for f in input_example.endpoint[0].float_value])
+ label_protos = [endpoint.label
+ for endpoint in input_example.endpoint[1:]]
+ assert len(label_protos) == self.config.num_classification_tasks
+ labels.append([l.SerializeToString() for l in label_protos])
+ else:
+ # This was a padded value to reach the batch size.
+ features.append([0.0 for _ in xrange(self.config.num_features)])
+ labels.append(
+ ['' for _ in xrange(self.config.num_classification_tasks)])
+
+ valid = np.asarray([(np.sum(f) > 0) for f in features])
+
+ assert len(features) == self.config.batch_size
+ assert len(labels) == self.config.batch_size
+ assert len(valid) == self.config.batch_size
+ yield self._GetFeedDict({
+ 'mol_features': features,
+ 'labels': labels,
+ 'valid': valid
+ })
+
+ def _ReadInputGenerator(self, features_tensor, labels_tensor):
+ """Generator that constructs feed_dict for minibatches.
+
+ Args:
+ features_tensor: Tensor of batch_size x molecule features.
+ labels_tensor: Tensor of batch_size x label protos.
+
+ Yields:
+ A dict for feeding a single batch to Placeholders in the graph.
+
+ Raises:
+ AssertionError: If no default session is available.
+ """
+ sess = tf.get_default_session()
+ if sess is None:
+ raise AssertionError('No default session')
+ while True:
+ try:
+ logging.vlog(1, 'Starting session execution to get input data')
+ features, labels = sess.run([features_tensor, labels_tensor])
+ logging.vlog(1, 'Done with session execution to get input data')
+ # TODO(user): check if the below axis=1 needs to change to axis=0,
+ # because cl/105081140.
+ valid = np.sum(features, axis=1) > 0
+ yield self._GetFeedDict({
+ 'mol_features': features,
+ 'labels': labels,
+ 'valid': valid
+ })
+
+ except tf.OpError as e:
+ # InputExampleInput op raises OpError when it has hit num_iterations
+ # or its input file is exhausted. However it may also be raised
+ # if the input sstable isn't what we expect.
+ if 'Invalid InputExample' in e.message:
+ raise e
+ else:
+ break
+
diff --git a/biology/icml/icml_train.py b/biology/icml/icml_train.py
new file mode 100644
index 0000000..ed1ed0c
--- /dev/null
+++ b/biology/icml/icml_train.py
@@ -0,0 +1,133 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Train a model from the ICML-2015 paper.
+"""
+# pylint: disable=line-too-long
+# pylint: enable=line-too-long
+
+import os
+
+
+from tensorflow.python.platform import app
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import gfile
+
+from biology import model_config
+from biology.icml import icml_models
+
+flags.DEFINE_string('config', None, 'Serialized ModelConfig proto.')
+flags.DEFINE_string('master', '', 'BNS name of the TensorFlow master.')
+flags.DEFINE_string('logdir', None, 'Directory for output files.')
+flags.DEFINE_integer('replica_id', 0, 'Task ID of this replica.')
+flags.DEFINE_integer('ps_tasks', 0, 'Number of parameter server tasks.')
+flags.DEFINE_integer('num_folds', 5, 'Number of cross-validation folds.')
+flags.DEFINE_integer('fold', None, 'Fold index for this model.')
+
+FLAGS = flags.FLAGS
+
+def kfold_pattern(input_pattern, num_folds, fold=None):
+ """Generator for train/test filename splits.
+
+ The pattern is not expanded except for the %d being replaced by the fold
+ index.
+
+ Args:
+ input_pattern: Input filename pattern. Should contain %d for fold index.
+ num_folds: Number of folds.
+ fold: If not None, the generator only yields the train/test split for the
+ given fold.
+
+ Yields:
+ train_filenames: A list of file patterns in training set.
+ test_filenames: A list of file patterns in test set.
+ """
+ # get filenames associated with each fold
+ fold_filepatterns = [input_pattern % i for i in range(num_folds)]
+
+ # create train/test splits
+ for i in range(num_folds):
+ if fold is not None and i != fold:
+ continue
+ train = fold_filepatterns[:i] + fold_filepatterns[i+1:]
+ test = [fold_filepatterns[i]]
+ if any([f in test for f in train]):
+ logging.fatal('Train/test split is not complete.')
+ if set(train + test) != set(fold_filepatterns):
+ logging.fatal('Not all input files are accounted for.')
+ yield train, test
+
+
+def Run(input_data_types=None):
+ """Trains the model with specified parameters.
+
+ Args:
+ input_data_types: List of legacy_types_pb2 constants or None.
+ """
+ config = model_config.ModelConfig({
+ 'input_pattern': '', # Should have %d for fold index substitution.
+ 'num_classification_tasks': 259,
+ 'tasks_in_input': 259, # Dimensionality of sstables
+ 'max_steps': 50000000,
+ 'summaries': False,
+ 'batch_size': 128,
+ 'learning_rate': 0.0003,
+ 'num_classes': 2,
+ 'optimizer': 'sgd',
+ 'penalty': 0.0,
+ 'num_features': 1024,
+ 'layer_sizes': [1200],
+ 'weight_init_stddevs': [0.01],
+ 'bias_init_consts': [0.5],
+ 'dropouts': [0.0],
+ })
+ config.ReadFromFile(FLAGS.config,
+ overwrite='required')
+
+ if FLAGS.replica_id == 0:
+ gfile.MakeDirs(FLAGS.logdir)
+ config.WriteToFile(os.path.join(FLAGS.logdir, 'config.pbtxt'))
+
+ model = icml_models.IcmlModel(config,
+ train=True,
+ logdir=FLAGS.logdir,
+ master=FLAGS.master)
+
+ if FLAGS.num_folds is not None and FLAGS.fold is not None:
+ folds = kfold_pattern(config.input_pattern, FLAGS.num_folds,
+ FLAGS.fold)
+ train_pattern, _ = folds.next()
+ train_pattern = ','.join(train_pattern)
+ else:
+ train_pattern = config.input_pattern
+
+ with model.graph.as_default():
+ model.Train(model.ReadInput(train_pattern,
+ input_data_types=input_data_types),
+ max_steps=config.max_steps,
+ summaries=config.summaries,
+ replica_id=FLAGS.replica_id,
+ ps_tasks=FLAGS.ps_tasks)
+
+
+def main(unused_argv=None):
+ Run()
+
+
+if __name__ == '__main__':
+ flags.MarkFlagAsRequired('config')
+ flags.MarkFlagAsRequired('logdir')
+ flags.MarkFlagAsRequired('fold')
+ app.run()
diff --git a/biology/metrics.py b/biology/metrics.py
new file mode 100644
index 0000000..fcc655a
--- /dev/null
+++ b/biology/metrics.py
@@ -0,0 +1,99 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation metrics."""
+
+import collections
+
+
+import numpy as np
+from sklearn import metrics
+
+
+def kappa_score(y_true, y_pred):
+ """Calculate Cohen's kappa for classification tasks.
+
+ See https://en.wikipedia.org/wiki/Cohen%27s_kappa
+
+ Note that this implementation of Cohen's kappa expects binary labels.
+
+ Args:
+ y_true: Numpy array containing true values.
+ y_pred: Numpy array containing predicted values.
+
+ Returns:
+ kappa: Numpy array containing kappa for each classification task.
+
+ Raises:
+ AssertionError: If y_true and y_pred are not the same size, or if class
+ labels are not in [0, 1].
+ """
+ assert len(y_true) == len(y_pred), 'Number of examples does not match.'
+ yt = np.asarray(y_true, dtype=int)
+ yp = np.asarray(y_pred, dtype=int)
+ assert np.array_equal(np.unique(yt), [0, 1]), (
+ 'Class labels must be binary: %s' % np.unique(yt))
+ observed_agreement = np.true_divide(np.count_nonzero(np.equal(yt, yp)),
+ len(yt))
+ expected_agreement = np.true_divide(
+ np.count_nonzero(yt == 1) * np.count_nonzero(yp == 1) +
+ np.count_nonzero(yt == 0) * np.count_nonzero(yp == 0),
+ len(yt) ** 2)
+ kappa = np.true_divide(observed_agreement - expected_agreement,
+ 1.0 - expected_agreement)
+ return kappa
+
+
+def compute_metric(y_true, y_pred, metric_str, threshold=0.5):
+ """Compute a metric value.
+
+ Args:
+ y_true: A list of arrays containing true values for each task.
+ y_pred: A list of arrays containing predicted values for each task.
+ metric_str: String description of the metric to compute. Must be in
+ biology_metrics.METRICS.
+ threshold: Float threshold to apply to probabilities for positive/negative
+ class assignment.
+
+ Returns:
+ Float metric value.
+
+ Raises:
+ NotImplementedError: If metric_str is not in METRICS.
+ """
+ if metric_str not in METRICS:
+ raise NotImplementedError('Unsupported metric %s' % metric_str)
+ metric_tuple = METRICS[metric_str]
+ if metric_tuple.threshold:
+ y_pred = np.greater(y_pred, threshold)
+ return metric_tuple.func(y_true, y_pred)
+
+
+class Metric(collections.namedtuple('MetricTuple', ['func', 'threshold'])):
+ """A named tuple used to organize model evaluation metrics.
+
+ Args:
+ func: Function to call. Should take true and predicted values (in that
+ order) and compute the metric.
+ threshold: Boolean indicating whether float values should be converted to
+ binary labels prior to computing the metric, e.g. accuracy.
+ """
+
+METRICS = {
+ 'accuracy': Metric(metrics.accuracy_score, True),
+ 'auc': Metric(metrics.roc_auc_score, False),
+ 'kappa': Metric(kappa_score, True),
+ 'r2': Metric(metrics.r2_score, False),
+}
diff --git a/biology/metrics_test.py b/biology/metrics_test.py
new file mode 100644
index 0000000..ea23cdb
--- /dev/null
+++ b/biology/metrics_test.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for metrics."""
+
+
+import numpy as np
+
+from tensorflow.python.platform import googletest
+
+from biology import metrics
+
+
+class MetricsTest(googletest.TestCase):
+
+ def test_kappa_score(self):
+ y_true = [1, 0, 1, 0]
+ y_pred = [0.8, 0.2, 0.3, 0.4] # [1, 0, 0, 0] with 0.5 threshold
+ kappa = metrics.kappa_score(y_true, np.greater(y_pred, 0.5))
+ observed_agreement = 3.0 / 4.0
+ expected_agreement = ((2 * 1) + (2 * 3)) / 4.0 ** 2
+ expected_kappa = np.true_divide(observed_agreement - expected_agreement,
+ 1.0 - expected_agreement)
+ self.assertAlmostEquals(kappa, expected_kappa)
+
+if __name__ == '__main__':
+ googletest.main()
diff --git a/biology/model.py b/biology/model.py
new file mode 100644
index 0000000..b8cf4a0
--- /dev/null
+++ b/biology/model.py
@@ -0,0 +1,989 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helper operations and classes for general model building.
+
+These methods are generally dependent on ModelConfig.
+"""
+
+import collections
+import cPickle as pickle
+import os
+import time
+import warnings
+
+
+import numpy as np
+import pandas as pd
+from sklearn import metrics as sklearn_metrics
+import tensorflow as tf
+
+from tensorflow.python.platform import logging
+
+from tensorflow.python.platform import gfile
+
+from biology import metrics as biology_metrics
+from biology import model_ops
+from biology import utils as biology_utils
+
+
+class Model(object):
+ """Generic base class for defining, training, and evaluating models.
+
+ Subclasses must implement the following methods:
+ AddOutputOps
+ Build
+ Eval
+ ReadInput / _ReadInputGenerator
+ BatchInputGenerator (if you want to use mr_eval/EvalBatch)
+ TrainingCost
+
+ Subclasses must set the following attributes:
+ loss: Op to calculate training cost used for gradient calculation.
+ output: Op(s) for model output for each task.
+ labels: Op(s) for true labels for each task.
+ weights: Op(s) for example weights for each task.
+ global_step: Scalar variable tracking total training/eval steps.
+ updates: Op(s) for running updates of e.g. moving averages for batch
+ normalization. Should be set to tf.no_op() if no updates are required.
+
+ This base class provides the following attributes:
+ config: ModelConfig containing model configuration parameters.
+ graph: TensorFlow graph object.
+ logdir: Path to the file output directory to store checkpoints etc.
+ master: TensorFlow session master specification string.
+ num_tasks: Integer number of tasks this model trains/evals on.
+ placeholder_root: String placeholder prefix, used to create
+ placeholder_scope.
+ placeholder_scope: name scope where tf.placeholders are defined.
+ summary_writer: SummaryWriter for writing summaries.
+ valid: Placeholder for a boolean tensor with shape batch_size to use as a
+ mask when calculating gradient costs.
+
+ Args:
+ config: ModelConfig.
+ train: If True, model is in training mode.
+ logdir: Directory for output files.
+ graph: Default graph.
+ master: BNS name of TensorFlow master.
+ summary_writer: SummaryWriter to use for writing summaries. If None, a new
+ SummaryWriter will be created.
+ """
+
+ def __init__(self,
+ config,
+ train,
+ logdir,
+ graph=None,
+ master='local',
+ summary_writer=None):
+ self.config = config
+ self.graph = graph if graph is not None else tf.Graph()
+ self.logdir = logdir
+ self.master = master
+
+ # Path to save checkpoint files, which matches the
+ # replicated supervisor's default path.
+ self._save_path = os.path.join(logdir, 'model.ckpt')
+
+ # batches. Lazily created by _SharedSession().
+ self._shared_session = None
+
+ # Guard variable to make sure we don't Restore() this model
+ # from a disk checkpoint more than once.
+ self._restored_model = False
+
+ # Cache of TensorFlow scopes, to prevent '_1' appended scope names
+ # when subclass-overridden methods use the same scopes.
+ self._name_scopes = {}
+
+ with self.graph.as_default():
+ model_ops.SetTraining(train)
+ self.placeholder_root = 'placeholders'
+ with tf.name_scope(self.placeholder_root) as scope:
+ self.placeholder_scope = scope
+ self.valid = tf.placeholder(tf.bool,
+ shape=[config.batch_size],
+ name='valid')
+
+ num_classification_tasks = config.GetOptionalParam(
+ 'num_classification_tasks', 0)
+ num_regression_tasks = config.GetOptionalParam('num_regression_tasks', 0)
+ if num_classification_tasks and num_regression_tasks:
+ raise AssertionError(
+ 'Dual classification/regression models are not supported.')
+ self.num_tasks = num_classification_tasks + num_regression_tasks
+ if self.num_tasks == 0:
+ raise AssertionError('Must specify one of '
+ 'num_classification_tasks or num_regression_tasks.')
+
+ if summary_writer is None:
+ summary_writer = tf.train.SummaryWriter(logdir)
+ self.summary_writer = summary_writer
+
+ def Build(self):
+ """Define the core model.
+
+ NOTE(user): Operations defined here should be in their own name scope to
+ avoid any ambiguity when restoring checkpoints.
+
+ Raises:
+ NotImplementedError: if not overridden by concrete subclass.
+ """
+ raise NotImplementedError('Must be overridden by concrete subclass')
+
+ def LabelPlaceholders(self):
+ """Add Placeholders for labels for each task.
+
+ This method creates the following Placeholders for each task:
+ labels_%d: Float label tensor. For classification tasks, this tensor will
+ have shape batch_size x num_classes. For regression tasks, this tensor
+ will have shape batch_size.
+
+ Raises:
+ NotImplementedError: if not overridden by concrete subclass.
+ """
+ raise NotImplementedError('Must be overridden by concrete subclass')
+
+ def WeightPlaceholders(self):
+ """Add Placeholders for example weights for each task.
+
+ This method creates the following Placeholders for each task:
+ weights_%d: Label tensor with shape batch_size.
+
+ Placeholders are wrapped in identity ops to avoid the error caused by
+ feeding and fetching the same tensor.
+ """
+ weights = []
+ for task in xrange(self.num_tasks):
+ with tf.name_scope(self.placeholder_scope):
+ weights.append(tf.identity(
+ tf.placeholder(tf.float32, shape=[self.config.batch_size],
+ name='weights_%d' % task)))
+ self.weights = weights
+
+ def LabelsAndWeights(self):
+ """Add Placeholders for labels and weights.
+
+ This method results in the creation of the following Placeholders for each
+ task:
+ labels_%d: Float label tensor. For classification tasks, this tensor will
+ have shape batch_size x num_classes. For regression tasks, this tensor
+ will have shape batch_size.
+ weights_%d: Label tensor with shape batch_size.
+
+ This method calls self.LabelPlaceholders and self.WeightPlaceholders; the
+ former method must be implemented by a concrete subclass.
+ """
+ self.LabelPlaceholders()
+ self.WeightPlaceholders()
+
+ def ReadInput(self, input_pattern):
+ """Read input data and return a generator for minibatches.
+
+ Args:
+ input_pattern: Input file pattern.
+
+ Returns:
+ A generator that yields a dict for feeding a single batch to Placeholders
+ in the graph.
+
+ Raises:
+ NotImplementedError: if not overridden by concrete subclass.
+ """
+ raise NotImplementedError('Must be overridden by concrete subclass')
+
+ def _ReadInputGenerator(self, names, tensors):
+ """Generator that constructs feed_dict for minibatches.
+
+ ReadInput cannot be a generator because any reading ops will not be added to
+ the graph until .next() is called (which is too late). Instead, ReadInput
+ should perform any necessary graph construction and then return this
+ generator.
+
+ Args:
+ names: A list of tensor names.
+ tensors: A list of tensors to evaluate.
+
+ Yields:
+ A dict for feeding a single batch to Placeholders in the graph.
+
+ Raises:
+ NotImplementedError: if not overridden by concrete subclass.
+ """
+ raise NotImplementedError('Must be overridden by concrete subclass')
+
+ def _SharedNameScope(self, name):
+ """Returns a singleton TensorFlow scope with the given name.
+
+ Used to prevent '_1'-appended scopes when sharing scopes with child classes.
+
+ Args:
+ name: String. Name scope for group of operations.
+ Returns:
+ tf.name_scope with the provided name.
+ """
+ if name not in self._name_scopes:
+ with tf.name_scope(name) as scope:
+ self._name_scopes[name] = scope
+
+ return tf.name_scope(self._name_scopes[name])
+
+ def Cost(self, output, labels, weights):
+ """Calculate single-task training cost for a batch of examples.
+
+ Args:
+ output: Tensor with model outputs.
+ labels: Tensor with true labels.
+ weights: Tensor with shape batch_size containing example weights.
+
+ Returns:
+ A tensor with shape batch_size containing the weighted cost for each
+ example. For use in subclasses that want to calculate additional costs.
+ """
+ # TODO(user): for mixed classification/regression models, pass in a task
+ # index to control the cost calculation
+ raise NotImplementedError('Must be overridden by concrete subclass')
+
+ def TrainingCost(self):
+ self.RequireAttributes(['output', 'labels', 'weights'])
+ epsilon = 1e-3 # small float to avoid dividing by zero
+ config = self.config
+ weighted_costs = [] # weighted costs for each example
+ gradient_costs = [] # costs used for gradient calculation
+ old_costs = [] # old-style cost
+
+ with self._SharedNameScope('costs'):
+ for task in xrange(self.num_tasks):
+ task_str = str(task).zfill(len(str(self.num_tasks)))
+ with self._SharedNameScope('cost_{}'.format(task_str)):
+ with tf.name_scope('weighted'):
+ weighted_cost = self.Cost(self.output[task], self.labels[task],
+ self.weights[task])
+ weighted_costs.append(weighted_cost)
+
+ with tf.name_scope('gradient'):
+ # Note that we divide by the batch size and not the number of
+ # non-zero weight examples in the batch. Also, instead of using
+ # tf.reduce_mean (which can put ops on the CPU) we explicitly
+ # calculate with div/sum so it stays on the GPU.
+ gradient_cost = tf.div(tf.reduce_sum(weighted_cost),
+ config.batch_size)
+ tf.scalar_summary('cost' + task_str,
+ model_ops.MovingAverage(gradient_cost,
+ self.global_step))
+ gradient_costs.append(gradient_cost)
+
+ with tf.name_scope('old_cost'):
+ old_cost = tf.div(
+ tf.reduce_sum(weighted_cost),
+ tf.reduce_sum(self.weights[task]) + epsilon)
+ tf.scalar_summary('old-cost' + task_str,
+ model_ops.MovingAverage(old_cost,
+ self.global_step))
+ old_costs.append(old_cost)
+
+ # aggregated costs
+ with self._SharedNameScope('aggregated'):
+ with tf.name_scope('gradient'):
+ loss = tf.add_n(gradient_costs)
+ with tf.name_scope('old_cost'):
+ old_loss = tf.add_n(old_costs)
+
+ # weight decay
+ if config.penalty != 0.0:
+ penalty = WeightDecay(config)
+ loss += penalty
+ old_loss += penalty
+
+ # loss used for gradient calculation
+ self.loss = loss
+
+ # (smoothed) summaries
+ tf.scalar_summary('Total Cost',
+ model_ops.MovingAverage(loss, self.global_step))
+ tf.scalar_summary('Total Old-Style Cost',
+ model_ops.MovingAverage(old_loss, self.global_step))
+
+ return weighted_costs
+
+ def Setup(self):
+ """Add ops common to training/eval to the graph."""
+ with tf.name_scope('core_model'):
+ self.Build()
+ self.LabelsAndWeights()
+ self.global_step = tf.Variable(0, name='global_step', trainable=False)
+
+ def MergeUpdates(self):
+ """Group updates into a single op."""
+ updates = tf.get_default_graph().get_collection('updates')
+ if updates:
+ self.updates = tf.group(*updates, name='updates')
+ else:
+ self.updates = tf.no_op(name='updates')
+
+ def TrainingOp(self):
+ """Get training op for applying gradients to variables.
+
+ Subclasses that need to do anything fancy with gradients should override
+ this method.
+
+ Returns:
+ A training op.
+ """
+ opt = Optimizer(self.config)
+ return opt.minimize(self.loss, global_step=self.global_step, name='train')
+
+ def SummaryOp(self):
+ """Get summary op for computing all summaries during training.
+
+ Returns:
+ A summary op.
+ """
+ return tf.merge_all_summaries()
+
+ def Train(self,
+ input_generator,
+ max_steps=None,
+ summaries=False,
+ save_model_secs=60,
+ save_summary_secs=30,
+ max_checkpoints_to_keep=5):
+ """Train the model.
+
+ Args:
+ input_generator: Generator that returns a feed_dict for feeding
+ Placeholders in the model graph. Usually this will be ReadInput with a
+ provided input pattern.
+ max_steps: Maximum number of training steps. If not provided, will
+ train indefinitely.
+ summaries: If True, add summaries for model parameters.
+ save_model_secs: Integer. Saves a checkpoint at this interval in seconds.
+ save_summary_secs: Integer. Saves a summary event file at this interval in
+ seconds.
+ max_checkpoints_to_keep: Integer. Maximum number of checkpoints to keep;
+ older checkpoints will be deleted.
+
+ Raises:
+ AssertionError: If model is not in training mode.
+ """
+ assert model_ops.IsTraining()
+ self.Setup()
+ self.TrainingCost()
+ self.MergeUpdates()
+ self.RequireAttributes(['loss', 'global_step', 'updates'])
+ if summaries:
+ self.AddSummaries()
+ train_op = self.TrainingOp()
+ summary_op = self.SummaryOp()
+ no_op = tf.no_op()
+ tf.train.write_graph(
+ tf.get_default_graph().as_graph_def(), self.logdir, 'train.pbtxt')
+ self.summary_writer.add_graph(tf.get_default_graph().as_graph_def())
+ last_checkpoint_time = time.time()
+ last_summary_time = time.time()
+ with self._SharedSession() as sess:
+ sess.run(tf.initialize_all_variables())
+ saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
+ # Save an initial checkpoint.
+ saver.save(sess, self._save_path, global_step=self.global_step)
+ for feed_dict in input_generator:
+ # Run training op and compute summaries.
+ secs_since_summary = time.time() - last_summary_time
+ if secs_since_summary > save_summary_secs:
+ this_summary_op = summary_op
+ else:
+ this_summary_op = no_op
+ step, loss, _, summary = sess.run(
+ [train_op.values()[0], self.loss, self.updates, this_summary_op],
+ feed_dict=feed_dict)
+ if summary is not None:
+ self.summary_writer.add_summary(summary, global_step=step)
+ last_summary_time = time.time()
+ # Save model checkpoints.
+ secs_since_checkpoint = time.time() - last_checkpoint_time
+ if secs_since_checkpoint > save_model_secs:
+ logging.info('step %d: %g', step, loss)
+ saver.save(sess, self._save_path, global_step=self.global_step)
+ last_checkpoint_time = time.time()
+ # Quit when we reach max_steps.
+ if max_steps is not None and step >= max_steps:
+ break
+ # Always save a final checkpoint when complete.
+ saver.save(sess, self._save_path, global_step=self.global_step)
+
+ def AddOutputOps(self):
+ """Add ops for inference.
+
+ Default implementation is pass, derived classes can override as needed.
+ """
+ pass
+
+ def _SharedSession(self):
+ if not self._shared_session:
+ # allow_soft_placement=True allows ops without a GPU implementation
+ # to run on the CPU instead.
+ config = tf.ConfigProto(allow_soft_placement=True)
+ self._shared_session = tf.Session(self.master, config=config)
+ return self._shared_session
+
+ def CloseSharedSession(self):
+ if self._shared_session:
+ self._shared_session.close()
+
+ def Restore(self, checkpoint):
+ """Restores the model from the provided training checkpoint.
+
+ Args:
+ checkpoint: string. Path to checkpoint file.
+ """
+ if self._restored_model:
+ return
+
+ with self.graph.as_default():
+ self.Setup()
+ self.AddOutputOps() # add softmax heads
+ saver = tf.train.Saver(tf.variables.all_variables())
+ saver.restore(self._SharedSession(),
+ biology_utils.ParseCheckpoint(checkpoint))
+ self.global_step_number = int(self._SharedSession().run(self.global_step))
+
+ self._restored_model = True
+
+ def Eval(self, input_generator, checkpoint, metrics=None):
+ """Evaluate the model.
+
+ Args:
+ input_generator: Generator that returns a feed_dict for feeding
+ Placeholders in the model graph.
+ checkpoint: Checkpoint filename.
+ metrics: List of metrics to compute. Defaults to self.default_metrics,
+ which is set in subclasses.
+
+ Returns:
+ step: Global step for this eval.
+ results: A dict mapping metric names to numpy arrays containing metric
+ values for each task.
+ """
+ self.Restore(checkpoint)
+ output, labels, weights = self.ModelOutput(input_generator)
+ y_true, y_pred = self.ParseModelOutput(output, labels, weights)
+
+ # keep counts for each class as a sanity check
+ counts = self.ExampleCounts(y_true)
+
+ # compute metrics
+ if metrics is None:
+ metrics = self.default_metrics
+ metric_values = {}
+ for metric in metrics:
+ metric_values[metric] = self.ComputeMetric(y_true, y_pred, metric)
+ self.ReportEval(metric_values, counts=counts,
+ global_step=self.global_step_number)
+ return self.global_step_number, metric_values
+
+ def ComputeMetric(self, y_true, y_pred, metric_str, threshold=0.5):
+ """Compute a performance metric for each task.
+
+ Args:
+ y_true: A list of arrays containing true values for each task.
+ y_pred: A list of arrays containing predicted values for each task.
+ metric_str: String description of the metric to compute. Must be in
+ biology_metrics.METRICS.
+ threshold: Float threshold to apply to probabilities for positive/negative
+ class assignment.
+
+ Returns:
+ A numpy array containing metric values for each task.
+ """
+ computed_metrics = []
+ for task in xrange(self.num_tasks):
+ yt = y_true[task]
+ yp = y_pred[task]
+ try:
+ metric_value = biology_metrics.compute_metric(yt, yp, metric_str,
+ threshold=threshold)
+ except (AssertionError, ValueError) as e:
+ warnings.warn('Error calculating metric %s for task %d: %s'
+ % (metric_str, task, e))
+ metric_value = np.nan
+ computed_metrics.append(metric_value)
+ return computed_metrics
+
+ def EvalBatch(self, input_batch):
+ """Runs inference on the provided batch of input.
+
+ Args:
+ input_batch: iterator of input with len self.config.batch_size.
+
+ Returns:
+ Tuple of three numpy arrays with shape num_examples x num_tasks (x ...):
+ output: Model predictions.
+ labels: True labels.
+ weights: Example weights.
+ """
+ with self.graph.as_default():
+ return self.ModelOutput(self.BatchInputGenerator(input_batch))
+
+ def ModelOutput(self, input_generator):
+ """Return model output for the provided input.
+
+ Restore(checkpoint) must have previously been called on this object.
+
+ Args:
+ input_generator: Generator that returns a feed_dict for feeding
+ Placeholders in the model graph.
+
+ Returns:
+ Tuple of three numpy arrays with shape num_examples x num_tasks (x ...):
+ output: Model outputs.
+ labels: True labels.
+ weights: Example weights.
+ Note that the output and labels arrays may be more than 2D, e.g. for
+ classifier models that return class probabilities.
+
+ Raises:
+ AssertionError: If model is not in evaluation mode.
+ ValueError: If output and labels are not both 3D or both 2D.
+ """
+ assert not model_ops.IsTraining()
+ assert self._restored_model
+ self.RequireAttributes(['output', 'labels', 'weights'])
+
+ # run eval data through the model
+ num_tasks = self.num_tasks
+ output, labels, weights = [], [], []
+ start = time.time()
+ with self._SharedSession().as_default():
+ batches_per_summary = 1000
+ seconds_per_summary = 0
+ batch_count = -1.0
+ for feed_dict in input_generator:
+ batch_start = time.time()
+ batch_count += 1
+ data = self._SharedSession().run(
+ self.output + self.labels + self.weights,
+ feed_dict=feed_dict)
+ batch_output = np.asarray(data[:num_tasks], dtype=float)
+ batch_labels = np.asarray(data[num_tasks:num_tasks * 2], dtype=float)
+ batch_weights = np.asarray(data[num_tasks * 2:num_tasks * 3],
+ dtype=float)
+ # reshape to batch_size x num_tasks x ...
+ if batch_output.ndim == 3 and batch_labels.ndim == 3:
+ batch_output = batch_output.transpose((1, 0, 2))
+ batch_labels = batch_labels.transpose((1, 0, 2))
+ elif batch_output.ndim == 2 and batch_labels.ndim == 2:
+ batch_output = batch_output.transpose((1, 0))
+ batch_labels = batch_labels.transpose((1, 0))
+ else:
+ raise ValueError(
+ 'Unrecognized rank combination for output and labels: %s %s' %
+ (batch_output.shape, batch_labels.shape))
+ batch_weights = batch_weights.transpose((1, 0))
+ valid = feed_dict[self.valid.name]
+ # only take valid outputs
+ if np.count_nonzero(~valid):
+ batch_output = batch_output[valid]
+ batch_labels = batch_labels[valid]
+ batch_weights = batch_weights[valid]
+ output.append(batch_output)
+ labels.append(batch_labels)
+ weights.append(batch_weights)
+
+ # Writes summary for tracking eval progress.
+ seconds_per_summary += (time.time() - batch_start)
+ self.RequireAttributes(['summary_writer'])
+ if batch_count % batches_per_summary == 0:
+ mean_seconds_per_batch = seconds_per_summary / batches_per_summary
+ seconds_per_summary = 0
+ summaries = [
+ tf.scalar_summary('secs/batch', mean_seconds_per_batch),
+ tf.scalar_summary('batches_evaluated', batch_count)
+ ]
+ self.summary_writer.add_summary(tf.merge_summary(summaries).eval(),
+ global_step=self.global_step_number)
+ self.summary_writer.flush()
+
+ logging.info('Eval took %g seconds', time.time() - start)
+
+ output = np.concatenate(output)
+ labels = np.concatenate(labels)
+ weights = np.concatenate(weights)
+
+ return output, labels, weights
+
+ def ReportEval(self, metrics, global_step, counts=None, name=None):
+ """Write Eval summaries.
+
+ Args:
+ metrics: Dict mapping metric names to numpy arrays containing metric
+ values for each task.
+ global_step: Integer. Global step number inference was run on.
+ counts: Dict mapping class names to counts.
+ name: String name for this group of metrics. Useful for organizing
+ metrics calculated using different subsets of the data.
+ """
+ # create a DataFrame to hold results
+ data = dict()
+ if counts is not None:
+ data.update({'count_%s' % group: values
+ for group, values in counts.iteritems()})
+ data.update(metrics)
+ df = pd.DataFrame(data)
+ print 'Eval at step: %d' % global_step
+ print df
+ # add global step to df
+ df['step'] = global_step
+
+ # save an update to disk
+ filename = os.path.join(self.logdir, 'eval-%d.pkl' % global_step)
+ with open(filename, 'w') as f:
+ pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
+
+ # write a summary for each metric
+ self.RequireAttributes(['summary_writer'])
+ with tf.Session(self.master):
+ summaries = []
+ prefix = '' if name is None else '%s - ' % name
+ for metric_name, results in metrics.iteritems():
+ for task in xrange(self.num_tasks):
+ task_str = str(task).zfill(len(str(self.num_tasks)))
+ summaries.append(
+ tf.scalar_summary('%s%s_%s' % (prefix, metric_name, task_str),
+ results[task]))
+ summaries.append(tf.scalar_summary(
+ '%sMean %s' % (prefix, metric_name), np.mean(results)))
+ summaries.append(tf.scalar_summary(
+ '%sMedian %s' % (prefix, metric_name), np.median(results)))
+ self.summary_writer.add_summary(tf.merge_summary(summaries).eval(),
+ global_step=global_step)
+ self.summary_writer.flush()
+
+ def RequireAttributes(self, attrs):
+ """Require class attributes to be defined.
+
+ Args:
+ attrs: A list of attribute names that must be defined.
+
+ Raises:
+ AssertionError: if a required attribute is not defined.
+ """
+ for attr in attrs:
+ if getattr(self, attr, None) is None:
+ raise AssertionError(
+ 'self.%s must be defined by a concrete subclass' % attr)
+
+ def AddSummaries(self):
+ """Add summaries for model parameters."""
+ for var in tf.trainable_variables():
+ if 'BatchNormalize' in var.name:
+ continue
+ tf.histogram_summary(var.name, var)
+
+
+class Classifier(Model):
+ """Classification model.
+
+ Subclasses must set the following attributes:
+ output: Logits op(s) used for computing classification loss and predicted
+ class probabilities for each task.
+
+ Class attributes:
+ default_metrics: List of metrics to compute by default.
+ """
+
+ default_metrics = ['auc']
+
+ def Cost(self, logits, labels, weights):
+ """Calculate single-task training cost for a batch of examples.
+
+ Args:
+ logits: Tensor with shape batch_size x num_classes containing logits.
+ labels: Tensor with shape batch_size x num_classes containing true labels
+ in a one-hot encoding.
+ weights: Tensor with shape batch_size containing example weights.
+
+ Returns:
+ A tensor with shape batch_size containing the weighted cost for each
+ example.
+ """
+ return tf.mul(tf.nn.softmax_cross_entropy_with_logits(logits, labels),
+ weights)
+
+ def TrainingCost(self):
+ """Calculate additional classifier-specific costs.
+
+ Returns:
+ A list of tensors with shape batch_size containing costs for each task.
+ """
+ weighted_costs = super(Classifier, self).TrainingCost() # calculate loss
+ epsilon = 1e-3 # small float to avoid dividing by zero
+ config = self.config
+ num_tasks = config.num_classification_tasks
+ cond_costs = collections.defaultdict(list)
+
+ with self._SharedNameScope('costs'):
+ for task in xrange(num_tasks):
+ task_str = str(task).zfill(len(str(num_tasks)))
+ with self._SharedNameScope('cost_{}'.format(task_str)):
+ with tf.name_scope('conditional'):
+ # pos/neg costs: mean over pos/neg examples
+ for name, label in [('neg', 0), ('pos', 1)]:
+ cond_weights = self.labels[task][:config.batch_size, label]
+ cond_cost = tf.div(
+ tf.reduce_sum(tf.mul(weighted_costs[task], cond_weights)),
+ tf.reduce_sum(cond_weights) + epsilon)
+ tf.scalar_summary('%s_%s' % (name, task_str),
+ model_ops.MovingAverage(cond_cost,
+ self.global_step))
+ cond_costs[name].append(cond_cost)
+
+ # aggregated costs
+ with self._SharedNameScope('aggregated'):
+ with tf.name_scope('pos_cost'):
+ pos_cost = tf.add_n(cond_costs['pos'])
+ with tf.name_scope('neg_cost'):
+ neg_cost = tf.add_n(cond_costs['neg'])
+
+ # (smoothed) summaries
+ tf.scalar_summary('Total Neg Cost',
+ model_ops.MovingAverage(neg_cost, self.global_step))
+ tf.scalar_summary('Total Pos Cost',
+ model_ops.MovingAverage(pos_cost, self.global_step))
+
+ # keep track of the number of positive examples seen by each task
+ with tf.name_scope('counts'):
+ for task in xrange(num_tasks):
+ num_pos = tf.Variable(0.0, name='num_pos_%d' % task, trainable=False)
+ # the assignment must occur on the same device as the variable
+ with tf.device(num_pos.device):
+ tf.get_default_graph().add_to_collection(
+ 'updates', num_pos.assign_add(
+ tf.reduce_sum(self.labels[task][:config.batch_size, 1])))
+ tf.scalar_summary(num_pos.name, num_pos)
+
+ return weighted_costs
+
+ def AddOutputOps(self):
+ """Replace logits with softmax outputs."""
+ softmax = []
+ with tf.name_scope('inference'):
+ for i, logits in enumerate(self.output):
+ softmax.append(tf.nn.softmax(logits, name='softmax_%d' % i))
+ self.output = softmax
+
+ def ExampleCounts(self, y_true):
+ """Get counts of examples in each class.
+
+ Args:
+ y_true: List of numpy arrays containing true values, one for each task.
+
+ Returns:
+ A dict mapping class names to counts.
+ """
+ classes = np.unique(np.concatenate(y_true))
+ counts = {klass: np.zeros(self.num_tasks, dtype=int)
+ for klass in classes}
+ for task in xrange(self.num_tasks):
+ for klass in classes:
+ counts[klass][task] = np.count_nonzero(y_true[task] == klass)
+ return counts
+
+ def LabelPlaceholders(self):
+ """Add Placeholders for labels for each task.
+
+ This method creates the following Placeholders for each task:
+ labels_%d: Label tensor with shape batch_size x num_classes.
+
+ Placeholders are wrapped in identity ops to avoid the error caused by
+ feeding and fetching the same tensor.
+ """
+ config = self.config
+ batch_size = config.batch_size
+ num_classes = config.num_classes
+ labels = []
+ for task in xrange(self.num_tasks):
+ with tf.name_scope(self.placeholder_scope):
+ labels.append(tf.identity(
+ tf.placeholder(tf.float32, shape=[batch_size, num_classes],
+ name='labels_%d' % task)))
+ self.labels = labels
+
+ def ParseModelOutput(self, output, labels, weights):
+ """Parse model output to get true and predicted values for each task.
+
+ Args:
+ output: Numpy array containing model output with shape
+ batch_size x num_tasks x num_classes.
+ labels: Numpy array containing one-hot example labels with shape
+ batch_size x num_tasks x num_classes.
+ weights: Numpy array containing example weights with shape
+ batch_size x num_tasks.
+
+ Returns:
+ y_true: List of numpy arrays containing true labels, one for each task.
+ y_pred: List of numpy arrays containing predicted labels, one for each
+ task.
+ """
+ y_true, y_pred = [], []
+ for task in xrange(self.config.num_classification_tasks):
+ # mask examples with zero weight
+ mask = weights[:, task] > 0
+ # get true class labels
+ y_true.append(labels[mask, task, 1])
+ # get positive class probabilities for predictions
+ y_pred.append(output[mask, task, 1])
+ return y_true, y_pred
+
+
+class Regressor(Model):
+ """Regression model.
+
+ Subclasses must set the following attributes:
+ output: Op(s) used for computing regression loss and predicted regression
+ outputs for each task.
+
+ Class attributes:
+ default_metrics: List of metrics to compute by default.
+ """
+
+ default_metrics = ['r2']
+
+ def Cost(self, output, labels, weights):
+ """Calculate single-task training cost for a batch of examples.
+
+ Args:
+ output: Tensor with shape batch_size containing predicted values.
+ labels: Tensor with shape batch_size containing true values.
+ weights: Tensor with shape batch_size containing example weights.
+
+ Returns:
+ A tensor with shape batch_size containing the weighted cost for each
+ example.
+ """
+ return tf.mul(tf.nn.l2_loss(output - labels), weights)
+
+ def ExampleCounts(self, y_true):
+ """Get counts of examples in each class.
+
+ Args:
+ y_true: List of numpy arrays containing true values, one for each task.
+
+ Returns:
+ A dict mapping class names to counts.
+ """
+ return {'all': np.asarray([len(y_true[task])
+ for task in xrange(self.num_tasks)])}
+
+ def LabelPlaceholders(self):
+ """Add Placeholders for labels for each task.
+
+ This method creates the following Placeholders for each task:
+ labels_%d: Label tensor with shape batch_size.
+
+ Placeholders are wrapped in identity ops to avoid the error caused by
+ feeding and fetching the same tensor.
+ """
+ batch_size = self.config.batch_size
+ labels = []
+ for task in xrange(self.num_tasks):
+ with tf.name_scope(self.placeholder_scope):
+ labels.append(tf.identity(
+ tf.placeholder(tf.float32, shape=[batch_size],
+ name='labels_%d' % task)))
+ self.labels = labels
+
+ def ParseModelOutput(self, output, labels, weights):
+ """Parse model output to get true and predicted values for each task.
+
+ Args:
+ output: Numpy array containing model output with shape
+ batch_size x num_tasks x num_classes.
+ labels: Numpy array containing one-hot example labels with shape
+ batch_size x num_tasks x num_classes.
+ weights: Numpy array containing example weights with shape
+ batch_size x num_tasks.
+
+ Returns:
+ y_true: List of numpy arrays containing true labels, one for each task.
+ y_pred: List of numpy arrays containing predicted labels, one for each
+ task.
+ """
+ # build arrays of true and predicted values for R-squared calculation
+ y_true, y_pred = [], []
+ for task in xrange(self.config.num_regression_tasks):
+ mask = weights[:, task] > 0 # ignore examples with zero weight
+ y_true.append(labels[mask, task])
+ y_pred.append(output[mask, task])
+ return y_true, y_pred
+
+
+def Optimizer(config):
+ """Create model optimizer.
+
+ Args:
+ config: ModelConfig.
+
+ Returns:
+ A training Optimizer.
+
+ Raises:
+ NotImplementedError: If an unsupported optimizer is requested.
+ """
+ # TODO(user): gradient clipping (see Minimize)
+ if config.optimizer == 'adagrad':
+ train_op = tf.train.AdagradOptimizer(config.learning_rate)
+ elif config.optimizer == 'adam':
+ train_op = tf.train.AdamOptimizer(config.learning_rate)
+ elif config.optimizer == 'momentum':
+ train_op = tf.train.MomentumOptimizer(config.learning_rate, config.memory)
+ elif config.optimizer == 'rmsprop':
+ train_op = tf.train.RMSPropOptimizer(config.learning_rate, config.memory)
+ elif config.optimizer == 'sgd':
+ train_op = tf.train.GradientDescentOptimizer(config.learning_rate)
+ else:
+ raise NotImplementedError('Unsupported optimizer %s' % config.optimizer)
+ return train_op
+
+
+def WeightDecay(config):
+ """Add weight decay.
+
+ Args:
+ config: ModelConfig.
+
+ Returns:
+ A scalar tensor containing the weight decay cost.
+
+ Raises:
+ NotImplementedError: If an unsupported penalty type is requested.
+ """
+ variables = []
+ # exclude bias variables
+ for v in tf.trainable_variables():
+ if v.get_shape().ndims == 2:
+ variables.append(v)
+
+ with tf.name_scope('weight_decay'):
+ if config.penalty_type == 'l1':
+ cost = tf.add_n([tf.reduce_sum(tf.Abs(v)) for v in variables])
+ elif config.penalty_type == 'l2':
+ cost = tf.add_n([tf.nn.l2_loss(v) for v in variables])
+ else:
+ raise NotImplementedError('Unsupported penalty_type %s' %
+ config.penalty_type)
+ cost *= config.penalty
+ tf.scalar_summary('Weight Decay Cost', cost)
+ return cost
diff --git a/biology/model_config.proto b/biology/model_config.proto
new file mode 100644
index 0000000..23e30fa
--- /dev/null
+++ b/biology/model_config.proto
@@ -0,0 +1,44 @@
+// Copyright 2015 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+syntax = "proto2";
+
+package biology;
+
+// Neural network model configuration, used mostly for
+// de/serializing model parameters for a given execution from/to disk.
+message ModelConfig {
+ message Parameter {
+ optional string name = 1;
+ optional string description = 10;
+
+ // See oneof user guide:
+ // http://sites/protocol-buffers/user-docs/miscellaneous-howtos/oneof
+ oneof value {
+ float float_value = 2;
+ int32 int_value = 3;
+ string string_value = 4;
+ bool bool_value = 5;
+ }
+
+ repeated float float_list = 6;
+ repeated int32 int_list = 7;
+ repeated string string_list = 8;
+ repeated bool bool_list = 9;
+ };
+
+ repeated Parameter parameter = 1;
+ optional string description = 2;
+}
diff --git a/biology/model_config.py b/biology/model_config.py
new file mode 100644
index 0000000..dbe88ee
--- /dev/null
+++ b/biology/model_config.py
@@ -0,0 +1,217 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wrapper of key-value pairs, which can be de/serialized from/to disk.
+"""
+import re
+
+from google.protobuf import text_format
+from tensorflow.python.platform import gfile
+
+from biology import model_config_pb2
+
+
+class ModelConfig(object):
+ """Wrapper of key-value pairs which can be de/serialized from/to disk.
+
+ A given key-value pair cannot be removed once added.
+ This wrapper is mostly meant to read
+ a config from disk or a python dict once, and subsequently the
+ values are read through the object's attributes.
+
+ De/Serialization is done through a protocol buffer with a text format,
+ so files on disk are human readable and editable. See the unittest
+ for an example of the protocol buffer text format.
+ """
+
+ _supported_types = [bool, int, float, str, unicode, list]
+ _supported_overwrite_modes = ['forbidden', 'required', 'allowed']
+
+ def __init__(self, defaults=None):
+ """Creates a config object.
+
+ Args:
+ defaults: An optional dictionary with string keys and
+ possibly heterogenously typed values;
+ see class attribute _supported_types for supported types.
+ The newly constructed object will gain attributes matching
+ the dict's keys and values.
+ """
+ self._config_dict = {}
+ if defaults:
+ for key, value in defaults.iteritems():
+ self.AddParam(key, value, overwrite='forbidden')
+
+ def _ValidateParam(self, key, value, overwrite):
+ """Checks param has a valid type, name, and enforces duplicate key handling.
+
+ Args:
+ key: str or unicode. Must be an allowable python attribute name,
+ (specifically, must match r'^[a-zA-Z][a-zA-Z_0-9]+$')
+ value: bool, int, float, str, unicode or homogeneous list thereof.
+ The value to be stored.
+ overwrite: String, how to handle duplicate keys.
+ 'forbidden': raise ValueError if key is already present.
+ 'required': raise ValueError if key is *not* already present.
+ 'allowed': key will be added or updated silently.
+
+ Raises:
+ ValueError: if parameters are not valid types,
+ or if the key is not an allowable python attribute name,
+ or if duplicate key validation failed.
+ """
+ if overwrite not in self._supported_overwrite_modes:
+ raise ValueError(
+ 'overwrite mode "{}" not allowed, must be one of {}'.format(
+ overwrite, ','.join(self._supported_overwrite_modes)))
+
+ if type(key) not in [str, unicode]:
+ raise ValueError('Key must but a string, but is: {}'.format(type(key)))
+
+ if re.match(r'^[a-zA-Z][a-zA-Z_0-9]+$', key) is None:
+ raise ValueError('Key is a bad attribute name: {}'.format(key))
+
+ if key in self._config_dict:
+ if overwrite == 'forbidden':
+ raise ValueError('Not allowed to specify same key twice: {}'.format(
+ key))
+ if (not isinstance(value, type(self._config_dict[key])) and
+ {str, unicode} != {type(value), type(self._config_dict[key])}):
+ raise ValueError(
+ 'Not allowed to change value type ({} -> {}) for a key: {}'.format(
+ type(self._config_dict[key]), type(value), key))
+ else:
+ if overwrite == 'required':
+ raise ValueError('Must specify default for {}'.format(key))
+
+ if type(value) not in self._supported_types:
+ raise ValueError(
+ 'Only {} values allowed: {}'.format(
+ ','.join([str(t) for t in self._supported_types]),
+ type(value)))
+
+ if type(value) is list:
+ if not value:
+ raise ValueError('Only non-empty lists supported: {}'.format(key))
+ type_set = {type(v) for v in value}
+ if len(type_set) > 1:
+ raise ValueError('Only homogenous lists supported, found: {}={}'.format(
+ key, ','.join(str(t) for t in type_set)))
+
+ def AddParam(self, key, value, overwrite):
+ """Adds one key-value pair to the dict being stored.
+
+ Args:
+ key: str or unicode. Must be an allowable python attribute name,
+ (specifically, must match r'^[a-zA-Z][a-zA-Z_0-9]+$')
+ value: bool, int, float, str, unicode or homogeneous list thereof.
+ The value to be stored.
+ overwrite: String, how to handle duplicate keys.
+ See _ValidateParam for allowed values and descriptions.
+
+ Raises:
+ ValueError: see _ValidateParam for raising conditions.
+ """
+ self._ValidateParam(key, value, overwrite)
+ self._config_dict[key] = value
+ setattr(self, key, value)
+
+ def GetOptionalParam(self, key, default_value):
+ """Returns the param value or the default_value if not present.
+
+ Typically you should directly read the object attribute for the
+ key, but if the key is optionally present this method can be convenient.
+
+ Args:
+ key: String of the parameter name.
+ default_value: Value to return if key is not present in this config.
+ May be int, float or string.
+
+ Returns:
+ Value of the parameter named by key or default_value if key isn't present.
+ """
+ return getattr(self, key, default_value)
+
+ def WriteToFile(self, filename):
+ """Writes this ModelConfig object to disk.
+
+ Args:
+ filename: Path to write config to on disk.
+
+ Raises:
+ IOError: in case of error while writing.
+ ValueError: in case of unsupported key or value type.
+ """
+ config_proto = model_config_pb2.ModelConfig()
+ for key, value in sorted(self._config_dict.iteritems()):
+ proto_param = config_proto.parameter.add()
+ proto_param.name = key
+ if type(value) is int:
+ proto_param.int_value = value
+ elif type(value) is float:
+ proto_param.float_value = value
+ elif type(value) in [str, unicode]:
+ proto_param.string_value = value
+ elif type(value) is bool:
+ proto_param.bool_value = value
+ elif type(value) is list:
+ list_type = type(value[0])
+ if list_type is int:
+ proto_param.int_list.extend(value)
+ elif list_type is float:
+ proto_param.float_list.extend(value)
+ elif list_type in [str, unicode]:
+ proto_param.string_list.extend(value)
+ elif list_type is bool:
+ proto_param.bool_list.extend(value)
+ else:
+ raise ValueError('Unsupported list type: {}'.format(list_type))
+ else:
+ raise ValueError('Unsupported value type: {}'.format(type(value)))
+
+ with open(filename, mode='w') as config_file:
+ config_file.write(text_format.MessageToString(config_proto))
+
+ def ReadFromFile(self, filename, overwrite='required'):
+ """Reads into this ModelConfig object from disk.
+
+ Args:
+ filename: Path to serialized config file.
+ overwrite: String, how to handle duplicate keys.
+ See _ValidateParam for allowed values and descriptions.
+
+ Raises:
+ IOError: in case of error while reading.
+ ValueError: if no value is set in a parameter.
+ """
+ config_proto = model_config_pb2.ModelConfig()
+ with open(filename) as config_file:
+ text_format.Merge(config_file.read(), config_proto)
+
+ for p in config_proto.parameter:
+ value_name = p.WhichOneof('value')
+ if value_name:
+ value = getattr(p, value_name)
+ elif p.int_list:
+ value = list(p.int_list)
+ elif p.float_list:
+ value = list(p.float_list)
+ elif p.string_list:
+ value = list(p.string_list)
+ elif p.bool_list:
+ value = list(p.bool_list)
+ else:
+ raise ValueError('No value set for key: {}'.format(p.name))
+ self.AddParam(p.name, value, overwrite)
diff --git a/biology/model_config_test.py b/biology/model_config_test.py
new file mode 100644
index 0000000..25f4dad
--- /dev/null
+++ b/biology/model_config_test.py
@@ -0,0 +1,194 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import tempfile
+
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+
+from biology import model_config
+
+EXAMPLE_DICT = {
+ 'hello': 'world',
+ 'pi': 3.14159,
+ 'Forty_Two': 42,
+ 'great': True,
+ 'spells': ['alohamora', 'expelliarmus'],
+ 'scores': [9.8, 10.0],
+ 'sizes': [2000, 100],
+ 'waver': [True, False, True],
+ }
+
+EXAMPLE_DEFAULTS = {
+ 'hello': 'there',
+ 'pi': 3.14,
+ 'Forty_Two': 24,
+ 'great': False,
+ 'spells': ['abracadabra', 'cruciatus'],
+ 'scores': [1.8, 1.0],
+ 'sizes': [1200, 10],
+ 'waver': [False, True, False],
+}
+
+EXAMPLE_FILE_CONTENTS = """parameter {
+ name: "Forty_Two"
+ int_value: 42
+}
+parameter {
+ name: "great"
+ bool_value: true
+}
+parameter {
+ name: "hello"
+ string_value: "world"
+}
+parameter {
+ name: "pi"
+ float_value: 3.14159
+}
+parameter {
+ name: "scores"
+ float_list: 9.8
+ float_list: 10.0
+}
+parameter {
+ name: "sizes"
+ int_list: 2000
+ int_list: 100
+}
+parameter {
+ name: "spells"
+ string_list: "alohamora"
+ string_list: "expelliarmus"
+}
+parameter {
+ name: "waver"
+ bool_list: true
+ bool_list: false
+ bool_list: true
+}
+"""
+
+
+class ModelConfigTest(googletest.TestCase):
+
+ def setUp(self):
+ super(ModelConfigTest, self).setUp()
+ self.root = tempfile.mkdtemp(dir=flags.FLAGS.test_tmpdir)
+
+ def _assertMatchesExample(self, config):
+ self.assertEqual(config.hello, 'world')
+ self.assertEqual(config.pi, 3.14159)
+ self.assertEqual(config.Forty_Two, 42)
+ self.assertTrue(config.great)
+ self.assertEqual(config.scores, [9.8, 10.0])
+ self.assertEqual(config.sizes, [2000, 100])
+ self.assertEqual(config.spells, ['alohamora', 'expelliarmus'])
+ self.assertEqual(config.waver, [True, False, True])
+
+ def testCreatesAttributes(self):
+ config = model_config.ModelConfig(EXAMPLE_DICT)
+ self._assertMatchesExample(config)
+
+ def testGetOptionalParam(self):
+ config = model_config.ModelConfig(EXAMPLE_DICT)
+ self.assertEqual('world', config.GetOptionalParam('hello', 'everybody'))
+ self.assertEqual('default', config.GetOptionalParam('otherkey', 'default'))
+
+ def testOnlyValidAttributeNamesAllowed(self):
+ config = model_config.ModelConfig()
+ with self.assertRaises(ValueError):
+ config.AddParam('spaces not allowed',
+ 'blah',
+ overwrite='forbidden')
+
+ with self.assertRaises(ValueError):
+ config.AddParam('42_must_start_with_letter',
+ 'blah',
+ overwrite='forbidden')
+
+ with self.assertRaises(ValueError):
+ config.AddParam('hyphens-not-allowed',
+ 'blah',
+ overwrite='forbidden')
+
+ with self.assertRaises(ValueError):
+ config.AddParam('',
+ 'empty string no good',
+ overwrite='forbidden')
+
+ def testDuplicateKeysNotAllowed(self):
+ config = model_config.ModelConfig(EXAMPLE_DICT)
+ with self.assertRaises(ValueError):
+ config.AddParam('hello',
+ 'everybody',
+ overwrite='forbidden')
+
+ def testRequireDefault(self):
+ config = model_config.ModelConfig(EXAMPLE_DICT)
+ config.AddParam('hello',
+ 'everybody',
+ overwrite='required')
+ with self.assertRaises(ValueError):
+ config.AddParam('not',
+ 'present',
+ overwrite='required')
+
+ def testSilentOverwrite(self):
+ config = model_config.ModelConfig(EXAMPLE_DICT)
+ config.AddParam('not', 'present', overwrite='allowed')
+ config.AddParam('not', 'anymore', overwrite='allowed')
+
+ def testHeterogeneousList(self):
+ config = model_config.ModelConfig()
+ with self.assertRaises(ValueError):
+ config.AddParam('different',
+ ['types for', 'different', 0xF, 0x0, 'lks'],
+ overwrite='forbidden')
+
+ def testWritesFile(self):
+ config = model_config.ModelConfig(EXAMPLE_DICT)
+ filename = os.path.join(self.root, 'config.pbtxt')
+ config.WriteToFile(filename)
+
+ with open(filename) as pbtxt_file:
+ self.assertEqual(EXAMPLE_FILE_CONTENTS, pbtxt_file.read())
+
+ def testReadsFile_NoDuplicates(self):
+ filename = os.path.join(self.root, 'config.pbtxt')
+ with open(filename, 'w') as pbtxt_file:
+ pbtxt_file.write(EXAMPLE_FILE_CONTENTS)
+
+ config = model_config.ModelConfig()
+ config.ReadFromFile(filename, overwrite='forbidden')
+ self._assertMatchesExample(config)
+
+ def testReadsFile_RequireDefaults(self):
+ filename = os.path.join(self.root, 'config.pbtxt')
+ with open(filename, 'w') as pbtxt_file:
+ pbtxt_file.write(EXAMPLE_FILE_CONTENTS)
+
+ self.assertEqual(set(EXAMPLE_DEFAULTS.keys()), set(EXAMPLE_DICT.keys()))
+ config = model_config.ModelConfig(EXAMPLE_DEFAULTS)
+ config.ReadFromFile(filename, overwrite='required')
+ self._assertMatchesExample(config)
+
+
+if __name__ == '__main__':
+ googletest.main()
diff --git a/biology/model_ops.py b/biology/model_ops.py
new file mode 100644
index 0000000..36ff42a
--- /dev/null
+++ b/biology/model_ops.py
@@ -0,0 +1,360 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ops for graph construction."""
+
+
+import tensorflow as tf
+
+from google.protobuf import text_format
+
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import logging
+
+from biology import utils as model_utils
+
+
+def AddBias(tensor, init=None, name=None):
+ """Add a bias term to a tensor.
+
+ Args:
+ tensor: Variable tensor.
+ init: Bias initializer. Defaults to zero.
+ name: Name for this op. Defaults to tensor.op.name.
+
+ Returns:
+ A biased tensor with the same shape as the input tensor.
+ """
+ if init is None:
+ init = tf.zeros([tensor.get_shape()[-1].value])
+ with tf.op_scope([tensor], name, tensor.op.name):
+ b = tf.Variable(init, name='b')
+ return tf.nn.bias_add(tensor, b)
+
+
+def BatchNormalize(tensor, convolution, mask=None, epsilon=0.001,
+ scale_after_normalization=True, decay=0.999,
+ global_step=None, name=None):
+ """Batch normalization.
+
+ Normalize, scale, and shift the input tensor to reduce covariate shift.
+
+ NOTE(user): For inference, the mean and variance must be set to fixed
+ values derived from the entire training set. This is accomplished by using
+ moving_mean and moving_variance during evaluation. Be sure that models run
+ the ops in updates during training or the moving averages will not be very
+ useful!
+
+ Args:
+ tensor: Input tensor (must be 4D).
+ convolution: If True, perform normalization across rows and columns as
+ well as over batch.
+ mask: Mask to apply to tensor.
+ epsilon: Small float to avoid dividing by zero.
+ scale_after_normalization: If True, multiply by gamma. If False, gamma is
+ not used. When the next layer is linear (also e.g. ReLU), this can be
+ disabled since the scaling can be done by the next layer.
+ decay: Float value for moving average decay.
+ global_step: Tensor containing global step for accelerating moving averages
+ at the beginning of training.
+ name: Name for this op. Defaults to 'batch_norm'.
+
+ Returns:
+ A new tensor corresponding to the batch normalized input.
+
+ Raises:
+ ValueError: If the input tensor is not 4D.
+ """
+ if len(tensor.get_shape()) != 4:
+ raise ValueError('Input tensor must be 4D, not %dD'
+ % len(tensor.get_shape()))
+ if convolution:
+ axes = [0, 1, 2]
+ shape = tensor.get_shape()[3:]
+ else:
+ axes = [0]
+ shape = tensor.get_shape()[1:]
+ with tf.op_scope([tensor], None, 'BatchNormalize'):
+ if mask is not None:
+ mean, variance = model_utils.Moment(
+ 2, tensor, reduction_indices=axes, mask=mask)
+ else:
+ mean, variance = tf.nn.moments(tensor, axes)
+
+ # Keep track of moving averages for mean and variance. During eval, use the
+ # moving averages from training.
+ mean_moving_average = MovingAverage(mean, global_step, decay)
+ variance_moving_average = MovingAverage(variance, global_step, decay)
+ if not IsTraining():
+ mean = mean_moving_average
+ variance = variance_moving_average
+
+ beta = tf.Variable(tf.zeros(shape), name='beta')
+ gamma = tf.Variable(tf.constant(1.0, shape=shape), name='gamma')
+ if convolution:
+ batch_norm = tf.nn.batch_norm_with_global_normalization(
+ tensor, mean, variance, beta, gamma, epsilon,
+ scale_after_normalization)
+ else:
+ batch_norm = (tensor - mean) * tf.rsqrt(variance + epsilon)
+ if scale_after_normalization:
+ batch_norm *= gamma
+ batch_norm += beta
+ if mask is not None:
+ batch_norm = model_utils.Mask(batch_norm, mask)
+ return batch_norm
+
+
+def MovingAverage(tensor, global_step, decay=0.999):
+ """Create a variable that contains the moving average of a tensor.
+
+ Adds a tf.identity and special namescope to ensure the tensor
+ is colocated with its Variable on the parameter server.
+ See http://g/tensorflow-users/PAAXYLlybNs/xA0z-x1qEwAJ
+ and replicated_model.py#NameScopeDevicePicker for context.
+
+ Args:
+ tensor: Tensor to calculate moving average of.
+ global_step: Variable containing the number of global steps.
+ decay: Float for exponential decay of moving average.
+
+ Returns:
+ A tf.Variable containing the moving average of the input tensor.
+ """
+ exponential_moving_average = tf.train.ExponentialMovingAverage(
+ decay=decay, num_updates=global_step)
+
+ update_op = exponential_moving_average.apply([tensor])
+ tf.get_default_graph().add_to_collection('updates', update_op)
+ return exponential_moving_average.average(tensor)
+
+
+def Dropout(tensor, dropout_prob, training_only=True):
+ """Random dropout.
+
+ This implementation supports "always-on" dropout (training_only=False), which
+ can be used to calculate model uncertainty. See Gal and Ghahramani,
+ http://arxiv.org/abs/1506.02142.
+
+ NOTE(user): To simplify the implementation, I have chosen not to reverse
+ the scaling that occurs in tf.nn.dropout when using dropout during
+ inference. This shouldn't be an issue since the activations will be scaled
+ by the same constant in both training and inference. This means that there
+ are no training-time differences between networks that use dropout during
+ inference and those that do not.
+
+ Args:
+ tensor: Input tensor.
+ dropout_prob: Float giving dropout probability for weights (NOT keep
+ probability).
+ training_only: Boolean. If True (standard dropout), apply dropout only
+ during training. If False, apply dropout during inference as well.
+
+ Returns:
+ A tensor with the same shape as the input tensor.
+ """
+ if not dropout_prob:
+ return tensor # do nothing
+ keep_prob = 1.0 - dropout_prob
+ if IsTraining() or not training_only:
+ tensor = tf.nn.dropout(tensor, keep_prob)
+ return tensor
+
+
+def FullyConnectedLayer(tensor, size, weight_init=None, bias_init=None,
+ name=None):
+ """Fully connected layer.
+
+ Args:
+ tensor: Input tensor.
+ size: Number of nodes in this layer.
+ weight_init: Weight initializer.
+ bias_init: Bias initializer.
+ name: Name for this op. Defaults to 'fully_connected'.
+
+ Returns:
+ A new tensor representing the output of the fully connected layer.
+
+ Raises:
+ ValueError: If input tensor is not 2D.
+ """
+ if len(tensor.get_shape()) != 2:
+ raise ValueError('Dense layer input must be 2D, not %dD'
+ % len(tensor.get_shape()))
+ if weight_init is None:
+ num_features = tensor.get_shape()[-1].value
+ weight_init = tf.truncated_normal([num_features, size], stddev=0.01)
+ if bias_init is None:
+ bias_init = tf.zeros([size])
+
+ with tf.op_scope([tensor], name, 'fully_connected'):
+ w = tf.Variable(weight_init, name='w')
+ b = tf.Variable(bias_init, name='b')
+ return tf.nn.xw_plus_b(tensor, w, b)
+
+
+def IsTraining():
+ """Determine whether the default graph is in training mode.
+
+ Returns:
+ A boolean value indicating whether the default graph is in training mode.
+
+ Raises:
+ ValueError: If the 'train' collection in the default graph does not contain
+ exactly one element.
+ """
+ train = tf.get_collection('train')
+ if not train:
+ raise ValueError('Training mode is not set. Please call SetTraining.')
+ elif len(train) > 1:
+ raise ValueError('Training mode has more than one setting.')
+ return train[0]
+
+
+def SetTraining(train):
+ """Set the training mode of the default graph.
+
+ This operation may only be called once for a given graph.
+
+ Args:
+ train: If True, graph is in training mode.
+
+ Raises:
+ AssertionError: If the default graph already has this value set.
+ """
+ if tf.get_collection('train'):
+ raise AssertionError('Training mode already set: %s' %
+ tf.get_collection('train'))
+ tf.add_to_collection('train', train)
+
+
+def MultitaskLogits(features, num_tasks, num_classes=2, weight_init=None,
+ bias_init=None, dropout=None, name=None):
+ """Create a logit tensor for each classification task.
+
+ Args:
+ features: A 2D tensor with dimensions batch_size x num_features.
+ num_tasks: Number of classification tasks.
+ num_classes: Number of classes for each task.
+ weight_init: Weight initializer.
+ bias_init: Bias initializer.
+ dropout: Float giving dropout probability for weights (NOT keep
+ probability).
+ name: Name for this op. Defaults to 'multitask_logits'.
+
+ Returns:
+ A list of logit tensors; one for each classification task.
+ """
+ logits = []
+ with tf.name_scope('multitask_logits'):
+ for task_idx in range(num_tasks):
+ with tf.op_scope([features], name,
+ ('task' + str(task_idx).zfill(len(str(num_tasks))))):
+ logits.append(
+ Logits(features, num_classes, weight_init=weight_init,
+ bias_init=bias_init, dropout=dropout))
+ return logits
+
+
+def Logits(features, num_classes=2, weight_init=None, bias_init=None,
+ dropout=None, name=None):
+ """Create a logits tensor for a single classification task.
+
+ You almost certainly don't want dropout on there -- it's like randomly setting
+ the (unscaled) probability of a target class to 0.5.
+
+ Args:
+ features: A 2D tensor with dimensions batch_size x num_features.
+ num_classes: Number of classes for each task.
+ weight_init: Weight initializer.
+ bias_init: Bias initializer.
+ dropout: Float giving dropout probability for weights (NOT keep
+ probability).
+ name: Name for this op.
+
+ Returns:
+ A logits tensor with shape batch_size x num_classes.
+ """
+ with tf.op_scope([features], name, 'logits') as name:
+ return Dropout(
+ FullyConnectedLayer(features, num_classes, weight_init=weight_init,
+ bias_init=bias_init, name=name),
+ dropout)
+
+
+def SoftmaxN(tensor, name=None):
+ """Apply softmax across last dimension of a tensor.
+
+ Args:
+ tensor: Input tensor.
+ name: Name for this op. If None, defaults to 'SoftmaxN'.
+
+ Returns:
+ A tensor with softmax-normalized values on the last dimension.
+ """
+ with tf.op_scope([tensor], name, 'SoftmaxN'):
+ exp_tensor = tf.exp(tensor)
+ reduction_indices = [tensor.get_shape().ndims - 1]
+ return tf.div(exp_tensor,
+ tf.reduce_sum(exp_tensor,
+ reduction_indices=reduction_indices,
+ keep_dims=True))
+
+
+def Transform(tensor, transform, convolution=True, mask=None):
+ """Apply a transform to a tensor.
+
+ Args:
+ tensor: Input tensor.
+ transform: String description of transform. Supported values are 'bias'
+ and 'batch_norm'.
+ convolution: If True, assume tensor is the output of a convolution.
+ mask: Mask to apply to tensor.
+
+ Returns:
+ A tensor with the same shape as the input tensor.
+
+ Raises:
+ ValueError: If the input tensor is not 3D or 4D.
+ """
+ if len(tensor.get_shape()) not in [2, 3, 4]:
+ raise ValueError('Input tensor must be 2D, 3D or 4D, not %dD.'
+ % len(tensor.get_shape()))
+ with tensor.graph.as_default():
+ if transform == 'batch_norm':
+ # batch normalization requires 4D input
+ if len(tensor.get_shape()) != 4:
+ # 3D case: add one extra dimension
+ if len(tensor.get_shape()) == 3:
+ squeeze = [2]
+ tensor = tf.expand_dims(tensor, 2)
+ if mask is not None:
+ mask = tf.expand_dims(mask, -1)
+ # 2D case: add two extra dimensions
+ else:
+ squeeze = [1, 2]
+ tensor = tf.expand_dims(tf.expand_dims(tensor, -2), -2)
+ if mask is not None:
+ mask = tf.expand_dims(tf.expand_dims(mask, -1), -1)
+ tensor = BatchNormalize(tensor, convolution=convolution, mask=mask)
+ tensor = tf.squeeze(tensor, squeeze)
+ else:
+ tensor = BatchNormalize(tensor, convolution=convolution, mask=mask)
+ elif transform == 'bias':
+ tensor = AddBias(tensor, init=tf.constant(
+ 1.0, shape=[tensor.get_shape()[-1].value]))
+ if mask is not None:
+ tensor = model_utils.Mask(tensor, mask)
+ return tensor
diff --git a/biology/model_ops_test.py b/biology/model_ops_test.py
new file mode 100644
index 0000000..c53a3e0
--- /dev/null
+++ b/biology/model_ops_test.py
@@ -0,0 +1,243 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+
+import numpy as np
+import tensorflow as tf
+
+from google.protobuf import text_format
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import checkpoint_state_pb2 as cspb
+
+from biology import model_ops
+
+FLAGS = flags.FLAGS
+FLAGS.test_random_seed = 20151102
+
+
+class ModelOpsTest(test_util.TensorFlowTestCase):
+
+ def setUp(self):
+ super(ModelOpsTest, self).setUp()
+ self.root = '/tmp'
+
+ def testAddBias(self):
+ with self.test_session() as sess:
+ w_t = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], shape=[2, 3])
+ w_biased_t = model_ops.AddBias(w_t, init=tf.constant(5.0, shape=[3]))
+ sess.run(tf.initialize_all_variables())
+ w, w_biased, bias = sess.run([w_t, w_biased_t] + tf.trainable_variables())
+ self.assertAllEqual(w, [[1.0, 2.0, 3.0],
+ [4.0, 5.0, 6.0]])
+ self.assertAllEqual(w_biased, [[6.0, 7.0, 8.0],
+ [9.0, 10.0, 11.0]])
+ self.assertAllEqual(bias, [5.0, 5.0, 5.0])
+
+ def testFullyConnectedLayer(self):
+ with self.test_session() as sess:
+ features = np.random.random((128, 100))
+ features_t = tf.constant(features, dtype=tf.float32)
+ dense_t = model_ops.FullyConnectedLayer(features_t, 50)
+ sess.run(tf.initialize_all_variables())
+ features, dense, w, b = sess.run(
+ [features_t, dense_t] + tf.trainable_variables())
+ expected = np.dot(features, w) + b
+ self.assertAllClose(dense, expected)
+
+ def testMultitaskLogits(self):
+ with self.test_session() as sess:
+ num_tasks = 3
+ np.random.seed(FLAGS.test_random_seed)
+ features = np.random.random((5, 100))
+ logits_t = model_ops.MultitaskLogits(
+ tf.constant(features,
+ dtype=tf.float32),
+ num_tasks)
+ sess.run(tf.initialize_all_variables())
+ output = sess.run(tf.trainable_variables() + logits_t)
+ w = output[0:-3:2]
+ b = output[1:-3:2]
+ logits = output[-3:]
+ for i in range(num_tasks):
+ expected = np.dot(features, w[i]) + b[i]
+ self.assertAllClose(logits[i], expected, rtol=1e-5, atol=1e-5)
+
+ def GetModel(self, train=True):
+ model_ops.SetTraining(train)
+
+ # dummy variable for testing Restore
+ tf.Variable(tf.constant(10.0, shape=[1]), name='v0')
+
+ def _CheckBatchNormalization(self, features, convolution, mean, variance,
+ mask=None):
+ model_ops.SetTraining(True)
+ epsilon = 0.001
+ with self.test_session() as sess:
+ features_t = tf.constant(features, dtype=tf.float32)
+ batch_norm_t = model_ops.BatchNormalize(
+ features_t, convolution=convolution, epsilon=epsilon, mask=mask)
+ sess.run(tf.initialize_all_variables())
+ batch_norm, beta, gamma = sess.run(
+ [batch_norm_t] + tf.trainable_variables())
+ expected = gamma * (features - mean) / np.sqrt(variance + epsilon) + beta
+ self.assertAllClose(batch_norm, np.ma.filled(expected, 0),
+ rtol=1e-5, atol=1e-5)
+
+ def CheckBatchNormalization(self, features, convolution):
+ if convolution:
+ axis = (0, 1, 2)
+ else:
+ axis = 0
+ mean = features.mean(axis=axis)
+ variance = features.var(axis=axis)
+ self._CheckBatchNormalization(features, convolution, mean, variance)
+
+ def CheckBatchNormalizationWithMask(self, features, convolution, mask):
+ # convert features to a masked array
+ # masked array must be created with a mask of the same shape as features
+ expanded_mask = np.logical_not(
+ np.ones_like(features) * np.expand_dims(mask, -1))
+ features = np.ma.array(features, mask=expanded_mask)
+ if convolution:
+ axis = (0, 1, 2)
+ # masked arrays don't support mean/variance with tuple for axis
+ count = np.logical_not(features.mask).sum(axis=axis)
+ mean = features.sum(axis=axis) / count
+ variance = np.square(features - mean).sum(axis=axis) / count
+ else:
+ axis = 0
+ mean = features.mean(axis=axis)
+ variance = features.var(axis=axis)
+ mask_t = tf.constant(mask, dtype=tf.float32)
+ self._CheckBatchNormalization(features, convolution, mean, variance,
+ mask=mask_t)
+
+ def testBatchNormalization(self):
+ # no convolution: norm over batch (first axis)
+ self.CheckBatchNormalization(
+ features=np.random.random((2, 3, 2, 3)), convolution=False)
+
+ def testBatchNormalizationWithConv(self):
+ # convolution: norm over first three axes
+ self.CheckBatchNormalization(
+ features=np.random.random((2, 3, 2, 3)), convolution=True)
+
+ def testBatchNormalizationInference(self):
+ # create a simple batch-normalized model
+ model_ops.SetTraining(True)
+ epsilon = 0.001
+ decay = 0.95
+ checkpoint = os.path.join(self.root, 'my-checkpoint')
+ with self.test_session() as sess:
+ features = np.random.random((2, 3, 2, 3))
+ features_t = tf.constant(features, dtype=tf.float32)
+ # create variables for beta, gamma, and moving mean and variance
+ model_ops.BatchNormalize(
+ features_t, convolution=False, epsilon=epsilon, decay=decay)
+ sess.run(tf.initialize_all_variables())
+ updates = tf.group(*tf.get_default_graph().get_collection('updates'))
+ sess.run(updates) # update moving mean and variance
+ expected_mean, expected_variance, _, _ = tf.all_variables()
+ expected_mean = expected_mean.eval()
+ expected_variance = expected_variance.eval()
+
+ # save a checkpoint
+ saver = tf.train.Saver()
+ saver.save(sess, checkpoint)
+
+ super(ModelOpsTest, self).setUp() # reset the default graph
+
+ # check that the moving mean and variance are used for evaluation
+ # get a new set of features to verify that the correct mean and var are used
+ model_ops.SetTraining(False)
+ with self.test_session() as sess:
+ new_features = np.random.random((2, 3, 2, 3))
+ new_features_t = tf.constant(new_features, dtype=tf.float32)
+ batch_norm_t = model_ops.BatchNormalize(
+ new_features_t, convolution=False, epsilon=epsilon, decay=decay)
+ saver = tf.train.Saver()
+ saver.restore(sess, checkpoint)
+ batch_norm, mean, variance, beta, gamma = sess.run(
+ [batch_norm_t] + tf.all_variables())
+ self.assertAllClose(mean, expected_mean)
+ self.assertAllClose(variance, expected_variance)
+ expected = (gamma * (new_features - mean) /
+ np.sqrt(variance + epsilon) + beta)
+ self.assertAllClose(batch_norm, expected)
+
+ def testBatchNormalizationWithMask(self):
+ features = np.random.random((2, 3, 2, 3))
+ mask = np.asarray(
+ [[[1, 0],
+ [1, 1],
+ [1, 0]],
+ [[0, 1],
+ [0, 0],
+ [0, 1]]],
+ dtype=float)
+ self.CheckBatchNormalizationWithMask(
+ features=features, convolution=False, mask=mask)
+
+ def testBatchNormalizationWithMaskAndConv(self):
+ features = np.random.random((2, 3, 2, 3))
+ mask = np.asarray(
+ [[[1, 0],
+ [1, 1],
+ [1, 0]],
+ [[0, 1],
+ [0, 0],
+ [0, 1]]],
+ dtype=float)
+ self.CheckBatchNormalizationWithMask(
+ features=features, convolution=True, mask=mask)
+
+ def testSoftmaxN(self):
+ features = np.asarray([[[1, 1],
+ [0.1, 0.3]],
+ [[0, 1],
+ [2, 2]]],
+ dtype=float)
+ expected = np.asarray([[[0.5, 0.5],
+ [0.45, 0.55]],
+ [[0.27, 0.73],
+ [0.5, 0.5]]],
+ dtype=float)
+ with self.test_session() as sess:
+ computed = sess.run(
+ model_ops.SoftmaxN(tf.constant(features,
+ dtype=tf.float32)))
+ self.assertAllClose(np.around(computed, 2), expected)
+
+ def testSoftmaxNWithNumpy(self):
+ features = np.random.random((2, 3, 4))
+ expected = np.exp(features) / np.exp(features).sum(axis=-1, keepdims=True)
+ with self.test_session() as sess:
+ computed = sess.run(
+ model_ops.SoftmaxN(tf.constant(features,
+ dtype=tf.float32)))
+ self.assertAllClose(computed, expected)
+
+
+if __name__ == '__main__':
+ googletest.main()
diff --git a/biology/model_test.py b/biology/model_test.py
new file mode 100644
index 0000000..586e41a
--- /dev/null
+++ b/biology/model_test.py
@@ -0,0 +1,52 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for model."""
+
+
+import numpy as np
+
+from tensorflow.python.platform import googletest
+
+from biology import model
+from biology import model_config
+
+
+class ClassifierTest(googletest.TestCase):
+
+ def setUp(self):
+ self.config = model_config.ModelConfig({
+ 'batch_size': 2,
+ 'num_classification_tasks': 1,
+ })
+ self.model = model.Classifier(self.config, train=True,
+ logdir='/tmp/classifier_test')
+
+ def testParseModelOutput(self):
+ # standard 2-class output; some weights are zero
+ output = np.asarray([[[0.1, 0.9]],
+ [[0.2, 0.8]]], dtype=float)
+ labels = np.asarray([[[0, 1]],
+ [[1, 0]]], dtype=float)
+ weights = np.asarray([[0],
+ [1]], dtype=float)
+ expected_y_true = [[0]]
+ expected_y_pred = [[0.8]]
+ y_true, y_pred = self.model.ParseModelOutput(output, labels, weights)
+ self.assertListEqual(y_true, expected_y_true)
+ self.assertListEqual(y_pred, expected_y_pred)
+
+if __name__ == '__main__':
+ googletest.main()
diff --git a/biology/utils.py b/biology/utils.py
new file mode 100644
index 0000000..1805522
--- /dev/null
+++ b/biology/utils.py
@@ -0,0 +1,216 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utils for graph convolution models."""
+
+
+import numpy as np
+import tensorflow as tf
+
+from google.protobuf import text_format
+
+from tensorflow.python.platform import gfile
+from tensorflow.python.training import checkpoint_state_pb2
+
+
+def ParseCheckpoint(checkpoint):
+ """Parse a checkpoint file.
+
+ Args:
+ checkpoint: Path to checkpoint. The checkpoint is either a serialized
+ CheckpointState proto or an actual checkpoint file.
+
+ Returns:
+ The path to an actual checkpoint file.
+ """
+ with open(checkpoint) as f:
+ try:
+ cp = checkpoint_state_pb2.CheckpointState()
+ text_format.Merge(f.read(), cp)
+ return cp.model_checkpoint_path
+ except text_format.ParseError:
+ return checkpoint
+
+
+def Mask(t, mask):
+ """Apply a mask to a tensor.
+
+ If not None, mask should be a t.shape[:-1] tensor of 0,1 values.
+
+ Args:
+ t: Input tensor.
+ mask: Boolean mask with shape == t.shape[:-1]. If None, nothing happens.
+
+ Returns:
+ A tensor with the same shape as the input tensor.
+
+ Raises:
+ ValueError: If shapes do not match.
+ """
+ if mask is None:
+ return t
+ if not t.get_shape()[:-1].is_compatible_with(mask.get_shape()):
+ raise ValueError('Shapes do not match: %s vs. %s' % (t.get_shape(),
+ mask.get_shape()))
+ return tf.mul(t, tf.expand_dims(mask, -1))
+
+
+def Mean(tensor, reduction_indices=None, mask=None):
+ """Compute mean using Sum and Mul for better GPU performance.
+
+ See tf.nn.moments for additional notes on this approach.
+
+ Args:
+ tensor: Input tensor.
+ reduction_indices: Axes to reduce across. If None, reduce to a scalar.
+ mask: Mask to apply to tensor.
+
+ Returns:
+ A tensor with the same type as the input tensor.
+ """
+ return Moment(1, tensor, standardize=False,
+ reduction_indices=reduction_indices, mask=mask)[0]
+
+
+def Variance(tensor, reduction_indices=None, mask=None):
+ """Compute variance.
+
+ Args:
+ tensor: Input tensor.
+ reduction_indices: Axes to reduce across. If None, reduce to a scalar.
+ mask: Mask to apply to tensor.
+
+ Returns:
+ A tensor with the same type as the input tensor.
+ """
+ return Moment(2, tensor, standardize=False,
+ reduction_indices=reduction_indices, mask=mask)[1]
+
+
+def Skewness(tensor, reduction_indices=None):
+ """Compute skewness, the third standardized moment.
+
+ Args:
+ tensor: Input tensor.
+ reduction_indices: Axes to reduce across. If None, reduce to a scalar.
+
+ Returns:
+ A tensor with the same type as the input tensor.
+ """
+ return Moment(3, tensor, standardize=True,
+ reduction_indices=reduction_indices)[1]
+
+
+def Kurtosis(tensor, reduction_indices=None):
+ """Compute kurtosis, the fourth standardized moment minus three.
+
+ Args:
+ tensor: Input tensor.
+ reduction_indices: Axes to reduce across. If None, reduce to a scalar.
+
+ Returns:
+ A tensor with the same type as the input tensor.
+ """
+ return Moment(4, tensor, standardize=True,
+ reduction_indices=reduction_indices)[1] - 3
+
+
+def Moment(k, tensor, standardize=False, reduction_indices=None, mask=None):
+ """Compute the k-th central moment of a tensor, possibly standardized.
+
+ Args:
+ k: Which moment to compute. 1 = mean, 2 = variance, etc.
+ tensor: Input tensor.
+ standardize: If True, returns the standardized moment, i.e. the central
+ moment divided by the n-th power of the standard deviation.
+ reduction_indices: Axes to reduce across. If None, reduce to a scalar.
+ mask: Mask to apply to tensor.
+
+ Returns:
+ The mean and the requested moment.
+ """
+ if reduction_indices is not None:
+ reduction_indices = np.atleast_1d(reduction_indices).tolist()
+
+ # get the divisor
+ if mask is not None:
+ tensor = Mask(tensor, mask)
+ ones = tf.constant(1, dtype=tf.float32, shape=tensor.get_shape())
+ divisor = tf.reduce_sum(Mask(ones, mask),
+ reduction_indices=reduction_indices,
+ keep_dims=True)
+ elif reduction_indices is None:
+ divisor = tf.constant(np.prod(tensor.get_shape().as_list()), tensor.dtype)
+ else:
+ divisor = 1.0
+ for i in range(len(tensor.get_shape())):
+ if i in reduction_indices:
+ divisor *= tensor.get_shape()[i].value
+ divisor = tf.constant(divisor, tensor.dtype)
+
+ # compute the requested central moment
+ # note that mean is a raw moment, not a central moment
+ mean = tf.div(
+ tf.reduce_sum(tensor,
+ reduction_indices=reduction_indices,
+ keep_dims=True),
+ divisor)
+ delta = tensor - mean
+ if mask is not None:
+ delta = Mask(delta, mask)
+ moment = tf.div(
+ tf.reduce_sum(tf.math_ops.pow(delta, k),
+ reduction_indices=reduction_indices,
+ keep_dims=True),
+ divisor)
+ moment = tf.squeeze(moment, reduction_indices)
+ if standardize:
+ moment = tf.mul(
+ moment,
+ tf.math_ops.pow(
+ tf.rsqrt(Moment(2,
+ tensor,
+ reduction_indices=reduction_indices)[1]),
+ k))
+
+ return tf.squeeze(mean, reduction_indices), moment
+
+
+def StringToOp(string):
+ """Get a TensorFlow op from a string.
+
+ Args:
+ string: String description of an op, such as 'sum' or 'mean'.
+
+ Returns:
+ A TensorFlow op.
+
+ Raises:
+ NotImplementedError: If string does not match a supported operation.
+ """
+ # TODO(user): median is not implemented yet in TensorFlow
+ op_map = {
+ 'max': tf.reduce_max,
+ 'mean': Mean,
+ 'min': tf.reduce_min,
+ 'sum': tf.reduce_sum,
+ 'variance': Variance,
+ 'skewness': Skewness,
+ 'kurtosis': Kurtosis,
+ }
+ try:
+ return op_map[string]
+ except KeyError:
+ raise NotImplementedError('Unrecognized op: %s' % string)
diff --git a/biology/utils_test.py b/biology/utils_test.py
new file mode 100644
index 0000000..b3657f0
--- /dev/null
+++ b/biology/utils_test.py
@@ -0,0 +1,218 @@
+#!/usr/bin/python
+#
+# Copyright 2015 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+
+
+import numpy as np
+import scipy.stats
+import tensorflow as tf
+
+from google.protobuf import text_format
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import checkpoint_state_pb2
+
+from biology import utils
+
+FLAGS = flags.FLAGS
+FLAGS.test_random_seed = 20151102
+
+
+class UtilsTest(test_util.TensorFlowTestCase):
+
+ def setUp(self):
+ super(UtilsTest, self).setUp()
+ np.random.seed(FLAGS.test_random_seed)
+
+ def testParseCheckpoint(self):
+ # parse CheckpointState proto
+ with tempfile.NamedTemporaryFile() as f:
+ cp = checkpoint_state_pb2.CheckpointState()
+ cp.model_checkpoint_path = 'my-checkpoint'
+ f.write(text_format.MessageToString(cp))
+ f.file.flush()
+ self.assertEqual(utils.ParseCheckpoint(f.name), 'my-checkpoint')
+ # parse path to actual checkpoint
+ with tempfile.NamedTemporaryFile() as f:
+ f.write('This is not a CheckpointState proto.')
+ f.file.flush()
+ self.assertEqual(utils.ParseCheckpoint(f.name), f.name)
+
+ def PrepareFeatures(self, features):
+ features = np.asarray(features, dtype=float)
+ features_t = tf.constant(features, dtype=tf.float32)
+ return features, features_t
+
+ def PrepareMask(self, features, mask):
+ mask = np.asarray(mask, dtype=float)
+ mask_t = tf.constant(mask, dtype=tf.float32)
+ # the provided mask has to be the same shape as features
+ expanded_mask = np.logical_not(
+ np.ones_like(features) * np.expand_dims(mask, -1))
+ masked_features = np.ma.masked_array(features, mask=expanded_mask)
+ return masked_features, mask_t
+
+ def Check(self, func, features, expected, axis=None, mask=None):
+ with self.test_session() as sess:
+ features, features_t = self.PrepareFeatures(features)
+ if mask is not None:
+ features, mask = self.PrepareMask(features, mask)
+ self.assertAllClose(
+ sess.run(func(features_t, reduction_indices=axis, mask=mask)),
+ expected)
+
+ def testMean(self):
+ self.Check(utils.Mean,
+ features=[0, 1],
+ expected=0.5)
+ self.Check(utils.Mean,
+ features=[[0, 1],
+ [2, 3]],
+ expected=[0.5, 2.5],
+ axis=1)
+ self.Check(utils.Mean,
+ features=[[[0, 1],
+ [2, 3]],
+ [[4, 5],
+ [6, 7]]],
+ expected=[2.5, 4.5],
+ axis=[0, 2])
+
+ def testMeanWithMask(self):
+ self.Check(utils.Mean,
+ features=[[9999],
+ [1],
+ [2]],
+ expected=1.5,
+ mask=[0, 1, 1])
+ self.Check(utils.Mean,
+ features=[[0, 1],
+ [9999, 9999]],
+ expected=[0, 1],
+ axis=0,
+ mask=[1, 0])
+ self.Check(utils.Mean,
+ features=[[[0, 1],
+ [9999, 9999]],
+ [[9999, 9999],
+ [6, 7]]],
+ expected=[0.5, 6.5],
+ axis=[0, 2],
+ mask=[[1, 0],
+ [0, 1]])
+
+ def testVariance(self):
+ self.Check(utils.Variance,
+ features=[0, 1],
+ expected=0.25)
+ self.Check(utils.Variance,
+ features=[[0, 2],
+ [2, 3]],
+ expected=[1, 0.25],
+ axis=1)
+ self.Check(utils.Variance,
+ features=[[[0, 1],
+ [2, 3]],
+ [[4, 5],
+ [6, 7]]],
+ expected=[4.25, 4.25],
+ axis=[0, 2])
+
+ def testVarianceWithMask(self):
+ self.Check(utils.Variance,
+ features=[[0],
+ [1],
+ [2]],
+ expected=0.25,
+ mask=[0, 1, 1])
+ self.Check(utils.Variance,
+ features=[[0, 2],
+ [9999, 9999],
+ [4, 4]],
+ expected=[4, 1],
+ axis=0,
+ mask=[1, 0, 1])
+ self.Check(utils.Variance,
+ features=[[[0, 1],
+ [9999, 9999]],
+ [[9999, 9999],
+ [6, 8]]],
+ expected=[0.25, 1],
+ axis=[0, 2],
+ mask=[[1, 0],
+ [0, 1]])
+
+ def testMoment(self):
+ with self.test_session() as sess:
+ features = np.random.random((3, 4, 5))
+ features_t = tf.constant(features, dtype=tf.float32)
+
+ # test k = 1..4
+ for k in [1, 2, 3, 4]:
+ # central moments
+ self.assertAllClose(
+ sess.run(utils.Moment(k, features_t)[1]),
+ scipy.stats.moment(features, k, axis=None),
+ rtol=1e-5, atol=1e-5)
+
+ # standardized moments
+ self.assertAllClose(
+ sess.run(utils.Moment(k, features_t, standardize=True)[1]),
+ np.divide(scipy.stats.moment(features, k, axis=None),
+ np.power(features.std(), k)),
+ rtol=1e-5, atol=1e-5)
+
+ # central across one axis
+ self.assertAllClose(
+ sess.run(utils.Moment(k, features_t, reduction_indices=1)[1]),
+ scipy.stats.moment(features, k, axis=1),
+ rtol=1e-5, atol=1e-5)
+
+ # standardized across one axis
+ self.assertAllClose(
+ sess.run(utils.Moment(k, features_t, standardize=True,
+ reduction_indices=1)[1]),
+ np.divide(scipy.stats.moment(features, k, axis=1),
+ np.power(features.std(axis=1), k)),
+ rtol=1e-5, atol=1e-5)
+
+ def testSkewness(self):
+ with self.test_session() as sess:
+ features = np.random.random((3, 4, 5))
+ features_t = tf.constant(features, dtype=tf.float32)
+ self.assertAllClose(sess.run(utils.Skewness(features_t)),
+ scipy.stats.skew(features, axis=None),
+ rtol=1e-5, atol=1e-5)
+ self.assertAllClose(sess.run(utils.Skewness(features_t, 1)),
+ scipy.stats.skew(features, axis=1),
+ rtol=1e-5, atol=1e-5)
+
+ def testKurtosis(self):
+ with self.test_session() as sess:
+ features = np.random.random((3, 4, 5))
+ features_t = tf.constant(features, dtype=tf.float32)
+ self.assertAllClose(sess.run(utils.Kurtosis(features_t)),
+ scipy.stats.kurtosis(features, axis=None),
+ rtol=1e-5, atol=1e-5)
+ self.assertAllClose(sess.run(utils.Kurtosis(features_t, 1)),
+ scipy.stats.kurtosis(features, axis=1),
+ rtol=1e-5, atol=1e-5)
+
+if __name__ == '__main__':
+ googletest.main()