Initial commit
diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE
@@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License.
diff --git a/README b/README new file mode 100644 index 0000000..82a44d3 --- /dev/null +++ b/README
@@ -0,0 +1,12 @@ +This small package provides some framework and implementation of a relatively +deep learning model for doing virtual screening based on Tensorflow. The model is as +described in this paper + +http://arxiv.org/abs/1502.02072 + +The framework is somewhat more general and can be used for a variety of +tensorflow models. + +Note that this code is NOT fully functional as is. The input reading and label +operations are specific to Google's implementation and need to be reimplemented +for your environment.
diff --git a/biology/icml/icml_eval.py b/biology/icml/icml_eval.py new file mode 100644 index 0000000..0c76402 --- /dev/null +++ b/biology/icml/icml_eval.py
@@ -0,0 +1,78 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluate a model from the ICML-2015 paper. + +This script requires a trained model with its associated config and checkpoint. +If you don't have a trained model, run icml_train.py first. + +""" +# pylint: disable=line-too-long +# pylint: enable=line-too-long + + +from nowhere.research.biology.collaborations.pande.py import utils + +from tensorflow.python.platform import app +from tensorflow.python.platform import flags +from tensorflow.python.platform import gfile + +from biology import model_config +from biology.icml import icml_models + +flags.DEFINE_string('config', None, 'Serialized ModelConfig proto.') +flags.DEFINE_string('checkpoint', None, + 'Model checkpoint file. File can contain either an ' + 'absolute checkpoint (e.g. model.ckpt-{step}) or a ' + 'serialized CheckpointState proto.') +flags.DEFINE_string('input_pattern', None, 'Input file pattern; ' + 'It should include %d for fold index substitution.') +flags.DEFINE_string('master', 'local', 'BNS name of the TensorFlow master.') +flags.DEFINE_string('logdir', None, 'Directory for output files.') +flags.DEFINE_integer('num_folds', 5, 'Number of cross-validation folds.') +flags.DEFINE_integer('fold', None, 'Fold index for this model.') +flags.DEFINE_enum('model_type', 'single', ['single', 'deep', 'deepaux', 'py', + 'pydrop1', 'pydrop2'], + 'Which model from the ICML paper should be trained/evaluated') +FLAGS = flags.FLAGS + + +def main(unused_argv=None): + config = model_config.ModelConfig() + config.ReadFromFile(FLAGS.config, overwrite='allowed') + gfile.MakeDirs(FLAGS.logdir) + model = icml_models.CONSTRUCTORS[FLAGS.model_type](config, + train=False, + logdir=FLAGS.logdir, + master=FLAGS.master) + + if FLAGS.num_folds is not None and FLAGS.fold is not None: + folds = utils.kfold_pattern(FLAGS.input_pattern, FLAGS.num_folds, + FLAGS.fold) + _, test_pattern = folds.next() + test_pattern = ','.join(test_pattern) + else: + test_pattern = FLAGS.input_pattern + + with model.graph.as_default(): + model.Eval(model.ReadInput(test_pattern), FLAGS.checkpoint) + + +if __name__ == '__main__': + flags.MarkFlagAsRequired('config') + flags.MarkFlagAsRequired('checkpoint') + flags.MarkFlagAsRequired('input_pattern') + flags.MarkFlagAsRequired('logdir') + app.run()
diff --git a/biology/icml/icml_models.py b/biology/icml/icml_models.py new file mode 100644 index 0000000..e79a70f --- /dev/null +++ b/biology/icml/icml_models.py
@@ -0,0 +1,309 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TensorFlow implementation of the models from the ICML-2015 paper. + + +hyperparam_dict = { + "single": Hyperparams(num_layers=1, + num_hidden=1200, + node_depth=1, + nonlinearity=ACTIVATION_RECTIFIED_LINEAR, + weight_init=GaussianWeightInit(0.01), + bias_init=ConstantBiasInit(0.5), + dropout=1.), + "deep": Hyperparams(num_layers=4, + num_hidden=1000, + node_depth=1, + nonlinearity=ACTIVATION_RECTIFIED_LINEAR, + weight_init=GaussianWeightInit(0.01), + bias_init=ConstantBiasInit(0.5), + dropout=1.), + "deepaux": Hyperparams(num_layers=4, + num_hidden=1000, + auxiliary_softmax_layers=[0, 1, 2], + auxiliary_softmax_weight=0.3, + node_depth=1, + nonlinearity=ACTIVATION_RECTIFIED_LINEAR, + weight_init=GaussianWeightInit(0.01), + bias_init=ConstantBiasInit(0.5), + dropout=1.), + "py": Hyperparams(num_layers=2, + num_hidden=[2000, 100], + node_depth=1, + nonlinearity=ACTIVATION_RECTIFIED_LINEAR, + weight_init=[GaussianWeightInit(0.01), + GaussianWeightInit(0.04)], + bias_init=[ConstantBiasInit(0.5), + ConstantBiasInit(3.0)], + dropout=1.), + "pydrop1": Hyperparams(num_layers=2, + num_hidden=[2000, 100], + node_depth=1, + nonlinearity=ACTIVATION_RECTIFIED_LINEAR, + weight_init=[GaussianWeightInit(0.01), + GaussianWeightInit(0.04)], + bias_init=[ConstantBiasInit(0.5), + ConstantBiasInit(3.0)], + dropout=[0.75, 1.]), + "pydrop2": Hyperparams(num_layers=2, + num_hidden=[2000, 100], + node_depth=1, + nonlinearity=ACTIVATION_RECTIFIED_LINEAR, + weight_init=[GaussianWeightInit(0.01), + GaussianWeightInit(0.04)], + bias_init=[ConstantBiasInit(0.5), + ConstantBiasInit(3.0)], + dropout=[0.75, 0.75])} +""" + +import numpy as np + +import tensorflow.google as tf + +from tensorflow.python.platform import logging + +from biology import model +from biology import model_ops +from nowhere.mustreimplement import input_ops +from nowhere.mustreimplement import label_ops +from nowhere.learning.dist_belief import input_example_pb2 +from nowhere.learning.dist_belief import types_pb2 as legacy_types_pb2 + + +class UnreplicatedIcmlModel(model.Classifier): + """Implements an icml model as configured in a model_config.proto.""" + + def Build(self): + """Constructs the graph architecture as specified in its config. + + This method creates the following Placeholders: + mol_features: Molecule descriptor (e.g. fingerprint) tensor with shape + batch_size x num_features. + """ + with tf.name_scope(self.placeholder_scope): + self.mol_features = tf.placeholder( + tf.float32, + shape=[self.config.batch_size, self.config.num_features], + name='mol_features') + + layer_sizes = self.config.layer_sizes + weight_init_stddevs = self.config.weight_init_stddevs + bias_init_consts = self.config.bias_init_consts + dropouts = self.config.dropouts + lengths_set = { + len(layer_sizes), + len(weight_init_stddevs), + len(bias_init_consts), + len(dropouts), + } + assert len(lengths_set) == 1, 'All layer params must have same length.' + num_layers = lengths_set.pop() + assert num_layers > 0, 'Must have some layers defined.' + + prev_layer = self.mol_features + prev_layer_size = self.config.num_features + for i in xrange(num_layers): + layer = tf.nn.relu(model_ops.FullyConnectedLayer( + tensor=prev_layer, + size=layer_sizes[i], + weight_init=tf.truncated_normal( + shape=[prev_layer_size, layer_sizes[i]], + stddev=weight_init_stddevs[i]), + bias_init=tf.constant(value=bias_init_consts[i], + shape=[layer_sizes[i]]))) + layer = model_ops.Dropout(layer, dropouts[i]) + prev_layer = layer + prev_layer_size = layer_sizes[i] + + self.output = model_ops.MultitaskLogits( + layer, self.config.num_classification_tasks) + + def LabelsAndWeights(self): + """Parse Label protos and create tensors for labels and weights. + + This method creates the following Placeholders in the graph: + labels: Tensor with shape batch_size x num_tasks containing serialized + Label protos. + """ + config = self.config + with tf.name_scope(self.placeholder_scope): + labels = tf.placeholder( + tf.string, + shape=[config.batch_size, config.num_classification_tasks], + name='labels') + self.labels = label_ops.MultitaskLabelClasses(labels, config.num_classes) + self.weights = label_ops.MultitaskLabelWeights(labels) + + def ReadInput(self, input_pattern, input_data_types=None): + """Read input data and return a generator for minibatches. + + Args: + input_pattern: Input file pattern. + input_data_types: List of legacy_types_pb2 constants matching the + number of and data types present in the sstables. If not specified, + defaults to full ICML 259-task types, but can be specified + for unittests or other datasets with consistent types. + + Returns: + A generator that yields a dict for feeding a single batch to Placeholders + in the graph. + + Raises: + AssertionError: If no default session is available. + """ + if model_ops.IsTraining(): + randomize = True + num_iterations = None + else: + randomize = False + num_iterations = 1 + + num_tasks = self.config.num_classification_tasks + tasks_in_input = self.config.tasks_in_input + if input_data_types is None: + input_data_types = ([legacy_types_pb2.DF_FLOAT] + + [legacy_types_pb2.DF_LABEL_PROTO] * tasks_in_input) + features, labels = input_ops.InputExampleInputReader( + input_pattern=input_pattern, + batch_size=self.config.batch_size, + num_tasks=num_tasks, + input_data_types=input_data_types, + num_features=self.config.num_features, + randomize=randomize, + shuffling=randomize, + num_iterations=num_iterations) + + return self._ReadInputGenerator(features, labels[:, :num_tasks]) + + def _GetFeedDict(self, named_values): + feed_dict = {} + for name, value in named_values.iteritems(): + feed_dict['{}/{}:0'.format(self.placeholder_root, name)] = value + + return feed_dict + + def EvalBatch(self, input_batch): + """Runs inference on the provided batch of input. + + Args: + input_batch: iterator of input with len self.config.batch_size. + + Returns: + Tuple of three numpy arrays with shape num_examples x num_tasks (x ...): + output: Model predictions. + labels: True labels. numpy array values are scalars, + not 1-hot classes vector. + weights: Example weights. + """ + output, labels, weights = super(UnreplicatedIcmlModel, self).EvalBatch( + input_batch) + + # Converts labels from 1-hot to float. + labels = labels[:, :, 1] # Whole batch, all tasks, 1-hot positive index. + return output, labels, weights + + def BatchInputGenerator(self, serialized_batch): + """Returns a generator that iterates over the provided batch of input. + + TODO(user): This is similar to input_ops.InputExampleInputReader(), + but doesn't need to be executed as part of the TensorFlow graph. + Consider refactoring so these can share code somehow. + + Args: + serialized_batch: List of tuples: (_, value) where value is + a serialized InputExample proto. Must have self.config.batch_size + length or smaller. If smaller, we'll pad up to batch_size + and mark the padding as invalid so it's ignored in eval metrics. + Yields: + Dict of model inputs for use as a feed_dict. + + Raises: + ValueError: If the batch is larger than the batch_size. + """ + if len(serialized_batch) > self.config.batch_size: + raise ValueError( + 'serialized_batch length {} must be <= batch_size {}'.format( + len(serialized_batch), self.config.batch_size)) + for _ in xrange(self.config.batch_size - len(serialized_batch)): + serialized_batch.append((None, '')) + + features = [] + labels = [] + for _, serialized_proto in serialized_batch: + if serialized_proto: + input_example = input_example_pb2.InputExample() + input_example.ParseFromString(serialized_proto) + features.append([f for f in input_example.endpoint[0].float_value]) + label_protos = [endpoint.label + for endpoint in input_example.endpoint[1:]] + assert len(label_protos) == self.config.num_classification_tasks + labels.append([l.SerializeToString() for l in label_protos]) + else: + # This was a padded value to reach the batch size. + features.append([0.0 for _ in xrange(self.config.num_features)]) + labels.append( + ['' for _ in xrange(self.config.num_classification_tasks)]) + + valid = np.asarray([(np.sum(f) > 0) for f in features]) + + assert len(features) == self.config.batch_size + assert len(labels) == self.config.batch_size + assert len(valid) == self.config.batch_size + yield self._GetFeedDict({ + 'mol_features': features, + 'labels': labels, + 'valid': valid + }) + + def _ReadInputGenerator(self, features_tensor, labels_tensor): + """Generator that constructs feed_dict for minibatches. + + Args: + features_tensor: Tensor of batch_size x molecule features. + labels_tensor: Tensor of batch_size x label protos. + + Yields: + A dict for feeding a single batch to Placeholders in the graph. + + Raises: + AssertionError: If no default session is available. + """ + sess = tf.get_default_session() + if sess is None: + raise AssertionError('No default session') + while True: + try: + logging.vlog(1, 'Starting session execution to get input data') + features, labels = sess.run([features_tensor, labels_tensor]) + logging.vlog(1, 'Done with session execution to get input data') + # TODO(user): check if the below axis=1 needs to change to axis=0, + # because cl/105081140. + valid = np.sum(features, axis=1) > 0 + yield self._GetFeedDict({ + 'mol_features': features, + 'labels': labels, + 'valid': valid + }) + + except tf.OpError as e: + # InputExampleInput op raises OpError when it has hit num_iterations + # or its input file is exhausted. However it may also be raised + # if the input sstable isn't what we expect. + if 'Invalid InputExample' in e.message: + raise e + else: + break +
diff --git a/biology/icml/icml_train.py b/biology/icml/icml_train.py new file mode 100644 index 0000000..ed1ed0c --- /dev/null +++ b/biology/icml/icml_train.py
@@ -0,0 +1,133 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Train a model from the ICML-2015 paper. +""" +# pylint: disable=line-too-long +# pylint: enable=line-too-long + +import os + + +from tensorflow.python.platform import app +from tensorflow.python.platform import flags +from tensorflow.python.platform import gfile + +from biology import model_config +from biology.icml import icml_models + +flags.DEFINE_string('config', None, 'Serialized ModelConfig proto.') +flags.DEFINE_string('master', '', 'BNS name of the TensorFlow master.') +flags.DEFINE_string('logdir', None, 'Directory for output files.') +flags.DEFINE_integer('replica_id', 0, 'Task ID of this replica.') +flags.DEFINE_integer('ps_tasks', 0, 'Number of parameter server tasks.') +flags.DEFINE_integer('num_folds', 5, 'Number of cross-validation folds.') +flags.DEFINE_integer('fold', None, 'Fold index for this model.') + +FLAGS = flags.FLAGS + +def kfold_pattern(input_pattern, num_folds, fold=None): + """Generator for train/test filename splits. + + The pattern is not expanded except for the %d being replaced by the fold + index. + + Args: + input_pattern: Input filename pattern. Should contain %d for fold index. + num_folds: Number of folds. + fold: If not None, the generator only yields the train/test split for the + given fold. + + Yields: + train_filenames: A list of file patterns in training set. + test_filenames: A list of file patterns in test set. + """ + # get filenames associated with each fold + fold_filepatterns = [input_pattern % i for i in range(num_folds)] + + # create train/test splits + for i in range(num_folds): + if fold is not None and i != fold: + continue + train = fold_filepatterns[:i] + fold_filepatterns[i+1:] + test = [fold_filepatterns[i]] + if any([f in test for f in train]): + logging.fatal('Train/test split is not complete.') + if set(train + test) != set(fold_filepatterns): + logging.fatal('Not all input files are accounted for.') + yield train, test + + +def Run(input_data_types=None): + """Trains the model with specified parameters. + + Args: + input_data_types: List of legacy_types_pb2 constants or None. + """ + config = model_config.ModelConfig({ + 'input_pattern': '', # Should have %d for fold index substitution. + 'num_classification_tasks': 259, + 'tasks_in_input': 259, # Dimensionality of sstables + 'max_steps': 50000000, + 'summaries': False, + 'batch_size': 128, + 'learning_rate': 0.0003, + 'num_classes': 2, + 'optimizer': 'sgd', + 'penalty': 0.0, + 'num_features': 1024, + 'layer_sizes': [1200], + 'weight_init_stddevs': [0.01], + 'bias_init_consts': [0.5], + 'dropouts': [0.0], + }) + config.ReadFromFile(FLAGS.config, + overwrite='required') + + if FLAGS.replica_id == 0: + gfile.MakeDirs(FLAGS.logdir) + config.WriteToFile(os.path.join(FLAGS.logdir, 'config.pbtxt')) + + model = icml_models.IcmlModel(config, + train=True, + logdir=FLAGS.logdir, + master=FLAGS.master) + + if FLAGS.num_folds is not None and FLAGS.fold is not None: + folds = kfold_pattern(config.input_pattern, FLAGS.num_folds, + FLAGS.fold) + train_pattern, _ = folds.next() + train_pattern = ','.join(train_pattern) + else: + train_pattern = config.input_pattern + + with model.graph.as_default(): + model.Train(model.ReadInput(train_pattern, + input_data_types=input_data_types), + max_steps=config.max_steps, + summaries=config.summaries, + replica_id=FLAGS.replica_id, + ps_tasks=FLAGS.ps_tasks) + + +def main(unused_argv=None): + Run() + + +if __name__ == '__main__': + flags.MarkFlagAsRequired('config') + flags.MarkFlagAsRequired('logdir') + flags.MarkFlagAsRequired('fold') + app.run()
diff --git a/biology/metrics.py b/biology/metrics.py new file mode 100644 index 0000000..fcc655a --- /dev/null +++ b/biology/metrics.py
@@ -0,0 +1,99 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation metrics.""" + +import collections + + +import numpy as np +from sklearn import metrics + + +def kappa_score(y_true, y_pred): + """Calculate Cohen's kappa for classification tasks. + + See https://en.wikipedia.org/wiki/Cohen%27s_kappa + + Note that this implementation of Cohen's kappa expects binary labels. + + Args: + y_true: Numpy array containing true values. + y_pred: Numpy array containing predicted values. + + Returns: + kappa: Numpy array containing kappa for each classification task. + + Raises: + AssertionError: If y_true and y_pred are not the same size, or if class + labels are not in [0, 1]. + """ + assert len(y_true) == len(y_pred), 'Number of examples does not match.' + yt = np.asarray(y_true, dtype=int) + yp = np.asarray(y_pred, dtype=int) + assert np.array_equal(np.unique(yt), [0, 1]), ( + 'Class labels must be binary: %s' % np.unique(yt)) + observed_agreement = np.true_divide(np.count_nonzero(np.equal(yt, yp)), + len(yt)) + expected_agreement = np.true_divide( + np.count_nonzero(yt == 1) * np.count_nonzero(yp == 1) + + np.count_nonzero(yt == 0) * np.count_nonzero(yp == 0), + len(yt) ** 2) + kappa = np.true_divide(observed_agreement - expected_agreement, + 1.0 - expected_agreement) + return kappa + + +def compute_metric(y_true, y_pred, metric_str, threshold=0.5): + """Compute a metric value. + + Args: + y_true: A list of arrays containing true values for each task. + y_pred: A list of arrays containing predicted values for each task. + metric_str: String description of the metric to compute. Must be in + biology_metrics.METRICS. + threshold: Float threshold to apply to probabilities for positive/negative + class assignment. + + Returns: + Float metric value. + + Raises: + NotImplementedError: If metric_str is not in METRICS. + """ + if metric_str not in METRICS: + raise NotImplementedError('Unsupported metric %s' % metric_str) + metric_tuple = METRICS[metric_str] + if metric_tuple.threshold: + y_pred = np.greater(y_pred, threshold) + return metric_tuple.func(y_true, y_pred) + + +class Metric(collections.namedtuple('MetricTuple', ['func', 'threshold'])): + """A named tuple used to organize model evaluation metrics. + + Args: + func: Function to call. Should take true and predicted values (in that + order) and compute the metric. + threshold: Boolean indicating whether float values should be converted to + binary labels prior to computing the metric, e.g. accuracy. + """ + +METRICS = { + 'accuracy': Metric(metrics.accuracy_score, True), + 'auc': Metric(metrics.roc_auc_score, False), + 'kappa': Metric(kappa_score, True), + 'r2': Metric(metrics.r2_score, False), +}
diff --git a/biology/metrics_test.py b/biology/metrics_test.py new file mode 100644 index 0000000..ea23cdb --- /dev/null +++ b/biology/metrics_test.py
@@ -0,0 +1,39 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for metrics.""" + + +import numpy as np + +from tensorflow.python.platform import googletest + +from biology import metrics + + +class MetricsTest(googletest.TestCase): + + def test_kappa_score(self): + y_true = [1, 0, 1, 0] + y_pred = [0.8, 0.2, 0.3, 0.4] # [1, 0, 0, 0] with 0.5 threshold + kappa = metrics.kappa_score(y_true, np.greater(y_pred, 0.5)) + observed_agreement = 3.0 / 4.0 + expected_agreement = ((2 * 1) + (2 * 3)) / 4.0 ** 2 + expected_kappa = np.true_divide(observed_agreement - expected_agreement, + 1.0 - expected_agreement) + self.assertAlmostEquals(kappa, expected_kappa) + +if __name__ == '__main__': + googletest.main()
diff --git a/biology/model.py b/biology/model.py new file mode 100644 index 0000000..b8cf4a0 --- /dev/null +++ b/biology/model.py
@@ -0,0 +1,989 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Helper operations and classes for general model building. + +These methods are generally dependent on ModelConfig. +""" + +import collections +import cPickle as pickle +import os +import time +import warnings + + +import numpy as np +import pandas as pd +from sklearn import metrics as sklearn_metrics +import tensorflow as tf + +from tensorflow.python.platform import logging + +from tensorflow.python.platform import gfile + +from biology import metrics as biology_metrics +from biology import model_ops +from biology import utils as biology_utils + + +class Model(object): + """Generic base class for defining, training, and evaluating models. + + Subclasses must implement the following methods: + AddOutputOps + Build + Eval + ReadInput / _ReadInputGenerator + BatchInputGenerator (if you want to use mr_eval/EvalBatch) + TrainingCost + + Subclasses must set the following attributes: + loss: Op to calculate training cost used for gradient calculation. + output: Op(s) for model output for each task. + labels: Op(s) for true labels for each task. + weights: Op(s) for example weights for each task. + global_step: Scalar variable tracking total training/eval steps. + updates: Op(s) for running updates of e.g. moving averages for batch + normalization. Should be set to tf.no_op() if no updates are required. + + This base class provides the following attributes: + config: ModelConfig containing model configuration parameters. + graph: TensorFlow graph object. + logdir: Path to the file output directory to store checkpoints etc. + master: TensorFlow session master specification string. + num_tasks: Integer number of tasks this model trains/evals on. + placeholder_root: String placeholder prefix, used to create + placeholder_scope. + placeholder_scope: name scope where tf.placeholders are defined. + summary_writer: SummaryWriter for writing summaries. + valid: Placeholder for a boolean tensor with shape batch_size to use as a + mask when calculating gradient costs. + + Args: + config: ModelConfig. + train: If True, model is in training mode. + logdir: Directory for output files. + graph: Default graph. + master: BNS name of TensorFlow master. + summary_writer: SummaryWriter to use for writing summaries. If None, a new + SummaryWriter will be created. + """ + + def __init__(self, + config, + train, + logdir, + graph=None, + master='local', + summary_writer=None): + self.config = config + self.graph = graph if graph is not None else tf.Graph() + self.logdir = logdir + self.master = master + + # Path to save checkpoint files, which matches the + # replicated supervisor's default path. + self._save_path = os.path.join(logdir, 'model.ckpt') + + # batches. Lazily created by _SharedSession(). + self._shared_session = None + + # Guard variable to make sure we don't Restore() this model + # from a disk checkpoint more than once. + self._restored_model = False + + # Cache of TensorFlow scopes, to prevent '_1' appended scope names + # when subclass-overridden methods use the same scopes. + self._name_scopes = {} + + with self.graph.as_default(): + model_ops.SetTraining(train) + self.placeholder_root = 'placeholders' + with tf.name_scope(self.placeholder_root) as scope: + self.placeholder_scope = scope + self.valid = tf.placeholder(tf.bool, + shape=[config.batch_size], + name='valid') + + num_classification_tasks = config.GetOptionalParam( + 'num_classification_tasks', 0) + num_regression_tasks = config.GetOptionalParam('num_regression_tasks', 0) + if num_classification_tasks and num_regression_tasks: + raise AssertionError( + 'Dual classification/regression models are not supported.') + self.num_tasks = num_classification_tasks + num_regression_tasks + if self.num_tasks == 0: + raise AssertionError('Must specify one of ' + 'num_classification_tasks or num_regression_tasks.') + + if summary_writer is None: + summary_writer = tf.train.SummaryWriter(logdir) + self.summary_writer = summary_writer + + def Build(self): + """Define the core model. + + NOTE(user): Operations defined here should be in their own name scope to + avoid any ambiguity when restoring checkpoints. + + Raises: + NotImplementedError: if not overridden by concrete subclass. + """ + raise NotImplementedError('Must be overridden by concrete subclass') + + def LabelPlaceholders(self): + """Add Placeholders for labels for each task. + + This method creates the following Placeholders for each task: + labels_%d: Float label tensor. For classification tasks, this tensor will + have shape batch_size x num_classes. For regression tasks, this tensor + will have shape batch_size. + + Raises: + NotImplementedError: if not overridden by concrete subclass. + """ + raise NotImplementedError('Must be overridden by concrete subclass') + + def WeightPlaceholders(self): + """Add Placeholders for example weights for each task. + + This method creates the following Placeholders for each task: + weights_%d: Label tensor with shape batch_size. + + Placeholders are wrapped in identity ops to avoid the error caused by + feeding and fetching the same tensor. + """ + weights = [] + for task in xrange(self.num_tasks): + with tf.name_scope(self.placeholder_scope): + weights.append(tf.identity( + tf.placeholder(tf.float32, shape=[self.config.batch_size], + name='weights_%d' % task))) + self.weights = weights + + def LabelsAndWeights(self): + """Add Placeholders for labels and weights. + + This method results in the creation of the following Placeholders for each + task: + labels_%d: Float label tensor. For classification tasks, this tensor will + have shape batch_size x num_classes. For regression tasks, this tensor + will have shape batch_size. + weights_%d: Label tensor with shape batch_size. + + This method calls self.LabelPlaceholders and self.WeightPlaceholders; the + former method must be implemented by a concrete subclass. + """ + self.LabelPlaceholders() + self.WeightPlaceholders() + + def ReadInput(self, input_pattern): + """Read input data and return a generator for minibatches. + + Args: + input_pattern: Input file pattern. + + Returns: + A generator that yields a dict for feeding a single batch to Placeholders + in the graph. + + Raises: + NotImplementedError: if not overridden by concrete subclass. + """ + raise NotImplementedError('Must be overridden by concrete subclass') + + def _ReadInputGenerator(self, names, tensors): + """Generator that constructs feed_dict for minibatches. + + ReadInput cannot be a generator because any reading ops will not be added to + the graph until .next() is called (which is too late). Instead, ReadInput + should perform any necessary graph construction and then return this + generator. + + Args: + names: A list of tensor names. + tensors: A list of tensors to evaluate. + + Yields: + A dict for feeding a single batch to Placeholders in the graph. + + Raises: + NotImplementedError: if not overridden by concrete subclass. + """ + raise NotImplementedError('Must be overridden by concrete subclass') + + def _SharedNameScope(self, name): + """Returns a singleton TensorFlow scope with the given name. + + Used to prevent '_1'-appended scopes when sharing scopes with child classes. + + Args: + name: String. Name scope for group of operations. + Returns: + tf.name_scope with the provided name. + """ + if name not in self._name_scopes: + with tf.name_scope(name) as scope: + self._name_scopes[name] = scope + + return tf.name_scope(self._name_scopes[name]) + + def Cost(self, output, labels, weights): + """Calculate single-task training cost for a batch of examples. + + Args: + output: Tensor with model outputs. + labels: Tensor with true labels. + weights: Tensor with shape batch_size containing example weights. + + Returns: + A tensor with shape batch_size containing the weighted cost for each + example. For use in subclasses that want to calculate additional costs. + """ + # TODO(user): for mixed classification/regression models, pass in a task + # index to control the cost calculation + raise NotImplementedError('Must be overridden by concrete subclass') + + def TrainingCost(self): + self.RequireAttributes(['output', 'labels', 'weights']) + epsilon = 1e-3 # small float to avoid dividing by zero + config = self.config + weighted_costs = [] # weighted costs for each example + gradient_costs = [] # costs used for gradient calculation + old_costs = [] # old-style cost + + with self._SharedNameScope('costs'): + for task in xrange(self.num_tasks): + task_str = str(task).zfill(len(str(self.num_tasks))) + with self._SharedNameScope('cost_{}'.format(task_str)): + with tf.name_scope('weighted'): + weighted_cost = self.Cost(self.output[task], self.labels[task], + self.weights[task]) + weighted_costs.append(weighted_cost) + + with tf.name_scope('gradient'): + # Note that we divide by the batch size and not the number of + # non-zero weight examples in the batch. Also, instead of using + # tf.reduce_mean (which can put ops on the CPU) we explicitly + # calculate with div/sum so it stays on the GPU. + gradient_cost = tf.div(tf.reduce_sum(weighted_cost), + config.batch_size) + tf.scalar_summary('cost' + task_str, + model_ops.MovingAverage(gradient_cost, + self.global_step)) + gradient_costs.append(gradient_cost) + + with tf.name_scope('old_cost'): + old_cost = tf.div( + tf.reduce_sum(weighted_cost), + tf.reduce_sum(self.weights[task]) + epsilon) + tf.scalar_summary('old-cost' + task_str, + model_ops.MovingAverage(old_cost, + self.global_step)) + old_costs.append(old_cost) + + # aggregated costs + with self._SharedNameScope('aggregated'): + with tf.name_scope('gradient'): + loss = tf.add_n(gradient_costs) + with tf.name_scope('old_cost'): + old_loss = tf.add_n(old_costs) + + # weight decay + if config.penalty != 0.0: + penalty = WeightDecay(config) + loss += penalty + old_loss += penalty + + # loss used for gradient calculation + self.loss = loss + + # (smoothed) summaries + tf.scalar_summary('Total Cost', + model_ops.MovingAverage(loss, self.global_step)) + tf.scalar_summary('Total Old-Style Cost', + model_ops.MovingAverage(old_loss, self.global_step)) + + return weighted_costs + + def Setup(self): + """Add ops common to training/eval to the graph.""" + with tf.name_scope('core_model'): + self.Build() + self.LabelsAndWeights() + self.global_step = tf.Variable(0, name='global_step', trainable=False) + + def MergeUpdates(self): + """Group updates into a single op.""" + updates = tf.get_default_graph().get_collection('updates') + if updates: + self.updates = tf.group(*updates, name='updates') + else: + self.updates = tf.no_op(name='updates') + + def TrainingOp(self): + """Get training op for applying gradients to variables. + + Subclasses that need to do anything fancy with gradients should override + this method. + + Returns: + A training op. + """ + opt = Optimizer(self.config) + return opt.minimize(self.loss, global_step=self.global_step, name='train') + + def SummaryOp(self): + """Get summary op for computing all summaries during training. + + Returns: + A summary op. + """ + return tf.merge_all_summaries() + + def Train(self, + input_generator, + max_steps=None, + summaries=False, + save_model_secs=60, + save_summary_secs=30, + max_checkpoints_to_keep=5): + """Train the model. + + Args: + input_generator: Generator that returns a feed_dict for feeding + Placeholders in the model graph. Usually this will be ReadInput with a + provided input pattern. + max_steps: Maximum number of training steps. If not provided, will + train indefinitely. + summaries: If True, add summaries for model parameters. + save_model_secs: Integer. Saves a checkpoint at this interval in seconds. + save_summary_secs: Integer. Saves a summary event file at this interval in + seconds. + max_checkpoints_to_keep: Integer. Maximum number of checkpoints to keep; + older checkpoints will be deleted. + + Raises: + AssertionError: If model is not in training mode. + """ + assert model_ops.IsTraining() + self.Setup() + self.TrainingCost() + self.MergeUpdates() + self.RequireAttributes(['loss', 'global_step', 'updates']) + if summaries: + self.AddSummaries() + train_op = self.TrainingOp() + summary_op = self.SummaryOp() + no_op = tf.no_op() + tf.train.write_graph( + tf.get_default_graph().as_graph_def(), self.logdir, 'train.pbtxt') + self.summary_writer.add_graph(tf.get_default_graph().as_graph_def()) + last_checkpoint_time = time.time() + last_summary_time = time.time() + with self._SharedSession() as sess: + sess.run(tf.initialize_all_variables()) + saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep) + # Save an initial checkpoint. + saver.save(sess, self._save_path, global_step=self.global_step) + for feed_dict in input_generator: + # Run training op and compute summaries. + secs_since_summary = time.time() - last_summary_time + if secs_since_summary > save_summary_secs: + this_summary_op = summary_op + else: + this_summary_op = no_op + step, loss, _, summary = sess.run( + [train_op.values()[0], self.loss, self.updates, this_summary_op], + feed_dict=feed_dict) + if summary is not None: + self.summary_writer.add_summary(summary, global_step=step) + last_summary_time = time.time() + # Save model checkpoints. + secs_since_checkpoint = time.time() - last_checkpoint_time + if secs_since_checkpoint > save_model_secs: + logging.info('step %d: %g', step, loss) + saver.save(sess, self._save_path, global_step=self.global_step) + last_checkpoint_time = time.time() + # Quit when we reach max_steps. + if max_steps is not None and step >= max_steps: + break + # Always save a final checkpoint when complete. + saver.save(sess, self._save_path, global_step=self.global_step) + + def AddOutputOps(self): + """Add ops for inference. + + Default implementation is pass, derived classes can override as needed. + """ + pass + + def _SharedSession(self): + if not self._shared_session: + # allow_soft_placement=True allows ops without a GPU implementation + # to run on the CPU instead. + config = tf.ConfigProto(allow_soft_placement=True) + self._shared_session = tf.Session(self.master, config=config) + return self._shared_session + + def CloseSharedSession(self): + if self._shared_session: + self._shared_session.close() + + def Restore(self, checkpoint): + """Restores the model from the provided training checkpoint. + + Args: + checkpoint: string. Path to checkpoint file. + """ + if self._restored_model: + return + + with self.graph.as_default(): + self.Setup() + self.AddOutputOps() # add softmax heads + saver = tf.train.Saver(tf.variables.all_variables()) + saver.restore(self._SharedSession(), + biology_utils.ParseCheckpoint(checkpoint)) + self.global_step_number = int(self._SharedSession().run(self.global_step)) + + self._restored_model = True + + def Eval(self, input_generator, checkpoint, metrics=None): + """Evaluate the model. + + Args: + input_generator: Generator that returns a feed_dict for feeding + Placeholders in the model graph. + checkpoint: Checkpoint filename. + metrics: List of metrics to compute. Defaults to self.default_metrics, + which is set in subclasses. + + Returns: + step: Global step for this eval. + results: A dict mapping metric names to numpy arrays containing metric + values for each task. + """ + self.Restore(checkpoint) + output, labels, weights = self.ModelOutput(input_generator) + y_true, y_pred = self.ParseModelOutput(output, labels, weights) + + # keep counts for each class as a sanity check + counts = self.ExampleCounts(y_true) + + # compute metrics + if metrics is None: + metrics = self.default_metrics + metric_values = {} + for metric in metrics: + metric_values[metric] = self.ComputeMetric(y_true, y_pred, metric) + self.ReportEval(metric_values, counts=counts, + global_step=self.global_step_number) + return self.global_step_number, metric_values + + def ComputeMetric(self, y_true, y_pred, metric_str, threshold=0.5): + """Compute a performance metric for each task. + + Args: + y_true: A list of arrays containing true values for each task. + y_pred: A list of arrays containing predicted values for each task. + metric_str: String description of the metric to compute. Must be in + biology_metrics.METRICS. + threshold: Float threshold to apply to probabilities for positive/negative + class assignment. + + Returns: + A numpy array containing metric values for each task. + """ + computed_metrics = [] + for task in xrange(self.num_tasks): + yt = y_true[task] + yp = y_pred[task] + try: + metric_value = biology_metrics.compute_metric(yt, yp, metric_str, + threshold=threshold) + except (AssertionError, ValueError) as e: + warnings.warn('Error calculating metric %s for task %d: %s' + % (metric_str, task, e)) + metric_value = np.nan + computed_metrics.append(metric_value) + return computed_metrics + + def EvalBatch(self, input_batch): + """Runs inference on the provided batch of input. + + Args: + input_batch: iterator of input with len self.config.batch_size. + + Returns: + Tuple of three numpy arrays with shape num_examples x num_tasks (x ...): + output: Model predictions. + labels: True labels. + weights: Example weights. + """ + with self.graph.as_default(): + return self.ModelOutput(self.BatchInputGenerator(input_batch)) + + def ModelOutput(self, input_generator): + """Return model output for the provided input. + + Restore(checkpoint) must have previously been called on this object. + + Args: + input_generator: Generator that returns a feed_dict for feeding + Placeholders in the model graph. + + Returns: + Tuple of three numpy arrays with shape num_examples x num_tasks (x ...): + output: Model outputs. + labels: True labels. + weights: Example weights. + Note that the output and labels arrays may be more than 2D, e.g. for + classifier models that return class probabilities. + + Raises: + AssertionError: If model is not in evaluation mode. + ValueError: If output and labels are not both 3D or both 2D. + """ + assert not model_ops.IsTraining() + assert self._restored_model + self.RequireAttributes(['output', 'labels', 'weights']) + + # run eval data through the model + num_tasks = self.num_tasks + output, labels, weights = [], [], [] + start = time.time() + with self._SharedSession().as_default(): + batches_per_summary = 1000 + seconds_per_summary = 0 + batch_count = -1.0 + for feed_dict in input_generator: + batch_start = time.time() + batch_count += 1 + data = self._SharedSession().run( + self.output + self.labels + self.weights, + feed_dict=feed_dict) + batch_output = np.asarray(data[:num_tasks], dtype=float) + batch_labels = np.asarray(data[num_tasks:num_tasks * 2], dtype=float) + batch_weights = np.asarray(data[num_tasks * 2:num_tasks * 3], + dtype=float) + # reshape to batch_size x num_tasks x ... + if batch_output.ndim == 3 and batch_labels.ndim == 3: + batch_output = batch_output.transpose((1, 0, 2)) + batch_labels = batch_labels.transpose((1, 0, 2)) + elif batch_output.ndim == 2 and batch_labels.ndim == 2: + batch_output = batch_output.transpose((1, 0)) + batch_labels = batch_labels.transpose((1, 0)) + else: + raise ValueError( + 'Unrecognized rank combination for output and labels: %s %s' % + (batch_output.shape, batch_labels.shape)) + batch_weights = batch_weights.transpose((1, 0)) + valid = feed_dict[self.valid.name] + # only take valid outputs + if np.count_nonzero(~valid): + batch_output = batch_output[valid] + batch_labels = batch_labels[valid] + batch_weights = batch_weights[valid] + output.append(batch_output) + labels.append(batch_labels) + weights.append(batch_weights) + + # Writes summary for tracking eval progress. + seconds_per_summary += (time.time() - batch_start) + self.RequireAttributes(['summary_writer']) + if batch_count % batches_per_summary == 0: + mean_seconds_per_batch = seconds_per_summary / batches_per_summary + seconds_per_summary = 0 + summaries = [ + tf.scalar_summary('secs/batch', mean_seconds_per_batch), + tf.scalar_summary('batches_evaluated', batch_count) + ] + self.summary_writer.add_summary(tf.merge_summary(summaries).eval(), + global_step=self.global_step_number) + self.summary_writer.flush() + + logging.info('Eval took %g seconds', time.time() - start) + + output = np.concatenate(output) + labels = np.concatenate(labels) + weights = np.concatenate(weights) + + return output, labels, weights + + def ReportEval(self, metrics, global_step, counts=None, name=None): + """Write Eval summaries. + + Args: + metrics: Dict mapping metric names to numpy arrays containing metric + values for each task. + global_step: Integer. Global step number inference was run on. + counts: Dict mapping class names to counts. + name: String name for this group of metrics. Useful for organizing + metrics calculated using different subsets of the data. + """ + # create a DataFrame to hold results + data = dict() + if counts is not None: + data.update({'count_%s' % group: values + for group, values in counts.iteritems()}) + data.update(metrics) + df = pd.DataFrame(data) + print 'Eval at step: %d' % global_step + print df + # add global step to df + df['step'] = global_step + + # save an update to disk + filename = os.path.join(self.logdir, 'eval-%d.pkl' % global_step) + with open(filename, 'w') as f: + pickle.dump(df, f, pickle.HIGHEST_PROTOCOL) + + # write a summary for each metric + self.RequireAttributes(['summary_writer']) + with tf.Session(self.master): + summaries = [] + prefix = '' if name is None else '%s - ' % name + for metric_name, results in metrics.iteritems(): + for task in xrange(self.num_tasks): + task_str = str(task).zfill(len(str(self.num_tasks))) + summaries.append( + tf.scalar_summary('%s%s_%s' % (prefix, metric_name, task_str), + results[task])) + summaries.append(tf.scalar_summary( + '%sMean %s' % (prefix, metric_name), np.mean(results))) + summaries.append(tf.scalar_summary( + '%sMedian %s' % (prefix, metric_name), np.median(results))) + self.summary_writer.add_summary(tf.merge_summary(summaries).eval(), + global_step=global_step) + self.summary_writer.flush() + + def RequireAttributes(self, attrs): + """Require class attributes to be defined. + + Args: + attrs: A list of attribute names that must be defined. + + Raises: + AssertionError: if a required attribute is not defined. + """ + for attr in attrs: + if getattr(self, attr, None) is None: + raise AssertionError( + 'self.%s must be defined by a concrete subclass' % attr) + + def AddSummaries(self): + """Add summaries for model parameters.""" + for var in tf.trainable_variables(): + if 'BatchNormalize' in var.name: + continue + tf.histogram_summary(var.name, var) + + +class Classifier(Model): + """Classification model. + + Subclasses must set the following attributes: + output: Logits op(s) used for computing classification loss and predicted + class probabilities for each task. + + Class attributes: + default_metrics: List of metrics to compute by default. + """ + + default_metrics = ['auc'] + + def Cost(self, logits, labels, weights): + """Calculate single-task training cost for a batch of examples. + + Args: + logits: Tensor with shape batch_size x num_classes containing logits. + labels: Tensor with shape batch_size x num_classes containing true labels + in a one-hot encoding. + weights: Tensor with shape batch_size containing example weights. + + Returns: + A tensor with shape batch_size containing the weighted cost for each + example. + """ + return tf.mul(tf.nn.softmax_cross_entropy_with_logits(logits, labels), + weights) + + def TrainingCost(self): + """Calculate additional classifier-specific costs. + + Returns: + A list of tensors with shape batch_size containing costs for each task. + """ + weighted_costs = super(Classifier, self).TrainingCost() # calculate loss + epsilon = 1e-3 # small float to avoid dividing by zero + config = self.config + num_tasks = config.num_classification_tasks + cond_costs = collections.defaultdict(list) + + with self._SharedNameScope('costs'): + for task in xrange(num_tasks): + task_str = str(task).zfill(len(str(num_tasks))) + with self._SharedNameScope('cost_{}'.format(task_str)): + with tf.name_scope('conditional'): + # pos/neg costs: mean over pos/neg examples + for name, label in [('neg', 0), ('pos', 1)]: + cond_weights = self.labels[task][:config.batch_size, label] + cond_cost = tf.div( + tf.reduce_sum(tf.mul(weighted_costs[task], cond_weights)), + tf.reduce_sum(cond_weights) + epsilon) + tf.scalar_summary('%s_%s' % (name, task_str), + model_ops.MovingAverage(cond_cost, + self.global_step)) + cond_costs[name].append(cond_cost) + + # aggregated costs + with self._SharedNameScope('aggregated'): + with tf.name_scope('pos_cost'): + pos_cost = tf.add_n(cond_costs['pos']) + with tf.name_scope('neg_cost'): + neg_cost = tf.add_n(cond_costs['neg']) + + # (smoothed) summaries + tf.scalar_summary('Total Neg Cost', + model_ops.MovingAverage(neg_cost, self.global_step)) + tf.scalar_summary('Total Pos Cost', + model_ops.MovingAverage(pos_cost, self.global_step)) + + # keep track of the number of positive examples seen by each task + with tf.name_scope('counts'): + for task in xrange(num_tasks): + num_pos = tf.Variable(0.0, name='num_pos_%d' % task, trainable=False) + # the assignment must occur on the same device as the variable + with tf.device(num_pos.device): + tf.get_default_graph().add_to_collection( + 'updates', num_pos.assign_add( + tf.reduce_sum(self.labels[task][:config.batch_size, 1]))) + tf.scalar_summary(num_pos.name, num_pos) + + return weighted_costs + + def AddOutputOps(self): + """Replace logits with softmax outputs.""" + softmax = [] + with tf.name_scope('inference'): + for i, logits in enumerate(self.output): + softmax.append(tf.nn.softmax(logits, name='softmax_%d' % i)) + self.output = softmax + + def ExampleCounts(self, y_true): + """Get counts of examples in each class. + + Args: + y_true: List of numpy arrays containing true values, one for each task. + + Returns: + A dict mapping class names to counts. + """ + classes = np.unique(np.concatenate(y_true)) + counts = {klass: np.zeros(self.num_tasks, dtype=int) + for klass in classes} + for task in xrange(self.num_tasks): + for klass in classes: + counts[klass][task] = np.count_nonzero(y_true[task] == klass) + return counts + + def LabelPlaceholders(self): + """Add Placeholders for labels for each task. + + This method creates the following Placeholders for each task: + labels_%d: Label tensor with shape batch_size x num_classes. + + Placeholders are wrapped in identity ops to avoid the error caused by + feeding and fetching the same tensor. + """ + config = self.config + batch_size = config.batch_size + num_classes = config.num_classes + labels = [] + for task in xrange(self.num_tasks): + with tf.name_scope(self.placeholder_scope): + labels.append(tf.identity( + tf.placeholder(tf.float32, shape=[batch_size, num_classes], + name='labels_%d' % task))) + self.labels = labels + + def ParseModelOutput(self, output, labels, weights): + """Parse model output to get true and predicted values for each task. + + Args: + output: Numpy array containing model output with shape + batch_size x num_tasks x num_classes. + labels: Numpy array containing one-hot example labels with shape + batch_size x num_tasks x num_classes. + weights: Numpy array containing example weights with shape + batch_size x num_tasks. + + Returns: + y_true: List of numpy arrays containing true labels, one for each task. + y_pred: List of numpy arrays containing predicted labels, one for each + task. + """ + y_true, y_pred = [], [] + for task in xrange(self.config.num_classification_tasks): + # mask examples with zero weight + mask = weights[:, task] > 0 + # get true class labels + y_true.append(labels[mask, task, 1]) + # get positive class probabilities for predictions + y_pred.append(output[mask, task, 1]) + return y_true, y_pred + + +class Regressor(Model): + """Regression model. + + Subclasses must set the following attributes: + output: Op(s) used for computing regression loss and predicted regression + outputs for each task. + + Class attributes: + default_metrics: List of metrics to compute by default. + """ + + default_metrics = ['r2'] + + def Cost(self, output, labels, weights): + """Calculate single-task training cost for a batch of examples. + + Args: + output: Tensor with shape batch_size containing predicted values. + labels: Tensor with shape batch_size containing true values. + weights: Tensor with shape batch_size containing example weights. + + Returns: + A tensor with shape batch_size containing the weighted cost for each + example. + """ + return tf.mul(tf.nn.l2_loss(output - labels), weights) + + def ExampleCounts(self, y_true): + """Get counts of examples in each class. + + Args: + y_true: List of numpy arrays containing true values, one for each task. + + Returns: + A dict mapping class names to counts. + """ + return {'all': np.asarray([len(y_true[task]) + for task in xrange(self.num_tasks)])} + + def LabelPlaceholders(self): + """Add Placeholders for labels for each task. + + This method creates the following Placeholders for each task: + labels_%d: Label tensor with shape batch_size. + + Placeholders are wrapped in identity ops to avoid the error caused by + feeding and fetching the same tensor. + """ + batch_size = self.config.batch_size + labels = [] + for task in xrange(self.num_tasks): + with tf.name_scope(self.placeholder_scope): + labels.append(tf.identity( + tf.placeholder(tf.float32, shape=[batch_size], + name='labels_%d' % task))) + self.labels = labels + + def ParseModelOutput(self, output, labels, weights): + """Parse model output to get true and predicted values for each task. + + Args: + output: Numpy array containing model output with shape + batch_size x num_tasks x num_classes. + labels: Numpy array containing one-hot example labels with shape + batch_size x num_tasks x num_classes. + weights: Numpy array containing example weights with shape + batch_size x num_tasks. + + Returns: + y_true: List of numpy arrays containing true labels, one for each task. + y_pred: List of numpy arrays containing predicted labels, one for each + task. + """ + # build arrays of true and predicted values for R-squared calculation + y_true, y_pred = [], [] + for task in xrange(self.config.num_regression_tasks): + mask = weights[:, task] > 0 # ignore examples with zero weight + y_true.append(labels[mask, task]) + y_pred.append(output[mask, task]) + return y_true, y_pred + + +def Optimizer(config): + """Create model optimizer. + + Args: + config: ModelConfig. + + Returns: + A training Optimizer. + + Raises: + NotImplementedError: If an unsupported optimizer is requested. + """ + # TODO(user): gradient clipping (see Minimize) + if config.optimizer == 'adagrad': + train_op = tf.train.AdagradOptimizer(config.learning_rate) + elif config.optimizer == 'adam': + train_op = tf.train.AdamOptimizer(config.learning_rate) + elif config.optimizer == 'momentum': + train_op = tf.train.MomentumOptimizer(config.learning_rate, config.memory) + elif config.optimizer == 'rmsprop': + train_op = tf.train.RMSPropOptimizer(config.learning_rate, config.memory) + elif config.optimizer == 'sgd': + train_op = tf.train.GradientDescentOptimizer(config.learning_rate) + else: + raise NotImplementedError('Unsupported optimizer %s' % config.optimizer) + return train_op + + +def WeightDecay(config): + """Add weight decay. + + Args: + config: ModelConfig. + + Returns: + A scalar tensor containing the weight decay cost. + + Raises: + NotImplementedError: If an unsupported penalty type is requested. + """ + variables = [] + # exclude bias variables + for v in tf.trainable_variables(): + if v.get_shape().ndims == 2: + variables.append(v) + + with tf.name_scope('weight_decay'): + if config.penalty_type == 'l1': + cost = tf.add_n([tf.reduce_sum(tf.Abs(v)) for v in variables]) + elif config.penalty_type == 'l2': + cost = tf.add_n([tf.nn.l2_loss(v) for v in variables]) + else: + raise NotImplementedError('Unsupported penalty_type %s' % + config.penalty_type) + cost *= config.penalty + tf.scalar_summary('Weight Decay Cost', cost) + return cost
diff --git a/biology/model_config.proto b/biology/model_config.proto new file mode 100644 index 0000000..23e30fa --- /dev/null +++ b/biology/model_config.proto
@@ -0,0 +1,44 @@ +// Copyright 2015 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// +syntax = "proto2"; + +package biology; + +// Neural network model configuration, used mostly for +// de/serializing model parameters for a given execution from/to disk. +message ModelConfig { + message Parameter { + optional string name = 1; + optional string description = 10; + + // See oneof user guide: + // http://sites/protocol-buffers/user-docs/miscellaneous-howtos/oneof + oneof value { + float float_value = 2; + int32 int_value = 3; + string string_value = 4; + bool bool_value = 5; + } + + repeated float float_list = 6; + repeated int32 int_list = 7; + repeated string string_list = 8; + repeated bool bool_list = 9; + }; + + repeated Parameter parameter = 1; + optional string description = 2; +}
diff --git a/biology/model_config.py b/biology/model_config.py new file mode 100644 index 0000000..dbe88ee --- /dev/null +++ b/biology/model_config.py
@@ -0,0 +1,217 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Wrapper of key-value pairs, which can be de/serialized from/to disk. +""" +import re + +from google.protobuf import text_format +from tensorflow.python.platform import gfile + +from biology import model_config_pb2 + + +class ModelConfig(object): + """Wrapper of key-value pairs which can be de/serialized from/to disk. + + A given key-value pair cannot be removed once added. + This wrapper is mostly meant to read + a config from disk or a python dict once, and subsequently the + values are read through the object's attributes. + + De/Serialization is done through a protocol buffer with a text format, + so files on disk are human readable and editable. See the unittest + for an example of the protocol buffer text format. + """ + + _supported_types = [bool, int, float, str, unicode, list] + _supported_overwrite_modes = ['forbidden', 'required', 'allowed'] + + def __init__(self, defaults=None): + """Creates a config object. + + Args: + defaults: An optional dictionary with string keys and + possibly heterogenously typed values; + see class attribute _supported_types for supported types. + The newly constructed object will gain attributes matching + the dict's keys and values. + """ + self._config_dict = {} + if defaults: + for key, value in defaults.iteritems(): + self.AddParam(key, value, overwrite='forbidden') + + def _ValidateParam(self, key, value, overwrite): + """Checks param has a valid type, name, and enforces duplicate key handling. + + Args: + key: str or unicode. Must be an allowable python attribute name, + (specifically, must match r'^[a-zA-Z][a-zA-Z_0-9]+$') + value: bool, int, float, str, unicode or homogeneous list thereof. + The value to be stored. + overwrite: String, how to handle duplicate keys. + 'forbidden': raise ValueError if key is already present. + 'required': raise ValueError if key is *not* already present. + 'allowed': key will be added or updated silently. + + Raises: + ValueError: if parameters are not valid types, + or if the key is not an allowable python attribute name, + or if duplicate key validation failed. + """ + if overwrite not in self._supported_overwrite_modes: + raise ValueError( + 'overwrite mode "{}" not allowed, must be one of {}'.format( + overwrite, ','.join(self._supported_overwrite_modes))) + + if type(key) not in [str, unicode]: + raise ValueError('Key must but a string, but is: {}'.format(type(key))) + + if re.match(r'^[a-zA-Z][a-zA-Z_0-9]+$', key) is None: + raise ValueError('Key is a bad attribute name: {}'.format(key)) + + if key in self._config_dict: + if overwrite == 'forbidden': + raise ValueError('Not allowed to specify same key twice: {}'.format( + key)) + if (not isinstance(value, type(self._config_dict[key])) and + {str, unicode} != {type(value), type(self._config_dict[key])}): + raise ValueError( + 'Not allowed to change value type ({} -> {}) for a key: {}'.format( + type(self._config_dict[key]), type(value), key)) + else: + if overwrite == 'required': + raise ValueError('Must specify default for {}'.format(key)) + + if type(value) not in self._supported_types: + raise ValueError( + 'Only {} values allowed: {}'.format( + ','.join([str(t) for t in self._supported_types]), + type(value))) + + if type(value) is list: + if not value: + raise ValueError('Only non-empty lists supported: {}'.format(key)) + type_set = {type(v) for v in value} + if len(type_set) > 1: + raise ValueError('Only homogenous lists supported, found: {}={}'.format( + key, ','.join(str(t) for t in type_set))) + + def AddParam(self, key, value, overwrite): + """Adds one key-value pair to the dict being stored. + + Args: + key: str or unicode. Must be an allowable python attribute name, + (specifically, must match r'^[a-zA-Z][a-zA-Z_0-9]+$') + value: bool, int, float, str, unicode or homogeneous list thereof. + The value to be stored. + overwrite: String, how to handle duplicate keys. + See _ValidateParam for allowed values and descriptions. + + Raises: + ValueError: see _ValidateParam for raising conditions. + """ + self._ValidateParam(key, value, overwrite) + self._config_dict[key] = value + setattr(self, key, value) + + def GetOptionalParam(self, key, default_value): + """Returns the param value or the default_value if not present. + + Typically you should directly read the object attribute for the + key, but if the key is optionally present this method can be convenient. + + Args: + key: String of the parameter name. + default_value: Value to return if key is not present in this config. + May be int, float or string. + + Returns: + Value of the parameter named by key or default_value if key isn't present. + """ + return getattr(self, key, default_value) + + def WriteToFile(self, filename): + """Writes this ModelConfig object to disk. + + Args: + filename: Path to write config to on disk. + + Raises: + IOError: in case of error while writing. + ValueError: in case of unsupported key or value type. + """ + config_proto = model_config_pb2.ModelConfig() + for key, value in sorted(self._config_dict.iteritems()): + proto_param = config_proto.parameter.add() + proto_param.name = key + if type(value) is int: + proto_param.int_value = value + elif type(value) is float: + proto_param.float_value = value + elif type(value) in [str, unicode]: + proto_param.string_value = value + elif type(value) is bool: + proto_param.bool_value = value + elif type(value) is list: + list_type = type(value[0]) + if list_type is int: + proto_param.int_list.extend(value) + elif list_type is float: + proto_param.float_list.extend(value) + elif list_type in [str, unicode]: + proto_param.string_list.extend(value) + elif list_type is bool: + proto_param.bool_list.extend(value) + else: + raise ValueError('Unsupported list type: {}'.format(list_type)) + else: + raise ValueError('Unsupported value type: {}'.format(type(value))) + + with open(filename, mode='w') as config_file: + config_file.write(text_format.MessageToString(config_proto)) + + def ReadFromFile(self, filename, overwrite='required'): + """Reads into this ModelConfig object from disk. + + Args: + filename: Path to serialized config file. + overwrite: String, how to handle duplicate keys. + See _ValidateParam for allowed values and descriptions. + + Raises: + IOError: in case of error while reading. + ValueError: if no value is set in a parameter. + """ + config_proto = model_config_pb2.ModelConfig() + with open(filename) as config_file: + text_format.Merge(config_file.read(), config_proto) + + for p in config_proto.parameter: + value_name = p.WhichOneof('value') + if value_name: + value = getattr(p, value_name) + elif p.int_list: + value = list(p.int_list) + elif p.float_list: + value = list(p.float_list) + elif p.string_list: + value = list(p.string_list) + elif p.bool_list: + value = list(p.bool_list) + else: + raise ValueError('No value set for key: {}'.format(p.name)) + self.AddParam(p.name, value, overwrite)
diff --git a/biology/model_config_test.py b/biology/model_config_test.py new file mode 100644 index 0000000..25f4dad --- /dev/null +++ b/biology/model_config_test.py
@@ -0,0 +1,194 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import tempfile + +from tensorflow.python.platform import flags +from tensorflow.python.platform import gfile +from tensorflow.python.platform import googletest + +from biology import model_config + +EXAMPLE_DICT = { + 'hello': 'world', + 'pi': 3.14159, + 'Forty_Two': 42, + 'great': True, + 'spells': ['alohamora', 'expelliarmus'], + 'scores': [9.8, 10.0], + 'sizes': [2000, 100], + 'waver': [True, False, True], + } + +EXAMPLE_DEFAULTS = { + 'hello': 'there', + 'pi': 3.14, + 'Forty_Two': 24, + 'great': False, + 'spells': ['abracadabra', 'cruciatus'], + 'scores': [1.8, 1.0], + 'sizes': [1200, 10], + 'waver': [False, True, False], +} + +EXAMPLE_FILE_CONTENTS = """parameter { + name: "Forty_Two" + int_value: 42 +} +parameter { + name: "great" + bool_value: true +} +parameter { + name: "hello" + string_value: "world" +} +parameter { + name: "pi" + float_value: 3.14159 +} +parameter { + name: "scores" + float_list: 9.8 + float_list: 10.0 +} +parameter { + name: "sizes" + int_list: 2000 + int_list: 100 +} +parameter { + name: "spells" + string_list: "alohamora" + string_list: "expelliarmus" +} +parameter { + name: "waver" + bool_list: true + bool_list: false + bool_list: true +} +""" + + +class ModelConfigTest(googletest.TestCase): + + def setUp(self): + super(ModelConfigTest, self).setUp() + self.root = tempfile.mkdtemp(dir=flags.FLAGS.test_tmpdir) + + def _assertMatchesExample(self, config): + self.assertEqual(config.hello, 'world') + self.assertEqual(config.pi, 3.14159) + self.assertEqual(config.Forty_Two, 42) + self.assertTrue(config.great) + self.assertEqual(config.scores, [9.8, 10.0]) + self.assertEqual(config.sizes, [2000, 100]) + self.assertEqual(config.spells, ['alohamora', 'expelliarmus']) + self.assertEqual(config.waver, [True, False, True]) + + def testCreatesAttributes(self): + config = model_config.ModelConfig(EXAMPLE_DICT) + self._assertMatchesExample(config) + + def testGetOptionalParam(self): + config = model_config.ModelConfig(EXAMPLE_DICT) + self.assertEqual('world', config.GetOptionalParam('hello', 'everybody')) + self.assertEqual('default', config.GetOptionalParam('otherkey', 'default')) + + def testOnlyValidAttributeNamesAllowed(self): + config = model_config.ModelConfig() + with self.assertRaises(ValueError): + config.AddParam('spaces not allowed', + 'blah', + overwrite='forbidden') + + with self.assertRaises(ValueError): + config.AddParam('42_must_start_with_letter', + 'blah', + overwrite='forbidden') + + with self.assertRaises(ValueError): + config.AddParam('hyphens-not-allowed', + 'blah', + overwrite='forbidden') + + with self.assertRaises(ValueError): + config.AddParam('', + 'empty string no good', + overwrite='forbidden') + + def testDuplicateKeysNotAllowed(self): + config = model_config.ModelConfig(EXAMPLE_DICT) + with self.assertRaises(ValueError): + config.AddParam('hello', + 'everybody', + overwrite='forbidden') + + def testRequireDefault(self): + config = model_config.ModelConfig(EXAMPLE_DICT) + config.AddParam('hello', + 'everybody', + overwrite='required') + with self.assertRaises(ValueError): + config.AddParam('not', + 'present', + overwrite='required') + + def testSilentOverwrite(self): + config = model_config.ModelConfig(EXAMPLE_DICT) + config.AddParam('not', 'present', overwrite='allowed') + config.AddParam('not', 'anymore', overwrite='allowed') + + def testHeterogeneousList(self): + config = model_config.ModelConfig() + with self.assertRaises(ValueError): + config.AddParam('different', + ['types for', 'different', 0xF, 0x0, 'lks'], + overwrite='forbidden') + + def testWritesFile(self): + config = model_config.ModelConfig(EXAMPLE_DICT) + filename = os.path.join(self.root, 'config.pbtxt') + config.WriteToFile(filename) + + with open(filename) as pbtxt_file: + self.assertEqual(EXAMPLE_FILE_CONTENTS, pbtxt_file.read()) + + def testReadsFile_NoDuplicates(self): + filename = os.path.join(self.root, 'config.pbtxt') + with open(filename, 'w') as pbtxt_file: + pbtxt_file.write(EXAMPLE_FILE_CONTENTS) + + config = model_config.ModelConfig() + config.ReadFromFile(filename, overwrite='forbidden') + self._assertMatchesExample(config) + + def testReadsFile_RequireDefaults(self): + filename = os.path.join(self.root, 'config.pbtxt') + with open(filename, 'w') as pbtxt_file: + pbtxt_file.write(EXAMPLE_FILE_CONTENTS) + + self.assertEqual(set(EXAMPLE_DEFAULTS.keys()), set(EXAMPLE_DICT.keys())) + config = model_config.ModelConfig(EXAMPLE_DEFAULTS) + config.ReadFromFile(filename, overwrite='required') + self._assertMatchesExample(config) + + +if __name__ == '__main__': + googletest.main()
diff --git a/biology/model_ops.py b/biology/model_ops.py new file mode 100644 index 0000000..36ff42a --- /dev/null +++ b/biology/model_ops.py
@@ -0,0 +1,360 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Ops for graph construction.""" + + +import tensorflow as tf + +from google.protobuf import text_format + +from tensorflow.python.platform import gfile +from tensorflow.python.platform import logging + +from biology import utils as model_utils + + +def AddBias(tensor, init=None, name=None): + """Add a bias term to a tensor. + + Args: + tensor: Variable tensor. + init: Bias initializer. Defaults to zero. + name: Name for this op. Defaults to tensor.op.name. + + Returns: + A biased tensor with the same shape as the input tensor. + """ + if init is None: + init = tf.zeros([tensor.get_shape()[-1].value]) + with tf.op_scope([tensor], name, tensor.op.name): + b = tf.Variable(init, name='b') + return tf.nn.bias_add(tensor, b) + + +def BatchNormalize(tensor, convolution, mask=None, epsilon=0.001, + scale_after_normalization=True, decay=0.999, + global_step=None, name=None): + """Batch normalization. + + Normalize, scale, and shift the input tensor to reduce covariate shift. + + NOTE(user): For inference, the mean and variance must be set to fixed + values derived from the entire training set. This is accomplished by using + moving_mean and moving_variance during evaluation. Be sure that models run + the ops in updates during training or the moving averages will not be very + useful! + + Args: + tensor: Input tensor (must be 4D). + convolution: If True, perform normalization across rows and columns as + well as over batch. + mask: Mask to apply to tensor. + epsilon: Small float to avoid dividing by zero. + scale_after_normalization: If True, multiply by gamma. If False, gamma is + not used. When the next layer is linear (also e.g. ReLU), this can be + disabled since the scaling can be done by the next layer. + decay: Float value for moving average decay. + global_step: Tensor containing global step for accelerating moving averages + at the beginning of training. + name: Name for this op. Defaults to 'batch_norm'. + + Returns: + A new tensor corresponding to the batch normalized input. + + Raises: + ValueError: If the input tensor is not 4D. + """ + if len(tensor.get_shape()) != 4: + raise ValueError('Input tensor must be 4D, not %dD' + % len(tensor.get_shape())) + if convolution: + axes = [0, 1, 2] + shape = tensor.get_shape()[3:] + else: + axes = [0] + shape = tensor.get_shape()[1:] + with tf.op_scope([tensor], None, 'BatchNormalize'): + if mask is not None: + mean, variance = model_utils.Moment( + 2, tensor, reduction_indices=axes, mask=mask) + else: + mean, variance = tf.nn.moments(tensor, axes) + + # Keep track of moving averages for mean and variance. During eval, use the + # moving averages from training. + mean_moving_average = MovingAverage(mean, global_step, decay) + variance_moving_average = MovingAverage(variance, global_step, decay) + if not IsTraining(): + mean = mean_moving_average + variance = variance_moving_average + + beta = tf.Variable(tf.zeros(shape), name='beta') + gamma = tf.Variable(tf.constant(1.0, shape=shape), name='gamma') + if convolution: + batch_norm = tf.nn.batch_norm_with_global_normalization( + tensor, mean, variance, beta, gamma, epsilon, + scale_after_normalization) + else: + batch_norm = (tensor - mean) * tf.rsqrt(variance + epsilon) + if scale_after_normalization: + batch_norm *= gamma + batch_norm += beta + if mask is not None: + batch_norm = model_utils.Mask(batch_norm, mask) + return batch_norm + + +def MovingAverage(tensor, global_step, decay=0.999): + """Create a variable that contains the moving average of a tensor. + + Adds a tf.identity and special namescope to ensure the tensor + is colocated with its Variable on the parameter server. + See http://g/tensorflow-users/PAAXYLlybNs/xA0z-x1qEwAJ + and replicated_model.py#NameScopeDevicePicker for context. + + Args: + tensor: Tensor to calculate moving average of. + global_step: Variable containing the number of global steps. + decay: Float for exponential decay of moving average. + + Returns: + A tf.Variable containing the moving average of the input tensor. + """ + exponential_moving_average = tf.train.ExponentialMovingAverage( + decay=decay, num_updates=global_step) + + update_op = exponential_moving_average.apply([tensor]) + tf.get_default_graph().add_to_collection('updates', update_op) + return exponential_moving_average.average(tensor) + + +def Dropout(tensor, dropout_prob, training_only=True): + """Random dropout. + + This implementation supports "always-on" dropout (training_only=False), which + can be used to calculate model uncertainty. See Gal and Ghahramani, + http://arxiv.org/abs/1506.02142. + + NOTE(user): To simplify the implementation, I have chosen not to reverse + the scaling that occurs in tf.nn.dropout when using dropout during + inference. This shouldn't be an issue since the activations will be scaled + by the same constant in both training and inference. This means that there + are no training-time differences between networks that use dropout during + inference and those that do not. + + Args: + tensor: Input tensor. + dropout_prob: Float giving dropout probability for weights (NOT keep + probability). + training_only: Boolean. If True (standard dropout), apply dropout only + during training. If False, apply dropout during inference as well. + + Returns: + A tensor with the same shape as the input tensor. + """ + if not dropout_prob: + return tensor # do nothing + keep_prob = 1.0 - dropout_prob + if IsTraining() or not training_only: + tensor = tf.nn.dropout(tensor, keep_prob) + return tensor + + +def FullyConnectedLayer(tensor, size, weight_init=None, bias_init=None, + name=None): + """Fully connected layer. + + Args: + tensor: Input tensor. + size: Number of nodes in this layer. + weight_init: Weight initializer. + bias_init: Bias initializer. + name: Name for this op. Defaults to 'fully_connected'. + + Returns: + A new tensor representing the output of the fully connected layer. + + Raises: + ValueError: If input tensor is not 2D. + """ + if len(tensor.get_shape()) != 2: + raise ValueError('Dense layer input must be 2D, not %dD' + % len(tensor.get_shape())) + if weight_init is None: + num_features = tensor.get_shape()[-1].value + weight_init = tf.truncated_normal([num_features, size], stddev=0.01) + if bias_init is None: + bias_init = tf.zeros([size]) + + with tf.op_scope([tensor], name, 'fully_connected'): + w = tf.Variable(weight_init, name='w') + b = tf.Variable(bias_init, name='b') + return tf.nn.xw_plus_b(tensor, w, b) + + +def IsTraining(): + """Determine whether the default graph is in training mode. + + Returns: + A boolean value indicating whether the default graph is in training mode. + + Raises: + ValueError: If the 'train' collection in the default graph does not contain + exactly one element. + """ + train = tf.get_collection('train') + if not train: + raise ValueError('Training mode is not set. Please call SetTraining.') + elif len(train) > 1: + raise ValueError('Training mode has more than one setting.') + return train[0] + + +def SetTraining(train): + """Set the training mode of the default graph. + + This operation may only be called once for a given graph. + + Args: + train: If True, graph is in training mode. + + Raises: + AssertionError: If the default graph already has this value set. + """ + if tf.get_collection('train'): + raise AssertionError('Training mode already set: %s' % + tf.get_collection('train')) + tf.add_to_collection('train', train) + + +def MultitaskLogits(features, num_tasks, num_classes=2, weight_init=None, + bias_init=None, dropout=None, name=None): + """Create a logit tensor for each classification task. + + Args: + features: A 2D tensor with dimensions batch_size x num_features. + num_tasks: Number of classification tasks. + num_classes: Number of classes for each task. + weight_init: Weight initializer. + bias_init: Bias initializer. + dropout: Float giving dropout probability for weights (NOT keep + probability). + name: Name for this op. Defaults to 'multitask_logits'. + + Returns: + A list of logit tensors; one for each classification task. + """ + logits = [] + with tf.name_scope('multitask_logits'): + for task_idx in range(num_tasks): + with tf.op_scope([features], name, + ('task' + str(task_idx).zfill(len(str(num_tasks))))): + logits.append( + Logits(features, num_classes, weight_init=weight_init, + bias_init=bias_init, dropout=dropout)) + return logits + + +def Logits(features, num_classes=2, weight_init=None, bias_init=None, + dropout=None, name=None): + """Create a logits tensor for a single classification task. + + You almost certainly don't want dropout on there -- it's like randomly setting + the (unscaled) probability of a target class to 0.5. + + Args: + features: A 2D tensor with dimensions batch_size x num_features. + num_classes: Number of classes for each task. + weight_init: Weight initializer. + bias_init: Bias initializer. + dropout: Float giving dropout probability for weights (NOT keep + probability). + name: Name for this op. + + Returns: + A logits tensor with shape batch_size x num_classes. + """ + with tf.op_scope([features], name, 'logits') as name: + return Dropout( + FullyConnectedLayer(features, num_classes, weight_init=weight_init, + bias_init=bias_init, name=name), + dropout) + + +def SoftmaxN(tensor, name=None): + """Apply softmax across last dimension of a tensor. + + Args: + tensor: Input tensor. + name: Name for this op. If None, defaults to 'SoftmaxN'. + + Returns: + A tensor with softmax-normalized values on the last dimension. + """ + with tf.op_scope([tensor], name, 'SoftmaxN'): + exp_tensor = tf.exp(tensor) + reduction_indices = [tensor.get_shape().ndims - 1] + return tf.div(exp_tensor, + tf.reduce_sum(exp_tensor, + reduction_indices=reduction_indices, + keep_dims=True)) + + +def Transform(tensor, transform, convolution=True, mask=None): + """Apply a transform to a tensor. + + Args: + tensor: Input tensor. + transform: String description of transform. Supported values are 'bias' + and 'batch_norm'. + convolution: If True, assume tensor is the output of a convolution. + mask: Mask to apply to tensor. + + Returns: + A tensor with the same shape as the input tensor. + + Raises: + ValueError: If the input tensor is not 3D or 4D. + """ + if len(tensor.get_shape()) not in [2, 3, 4]: + raise ValueError('Input tensor must be 2D, 3D or 4D, not %dD.' + % len(tensor.get_shape())) + with tensor.graph.as_default(): + if transform == 'batch_norm': + # batch normalization requires 4D input + if len(tensor.get_shape()) != 4: + # 3D case: add one extra dimension + if len(tensor.get_shape()) == 3: + squeeze = [2] + tensor = tf.expand_dims(tensor, 2) + if mask is not None: + mask = tf.expand_dims(mask, -1) + # 2D case: add two extra dimensions + else: + squeeze = [1, 2] + tensor = tf.expand_dims(tf.expand_dims(tensor, -2), -2) + if mask is not None: + mask = tf.expand_dims(tf.expand_dims(mask, -1), -1) + tensor = BatchNormalize(tensor, convolution=convolution, mask=mask) + tensor = tf.squeeze(tensor, squeeze) + else: + tensor = BatchNormalize(tensor, convolution=convolution, mask=mask) + elif transform == 'bias': + tensor = AddBias(tensor, init=tf.constant( + 1.0, shape=[tensor.get_shape()[-1].value])) + if mask is not None: + tensor = model_utils.Mask(tensor, mask) + return tensor
diff --git a/biology/model_ops_test.py b/biology/model_ops_test.py new file mode 100644 index 0000000..c53a3e0 --- /dev/null +++ b/biology/model_ops_test.py
@@ -0,0 +1,243 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + + +import numpy as np +import tensorflow as tf + +from google.protobuf import text_format + +from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util +from tensorflow.python.platform import flags +from tensorflow.python.platform import gfile +from tensorflow.python.platform import googletest +from tensorflow.python.training import checkpoint_state_pb2 as cspb + +from biology import model_ops + +FLAGS = flags.FLAGS +FLAGS.test_random_seed = 20151102 + + +class ModelOpsTest(test_util.TensorFlowTestCase): + + def setUp(self): + super(ModelOpsTest, self).setUp() + self.root = '/tmp' + + def testAddBias(self): + with self.test_session() as sess: + w_t = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], shape=[2, 3]) + w_biased_t = model_ops.AddBias(w_t, init=tf.constant(5.0, shape=[3])) + sess.run(tf.initialize_all_variables()) + w, w_biased, bias = sess.run([w_t, w_biased_t] + tf.trainable_variables()) + self.assertAllEqual(w, [[1.0, 2.0, 3.0], + [4.0, 5.0, 6.0]]) + self.assertAllEqual(w_biased, [[6.0, 7.0, 8.0], + [9.0, 10.0, 11.0]]) + self.assertAllEqual(bias, [5.0, 5.0, 5.0]) + + def testFullyConnectedLayer(self): + with self.test_session() as sess: + features = np.random.random((128, 100)) + features_t = tf.constant(features, dtype=tf.float32) + dense_t = model_ops.FullyConnectedLayer(features_t, 50) + sess.run(tf.initialize_all_variables()) + features, dense, w, b = sess.run( + [features_t, dense_t] + tf.trainable_variables()) + expected = np.dot(features, w) + b + self.assertAllClose(dense, expected) + + def testMultitaskLogits(self): + with self.test_session() as sess: + num_tasks = 3 + np.random.seed(FLAGS.test_random_seed) + features = np.random.random((5, 100)) + logits_t = model_ops.MultitaskLogits( + tf.constant(features, + dtype=tf.float32), + num_tasks) + sess.run(tf.initialize_all_variables()) + output = sess.run(tf.trainable_variables() + logits_t) + w = output[0:-3:2] + b = output[1:-3:2] + logits = output[-3:] + for i in range(num_tasks): + expected = np.dot(features, w[i]) + b[i] + self.assertAllClose(logits[i], expected, rtol=1e-5, atol=1e-5) + + def GetModel(self, train=True): + model_ops.SetTraining(train) + + # dummy variable for testing Restore + tf.Variable(tf.constant(10.0, shape=[1]), name='v0') + + def _CheckBatchNormalization(self, features, convolution, mean, variance, + mask=None): + model_ops.SetTraining(True) + epsilon = 0.001 + with self.test_session() as sess: + features_t = tf.constant(features, dtype=tf.float32) + batch_norm_t = model_ops.BatchNormalize( + features_t, convolution=convolution, epsilon=epsilon, mask=mask) + sess.run(tf.initialize_all_variables()) + batch_norm, beta, gamma = sess.run( + [batch_norm_t] + tf.trainable_variables()) + expected = gamma * (features - mean) / np.sqrt(variance + epsilon) + beta + self.assertAllClose(batch_norm, np.ma.filled(expected, 0), + rtol=1e-5, atol=1e-5) + + def CheckBatchNormalization(self, features, convolution): + if convolution: + axis = (0, 1, 2) + else: + axis = 0 + mean = features.mean(axis=axis) + variance = features.var(axis=axis) + self._CheckBatchNormalization(features, convolution, mean, variance) + + def CheckBatchNormalizationWithMask(self, features, convolution, mask): + # convert features to a masked array + # masked array must be created with a mask of the same shape as features + expanded_mask = np.logical_not( + np.ones_like(features) * np.expand_dims(mask, -1)) + features = np.ma.array(features, mask=expanded_mask) + if convolution: + axis = (0, 1, 2) + # masked arrays don't support mean/variance with tuple for axis + count = np.logical_not(features.mask).sum(axis=axis) + mean = features.sum(axis=axis) / count + variance = np.square(features - mean).sum(axis=axis) / count + else: + axis = 0 + mean = features.mean(axis=axis) + variance = features.var(axis=axis) + mask_t = tf.constant(mask, dtype=tf.float32) + self._CheckBatchNormalization(features, convolution, mean, variance, + mask=mask_t) + + def testBatchNormalization(self): + # no convolution: norm over batch (first axis) + self.CheckBatchNormalization( + features=np.random.random((2, 3, 2, 3)), convolution=False) + + def testBatchNormalizationWithConv(self): + # convolution: norm over first three axes + self.CheckBatchNormalization( + features=np.random.random((2, 3, 2, 3)), convolution=True) + + def testBatchNormalizationInference(self): + # create a simple batch-normalized model + model_ops.SetTraining(True) + epsilon = 0.001 + decay = 0.95 + checkpoint = os.path.join(self.root, 'my-checkpoint') + with self.test_session() as sess: + features = np.random.random((2, 3, 2, 3)) + features_t = tf.constant(features, dtype=tf.float32) + # create variables for beta, gamma, and moving mean and variance + model_ops.BatchNormalize( + features_t, convolution=False, epsilon=epsilon, decay=decay) + sess.run(tf.initialize_all_variables()) + updates = tf.group(*tf.get_default_graph().get_collection('updates')) + sess.run(updates) # update moving mean and variance + expected_mean, expected_variance, _, _ = tf.all_variables() + expected_mean = expected_mean.eval() + expected_variance = expected_variance.eval() + + # save a checkpoint + saver = tf.train.Saver() + saver.save(sess, checkpoint) + + super(ModelOpsTest, self).setUp() # reset the default graph + + # check that the moving mean and variance are used for evaluation + # get a new set of features to verify that the correct mean and var are used + model_ops.SetTraining(False) + with self.test_session() as sess: + new_features = np.random.random((2, 3, 2, 3)) + new_features_t = tf.constant(new_features, dtype=tf.float32) + batch_norm_t = model_ops.BatchNormalize( + new_features_t, convolution=False, epsilon=epsilon, decay=decay) + saver = tf.train.Saver() + saver.restore(sess, checkpoint) + batch_norm, mean, variance, beta, gamma = sess.run( + [batch_norm_t] + tf.all_variables()) + self.assertAllClose(mean, expected_mean) + self.assertAllClose(variance, expected_variance) + expected = (gamma * (new_features - mean) / + np.sqrt(variance + epsilon) + beta) + self.assertAllClose(batch_norm, expected) + + def testBatchNormalizationWithMask(self): + features = np.random.random((2, 3, 2, 3)) + mask = np.asarray( + [[[1, 0], + [1, 1], + [1, 0]], + [[0, 1], + [0, 0], + [0, 1]]], + dtype=float) + self.CheckBatchNormalizationWithMask( + features=features, convolution=False, mask=mask) + + def testBatchNormalizationWithMaskAndConv(self): + features = np.random.random((2, 3, 2, 3)) + mask = np.asarray( + [[[1, 0], + [1, 1], + [1, 0]], + [[0, 1], + [0, 0], + [0, 1]]], + dtype=float) + self.CheckBatchNormalizationWithMask( + features=features, convolution=True, mask=mask) + + def testSoftmaxN(self): + features = np.asarray([[[1, 1], + [0.1, 0.3]], + [[0, 1], + [2, 2]]], + dtype=float) + expected = np.asarray([[[0.5, 0.5], + [0.45, 0.55]], + [[0.27, 0.73], + [0.5, 0.5]]], + dtype=float) + with self.test_session() as sess: + computed = sess.run( + model_ops.SoftmaxN(tf.constant(features, + dtype=tf.float32))) + self.assertAllClose(np.around(computed, 2), expected) + + def testSoftmaxNWithNumpy(self): + features = np.random.random((2, 3, 4)) + expected = np.exp(features) / np.exp(features).sum(axis=-1, keepdims=True) + with self.test_session() as sess: + computed = sess.run( + model_ops.SoftmaxN(tf.constant(features, + dtype=tf.float32))) + self.assertAllClose(computed, expected) + + +if __name__ == '__main__': + googletest.main()
diff --git a/biology/model_test.py b/biology/model_test.py new file mode 100644 index 0000000..586e41a --- /dev/null +++ b/biology/model_test.py
@@ -0,0 +1,52 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for model.""" + + +import numpy as np + +from tensorflow.python.platform import googletest + +from biology import model +from biology import model_config + + +class ClassifierTest(googletest.TestCase): + + def setUp(self): + self.config = model_config.ModelConfig({ + 'batch_size': 2, + 'num_classification_tasks': 1, + }) + self.model = model.Classifier(self.config, train=True, + logdir='/tmp/classifier_test') + + def testParseModelOutput(self): + # standard 2-class output; some weights are zero + output = np.asarray([[[0.1, 0.9]], + [[0.2, 0.8]]], dtype=float) + labels = np.asarray([[[0, 1]], + [[1, 0]]], dtype=float) + weights = np.asarray([[0], + [1]], dtype=float) + expected_y_true = [[0]] + expected_y_pred = [[0.8]] + y_true, y_pred = self.model.ParseModelOutput(output, labels, weights) + self.assertListEqual(y_true, expected_y_true) + self.assertListEqual(y_pred, expected_y_pred) + +if __name__ == '__main__': + googletest.main()
diff --git a/biology/utils.py b/biology/utils.py new file mode 100644 index 0000000..1805522 --- /dev/null +++ b/biology/utils.py
@@ -0,0 +1,216 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utils for graph convolution models.""" + + +import numpy as np +import tensorflow as tf + +from google.protobuf import text_format + +from tensorflow.python.platform import gfile +from tensorflow.python.training import checkpoint_state_pb2 + + +def ParseCheckpoint(checkpoint): + """Parse a checkpoint file. + + Args: + checkpoint: Path to checkpoint. The checkpoint is either a serialized + CheckpointState proto or an actual checkpoint file. + + Returns: + The path to an actual checkpoint file. + """ + with open(checkpoint) as f: + try: + cp = checkpoint_state_pb2.CheckpointState() + text_format.Merge(f.read(), cp) + return cp.model_checkpoint_path + except text_format.ParseError: + return checkpoint + + +def Mask(t, mask): + """Apply a mask to a tensor. + + If not None, mask should be a t.shape[:-1] tensor of 0,1 values. + + Args: + t: Input tensor. + mask: Boolean mask with shape == t.shape[:-1]. If None, nothing happens. + + Returns: + A tensor with the same shape as the input tensor. + + Raises: + ValueError: If shapes do not match. + """ + if mask is None: + return t + if not t.get_shape()[:-1].is_compatible_with(mask.get_shape()): + raise ValueError('Shapes do not match: %s vs. %s' % (t.get_shape(), + mask.get_shape())) + return tf.mul(t, tf.expand_dims(mask, -1)) + + +def Mean(tensor, reduction_indices=None, mask=None): + """Compute mean using Sum and Mul for better GPU performance. + + See tf.nn.moments for additional notes on this approach. + + Args: + tensor: Input tensor. + reduction_indices: Axes to reduce across. If None, reduce to a scalar. + mask: Mask to apply to tensor. + + Returns: + A tensor with the same type as the input tensor. + """ + return Moment(1, tensor, standardize=False, + reduction_indices=reduction_indices, mask=mask)[0] + + +def Variance(tensor, reduction_indices=None, mask=None): + """Compute variance. + + Args: + tensor: Input tensor. + reduction_indices: Axes to reduce across. If None, reduce to a scalar. + mask: Mask to apply to tensor. + + Returns: + A tensor with the same type as the input tensor. + """ + return Moment(2, tensor, standardize=False, + reduction_indices=reduction_indices, mask=mask)[1] + + +def Skewness(tensor, reduction_indices=None): + """Compute skewness, the third standardized moment. + + Args: + tensor: Input tensor. + reduction_indices: Axes to reduce across. If None, reduce to a scalar. + + Returns: + A tensor with the same type as the input tensor. + """ + return Moment(3, tensor, standardize=True, + reduction_indices=reduction_indices)[1] + + +def Kurtosis(tensor, reduction_indices=None): + """Compute kurtosis, the fourth standardized moment minus three. + + Args: + tensor: Input tensor. + reduction_indices: Axes to reduce across. If None, reduce to a scalar. + + Returns: + A tensor with the same type as the input tensor. + """ + return Moment(4, tensor, standardize=True, + reduction_indices=reduction_indices)[1] - 3 + + +def Moment(k, tensor, standardize=False, reduction_indices=None, mask=None): + """Compute the k-th central moment of a tensor, possibly standardized. + + Args: + k: Which moment to compute. 1 = mean, 2 = variance, etc. + tensor: Input tensor. + standardize: If True, returns the standardized moment, i.e. the central + moment divided by the n-th power of the standard deviation. + reduction_indices: Axes to reduce across. If None, reduce to a scalar. + mask: Mask to apply to tensor. + + Returns: + The mean and the requested moment. + """ + if reduction_indices is not None: + reduction_indices = np.atleast_1d(reduction_indices).tolist() + + # get the divisor + if mask is not None: + tensor = Mask(tensor, mask) + ones = tf.constant(1, dtype=tf.float32, shape=tensor.get_shape()) + divisor = tf.reduce_sum(Mask(ones, mask), + reduction_indices=reduction_indices, + keep_dims=True) + elif reduction_indices is None: + divisor = tf.constant(np.prod(tensor.get_shape().as_list()), tensor.dtype) + else: + divisor = 1.0 + for i in range(len(tensor.get_shape())): + if i in reduction_indices: + divisor *= tensor.get_shape()[i].value + divisor = tf.constant(divisor, tensor.dtype) + + # compute the requested central moment + # note that mean is a raw moment, not a central moment + mean = tf.div( + tf.reduce_sum(tensor, + reduction_indices=reduction_indices, + keep_dims=True), + divisor) + delta = tensor - mean + if mask is not None: + delta = Mask(delta, mask) + moment = tf.div( + tf.reduce_sum(tf.math_ops.pow(delta, k), + reduction_indices=reduction_indices, + keep_dims=True), + divisor) + moment = tf.squeeze(moment, reduction_indices) + if standardize: + moment = tf.mul( + moment, + tf.math_ops.pow( + tf.rsqrt(Moment(2, + tensor, + reduction_indices=reduction_indices)[1]), + k)) + + return tf.squeeze(mean, reduction_indices), moment + + +def StringToOp(string): + """Get a TensorFlow op from a string. + + Args: + string: String description of an op, such as 'sum' or 'mean'. + + Returns: + A TensorFlow op. + + Raises: + NotImplementedError: If string does not match a supported operation. + """ + # TODO(user): median is not implemented yet in TensorFlow + op_map = { + 'max': tf.reduce_max, + 'mean': Mean, + 'min': tf.reduce_min, + 'sum': tf.reduce_sum, + 'variance': Variance, + 'skewness': Skewness, + 'kurtosis': Kurtosis, + } + try: + return op_map[string] + except KeyError: + raise NotImplementedError('Unrecognized op: %s' % string)
diff --git a/biology/utils_test.py b/biology/utils_test.py new file mode 100644 index 0000000..b3657f0 --- /dev/null +++ b/biology/utils_test.py
@@ -0,0 +1,218 @@ +#!/usr/bin/python +# +# Copyright 2015 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile + + +import numpy as np +import scipy.stats +import tensorflow as tf + +from google.protobuf import text_format + +from tensorflow.python.framework import test_util +from tensorflow.python.platform import flags +from tensorflow.python.platform import googletest +from tensorflow.python.training import checkpoint_state_pb2 + +from biology import utils + +FLAGS = flags.FLAGS +FLAGS.test_random_seed = 20151102 + + +class UtilsTest(test_util.TensorFlowTestCase): + + def setUp(self): + super(UtilsTest, self).setUp() + np.random.seed(FLAGS.test_random_seed) + + def testParseCheckpoint(self): + # parse CheckpointState proto + with tempfile.NamedTemporaryFile() as f: + cp = checkpoint_state_pb2.CheckpointState() + cp.model_checkpoint_path = 'my-checkpoint' + f.write(text_format.MessageToString(cp)) + f.file.flush() + self.assertEqual(utils.ParseCheckpoint(f.name), 'my-checkpoint') + # parse path to actual checkpoint + with tempfile.NamedTemporaryFile() as f: + f.write('This is not a CheckpointState proto.') + f.file.flush() + self.assertEqual(utils.ParseCheckpoint(f.name), f.name) + + def PrepareFeatures(self, features): + features = np.asarray(features, dtype=float) + features_t = tf.constant(features, dtype=tf.float32) + return features, features_t + + def PrepareMask(self, features, mask): + mask = np.asarray(mask, dtype=float) + mask_t = tf.constant(mask, dtype=tf.float32) + # the provided mask has to be the same shape as features + expanded_mask = np.logical_not( + np.ones_like(features) * np.expand_dims(mask, -1)) + masked_features = np.ma.masked_array(features, mask=expanded_mask) + return masked_features, mask_t + + def Check(self, func, features, expected, axis=None, mask=None): + with self.test_session() as sess: + features, features_t = self.PrepareFeatures(features) + if mask is not None: + features, mask = self.PrepareMask(features, mask) + self.assertAllClose( + sess.run(func(features_t, reduction_indices=axis, mask=mask)), + expected) + + def testMean(self): + self.Check(utils.Mean, + features=[0, 1], + expected=0.5) + self.Check(utils.Mean, + features=[[0, 1], + [2, 3]], + expected=[0.5, 2.5], + axis=1) + self.Check(utils.Mean, + features=[[[0, 1], + [2, 3]], + [[4, 5], + [6, 7]]], + expected=[2.5, 4.5], + axis=[0, 2]) + + def testMeanWithMask(self): + self.Check(utils.Mean, + features=[[9999], + [1], + [2]], + expected=1.5, + mask=[0, 1, 1]) + self.Check(utils.Mean, + features=[[0, 1], + [9999, 9999]], + expected=[0, 1], + axis=0, + mask=[1, 0]) + self.Check(utils.Mean, + features=[[[0, 1], + [9999, 9999]], + [[9999, 9999], + [6, 7]]], + expected=[0.5, 6.5], + axis=[0, 2], + mask=[[1, 0], + [0, 1]]) + + def testVariance(self): + self.Check(utils.Variance, + features=[0, 1], + expected=0.25) + self.Check(utils.Variance, + features=[[0, 2], + [2, 3]], + expected=[1, 0.25], + axis=1) + self.Check(utils.Variance, + features=[[[0, 1], + [2, 3]], + [[4, 5], + [6, 7]]], + expected=[4.25, 4.25], + axis=[0, 2]) + + def testVarianceWithMask(self): + self.Check(utils.Variance, + features=[[0], + [1], + [2]], + expected=0.25, + mask=[0, 1, 1]) + self.Check(utils.Variance, + features=[[0, 2], + [9999, 9999], + [4, 4]], + expected=[4, 1], + axis=0, + mask=[1, 0, 1]) + self.Check(utils.Variance, + features=[[[0, 1], + [9999, 9999]], + [[9999, 9999], + [6, 8]]], + expected=[0.25, 1], + axis=[0, 2], + mask=[[1, 0], + [0, 1]]) + + def testMoment(self): + with self.test_session() as sess: + features = np.random.random((3, 4, 5)) + features_t = tf.constant(features, dtype=tf.float32) + + # test k = 1..4 + for k in [1, 2, 3, 4]: + # central moments + self.assertAllClose( + sess.run(utils.Moment(k, features_t)[1]), + scipy.stats.moment(features, k, axis=None), + rtol=1e-5, atol=1e-5) + + # standardized moments + self.assertAllClose( + sess.run(utils.Moment(k, features_t, standardize=True)[1]), + np.divide(scipy.stats.moment(features, k, axis=None), + np.power(features.std(), k)), + rtol=1e-5, atol=1e-5) + + # central across one axis + self.assertAllClose( + sess.run(utils.Moment(k, features_t, reduction_indices=1)[1]), + scipy.stats.moment(features, k, axis=1), + rtol=1e-5, atol=1e-5) + + # standardized across one axis + self.assertAllClose( + sess.run(utils.Moment(k, features_t, standardize=True, + reduction_indices=1)[1]), + np.divide(scipy.stats.moment(features, k, axis=1), + np.power(features.std(axis=1), k)), + rtol=1e-5, atol=1e-5) + + def testSkewness(self): + with self.test_session() as sess: + features = np.random.random((3, 4, 5)) + features_t = tf.constant(features, dtype=tf.float32) + self.assertAllClose(sess.run(utils.Skewness(features_t)), + scipy.stats.skew(features, axis=None), + rtol=1e-5, atol=1e-5) + self.assertAllClose(sess.run(utils.Skewness(features_t, 1)), + scipy.stats.skew(features, axis=1), + rtol=1e-5, atol=1e-5) + + def testKurtosis(self): + with self.test_session() as sess: + features = np.random.random((3, 4, 5)) + features_t = tf.constant(features, dtype=tf.float32) + self.assertAllClose(sess.run(utils.Kurtosis(features_t)), + scipy.stats.kurtosis(features, axis=None), + rtol=1e-5, atol=1e-5) + self.assertAllClose(sess.run(utils.Kurtosis(features_t, 1)), + scipy.stats.kurtosis(features, axis=1), + rtol=1e-5, atol=1e-5) + +if __name__ == '__main__': + googletest.main()