| #!/usr/bin/python |
| # |
| # Copyright 2015 Google Inc. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| """TensorFlow implementation of the models from the ICML-2015 paper. |
| |
| |
| hyperparam_dict = { |
| "single": Hyperparams(num_layers=1, |
| num_hidden=1200, |
| node_depth=1, |
| nonlinearity=ACTIVATION_RECTIFIED_LINEAR, |
| weight_init=GaussianWeightInit(0.01), |
| bias_init=ConstantBiasInit(0.5), |
| dropout=1.), |
| "deep": Hyperparams(num_layers=4, |
| num_hidden=1000, |
| node_depth=1, |
| nonlinearity=ACTIVATION_RECTIFIED_LINEAR, |
| weight_init=GaussianWeightInit(0.01), |
| bias_init=ConstantBiasInit(0.5), |
| dropout=1.), |
| "deepaux": Hyperparams(num_layers=4, |
| num_hidden=1000, |
| auxiliary_softmax_layers=[0, 1, 2], |
| auxiliary_softmax_weight=0.3, |
| node_depth=1, |
| nonlinearity=ACTIVATION_RECTIFIED_LINEAR, |
| weight_init=GaussianWeightInit(0.01), |
| bias_init=ConstantBiasInit(0.5), |
| dropout=1.), |
| "py": Hyperparams(num_layers=2, |
| num_hidden=[2000, 100], |
| node_depth=1, |
| nonlinearity=ACTIVATION_RECTIFIED_LINEAR, |
| weight_init=[GaussianWeightInit(0.01), |
| GaussianWeightInit(0.04)], |
| bias_init=[ConstantBiasInit(0.5), |
| ConstantBiasInit(3.0)], |
| dropout=1.), |
| "pydrop1": Hyperparams(num_layers=2, |
| num_hidden=[2000, 100], |
| node_depth=1, |
| nonlinearity=ACTIVATION_RECTIFIED_LINEAR, |
| weight_init=[GaussianWeightInit(0.01), |
| GaussianWeightInit(0.04)], |
| bias_init=[ConstantBiasInit(0.5), |
| ConstantBiasInit(3.0)], |
| dropout=[0.75, 1.]), |
| "pydrop2": Hyperparams(num_layers=2, |
| num_hidden=[2000, 100], |
| node_depth=1, |
| nonlinearity=ACTIVATION_RECTIFIED_LINEAR, |
| weight_init=[GaussianWeightInit(0.01), |
| GaussianWeightInit(0.04)], |
| bias_init=[ConstantBiasInit(0.5), |
| ConstantBiasInit(3.0)], |
| dropout=[0.75, 0.75])} |
| """ |
| |
| import numpy as np |
| |
| import tensorflow.google as tf |
| |
| from tensorflow.python.platform import logging |
| |
| from biology import model |
| from biology import model_ops |
| from nowhere.mustreimplement import input_ops |
| from nowhere.mustreimplement import label_ops |
| from nowhere.learning.dist_belief import input_example_pb2 |
| from nowhere.learning.dist_belief import types_pb2 as legacy_types_pb2 |
| |
| |
| class UnreplicatedIcmlModel(model.Classifier): |
| """Implements an icml model as configured in a model_config.proto.""" |
| |
| def Build(self): |
| """Constructs the graph architecture as specified in its config. |
| |
| This method creates the following Placeholders: |
| mol_features: Molecule descriptor (e.g. fingerprint) tensor with shape |
| batch_size x num_features. |
| """ |
| with tf.name_scope(self.placeholder_scope): |
| self.mol_features = tf.placeholder( |
| tf.float32, |
| shape=[self.config.batch_size, self.config.num_features], |
| name='mol_features') |
| |
| layer_sizes = self.config.layer_sizes |
| weight_init_stddevs = self.config.weight_init_stddevs |
| bias_init_consts = self.config.bias_init_consts |
| dropouts = self.config.dropouts |
| lengths_set = { |
| len(layer_sizes), |
| len(weight_init_stddevs), |
| len(bias_init_consts), |
| len(dropouts), |
| } |
| assert len(lengths_set) == 1, 'All layer params must have same length.' |
| num_layers = lengths_set.pop() |
| assert num_layers > 0, 'Must have some layers defined.' |
| |
| prev_layer = self.mol_features |
| prev_layer_size = self.config.num_features |
| for i in xrange(num_layers): |
| layer = tf.nn.relu(model_ops.FullyConnectedLayer( |
| tensor=prev_layer, |
| size=layer_sizes[i], |
| weight_init=tf.truncated_normal( |
| shape=[prev_layer_size, layer_sizes[i]], |
| stddev=weight_init_stddevs[i]), |
| bias_init=tf.constant(value=bias_init_consts[i], |
| shape=[layer_sizes[i]]))) |
| layer = model_ops.Dropout(layer, dropouts[i]) |
| prev_layer = layer |
| prev_layer_size = layer_sizes[i] |
| |
| self.output = model_ops.MultitaskLogits( |
| layer, self.config.num_classification_tasks) |
| |
| def LabelsAndWeights(self): |
| """Parse Label protos and create tensors for labels and weights. |
| |
| This method creates the following Placeholders in the graph: |
| labels: Tensor with shape batch_size x num_tasks containing serialized |
| Label protos. |
| """ |
| config = self.config |
| with tf.name_scope(self.placeholder_scope): |
| labels = tf.placeholder( |
| tf.string, |
| shape=[config.batch_size, config.num_classification_tasks], |
| name='labels') |
| self.labels = label_ops.MultitaskLabelClasses(labels, config.num_classes) |
| self.weights = label_ops.MultitaskLabelWeights(labels) |
| |
| def ReadInput(self, input_pattern, input_data_types=None): |
| """Read input data and return a generator for minibatches. |
| |
| Args: |
| input_pattern: Input file pattern. |
| input_data_types: List of legacy_types_pb2 constants matching the |
| number of and data types present in the sstables. If not specified, |
| defaults to full ICML 259-task types, but can be specified |
| for unittests or other datasets with consistent types. |
| |
| Returns: |
| A generator that yields a dict for feeding a single batch to Placeholders |
| in the graph. |
| |
| Raises: |
| AssertionError: If no default session is available. |
| """ |
| if model_ops.IsTraining(): |
| randomize = True |
| num_iterations = None |
| else: |
| randomize = False |
| num_iterations = 1 |
| |
| num_tasks = self.config.num_classification_tasks |
| tasks_in_input = self.config.tasks_in_input |
| if input_data_types is None: |
| input_data_types = ([legacy_types_pb2.DF_FLOAT] + |
| [legacy_types_pb2.DF_LABEL_PROTO] * tasks_in_input) |
| features, labels = input_ops.InputExampleInputReader( |
| input_pattern=input_pattern, |
| batch_size=self.config.batch_size, |
| num_tasks=num_tasks, |
| input_data_types=input_data_types, |
| num_features=self.config.num_features, |
| randomize=randomize, |
| shuffling=randomize, |
| num_iterations=num_iterations) |
| |
| return self._ReadInputGenerator(features, labels[:, :num_tasks]) |
| |
| def _GetFeedDict(self, named_values): |
| feed_dict = {} |
| for name, value in named_values.iteritems(): |
| feed_dict['{}/{}:0'.format(self.placeholder_root, name)] = value |
| |
| return feed_dict |
| |
| def EvalBatch(self, input_batch): |
| """Runs inference on the provided batch of input. |
| |
| Args: |
| input_batch: iterator of input with len self.config.batch_size. |
| |
| Returns: |
| Tuple of three numpy arrays with shape num_examples x num_tasks (x ...): |
| output: Model predictions. |
| labels: True labels. numpy array values are scalars, |
| not 1-hot classes vector. |
| weights: Example weights. |
| """ |
| output, labels, weights = super(UnreplicatedIcmlModel, self).EvalBatch( |
| input_batch) |
| |
| # Converts labels from 1-hot to float. |
| labels = labels[:, :, 1] # Whole batch, all tasks, 1-hot positive index. |
| return output, labels, weights |
| |
| def BatchInputGenerator(self, serialized_batch): |
| """Returns a generator that iterates over the provided batch of input. |
| |
| TODO(user): This is similar to input_ops.InputExampleInputReader(), |
| but doesn't need to be executed as part of the TensorFlow graph. |
| Consider refactoring so these can share code somehow. |
| |
| Args: |
| serialized_batch: List of tuples: (_, value) where value is |
| a serialized InputExample proto. Must have self.config.batch_size |
| length or smaller. If smaller, we'll pad up to batch_size |
| and mark the padding as invalid so it's ignored in eval metrics. |
| Yields: |
| Dict of model inputs for use as a feed_dict. |
| |
| Raises: |
| ValueError: If the batch is larger than the batch_size. |
| """ |
| if len(serialized_batch) > self.config.batch_size: |
| raise ValueError( |
| 'serialized_batch length {} must be <= batch_size {}'.format( |
| len(serialized_batch), self.config.batch_size)) |
| for _ in xrange(self.config.batch_size - len(serialized_batch)): |
| serialized_batch.append((None, '')) |
| |
| features = [] |
| labels = [] |
| for _, serialized_proto in serialized_batch: |
| if serialized_proto: |
| input_example = input_example_pb2.InputExample() |
| input_example.ParseFromString(serialized_proto) |
| features.append([f for f in input_example.endpoint[0].float_value]) |
| label_protos = [endpoint.label |
| for endpoint in input_example.endpoint[1:]] |
| assert len(label_protos) == self.config.num_classification_tasks |
| labels.append([l.SerializeToString() for l in label_protos]) |
| else: |
| # This was a padded value to reach the batch size. |
| features.append([0.0 for _ in xrange(self.config.num_features)]) |
| labels.append( |
| ['' for _ in xrange(self.config.num_classification_tasks)]) |
| |
| valid = np.asarray([(np.sum(f) > 0) for f in features]) |
| |
| assert len(features) == self.config.batch_size |
| assert len(labels) == self.config.batch_size |
| assert len(valid) == self.config.batch_size |
| yield self._GetFeedDict({ |
| 'mol_features': features, |
| 'labels': labels, |
| 'valid': valid |
| }) |
| |
| def _ReadInputGenerator(self, features_tensor, labels_tensor): |
| """Generator that constructs feed_dict for minibatches. |
| |
| Args: |
| features_tensor: Tensor of batch_size x molecule features. |
| labels_tensor: Tensor of batch_size x label protos. |
| |
| Yields: |
| A dict for feeding a single batch to Placeholders in the graph. |
| |
| Raises: |
| AssertionError: If no default session is available. |
| """ |
| sess = tf.get_default_session() |
| if sess is None: |
| raise AssertionError('No default session') |
| while True: |
| try: |
| logging.vlog(1, 'Starting session execution to get input data') |
| features, labels = sess.run([features_tensor, labels_tensor]) |
| logging.vlog(1, 'Done with session execution to get input data') |
| # TODO(user): check if the below axis=1 needs to change to axis=0, |
| # because cl/105081140. |
| valid = np.sum(features, axis=1) > 0 |
| yield self._GetFeedDict({ |
| 'mol_features': features, |
| 'labels': labels, |
| 'valid': valid |
| }) |
| |
| except tf.OpError as e: |
| # InputExampleInput op raises OpError when it has hit num_iterations |
| # or its input file is exhausted. However it may also be raised |
| # if the input sstable isn't what we expect. |
| if 'Invalid InputExample' in e.message: |
| raise e |
| else: |
| break |
| |