blob: e79a70f5e72091412b340c3ab1be4c670fb063a1 [file] [log] [blame]
#!/usr/bin/python
#
# Copyright 2015 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TensorFlow implementation of the models from the ICML-2015 paper.
hyperparam_dict = {
"single": Hyperparams(num_layers=1,
num_hidden=1200,
node_depth=1,
nonlinearity=ACTIVATION_RECTIFIED_LINEAR,
weight_init=GaussianWeightInit(0.01),
bias_init=ConstantBiasInit(0.5),
dropout=1.),
"deep": Hyperparams(num_layers=4,
num_hidden=1000,
node_depth=1,
nonlinearity=ACTIVATION_RECTIFIED_LINEAR,
weight_init=GaussianWeightInit(0.01),
bias_init=ConstantBiasInit(0.5),
dropout=1.),
"deepaux": Hyperparams(num_layers=4,
num_hidden=1000,
auxiliary_softmax_layers=[0, 1, 2],
auxiliary_softmax_weight=0.3,
node_depth=1,
nonlinearity=ACTIVATION_RECTIFIED_LINEAR,
weight_init=GaussianWeightInit(0.01),
bias_init=ConstantBiasInit(0.5),
dropout=1.),
"py": Hyperparams(num_layers=2,
num_hidden=[2000, 100],
node_depth=1,
nonlinearity=ACTIVATION_RECTIFIED_LINEAR,
weight_init=[GaussianWeightInit(0.01),
GaussianWeightInit(0.04)],
bias_init=[ConstantBiasInit(0.5),
ConstantBiasInit(3.0)],
dropout=1.),
"pydrop1": Hyperparams(num_layers=2,
num_hidden=[2000, 100],
node_depth=1,
nonlinearity=ACTIVATION_RECTIFIED_LINEAR,
weight_init=[GaussianWeightInit(0.01),
GaussianWeightInit(0.04)],
bias_init=[ConstantBiasInit(0.5),
ConstantBiasInit(3.0)],
dropout=[0.75, 1.]),
"pydrop2": Hyperparams(num_layers=2,
num_hidden=[2000, 100],
node_depth=1,
nonlinearity=ACTIVATION_RECTIFIED_LINEAR,
weight_init=[GaussianWeightInit(0.01),
GaussianWeightInit(0.04)],
bias_init=[ConstantBiasInit(0.5),
ConstantBiasInit(3.0)],
dropout=[0.75, 0.75])}
"""
import numpy as np
import tensorflow.google as tf
from tensorflow.python.platform import logging
from biology import model
from biology import model_ops
from nowhere.mustreimplement import input_ops
from nowhere.mustreimplement import label_ops
from nowhere.learning.dist_belief import input_example_pb2
from nowhere.learning.dist_belief import types_pb2 as legacy_types_pb2
class UnreplicatedIcmlModel(model.Classifier):
"""Implements an icml model as configured in a model_config.proto."""
def Build(self):
"""Constructs the graph architecture as specified in its config.
This method creates the following Placeholders:
mol_features: Molecule descriptor (e.g. fingerprint) tensor with shape
batch_size x num_features.
"""
with tf.name_scope(self.placeholder_scope):
self.mol_features = tf.placeholder(
tf.float32,
shape=[self.config.batch_size, self.config.num_features],
name='mol_features')
layer_sizes = self.config.layer_sizes
weight_init_stddevs = self.config.weight_init_stddevs
bias_init_consts = self.config.bias_init_consts
dropouts = self.config.dropouts
lengths_set = {
len(layer_sizes),
len(weight_init_stddevs),
len(bias_init_consts),
len(dropouts),
}
assert len(lengths_set) == 1, 'All layer params must have same length.'
num_layers = lengths_set.pop()
assert num_layers > 0, 'Must have some layers defined.'
prev_layer = self.mol_features
prev_layer_size = self.config.num_features
for i in xrange(num_layers):
layer = tf.nn.relu(model_ops.FullyConnectedLayer(
tensor=prev_layer,
size=layer_sizes[i],
weight_init=tf.truncated_normal(
shape=[prev_layer_size, layer_sizes[i]],
stddev=weight_init_stddevs[i]),
bias_init=tf.constant(value=bias_init_consts[i],
shape=[layer_sizes[i]])))
layer = model_ops.Dropout(layer, dropouts[i])
prev_layer = layer
prev_layer_size = layer_sizes[i]
self.output = model_ops.MultitaskLogits(
layer, self.config.num_classification_tasks)
def LabelsAndWeights(self):
"""Parse Label protos and create tensors for labels and weights.
This method creates the following Placeholders in the graph:
labels: Tensor with shape batch_size x num_tasks containing serialized
Label protos.
"""
config = self.config
with tf.name_scope(self.placeholder_scope):
labels = tf.placeholder(
tf.string,
shape=[config.batch_size, config.num_classification_tasks],
name='labels')
self.labels = label_ops.MultitaskLabelClasses(labels, config.num_classes)
self.weights = label_ops.MultitaskLabelWeights(labels)
def ReadInput(self, input_pattern, input_data_types=None):
"""Read input data and return a generator for minibatches.
Args:
input_pattern: Input file pattern.
input_data_types: List of legacy_types_pb2 constants matching the
number of and data types present in the sstables. If not specified,
defaults to full ICML 259-task types, but can be specified
for unittests or other datasets with consistent types.
Returns:
A generator that yields a dict for feeding a single batch to Placeholders
in the graph.
Raises:
AssertionError: If no default session is available.
"""
if model_ops.IsTraining():
randomize = True
num_iterations = None
else:
randomize = False
num_iterations = 1
num_tasks = self.config.num_classification_tasks
tasks_in_input = self.config.tasks_in_input
if input_data_types is None:
input_data_types = ([legacy_types_pb2.DF_FLOAT] +
[legacy_types_pb2.DF_LABEL_PROTO] * tasks_in_input)
features, labels = input_ops.InputExampleInputReader(
input_pattern=input_pattern,
batch_size=self.config.batch_size,
num_tasks=num_tasks,
input_data_types=input_data_types,
num_features=self.config.num_features,
randomize=randomize,
shuffling=randomize,
num_iterations=num_iterations)
return self._ReadInputGenerator(features, labels[:, :num_tasks])
def _GetFeedDict(self, named_values):
feed_dict = {}
for name, value in named_values.iteritems():
feed_dict['{}/{}:0'.format(self.placeholder_root, name)] = value
return feed_dict
def EvalBatch(self, input_batch):
"""Runs inference on the provided batch of input.
Args:
input_batch: iterator of input with len self.config.batch_size.
Returns:
Tuple of three numpy arrays with shape num_examples x num_tasks (x ...):
output: Model predictions.
labels: True labels. numpy array values are scalars,
not 1-hot classes vector.
weights: Example weights.
"""
output, labels, weights = super(UnreplicatedIcmlModel, self).EvalBatch(
input_batch)
# Converts labels from 1-hot to float.
labels = labels[:, :, 1] # Whole batch, all tasks, 1-hot positive index.
return output, labels, weights
def BatchInputGenerator(self, serialized_batch):
"""Returns a generator that iterates over the provided batch of input.
TODO(user): This is similar to input_ops.InputExampleInputReader(),
but doesn't need to be executed as part of the TensorFlow graph.
Consider refactoring so these can share code somehow.
Args:
serialized_batch: List of tuples: (_, value) where value is
a serialized InputExample proto. Must have self.config.batch_size
length or smaller. If smaller, we'll pad up to batch_size
and mark the padding as invalid so it's ignored in eval metrics.
Yields:
Dict of model inputs for use as a feed_dict.
Raises:
ValueError: If the batch is larger than the batch_size.
"""
if len(serialized_batch) > self.config.batch_size:
raise ValueError(
'serialized_batch length {} must be <= batch_size {}'.format(
len(serialized_batch), self.config.batch_size))
for _ in xrange(self.config.batch_size - len(serialized_batch)):
serialized_batch.append((None, ''))
features = []
labels = []
for _, serialized_proto in serialized_batch:
if serialized_proto:
input_example = input_example_pb2.InputExample()
input_example.ParseFromString(serialized_proto)
features.append([f for f in input_example.endpoint[0].float_value])
label_protos = [endpoint.label
for endpoint in input_example.endpoint[1:]]
assert len(label_protos) == self.config.num_classification_tasks
labels.append([l.SerializeToString() for l in label_protos])
else:
# This was a padded value to reach the batch size.
features.append([0.0 for _ in xrange(self.config.num_features)])
labels.append(
['' for _ in xrange(self.config.num_classification_tasks)])
valid = np.asarray([(np.sum(f) > 0) for f in features])
assert len(features) == self.config.batch_size
assert len(labels) == self.config.batch_size
assert len(valid) == self.config.batch_size
yield self._GetFeedDict({
'mol_features': features,
'labels': labels,
'valid': valid
})
def _ReadInputGenerator(self, features_tensor, labels_tensor):
"""Generator that constructs feed_dict for minibatches.
Args:
features_tensor: Tensor of batch_size x molecule features.
labels_tensor: Tensor of batch_size x label protos.
Yields:
A dict for feeding a single batch to Placeholders in the graph.
Raises:
AssertionError: If no default session is available.
"""
sess = tf.get_default_session()
if sess is None:
raise AssertionError('No default session')
while True:
try:
logging.vlog(1, 'Starting session execution to get input data')
features, labels = sess.run([features_tensor, labels_tensor])
logging.vlog(1, 'Done with session execution to get input data')
# TODO(user): check if the below axis=1 needs to change to axis=0,
# because cl/105081140.
valid = np.sum(features, axis=1) > 0
yield self._GetFeedDict({
'mol_features': features,
'labels': labels,
'valid': valid
})
except tf.OpError as e:
# InputExampleInput op raises OpError when it has hit num_iterations
# or its input file is exhausted. However it may also be raised
# if the input sstable isn't what we expect.
if 'Invalid InputExample' in e.message:
raise e
else:
break