TensorFlow CNN loss quickly increases to NaN

I'm trying to train a regressor model that can predict 4 scalar float outputs. As it currently stands, the network very quickly diverges with loss increasing to NaN. I can't figure out what's going on.

Below is a self-contined sample tested with TensorFlow 1.1.0 on Windows 10 with a NVidia GPU.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy
import tensorflow as tf

IMAGE_HEIGHT = 320
IMAGE_WIDTH = 160
NUM_CHANNELS = 3

PIXEL_DEPTH = 255
SEED = 66479  # Set to None for random seed.
BATCH_SIZE=5
NUM_OUTPUTS = 4 # the four outputs

def data_type():
    return tf.float32

# The variables below hold all the trainable weights. They are passed an
# initial value which will be assigned when we call:
# {tf.global_variables_initializer().run()}
conv1_weights = tf.Variable(
  tf.truncated_normal([5, 5, NUM_CHANNELS, 32],  # 5x5 filter, depth 32.
                      stddev=0.1,
                      seed=SEED, dtype=data_type()))
conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type()))
conv2_weights = tf.Variable(tf.truncated_normal(
  [5, 5, 32, 64], stddev=0.1,
  seed=SEED, dtype=data_type()))
conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type()))
fc1_weights = tf.Variable(  # fully connected, depth 512.
  tf.truncated_normal([IMAGE_HEIGHT // 4 * IMAGE_WIDTH // 4 * 64, 512],
                      stddev=0.1,
                      seed=SEED,
                      dtype=data_type()))
fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type()))
fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_OUTPUTS],
                                            stddev=0.1,
                                            seed=SEED,
                                            dtype=data_type()))
fc2_biases = tf.Variable(tf.constant(
  0.1, shape=[NUM_OUTPUTS], dtype=data_type()))


  # We will replicate the model structure for the training subgraph, as well
  # as the evaluation subgraphs, while sharing the trainable parameters.
def model(data, train=False):
    """The Model definition."""
    # 2D convolution, with 'SAME' padding (i.e. the output feature map has
    # the same size as the input). Note that {strides} is a 4D array whose
    # shape matches the data layout: [image index, y, x, depth].
    conv = tf.nn.conv2d(data,
                        conv1_weights,
                        strides=[1, 1, 1, 1],
                        padding='SAME')
    # Bias and rectified linear non-linearity.
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))
    # Max pooling. The kernel size spec {ksize} also follows the layout of
    # the data. Here we have a pooling window of 2, and a stride of 2.
    pool = tf.nn.max_pool(relu,
                          ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1],
                          padding='SAME')
    conv = tf.nn.conv2d(pool,
                        conv2_weights,
                        strides=[1, 1, 1, 1],
                        padding='SAME')
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
    pool = tf.nn.max_pool(relu,
                          ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1],
                          padding='SAME')
    # Reshape the feature map cuboid into a 2D matrix to feed it to the
    # fully connected layers.
    pool_shape = pool.get_shape().as_list()
    reshape = tf.reshape(
        pool,
        [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
    # Fully connected layer. Note that the '+' operation automatically
    # broadcasts the biases.
    hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
    # Add a 50% dropout during training only. Dropout also scales
    # activations such that no rescaling is needed at evaluation time.
    if train:
      hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
    return tf.matmul(hidden, fc2_weights) + fc2_biases

def main():

    train_data_batch = tf.placeholder(tf.float32, shape=(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS))
    train_label_batch = tf.placeholder(tf.float32, shape=(BATCH_SIZE, NUM_OUTPUTS))


    with tf.name_scope('pred'):
        train_pred = model(train_data_batch, train=True)

    with tf.name_scope('loss'):
        loss = tf.reduce_sum(tf.square(train_pred - train_label_batch))
        tf.summary.scalar('loss', loss)


    # L2 regularization for the fully connected parameters.
    regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
                  tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
    # Add the regularization term to the loss.
    loss += 5e-4 * regularizers

    optimizer = tf.train.GradientDescentOptimizer(0.01)
    train_op = optimizer.minimize(loss)

    with tf.Session()  as sess:
        # The op for initializing the variables.
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())

        sess.run(init_op)

        while True:
            predictions, l, _ = sess.run([train_pred, loss, train_op], feed_dict={

                train_data_batch: numpy.zeros([BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS])+0.2,
                train_label_batch: numpy.zeros([BATCH_SIZE, 4])})

            print(l)

if __name__ == "__main__":
    main()

output:

9031.0
5.6838e+22
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan

It appears that my model was diverging. I solved this by changing to an AdamOptimizer:

optimizer = tf.train.AdamOptimizer(0.5)

This adaptively sets parameters for a momentum optimizer.

链接地址: http://www.djcxy.com/p/32052.html

上一篇: 损失保持不变

下一篇: TensorFlow CNN损失迅速增加至NaN