Weights and Biases not updating in tensorflow

岁酱吖の 提交于 2019-12-08 04:34:45

A few things to consider

  • Minibatch not being evaluated correctly since you feed in features and lbls instead of epoch_input and epoch_label.
  • You do not precondition your data in any way, so it is completely out of range. I.e. my code below normalizes the features into stddev and mean. You might consider using batch_normalization.
  • You are not evaluating error at any point. You need a held out training and testing set. My code below doesn't hold out data, but it does test in terms of error % rather than just loss (which is a weak proxy for error, so you shouldn't call it error).
  • You initialize biases to random normals. You probably want to just start those at zero.
  • You probably should use tf.layers or another high level api.

The below code achieves a training error of 95%. You'd want to test with a held out data set not used for training to evaluate the testing error.

#!/usr/bin/env python
import sys
import pandas as pd
import numpy as np
import tensorflow as tf


data = pd.read_csv("data.csv")

features = data.drop(['good buy'], axis = 1)
lbls = data.drop([ 'area', 'bathrooms', 'price', 'sq_price'], axis = 1)

features = features[0:20]
lbls = lbls[0:20]

mu = np.mean(features, axis=0)
sigma = (np.std(features, axis=0))
features = (features - mu) / sigma

n_examples = len(lbls)

# Model

# Hyper parameters

epochs = 100
learning_rate = 0.01
batch_size = 5

input_data = tf.placeholder('float', [None, 4])
labels = tf.placeholder('float', [None, 1])

weights = {
      'hl1': tf.Variable(tf.random_normal([4, 10])),
      'hl2': tf.Variable(tf.random_normal([10, 10])),
      'hl3': tf.Variable(tf.random_normal([10, 4])),
      'ol': tf.Variable(tf.random_normal([4, 1]))
      }

biases = {
      'hl1': tf.Variable(tf.zeros([10])),
      'hl2': tf.Variable(tf.zeros([10])),
      'hl3': tf.Variable(tf.zeros([4])),
      'ol': tf.Variable(tf.zeros([1]))
      }



hl1 = tf.nn.relu(tf.add(tf.matmul(input_data, weights['hl1']), biases['hl1']))
hl2 = tf.nn.relu(tf.add(tf.matmul(hl1, weights['hl2']), biases['hl2']))
hl3 = tf.nn.relu(tf.add(tf.matmul(hl2, weights['hl3']), biases['hl3']))
ol = tf.nn.sigmoid(tf.add(tf.matmul(hl3, weights['ol']), biases['ol']))

loss = tf.reduce_mean((labels - ol)**2)
train = tf.train.AdamOptimizer(learning_rate).minimize(loss)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

iterations = int(n_examples/batch_size)


def training_accuracy():
  foo,  = sess.run([ol], feed_dict={input_data: features, labels: lbls})
  return (float(np.count_nonzero(np.equal(np.round(foo), lbls))) / float(lbls.shape[0]))


print("Initial training accuracy %f" % training_accuracy())


for epoch_no in range(epochs):
  ptr = 0
  for iteration_no in range(iterations):
    epoch_input = features[ptr:ptr+batch_size]
    epoch_label = lbls[ptr: ptr+batch_size]
    ptr = (ptr + batch_size)%len(features)
    _, err = sess.run([train, loss], feed_dict={input_data: epoch_input, labels: epoch_label})
  print("Error at epoch ", epoch_no, ": ", err)
  print("  Training accuracy %f" % training_accuracy())

Also, please do not post usage questions like this on github, they belong here on StackOverflow.

There are several things not ok with your code. First, you mean

    epoch_input = features[ptr:ptr+batch_size]
    epoch_label = lbls[ptr: ptr+batch_size]
    ptr = ptr + batch_size
    // _, err = sess.run([train, loss], feed_dict={input_data: features, labels: lbls}
    _, err = sess.run([train, loss], feed_dict={input_data: epoch_input, labels: epoch_label}

Now it uses minibatch.

Debugging the gradient:

You can always check some stuff by adding

loss = tf.Print(loss, [tf.reduce_sum(weights['hl1'])])

This will print the elements of that list [tf.reduce_sum(weights['hl1'])]. To investigate further your problem, you can check the gradients instead of using minimize

grads = tf.reduce_sum(tf.gradients(loss, ol)[0])
sess.run(grads, {input_data: features, labels: lbls})

And finally, the loss function is inappropriate/numerical instable for classification. With your version, I get:

variables
   Variable:0
   Variable_1:0
   Variable_2:0
   Variable_3:0
   Variable_4:0
   Variable_5:0
   Variable_6:0
   Variable_7:0
I tensorflow/core/kernels/logging_ops.cc:79] [-6.2784553]
-----------------------------------------
name MatMul_grad
gradient [[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]
value [[-0.59977376 -0.30060738  0.55068201  0.15304407  1.39992142  0.07495346
  -0.87189424 -0.22595075 -0.30094525 -1.2688272 ]
 [-0.44018757  1.08651936 -0.26267499 -0.54463315  0.47019768  0.69873857
   0.56195319  0.20222363  0.38143152 -0.92212462]
 [-0.39977714 -1.07244122  0.41926911  1.4951371  -2.28751612  0.45676312
   0.88010246 -0.88077509 -1.25860023  0.56874037]
 [-0.98260719 -1.30747247 -1.4460088   1.0717535   0.08794415 -0.53184992
  -1.17537284 -0.51598179 -0.15323587  0.91142744]]
-----------------------------------------
name MatMul_1_grad
gradient [[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]
value [[-0.1170694   0.12174897  0.91696155  0.59427398  0.90844423  0.29010534
  -0.34039831 -0.62824941  0.37833953  0.27777222]
 [-0.34947088  1.09264851  0.27353975  1.31722498 -0.42032316 -2.74952078
  -0.66349608 -0.61844724 -0.82141227  1.21691799]
 [ 0.10453336 -1.68631995  0.45700032 -1.58120835 -1.23378754 -0.05648948
  -1.64761281 -0.57684237 -0.06499017 -0.49623618]
 [ 1.47821534 -0.5329541   0.09209292  1.78089786  1.71149898  0.30547267
   0.39544162  1.00369155  1.0097307  -0.92320329]
 [ 1.27038908 -2.17246103 -0.31276336  0.8945803   0.30964327  1.15329361
   0.9711507  -0.36301252 -0.05652813  0.63399518]
 [-0.30909851 -0.41660413 -0.50603527  0.11735299 -0.26837045  0.16547598
  -0.33875859 -0.46821991  0.25723135 -0.80380815]
 [-0.86255074 -1.11751068  0.01365725  0.66119182  0.48947951  1.6353699
  -0.794447    0.43182942 -0.97692633 -1.62605619]
 [ 1.38552308  0.83679706 -0.87287223  2.59401655 -0.61855     0.38301265
   1.09983373  0.49209142  1.03003716 -1.33537853]
 [ 0.74452382  1.57940936 -0.90974236 -1.2211293  -1.1076287   0.92846316
  -0.46856263 -0.3179535   0.75120807 -0.86442506]
 [ 0.31622764 -0.35965034 -0.02351121 -0.0650174   0.4714573   0.35687482
   1.43354905  0.39608309  0.42744714 -0.37226421]]
-----------------------------------------
name MatMul_2_grad
gradient [[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
value [[-1.50904143  0.00228321  1.45787132  0.68312413]
 [-0.16627057  1.31303644  1.16326404  0.72901946]
 [ 0.8004092   0.37329885  0.89361066 -0.19850619]
 [ 1.58354807 -1.05612624  0.69891322 -0.32565734]
 [-1.57602286 -0.41256282  0.69086516 -0.54095054]
 [ 1.72376788 -0.53928965 -0.71574098 -0.94974124]
 [-0.62061429  1.51380932 -0.72585452 -0.07695383]
 [ 0.35537818  1.49691582  0.03931179  0.93435526]
 [ 0.20697887  1.39266443  0.73217523 -0.64737892]
 [ 1.00519872  0.90984046  1.68565321 -0.28157935]]
-----------------------------------------
name MatMul_3_grad
gradient [[ 0.]
 [ 0.]
 [ 0.]
 [ 0.]]
value [[ 0.94082022]
 [ 0.14753926]
 [-0.08765228]
 [ 1.32516992]]
-----------------------------------------
name Add_grad
gradient [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
value [ 1.71239722  1.12632215  0.75409448  0.01951236  0.32135537 -1.46281374
  0.40413955  0.54653352 -0.57894999  0.2746354 ]
-----------------------------------------
name Add_1_grad
gradient [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
value [ 0.74800217 -0.43517059 -0.77706921  1.46858656  1.09103405 -0.46681881
  0.6126743  -2.27877688  1.48809242 -1.19616997]
-----------------------------------------
name Add_2_grad
gradient [ 0.  0.  0.  0.]
value [-0.12137324 -0.23238407  0.17909229 -0.75496733]
-----------------------------------------
name Add_3_grad
gradient [ 0.]
value [-0.91176724]

As you see, almost all gradients are zero. Why?

  • by definition (labels - ol) is in [0, 1]
  • the squared value is much smaller than one
  • the derivative of sigmoid s(x) is s'(x) = s(x)*(1-s(x)) the gradients are multiplied by this value which is again much smaller than one.

But after using sparse_softmax_cross_entropy_with_logits which is numerically stable and operates in the log-domain I get

variables
   Variable:0
   Variable_1:0
   Variable_2:0
   Variable_3:0
   Variable_4:0
   Variable_5:0
   Variable_6:0
   Variable_7:0
-----------------------------------------
name MatMul_grad
gradient [[ -1.42780918e-05  -1.96137808e-05  -2.44040220e-05  -2.25691911e-05
    0.00000000e+00   2.95208647e-05   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [ -2.54181440e-08  -3.49168410e-08  -4.34445262e-08  -4.01781257e-08
    0.00000000e+00   5.25536308e-08   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [ -2.45539122e-03  -3.37296468e-03  -4.19673882e-03  -3.88120394e-03
    0.00000000e+00   5.07667707e-03   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [ -1.42123906e-06  -1.95235293e-06  -2.42917258e-06  -2.24653377e-06
    0.00000000e+00   2.93850212e-06   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
value [[ 0.43133125 -0.40009859 -0.08456381  0.59587955  0.57171088 -0.9824872
   1.18876612  0.9704771   0.74798232  0.15660612]
 [-1.18380785  0.22617982 -1.15734088 -0.50478351  1.43819618  1.55950046
  -1.1510663  -0.88835335  0.58378232  0.56860197]
 [ 0.29826403  0.02192715  0.62225986  2.47716165 -0.9223454   1.70159853
  -1.03968358 -0.26019615 -0.33808291 -0.30873826]
 [ 0.59774327 -1.28855145 -0.43420359 -0.4413566  -0.19220066  0.96984953
  -0.04922202  0.32994318 -1.05539823 -0.80112725]]
-----------------------------------------
name MatMul_1_grad
gradient [[  0.00000000e+00   1.15650124e-03   0.00000000e+00   0.00000000e+00
    6.59449317e-04  -1.09400018e-03   0.00000000e+00  -4.02117817e-04
    5.44495881e-04  -8.90314346e-04]
 [  0.00000000e+00   7.24206184e-05   0.00000000e+00   0.00000000e+00
    4.12950030e-05  -6.85067716e-05   0.00000000e+00  -2.51807924e-05
    3.40965707e-05  -5.57518724e-05]
 [  0.00000000e+00   2.38713808e-03   0.00000000e+00   0.00000000e+00
    1.36117137e-03  -2.25812919e-03   0.00000000e+00  -8.30012548e-04
    1.12389564e-03  -1.83770037e-03]
 [  0.00000000e+00   9.52679198e-03   0.00000000e+00   0.00000000e+00
    5.43227792e-03  -9.01193265e-03   0.00000000e+00  -3.31248436e-03
    4.48533799e-03  -7.33405072e-03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   6.51591457e-03   0.00000000e+00   0.00000000e+00
    3.71544389e-03  -6.16377220e-03   0.00000000e+00  -2.26559630e-03
    3.06777749e-03  -5.01617463e-03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
value [[ 0.38902158 -2.14370036 -1.02228141 -0.6492967   1.87193418 -0.06453216
   1.0013988  -1.26857054  0.59826601  0.45045251]
 [ 0.51465249 -1.09108925 -0.21368918 -0.49310678 -0.87893176 -0.07944249
  -0.15810326  1.65703297  1.01812947 -0.95572269]
 [-1.76351583 -1.46950841  1.43533802  2.15617752  1.30682683  0.77409673
  -1.50309181  0.81978178  0.6672287  -0.434971  ]
 [-0.7291944   2.16516733 -1.39850736 -1.06059277  0.40035763  1.23335707
  -0.03707252  1.88107574  0.09459961  2.11439633]
 [-1.39152992 -1.39924514 -0.35704514 -0.71152836 -2.68857026  0.78129828
  -1.0077033  -1.26149333  0.4403404  -0.10159389]
 [ 0.37354535  0.12654085  0.7632165  -0.76493222  0.68177891 -0.34254205
  -1.11582613  2.60665917  1.53196526 -0.867055  ]
 [ 0.62746197 -0.01072595  3.26629376  1.28371656 -0.88725293  3.55530715
   0.67065352 -0.61927503  1.20604384 -0.87207574]
 [-0.68954837  1.89912283  0.90083456  0.02054735 -0.23425011  0.39949065
  -0.08969283 -0.75943565  1.0924015   0.28920195]
 [-0.64865923 -1.29299021 -0.39945969  0.02289505  1.46024895  0.94282049
  -0.99704605 -1.36124468  0.76788425  0.86770487]
 [ 0.63794595  1.68530416 -0.15548207 -0.22658408 -0.45446202 -0.77308726
  -0.12694608  1.17369819  2.25879693  0.20346723]]
-----------------------------------------
name MatMul_2_grad
gradient [[ 0.          0.          0.          0.        ]
 [-0.02205572  0.          0.00960038  0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [-0.01932034  0.          0.00840973  0.        ]
 [-0.01617817  0.          0.00704201  0.        ]
 [ 0.          0.          0.          0.        ]
 [-0.05091252  0.          0.02216113  0.        ]
 [-0.0189826   0.          0.00826272  0.        ]
 [-0.01993647  0.          0.00867792  0.        ]]
value [[-0.18724969 -0.0544498  -0.69153035  0.47535184]
 [-0.75444973 -1.33321464 -0.13066645  1.56889391]
 [-0.6458627   1.17859495 -0.75926393  0.30138403]
 [ 1.0069555  -0.69344127  0.49295315  0.54917085]
 [-0.55954564 -1.13277721 -0.37167427 -0.64837182]
 [ 0.93753678  1.12197697  0.63789612  0.52438796]
 [ 0.77543265 -1.241382    1.78230286 -0.6928125 ]
 [ 0.95383584 -2.00331807  1.63409865 -0.36474878]
 [-0.73891008  2.066082   -0.94303596 -0.42322466]
 [ 0.38519588  0.03278512 -0.3487882  -1.50447905]]
-----------------------------------------
name MatMul_3_grad
gradient [[ 0.08460998]
 [ 0.        ]
 [ 0.16564058]
 [ 0.        ]]
value [[-0.35376808]
 [-0.07330427]
 [ 0.15398768]
 [-0.06484076]]
-----------------------------------------
name Add_grad
gradient [ -8.22783885e-09  -1.13025616e-08  -1.40629695e-08  -1.30056375e-08
   0.00000000e+00   1.70115797e-08   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00]
value [-1.00038147 -0.56519473  0.59372097 -1.1646167  -0.16213787 -0.69313556
  0.62788707  1.03768504  0.57876503 -0.5201084 ]
-----------------------------------------
name Add_1_grad
gradient [  0.00000000e+00   1.28705375e-08   0.00000000e+00   0.00000000e+00
   7.33891703e-09  -1.21749730e-08   0.00000000e+00  -4.47511184e-09
   6.05961770e-09  -9.90818183e-09]
value [ 0.02854451 -1.46039021 -0.03916361  0.40116394  0.16030532  0.88267213
 -0.46328214  0.18927227 -1.7536788  -0.46590349]
-----------------------------------------
name Add_2_grad
gradient [ -1.84504412e-08   0.00000000e+00   8.03108247e-09   0.00000000e+00]
value [ 0.94534302 -0.9080081  -1.86719894 -1.31547296]
-----------------------------------------
name Add_3_grad
gradient [ 0.29727879 -0.29727876]
value [ 0.07999782 -0.75647992]

The gradients are (while very small) this time non zero. The code for reproducing that is

import numpy as np
import tensorflow as tf

features = [
[2104, 3, 399900, 190.066540],
[1600, 3, 329900, 206.187500],
[2400, 3, 369000, 153.750000],
[1416, 2, 232000, 163.841808],
[3000, 4, 539900, 179.966667],
[1985, 4, 299900, 151.083123],
[1534, 3, 314900, 205.280313],
[1427, 3, 198999, 139.452698],
[1380, 3, 212000, 153.623188],
[1494, 3, 242500, 162.315930],
[1940, 4, 239999, 123.710825],
[2000, 3, 347000, 173.500000],
[1890, 3, 329999, 174.602645],
[4478, 5, 699900, 156.297454],
[1268, 3, 259900, 204.968454],
[2300, 4, 449900, 195.608696],
[1320, 2, 299900, 227.196970],
[1236, 3, 199900, 161.731392],
[2609, 4, 499998, 191.643542],
[3031, 4, 599000, 197.624546]]

lbls = [1,0,1,0,1,0,0,1,0,0,1,1,1,1,0,1,0,1,1,1]
features = np.array(features, dtype=np.float32)
lbls = np.array(lbls, dtype=np.int32)

n_examples = len(lbls)
epochs = 100
learning_rate = 0.1
batch_size = 1

input_data = tf.placeholder('float', [None, 4])
labels = tf.placeholder('int32', [None])

weights = {
            'hl1': tf.Variable(tf.random_normal([4, 10])),
            'hl2': tf.Variable(tf.random_normal([10, 10])),
            'hl3': tf.Variable(tf.random_normal([10, 4])),
            'ol': tf.Variable(tf.random_normal([4, 1]))
            }

biases = {
            'hl1': tf.Variable(tf.random_normal([10])),
            'hl2': tf.Variable(tf.random_normal([10])),
            'hl3': tf.Variable(tf.random_normal([4])),
            # 'ol': tf.Variable(tf.random_normal([1])),
            'ol': tf.Variable(tf.random_normal([2]))
            }

hl1 = tf.nn.relu(tf.add(tf.matmul(input_data, weights['hl1']), biases['hl1']))
hl2 = tf.nn.relu(tf.add(tf.matmul(hl1, weights['hl2']), biases['hl2']))
hl3 = tf.nn.relu(tf.add(tf.matmul(hl2, weights['hl3']), biases['hl3']))
# ol = tf.nn.sigmoid(tf.add(tf.matmul(hl3, weights['ol']), biases['ol']))
logits = tf.add(tf.matmul(hl3, weights['ol']), biases['ol'])

# ol = tf.Print(ol, [tf.reduce_sum(weights['hl1'])])
# loss = tf.reduce_mean((labels - ol)**2)
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
# loss = tf.reduce_mean((labels - ol)**2)
loss = tf.reduce_mean(cost)
optimizer = tf.train.AdamOptimizer(learning_rate)

iterations = int(n_examples/batch_size)

def debug_minimize(optimizer, loss, sess):
    from tensorflow.python.ops import variables
    from tensorflow.python.framework import ops
    # get all varibles
    var_list = (variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
    print 'variables'
    for v in var_list:
        print '  ', v.name
    # get all gradients
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars)

    zipped_val = sess.run(grads_and_vars, {input_data: features, labels: lbls})

    for rsl, tensor in zip(zipped_val, grads_and_vars):
        print '-----------------------------------------'
        print 'name', tensor[0].name.replace('/tuple/control_dependency_1:0', '').replace('gradients/', '')
        print 'gradient', rsl[0]
        print 'value', rsl[1]
    return train_op

sess = tf.Session()
sess.run(tf.global_variables_initializer())
debug_minimize(optimizer, loss, sess)

I'm not sure if this is the problem for you. But sigmoid functions gradient can get very small if its input is to big, this can make updates very slow.

To check if this is the case for you try initializing all your weights at very small values. You can adjust this by setting a standard deviation for your random norms.

tf.Variable(tf.random_normal([4, 10],  stddev=0.1))
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!