问题
I've made this neural net to figure out whether a house is a good buy or a bad buy. For some reasons the code is not updating weights and biases. My loss stays same. This is my code:
I've made this neural net to figure out whether a house is a good buy or a bad buy. For some reasons the code is not updating weights and biases. My loss stays same. This is my code:
import pandas as pd
import tensorflow as tf
data = pd.read_csv("E:/workspace_py/datasets/good_bad_buy.csv")
features = data.drop(['index', 'good buy'], axis = 1)
lbls = data.drop(['index', 'area', 'bathrooms', 'price', 'sq_price'], axis = 1)
features = features[0:20]
lbls = lbls[0:20]
print(features)
print(lbls)
n_examples = len(lbls)
# Model
# Hyper parameters
epochs = 100
learning_rate = 0.1
batch_size = 1
input_data = tf.placeholder('float', [None, 4])
labels = tf.placeholder('float', [None, 1])
weights = {
'hl1': tf.Variable(tf.random_normal([4, 10])),
'hl2': tf.Variable(tf.random_normal([10, 10])),
'hl3': tf.Variable(tf.random_normal([10, 4])),
'ol': tf.Variable(tf.random_normal([4, 1]))
}
biases = {
'hl1': tf.Variable(tf.random_normal([10])),
'hl2': tf.Variable(tf.random_normal([10])),
'hl3': tf.Variable(tf.random_normal([4])),
'ol': tf.Variable(tf.random_normal([1]))
}
hl1 = tf.nn.relu(tf.add(tf.matmul(input_data, weights['hl1']), biases['hl1']))
hl2 = tf.nn.relu(tf.add(tf.matmul(hl1, weights['hl2']), biases['hl2']))
hl3 = tf.nn.relu(tf.add(tf.matmul(hl2, weights['hl3']), biases['hl3']))
ol = tf.nn.sigmoid(tf.add(tf.matmul(hl3, weights['ol']), biases['ol']))
loss = tf.reduce_mean((labels - ol)**2)
train = tf.train.AdamOptimizer(learning_rate).minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
iterations = int(n_examples/batch_size)
for epoch_no in range(epochs):
ptr = 0
for iteration_no in range(iterations):
epoch_input = features[ptr:ptr+batch_size]
epoch_label = lbls[ptr: ptr+batch_size]
ptr = ptr + batch_size
_, err = sess.run([train, loss], feed_dict={input_data: features, labels: lbls})
print("Error at epoch ", epoch_no, ": ", err)
print(sess.run(ol, feed_dict={input_data: [[2104, 3, 399900, 190.0665]]}))
This is the dataset:
Features:
area bathrooms price sq_price
0 2104 3 399900 190.066540
1 1600 3 329900 206.187500
2 2400 3 369000 153.750000
3 1416 2 232000 163.841808
4 3000 4 539900 179.966667
5 1985 4 299900 151.083123
6 1534 3 314900 205.280313
7 1427 3 198999 139.452698
8 1380 3 212000 153.623188
9 1494 3 242500 162.315930
10 1940 4 239999 123.710825
11 2000 3 347000 173.500000
12 1890 3 329999 174.602645
13 4478 5 699900 156.297454
14 1268 3 259900 204.968454
15 2300 4 449900 195.608696
16 1320 2 299900 227.196970
17 1236 3 199900 161.731392
18 2609 4 499998 191.643542
19 3031 4 599000 197.624546
labels:
good buy
0 1.0
1 0.0
2 1.0
3 0.0
4 1.0
5 0.0
6 0.0
7 1.0
8 0.0
9 0.0
10 1.0
11 1.0
12 1.0
13 1.0
14 0.0
15 1.0
16 0.0
17 1.0
18 1.0
19 1.0
Any suggestions on how to fix this? I've tried tf.reduce_sum other than tf.reduce_mean. I've also tried a larger batch_size.
回答1:
A few things to consider
- Minibatch not being evaluated correctly since you feed in features and lbls instead of epoch_input and epoch_label.
- You do not precondition your data in any way, so it is completely out of range. I.e. my code below normalizes the features into stddev and mean. You might consider using batch_normalization.
- You are not evaluating error at any point. You need a held out training and testing set. My code below doesn't hold out data, but it does test in terms of error % rather than just loss (which is a weak proxy for error, so you shouldn't call it error).
- You initialize biases to random normals. You probably want to just start those at zero.
- You probably should use tf.layers or another high level api.
The below code achieves a training error of 95%. You'd want to test with a held out data set not used for training to evaluate the testing error.
#!/usr/bin/env python
import sys
import pandas as pd
import numpy as np
import tensorflow as tf
data = pd.read_csv("data.csv")
features = data.drop(['good buy'], axis = 1)
lbls = data.drop([ 'area', 'bathrooms', 'price', 'sq_price'], axis = 1)
features = features[0:20]
lbls = lbls[0:20]
mu = np.mean(features, axis=0)
sigma = (np.std(features, axis=0))
features = (features - mu) / sigma
n_examples = len(lbls)
# Model
# Hyper parameters
epochs = 100
learning_rate = 0.01
batch_size = 5
input_data = tf.placeholder('float', [None, 4])
labels = tf.placeholder('float', [None, 1])
weights = {
'hl1': tf.Variable(tf.random_normal([4, 10])),
'hl2': tf.Variable(tf.random_normal([10, 10])),
'hl3': tf.Variable(tf.random_normal([10, 4])),
'ol': tf.Variable(tf.random_normal([4, 1]))
}
biases = {
'hl1': tf.Variable(tf.zeros([10])),
'hl2': tf.Variable(tf.zeros([10])),
'hl3': tf.Variable(tf.zeros([4])),
'ol': tf.Variable(tf.zeros([1]))
}
hl1 = tf.nn.relu(tf.add(tf.matmul(input_data, weights['hl1']), biases['hl1']))
hl2 = tf.nn.relu(tf.add(tf.matmul(hl1, weights['hl2']), biases['hl2']))
hl3 = tf.nn.relu(tf.add(tf.matmul(hl2, weights['hl3']), biases['hl3']))
ol = tf.nn.sigmoid(tf.add(tf.matmul(hl3, weights['ol']), biases['ol']))
loss = tf.reduce_mean((labels - ol)**2)
train = tf.train.AdamOptimizer(learning_rate).minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
iterations = int(n_examples/batch_size)
def training_accuracy():
foo, = sess.run([ol], feed_dict={input_data: features, labels: lbls})
return (float(np.count_nonzero(np.equal(np.round(foo), lbls))) / float(lbls.shape[0]))
print("Initial training accuracy %f" % training_accuracy())
for epoch_no in range(epochs):
ptr = 0
for iteration_no in range(iterations):
epoch_input = features[ptr:ptr+batch_size]
epoch_label = lbls[ptr: ptr+batch_size]
ptr = (ptr + batch_size)%len(features)
_, err = sess.run([train, loss], feed_dict={input_data: epoch_input, labels: epoch_label})
print("Error at epoch ", epoch_no, ": ", err)
print(" Training accuracy %f" % training_accuracy())
Also, please do not post usage questions like this on github, they belong here on StackOverflow.
回答2:
There are several things not ok with your code. First, you mean
epoch_input = features[ptr:ptr+batch_size]
epoch_label = lbls[ptr: ptr+batch_size]
ptr = ptr + batch_size
// _, err = sess.run([train, loss], feed_dict={input_data: features, labels: lbls}
_, err = sess.run([train, loss], feed_dict={input_data: epoch_input, labels: epoch_label}
Now it uses minibatch.
Debugging the gradient:
You can always check some stuff by adding
loss = tf.Print(loss, [tf.reduce_sum(weights['hl1'])])
This will print the elements of that list [tf.reduce_sum(weights['hl1'])]
. To investigate further your problem, you can check the gradients instead of using minimize
grads = tf.reduce_sum(tf.gradients(loss, ol)[0])
sess.run(grads, {input_data: features, labels: lbls})
And finally, the loss function is inappropriate/numerical instable for classification. With your version, I get:
variables
Variable:0
Variable_1:0
Variable_2:0
Variable_3:0
Variable_4:0
Variable_5:0
Variable_6:0
Variable_7:0
I tensorflow/core/kernels/logging_ops.cc:79] [-6.2784553]
-----------------------------------------
name MatMul_grad
gradient [[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
value [[-0.59977376 -0.30060738 0.55068201 0.15304407 1.39992142 0.07495346
-0.87189424 -0.22595075 -0.30094525 -1.2688272 ]
[-0.44018757 1.08651936 -0.26267499 -0.54463315 0.47019768 0.69873857
0.56195319 0.20222363 0.38143152 -0.92212462]
[-0.39977714 -1.07244122 0.41926911 1.4951371 -2.28751612 0.45676312
0.88010246 -0.88077509 -1.25860023 0.56874037]
[-0.98260719 -1.30747247 -1.4460088 1.0717535 0.08794415 -0.53184992
-1.17537284 -0.51598179 -0.15323587 0.91142744]]
-----------------------------------------
name MatMul_1_grad
gradient [[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
value [[-0.1170694 0.12174897 0.91696155 0.59427398 0.90844423 0.29010534
-0.34039831 -0.62824941 0.37833953 0.27777222]
[-0.34947088 1.09264851 0.27353975 1.31722498 -0.42032316 -2.74952078
-0.66349608 -0.61844724 -0.82141227 1.21691799]
[ 0.10453336 -1.68631995 0.45700032 -1.58120835 -1.23378754 -0.05648948
-1.64761281 -0.57684237 -0.06499017 -0.49623618]
[ 1.47821534 -0.5329541 0.09209292 1.78089786 1.71149898 0.30547267
0.39544162 1.00369155 1.0097307 -0.92320329]
[ 1.27038908 -2.17246103 -0.31276336 0.8945803 0.30964327 1.15329361
0.9711507 -0.36301252 -0.05652813 0.63399518]
[-0.30909851 -0.41660413 -0.50603527 0.11735299 -0.26837045 0.16547598
-0.33875859 -0.46821991 0.25723135 -0.80380815]
[-0.86255074 -1.11751068 0.01365725 0.66119182 0.48947951 1.6353699
-0.794447 0.43182942 -0.97692633 -1.62605619]
[ 1.38552308 0.83679706 -0.87287223 2.59401655 -0.61855 0.38301265
1.09983373 0.49209142 1.03003716 -1.33537853]
[ 0.74452382 1.57940936 -0.90974236 -1.2211293 -1.1076287 0.92846316
-0.46856263 -0.3179535 0.75120807 -0.86442506]
[ 0.31622764 -0.35965034 -0.02351121 -0.0650174 0.4714573 0.35687482
1.43354905 0.39608309 0.42744714 -0.37226421]]
-----------------------------------------
name MatMul_2_grad
gradient [[ 0. 0. 0. 0.]
[ 0. 0. 0. 0.]
[ 0. 0. 0. 0.]
[ 0. 0. 0. 0.]
[ 0. 0. 0. 0.]
[ 0. 0. 0. 0.]
[ 0. 0. 0. 0.]
[ 0. 0. 0. 0.]
[ 0. 0. 0. 0.]
[ 0. 0. 0. 0.]]
value [[-1.50904143 0.00228321 1.45787132 0.68312413]
[-0.16627057 1.31303644 1.16326404 0.72901946]
[ 0.8004092 0.37329885 0.89361066 -0.19850619]
[ 1.58354807 -1.05612624 0.69891322 -0.32565734]
[-1.57602286 -0.41256282 0.69086516 -0.54095054]
[ 1.72376788 -0.53928965 -0.71574098 -0.94974124]
[-0.62061429 1.51380932 -0.72585452 -0.07695383]
[ 0.35537818 1.49691582 0.03931179 0.93435526]
[ 0.20697887 1.39266443 0.73217523 -0.64737892]
[ 1.00519872 0.90984046 1.68565321 -0.28157935]]
-----------------------------------------
name MatMul_3_grad
gradient [[ 0.]
[ 0.]
[ 0.]
[ 0.]]
value [[ 0.94082022]
[ 0.14753926]
[-0.08765228]
[ 1.32516992]]
-----------------------------------------
name Add_grad
gradient [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
value [ 1.71239722 1.12632215 0.75409448 0.01951236 0.32135537 -1.46281374
0.40413955 0.54653352 -0.57894999 0.2746354 ]
-----------------------------------------
name Add_1_grad
gradient [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
value [ 0.74800217 -0.43517059 -0.77706921 1.46858656 1.09103405 -0.46681881
0.6126743 -2.27877688 1.48809242 -1.19616997]
-----------------------------------------
name Add_2_grad
gradient [ 0. 0. 0. 0.]
value [-0.12137324 -0.23238407 0.17909229 -0.75496733]
-----------------------------------------
name Add_3_grad
gradient [ 0.]
value [-0.91176724]
As you see, almost all gradients are zero. Why?
- by definition
(labels - ol)
is in [0, 1] - the squared value is much smaller than one
- the derivative of sigmoid
s(x)
iss'(x) = s(x)*(1-s(x))
the gradients are multiplied by this value which is again much smaller than one.
But after using sparse_softmax_cross_entropy_with_logits
which is numerically stable and operates in the log-domain I get
variables
Variable:0
Variable_1:0
Variable_2:0
Variable_3:0
Variable_4:0
Variable_5:0
Variable_6:0
Variable_7:0
-----------------------------------------
name MatMul_grad
gradient [[ -1.42780918e-05 -1.96137808e-05 -2.44040220e-05 -2.25691911e-05
0.00000000e+00 2.95208647e-05 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[ -2.54181440e-08 -3.49168410e-08 -4.34445262e-08 -4.01781257e-08
0.00000000e+00 5.25536308e-08 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[ -2.45539122e-03 -3.37296468e-03 -4.19673882e-03 -3.88120394e-03
0.00000000e+00 5.07667707e-03 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[ -1.42123906e-06 -1.95235293e-06 -2.42917258e-06 -2.24653377e-06
0.00000000e+00 2.93850212e-06 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]]
value [[ 0.43133125 -0.40009859 -0.08456381 0.59587955 0.57171088 -0.9824872
1.18876612 0.9704771 0.74798232 0.15660612]
[-1.18380785 0.22617982 -1.15734088 -0.50478351 1.43819618 1.55950046
-1.1510663 -0.88835335 0.58378232 0.56860197]
[ 0.29826403 0.02192715 0.62225986 2.47716165 -0.9223454 1.70159853
-1.03968358 -0.26019615 -0.33808291 -0.30873826]
[ 0.59774327 -1.28855145 -0.43420359 -0.4413566 -0.19220066 0.96984953
-0.04922202 0.32994318 -1.05539823 -0.80112725]]
-----------------------------------------
name MatMul_1_grad
gradient [[ 0.00000000e+00 1.15650124e-03 0.00000000e+00 0.00000000e+00
6.59449317e-04 -1.09400018e-03 0.00000000e+00 -4.02117817e-04
5.44495881e-04 -8.90314346e-04]
[ 0.00000000e+00 7.24206184e-05 0.00000000e+00 0.00000000e+00
4.12950030e-05 -6.85067716e-05 0.00000000e+00 -2.51807924e-05
3.40965707e-05 -5.57518724e-05]
[ 0.00000000e+00 2.38713808e-03 0.00000000e+00 0.00000000e+00
1.36117137e-03 -2.25812919e-03 0.00000000e+00 -8.30012548e-04
1.12389564e-03 -1.83770037e-03]
[ 0.00000000e+00 9.52679198e-03 0.00000000e+00 0.00000000e+00
5.43227792e-03 -9.01193265e-03 0.00000000e+00 -3.31248436e-03
4.48533799e-03 -7.33405072e-03]
[ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[ 0.00000000e+00 6.51591457e-03 0.00000000e+00 0.00000000e+00
3.71544389e-03 -6.16377220e-03 0.00000000e+00 -2.26559630e-03
3.06777749e-03 -5.01617463e-03]
[ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]]
value [[ 0.38902158 -2.14370036 -1.02228141 -0.6492967 1.87193418 -0.06453216
1.0013988 -1.26857054 0.59826601 0.45045251]
[ 0.51465249 -1.09108925 -0.21368918 -0.49310678 -0.87893176 -0.07944249
-0.15810326 1.65703297 1.01812947 -0.95572269]
[-1.76351583 -1.46950841 1.43533802 2.15617752 1.30682683 0.77409673
-1.50309181 0.81978178 0.6672287 -0.434971 ]
[-0.7291944 2.16516733 -1.39850736 -1.06059277 0.40035763 1.23335707
-0.03707252 1.88107574 0.09459961 2.11439633]
[-1.39152992 -1.39924514 -0.35704514 -0.71152836 -2.68857026 0.78129828
-1.0077033 -1.26149333 0.4403404 -0.10159389]
[ 0.37354535 0.12654085 0.7632165 -0.76493222 0.68177891 -0.34254205
-1.11582613 2.60665917 1.53196526 -0.867055 ]
[ 0.62746197 -0.01072595 3.26629376 1.28371656 -0.88725293 3.55530715
0.67065352 -0.61927503 1.20604384 -0.87207574]
[-0.68954837 1.89912283 0.90083456 0.02054735 -0.23425011 0.39949065
-0.08969283 -0.75943565 1.0924015 0.28920195]
[-0.64865923 -1.29299021 -0.39945969 0.02289505 1.46024895 0.94282049
-0.99704605 -1.36124468 0.76788425 0.86770487]
[ 0.63794595 1.68530416 -0.15548207 -0.22658408 -0.45446202 -0.77308726
-0.12694608 1.17369819 2.25879693 0.20346723]]
-----------------------------------------
name MatMul_2_grad
gradient [[ 0. 0. 0. 0. ]
[-0.02205572 0. 0.00960038 0. ]
[ 0. 0. 0. 0. ]
[ 0. 0. 0. 0. ]
[-0.01932034 0. 0.00840973 0. ]
[-0.01617817 0. 0.00704201 0. ]
[ 0. 0. 0. 0. ]
[-0.05091252 0. 0.02216113 0. ]
[-0.0189826 0. 0.00826272 0. ]
[-0.01993647 0. 0.00867792 0. ]]
value [[-0.18724969 -0.0544498 -0.69153035 0.47535184]
[-0.75444973 -1.33321464 -0.13066645 1.56889391]
[-0.6458627 1.17859495 -0.75926393 0.30138403]
[ 1.0069555 -0.69344127 0.49295315 0.54917085]
[-0.55954564 -1.13277721 -0.37167427 -0.64837182]
[ 0.93753678 1.12197697 0.63789612 0.52438796]
[ 0.77543265 -1.241382 1.78230286 -0.6928125 ]
[ 0.95383584 -2.00331807 1.63409865 -0.36474878]
[-0.73891008 2.066082 -0.94303596 -0.42322466]
[ 0.38519588 0.03278512 -0.3487882 -1.50447905]]
-----------------------------------------
name MatMul_3_grad
gradient [[ 0.08460998]
[ 0. ]
[ 0.16564058]
[ 0. ]]
value [[-0.35376808]
[-0.07330427]
[ 0.15398768]
[-0.06484076]]
-----------------------------------------
name Add_grad
gradient [ -8.22783885e-09 -1.13025616e-08 -1.40629695e-08 -1.30056375e-08
0.00000000e+00 1.70115797e-08 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]
value [-1.00038147 -0.56519473 0.59372097 -1.1646167 -0.16213787 -0.69313556
0.62788707 1.03768504 0.57876503 -0.5201084 ]
-----------------------------------------
name Add_1_grad
gradient [ 0.00000000e+00 1.28705375e-08 0.00000000e+00 0.00000000e+00
7.33891703e-09 -1.21749730e-08 0.00000000e+00 -4.47511184e-09
6.05961770e-09 -9.90818183e-09]
value [ 0.02854451 -1.46039021 -0.03916361 0.40116394 0.16030532 0.88267213
-0.46328214 0.18927227 -1.7536788 -0.46590349]
-----------------------------------------
name Add_2_grad
gradient [ -1.84504412e-08 0.00000000e+00 8.03108247e-09 0.00000000e+00]
value [ 0.94534302 -0.9080081 -1.86719894 -1.31547296]
-----------------------------------------
name Add_3_grad
gradient [ 0.29727879 -0.29727876]
value [ 0.07999782 -0.75647992]
The gradients are (while very small) this time non zero. The code for reproducing that is
import numpy as np
import tensorflow as tf
features = [
[2104, 3, 399900, 190.066540],
[1600, 3, 329900, 206.187500],
[2400, 3, 369000, 153.750000],
[1416, 2, 232000, 163.841808],
[3000, 4, 539900, 179.966667],
[1985, 4, 299900, 151.083123],
[1534, 3, 314900, 205.280313],
[1427, 3, 198999, 139.452698],
[1380, 3, 212000, 153.623188],
[1494, 3, 242500, 162.315930],
[1940, 4, 239999, 123.710825],
[2000, 3, 347000, 173.500000],
[1890, 3, 329999, 174.602645],
[4478, 5, 699900, 156.297454],
[1268, 3, 259900, 204.968454],
[2300, 4, 449900, 195.608696],
[1320, 2, 299900, 227.196970],
[1236, 3, 199900, 161.731392],
[2609, 4, 499998, 191.643542],
[3031, 4, 599000, 197.624546]]
lbls = [1,0,1,0,1,0,0,1,0,0,1,1,1,1,0,1,0,1,1,1]
features = np.array(features, dtype=np.float32)
lbls = np.array(lbls, dtype=np.int32)
n_examples = len(lbls)
epochs = 100
learning_rate = 0.1
batch_size = 1
input_data = tf.placeholder('float', [None, 4])
labels = tf.placeholder('int32', [None])
weights = {
'hl1': tf.Variable(tf.random_normal([4, 10])),
'hl2': tf.Variable(tf.random_normal([10, 10])),
'hl3': tf.Variable(tf.random_normal([10, 4])),
'ol': tf.Variable(tf.random_normal([4, 1]))
}
biases = {
'hl1': tf.Variable(tf.random_normal([10])),
'hl2': tf.Variable(tf.random_normal([10])),
'hl3': tf.Variable(tf.random_normal([4])),
# 'ol': tf.Variable(tf.random_normal([1])),
'ol': tf.Variable(tf.random_normal([2]))
}
hl1 = tf.nn.relu(tf.add(tf.matmul(input_data, weights['hl1']), biases['hl1']))
hl2 = tf.nn.relu(tf.add(tf.matmul(hl1, weights['hl2']), biases['hl2']))
hl3 = tf.nn.relu(tf.add(tf.matmul(hl2, weights['hl3']), biases['hl3']))
# ol = tf.nn.sigmoid(tf.add(tf.matmul(hl3, weights['ol']), biases['ol']))
logits = tf.add(tf.matmul(hl3, weights['ol']), biases['ol'])
# ol = tf.Print(ol, [tf.reduce_sum(weights['hl1'])])
# loss = tf.reduce_mean((labels - ol)**2)
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
# loss = tf.reduce_mean((labels - ol)**2)
loss = tf.reduce_mean(cost)
optimizer = tf.train.AdamOptimizer(learning_rate)
iterations = int(n_examples/batch_size)
def debug_minimize(optimizer, loss, sess):
from tensorflow.python.ops import variables
from tensorflow.python.framework import ops
# get all varibles
var_list = (variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
print 'variables'
for v in var_list:
print ' ', v.name
# get all gradients
grads_and_vars = optimizer.compute_gradients(loss)
train_op = optimizer.apply_gradients(grads_and_vars)
zipped_val = sess.run(grads_and_vars, {input_data: features, labels: lbls})
for rsl, tensor in zip(zipped_val, grads_and_vars):
print '-----------------------------------------'
print 'name', tensor[0].name.replace('/tuple/control_dependency_1:0', '').replace('gradients/', '')
print 'gradient', rsl[0]
print 'value', rsl[1]
return train_op
sess = tf.Session()
sess.run(tf.global_variables_initializer())
debug_minimize(optimizer, loss, sess)
回答3:
I'm not sure if this is the problem for you. But sigmoid functions gradient can get very small if its input is to big, this can make updates very slow.
To check if this is the case for you try initializing all your weights at very small values. You can adjust this by setting a standard deviation for your random norms.
tf.Variable(tf.random_normal([4, 10], stddev=0.1))
来源:https://stackoverflow.com/questions/46264133/weights-and-biases-not-updating-in-tensorflow