问题
My project partner and I are currently facing a problem in our latest university project. Our mission is to implement a neural network that plays the game Pong. We are giving the ball position the ball speed and the position of the paddles to our network and have three outputs: UP DOWN DO_NOTHING. After a player has 11 points we train the network with all states, the made decisions and the reward of the made decisions (see reward_cal()). The problem we are facing is, that the loss is constantly staying at a specific value only depending on the learning rate. Because of this the network always makes the same decision even though we reward it as terribly wrong.
Please help us find out what we did wrong we are thankful for every advise! Below is our code pls feel free to ask if there are any questions. We are pretty new to this topic so pls don't be rude if there is something completly stupid :D
this is our code:
import sys, pygame, time
import numpy as np
import random
from os.path import isfile
import keras
from keras.optimizers import SGD
from keras.layers import Dense
from keras.layers.core import Flatten
pygame.init()
pygame.mixer.init()
#surface of the game
width = 400
height = 600
black = 0, 0, 0 #RGB value
screen = pygame.display.set_mode((width, height), 0, 32)
#(Resolution(x,y), flags, colour depth)
font = pygame.font.SysFont('arial', 36, bold=True)
pygame.display.set_caption('PyPong') #title of window
#consts for the game
acceleration = 0.0025 # ball becomes faster during the game
mousematch = 1
delay_time = 0
paddleP = pygame.image.load("schlaeger.gif")
playerRect = paddleP.get_rect(center = (200, 550))
paddleC = pygame.image.load("schlaeger.gif")
comRect = paddleC.get_rect(center=(200,50))
ball = pygame.image.load("ball.gif")
ballRect = ball.get_rect(center=(200,300))
#Variables for the game
pointsPlayer = [0]
pointsCom = [0]
playermove = [0, 0]
speedbar = [0, 0]
speed = [6, 6]
hitX = 0
#neural const
learning_rate = 0.01
number_of_actions = 3
filehandler = open('logfile.log', 'a')
filename = sys.argv[1]
#neural variables
states, action_prob_grads, rewards, action_probs = [], [], [], []
reward_sum = 0
episode_number = 0
reward_sums = []
pygame.display.flip()
def pointcontrol(): #having a look at the points in the game and restart()
if pointsPlayer[0] >= 11:
print('Player Won ', pointsPlayer[0], '/', pointsCom[0])
restart(1)
return 1
if pointsCom[0] >= 11:
print('Computer Won ', pointsPlayer[0], '/', pointsCom[0])
restart(1)
return 1
elif pointsCom[0] < 11 and pointsPlayer[0] < 11:
restart(0)
return 0
def restart(finished): #resetting the positions and the ball speed and
(if point limit was reached) the points
ballRect.center = 200,300
comRect.center = 200,50
playerRect.center = 200, 550
speed[0] = 6
speed[1] = 6
screen.blit(paddleC, comRect)
screen.blit(paddleP, playerRect)
pygame.display.flip()
if finished:
pointsPlayer[0] = 0
pointsCom[0] = 0
def reward_cal(r, gamma = 0.99): #rewarding every move
discounted_r = np.zeros_like(r) #making zero array with size of
reward array
running_add = 0
for t in range(r.size - 1, 0, -1): #iterating beginning in the end
if r[t] != 0: #if reward -1 or 1 (point made or lost)
running_add = 0
running_add = running_add * gamma + r[t] #making every move
before the point the same reward but a little bit smaller
discounted_r[t] = running_add #putting the value in the new
reward array
#e.g r = 000001000-1 -> discounted_r = 0.5 0.6 0.7 0.8 0.9 1 -0.7
-0.8 -0.9 -1 values are not really correct just to make it clear
return discounted_r
#neural net
model = keras.models.Sequential()
model.add(Dense(16, input_dim = (8), kernel_initializer =
'glorot_normal', activation = 'relu'))
model.add(Dense(32, kernel_initializer = 'glorot_normal', activation =
'relu'))
model.add(Dense(number_of_actions, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
model.summary()
if isfile(filename):
model.load_weights(filename)
# one ball movement before the AI gets to make a decision
ballRect = ballRect.move(speed)
reward_temp = 0.0
if ballRect.left < 0 or ballRect.right > width:
speed[0] = -speed[0]
if ballRect.top < 0:
pointsPlayer[0] += 1
reward_temp = 1.0
done = pointcontrol()
if ballRect.bottom > height:
pointsCom[0] += 1
done = pointcontrol()
reward_temp = -1.0
if ballRect.colliderect(playerRect):
speed[1] = -speed[1]
if ballRect.colliderect(comRect):
speed[1] = -speed[1]
if speed[0] < 0:
speed[0] -= acceleration
if speed[0] > 0:
speed[0] += acceleration
if speed[1] < 0:
speed[1] -= acceleration
if speed[1] > 0 :
speed[1] += acceleration
while True: #game
for event in pygame.event.get():
if event.type == pygame.QUIT:
pygame.quit()
sys.exit()
state = np.array([ballRect.center[0], ballRect.center[1], speed[0],
speed[1], playerRect.center[0], playerRect.center[1], comRect.center[0],
comRect.center[1]])
states.append(state)
action_prob = model.predict_on_batch(state.reshape(1, 8))[0, :]
action_probs.append(action_prob)
action = np.random.choice(number_of_actions, p=action_prob)
if(action == 0): playermove = [0, 0]
elif(action == 1): playermove = [5, 0]
elif(action == 2): playermove = [-5, 0]
playerRect = playerRect.move(playermove)
y = np.array([-1, -1, -1])
y[action] = 1
action_prob_grads.append(y-action_prob)
#enemy move
comRect = comRect.move(speedbar)
ballY = ballRect.left+5
comRectY = comRect.left+30
if comRect.top <= (height/1.5):
if comRectY - ballY > 0:
speedbar[0] = -7
elif comRectY - ballY < 0:
speedbar[0] = 7
if comRect.top > (height/1.5):
speedbar[0] = 0
if(mousematch == 1):
done = 0
reward_temp = 0.0
ballRect = ballRect.move(speed)
if ballRect.left < 0 or ballRect.right > width:
speed[0] = -speed[0]
if ballRect.top < 0:
pointsPlayer[0] += 1
done = pointcontrol()
reward_temp = 1.0
if ballRect.bottom > height:
pointsCom[0] += 1
done = pointcontrol()
reward_temp = -1.0
if ballRect.colliderect(playerRect):
speed[1] = -speed[1]
if ballRect.colliderect(comRect):
speed[1] = -speed[1]
if speed[0] < 0:
speed[0] -= acceleration
if speed[0] > 0:
speed[0] += acceleration
if speed[1] < 0:
speed[1] -= acceleration
if speed[1] > 0 :
speed[1] += acceleration
rewards.append(reward_temp)
if (done):
episode_number += 1
reward_sums.append(np.sum(rewards))
if len(reward_sums) > 40:
reward_sums.pop(0)
s = 'Episode %d Total Episode Reward: %f , Mean %f' % (
episode_number, np.sum(rewards), np.mean(reward_sums))
print(s)
filehandler.write(s + '\n')
filehandler.flush()
# Propagate the rewards back to actions where no reward
was given.
# Rewards for earlier actions are attenuated
rewards = np.vstack(rewards)
action_prob_grads = np.vstack(action_prob_grads)
rewards = reward_cal(rewards)
X = np.vstack(states).reshape(-1, 8)
Y = action_probs + learning_rate * rewards * y
print('loss: ', model.train_on_batch(X, Y))
model.save_weights(filename)
states, action_prob_grads, rewards, action_probs = [], [], [], []
reward_sum = 0
screen.fill(black)
screen.blit(paddleP, playerRect)
screen.blit(ball, ballRect)
screen.blit(paddleC, comRect)
pygame.display.flip()
pygame.time.delay(delay_time)
this is our output:
pygame 1.9.4 Hello from the pygame community. https://www.pygame.org/contribute.html Using TensorFlow backend.
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_1 (Dense) (None, 16) 144
_________________________________________________________________
dense_2 (Dense) (None, 32) 544
_________________________________________________________________
dense_3 (Dense) (None, 3) 99
=================================================================
Total params: 787 Trainable params: 787 Non-trainable params: 0
_________________________________________________________________ 2019-02-14 11:18:10.543401: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA 2019-02-14 11:18:10.666634: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.6705 pciBusID: 0000:17:00.0 totalMemory:
10.92GiB freeMemory: 10.76GiB 2019-02-14 11:18:10.775144: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 1 with properties: name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.6705 pciBusID: 0000:65:00.0 totalMemory:
10.91GiB freeMemory: 10.73GiB 2019-02-14 11:18:10.776037: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0, 1 2019-02-14 11:18:11.176560: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix: 2019-02-14 11:18:11.176590: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 1 2019-02-14 11:18:11.176596: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N Y 2019-02-14 11:18:11.176600: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 1: Y N 2019-02-14 11:18:11.176914: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10403 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:17:00.0, compute capability: 6.1) 2019-02-14 11:18:11.177216: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 10382 MB memory) -> physical GPU (device: 1, name: GeForce GTX 1080 Ti, pci bus id: 0000:65:00.0, compute capability: 6.1)
Computer Won 0 / 11 Episode 1 Total Episode Reward: -11.000000 , Mean -11.000000
loss: 0.254405
Computer Won 0 / 11 Episode 2 Total Episode Reward: -11.000000 , Mean -11.000000
loss: 0.254304
Computer Won 0 / 11 Episode 3 Total Episode Reward: -11.000000 , Mean -11.000000
loss: 0.254304
Computer Won 0 / 11 Episode 4 Total Episode Reward: -11.000000 , Mean -11.000000
loss: 0.254304
Computer Won 0 / 11 Episode 5 Total Episode Reward: -11.000000 , Mean -11.000000
loss: 0.254304
Computer Won 0 / 11 Episode 6 Total Episode Reward: -11.000000 , Mean -11.000000
loss: 0.254304
回答1:
That's evil 'relu'
showing its power.
Relu has a "zero" region without gradients. When all your outputs get negative, Relu makes all of them equal to zero and kills backpropagation.
The easiest solution for using Relus safely is to add BatchNormalization
layers before them:
model = keras.models.Sequential()
model.add(Dense(16, input_dim = (8), kernel_initializer = 'glorot_normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(32, kernel_initializer = 'glorot_normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(number_of_actions, activation='softmax'))
This will make "rougly" half of the outputs of the layer be zero and half be trainable.
Other solutions consist of controlling very well your learning rate and optimizer, which may be quite a headache for beginners.
来源:https://stackoverflow.com/questions/54688502/neural-network-does-not-learn-loss-stays-the-same