基于DQN的五子棋算法

匿名 (未验证) 提交于 2019-12-03 00:22:01

五子棋的gym环境

import gym import logging import numpy import random from gym import spaces   logger = logging.getLogger(__name__)  class FiveChessEnv(gym.Env): 	metadata = { 		'render.modes': ['human', 'rgb_array'], 		'video.frames_per_second': 2 	}  	def __init__(self): 		#棋盘大小 		self.SIZE = 8 		#初始棋盘是0    -1表示黑棋子   1表示白棋子 		self.chessboard = [ [  0 for v in range(self.SIZE)  ] for v in range(self.SIZE) ] 		self.viewer = None 		self.step_count = 0  	def seed(self, seed=None): 		self.np_random, seed = seeding.np_random(seed) 		return [seed]   	def is_valid_coord(self,x,y): 		return x>=0 and x<self.SIZE and y>=0 and y<self.SIZE  	def is_valid_set_coord(self,x,y): 		return self.is_valid_coord(x,y) and self.chessboard[x][y]==0  	#返回一个有效的下棋位置 	def get_valid_pos_weights(self): 		results = [] 		for x in range(self.SIZE): 			for y in range(self.SIZE): 				if self.chessboard[x][y]==0: 					results.append(1) 				else: 					results.append(0) 		return results  	#action 包括坐标和棋子颜色  例如:[1,3,1] 表示: 坐标(1,3),白棋 	#输出 下一个状态,动作价值,是否结束,额外信息{} 	def step(self, action): 		''' 		#非法操作 		if not self.is_valid_set_coord(action[0],action[1]): 			return self.chessboard,-50,False,{} 		'''  		#棋子 		self.chessboard[action[0]][action[1]] = action[2]  		self.step_count +=1  		#胜负判定 		color = action[2] 		 		win_reward = 1000 		common_reward = -20 		draw_reward = 0  		#1.横向 		count = 1 		win = False  		i = 1 		stop0 = False 		stop1 = False  		while i<self.SIZE: 			x = action[0]+i 			y = action[1] 			#左边 			if (not stop0) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: 				count = count+1 			else: 				stop0 = True 			#右边 			x = action[0]-i 			if (not stop1) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: 				count = count+1 			else: 				stop1 = True  			#超过5个相同,则胜利 			if count>=5: 				win = True 				break  			#都不相同,停止探索 			if stop0 and stop1: 				break 			i+=1  		if win: 			print('win1') 			return self.chessboard,win_reward,True,{} 		#2.纵向 		count = 1 		win = False  		i = 1 		stop0 = False 		stop1 = False  		while i<self.SIZE: 			x = action[0] 			y = action[1]+i 			#左边 			if (not stop0) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: 				count = count+1 			else: 				stop0 = True 			#右边 			y = action[1]-i 			if (not stop1) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: 				count = count+1 			else: 				stop1 = True  			#超过5个相同,则胜利 			if count>=5: 				win = True 				break  			#都不相同,停止探索 			if stop0 and stop1: 				break 			i+=1 		if win: 			print('win2') 			return self.chessboard,win_reward,True,{} 		#3.左斜向 		count = 1 		win = False  		i = 1 		stop0 = False 		stop1 = False  		while i<self.SIZE: 			x = action[0]+i 			y = action[1]+i 			#左边 			if (not stop0) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: 				count = count+1 			else: 				stop0 = True 			#右边 			x = action[0]-i 			y = action[1]-i 			if (not stop1) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: 				count = count+1 			else: 				stop1 = True  			#超过5个相同,则胜利 			if count>=5: 				win = True 				break  			#都不相同,停止探索 			if stop0 and stop1: 				break 			i+=1 		if win: 			print('win3') 			return self.chessboard,win_reward,True,{}  		#3.右斜向 		count = 1 		win = False  		i = 1 		stop0 = False 		stop1 = False  		while i<self.SIZE: 			x = action[0]-i 			y = action[1]+i 			#左边 			if (not stop0) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: 				count = count+1 			else: 				stop0 = True 			#右边 			x = action[0]+i 			y = action[1]-i 			if (not stop1) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: 				count = count+1 			else: 				stop1 = True  			#超过5个相同,则胜利 			if count>=5: 				win = True 				break  			#都不相同,停止探索 			if stop0 and stop1: 				break 			i+=1 		if win: 			print('win4') 			return self.chessboard,win_reward,True,{}  		if self.step_count == self.SIZE*self.SIZE: 			print('draw') 			return self.chessboard,draw_reward,True,{}  		return self.chessboard,common_reward,False,{}  	def reset(self): 		self.chessboard = [ [  0 for v in range(self.SIZE)  ] for v in range(self.SIZE) ] 		self.step_count = 0 		return self.chessboard  	def render(self, mode = 'human', close=False): 		if close: 			if self.viewer is not None: 				self.viewer.close() 				self.viewer = None 			return  		screen_width = 800 		screen_height = 800 		space = 10 		width = (screen_width - space*2)/(self.SIZE-1)  		if self.viewer is None: 			from gym.envs.classic_control import rendering 			self.viewer = rendering.Viewer(screen_width, screen_height) 			bg = rendering.FilledPolygon([(0,0),(screen_width,0),(screen_width,screen_height),(0,screen_height),(0,0)]) 			bg.set_color(0.2,0.2,0.2) 			self.viewer.add_geom(bg) 			 			#棋盘网格 			for i in range(self.SIZE): 				line = rendering.Line((space,space+i*width),(screen_width-space,space+i*width)) 				line.set_color(1, 1, 1) 				self.viewer.add_geom(line) 			for i in range(self.SIZE): 				line = rendering.Line((space+i*width,space),(space+i*width,screen_height - space)) 				line.set_color(1, 1, 1) 				self.viewer.add_geom(line) 				 			#棋子 			self.chess = [] 			for x in range(self.SIZE): 				self.chess.append([]) 				for y in range(self.SIZE): 					c = rendering.make_circle(width/2-3) 					ct = rendering.Transform(translation=(0,0)) 					c.add_attr(ct) 					c.set_color(0,0,0) 					self.chess[x].append([c,ct]) 					self.viewer.add_geom(c)  			  		for x in range(self.SIZE): 			for y in range(self.SIZE):	 				if self.chessboard[x][y]!=0: 					self.chess[x][y][1].set_translation(space+x*width,space+y*width) 					if self.chessboard[x][y]==1: 						self.chess[x][y][0].set_color(255,255,255) 					else: 						self.chess[x][y][0].set_color(0,0,0) 				else: 					self.chess[x][y][1].set_translation(-10,-10)   		return self.viewer.render(return_rgb_array=mode == 'rgb_array')  		#if self.state is None: return None 		#return super().render(mode)


环境安装好之后,以下是DQN算法

import tensorflow as tf import numpy as np import random from collections import deque import gym import time import math  GAMMA = 0.9 # discount factor for target Q INITIAL_EPSILON = 0.1 # starting value of epsilon FINAL_EPSILON = 0.01 # final value of epsilon REPLAY_SIZE = 10000 # 经验回放缓存大小 BATCH_SIZE = 200 # 小批量尺寸 TARGET_Q_STEP = 100	# 目标网络同步的训练次数  class DQN(): 	# DQN Agent 	def __init__(self, env): 		# init experience replay 		self.replay_buffer = deque() 		# init some parameters 		self.time_step = 0 		self.epsilon = INITIAL_EPSILON 		self.SIZE = env.env.SIZE 		self.state_dim = self.SIZE*self.SIZE+1 		self.action_dim = self.SIZE*self.SIZE 		self.hide_layer_inputs = 52 		#创建Q网络 		self.create_Q_network() 		#创建训练方法 		self.create_training_method()   		self.target_q_step = TARGET_Q_STEP 		self.create_TargetQ_network()   		# 初始会话 		self.session = tf.InteractiveSession() 		self.session.run(tf.initialize_all_variables())  	def create_Q_network(self): 		# network weights 		W1 = self.weight_variable([self.state_dim,self.hide_layer_inputs]) 		b1 = self.bias_variable([self.hide_layer_inputs]) 		W2 = self.weight_variable([self.hide_layer_inputs,self.action_dim]) 		b2 = self.bias_variable([self.action_dim]) 		# input layer 		self.state_input = tf.placeholder("float",[None,self.state_dim]) 		# hidden layers 		h_layer = tf.nn.relu(tf.matmul(self.state_input,W1) + b1) 		# Q Value layer 		self.Q_value = tf.matmul(h_layer,W2) + b2 		#保存权重 		self.Q_Weihgts = [W1,b1,W2,b2]  	def create_TargetQ_network(self): 		# network weights 		W1 = self.weight_variable([self.state_dim,self.hide_layer_inputs]) 		b1 = self.bias_variable([self.hide_layer_inputs]) 		W2 = self.weight_variable([self.hide_layer_inputs,self.action_dim]) 		b2 = self.bias_variable([self.action_dim]) 		# input layer 		#self.state_input = tf.placeholder("float",[None,self.state_dim]) 		# hidden layers 		h_layer = tf.nn.relu(tf.matmul(self.state_input,W1) + b1) 		# Q Value layer 		self.TargetQ_value = tf.matmul(h_layer,W2) + b2 		self.TargetQ_Weights = [W1,b1,W2,b2]  	def copyWeightsToTarget(self): 		for i in range(len(self.Q_Weihgts)): 			self.session.run(tf.assign(self.TargetQ_Weights[i],self.Q_Weihgts[i]))  	def create_training_method(self): 		self.action_input = tf.placeholder("float",[None,self.action_dim]) # one hot presentation 		self.y_input = tf.placeholder("float",[None]) 	 		Q_action = tf.reduce_sum(tf.multiply(self.Q_value,self.action_input),reduction_indices = 1)	#mul->matmul 		self.cost = tf.reduce_mean(tf.square(self.y_input - Q_action)) 		self.optimizer = tf.train.AdamOptimizer(0.0001).minimize(self.cost)  	def perceive(self,state,action,reward,next_state,done): 		one_hot_action = np.zeros(self.action_dim) 		one_hot_action[action] = 1 		self.replay_buffer.append([state,one_hot_action,reward,next_state,done]) 		if len(self.replay_buffer) > REPLAY_SIZE: 			self.replay_buffer.popleft()  		if len(self.replay_buffer) > BATCH_SIZE: 			self.train_Q_network()  	def modify_last_reward(self,new_reward): 		v = self.replay_buffer.pop() 		v[2] = new_reward 		self.replay_buffer.append(v)  	def train_Q_network(self): 		self.time_step += 1 		# Step 1: obtain random minibatch from replay memory 		minibatch = random.sample(self.replay_buffer,BATCH_SIZE) 		state_batch = [data[0] for data in minibatch] 		action_batch = [data[1] for data in minibatch] 		reward_batch = [data[2] for data in minibatch] 		next_state_batch = [data[3] for data in minibatch]  		# Step 2: calculate y 		y_batch = [] 		Q_value_batch = self.Q_value.eval(feed_dict={self.state_input:next_state_batch}) 		#Q_value_batch = self.TargetQ_value.eval(feed_dict={self.state_input:next_state_batch}) 		for i in range(0,BATCH_SIZE): 			done = minibatch[i][4] 			if done: 				y_batch.append(reward_batch[i]) 			else : 				y_batch.append(reward_batch[i] + GAMMA * np.max(Q_value_batch[i]))  		self.optimizer.run(feed_dict={ 			self.y_input:y_batch, 			self.action_input:action_batch, 			self.state_input:state_batch 			})  		#同步目标网络 		if self.time_step % self.target_q_step == 0: 			self.copyWeightsToTarget()   	def egreedy_action(self,state): 		Q_value = self.Q_value.eval(feed_dict = { 			self.state_input:[state] 			})[0]  		min_v = Q_value[np.argmin(Q_value)]-1 		valid_action = [] 		for i in range(len(Q_value)): 			if state[i]==0: 				valid_action.append(i) 			else: 				Q_value[i] = min_v  		if random.random() <= self.epsilon: 			return valid_action[random.randint(0,len(valid_action) - 1)] 			#return random.randint(0,self.action_dim - 1) 		else: 			return np.argmax(Q_value)  		#self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/10000  	def action(self,state): 		Q_value = self.Q_value.eval(feed_dict = { 			self.state_input:[state] 			})[0]  		min_v = Q_value[np.argmin(Q_value)]-1 		valid_action = [] 		for i in range(len(Q_value)): 			if state[i]==0: 				valid_action.append(i) 			else: 				Q_value[i] = min_v  		return np.argmax(Q_value)  	def weight_variable(self,shape): 		initial = tf.truncated_normal(shape) 		return tf.Variable(initial)  	def bias_variable(self,shape): 		initial = tf.constant(0.01, shape = shape) 		return tf.Variable(initial) # --------------------------------------------------------- # Hyper Parameters ENV_NAME = 'FiveChess-v0' EPISODE = 10000 # Episode limitation STEP = 300 # Step limitation in an episode TEST = 1 # The number of experiment test every 100 episode  def main(): 	# initialize OpenAI Gym env and dqn agent 	env = gym.make(ENV_NAME) 	agent = DQN(env) 	SIZE = env.env.SIZE  	agent.copyWeightsToTarget()  	for episode in range(EPISODE): 		# initialize task 		state = env.reset()  		camp = -1 		state = np.reshape(state,[-1]) 		state = np.append(state,camp)  		print('episode ',episode)  		# Train 		for step in range(STEP): 			#自己下一步棋 			action = agent.egreedy_action(state) # e-greedy action for train 			 			action = [math.floor(action/SIZE),action%SIZE,camp] 			#if env.env.is_valid_set_coord(action[0],action[1]): 			next_state,reward,done,_ = env.step(action) 			next_state = np.reshape(next_state,[-1]) 			if step%2 == 0: 				camp = 1 			else: 				camp = -1 			next_state = np.append(next_state,camp) 			# Define reward for agent 			reward_agent = reward 			agent.perceive(state,action,reward,next_state,done) 			state = next_state 			if done: 				print('done step ',step) 				break 		# Test every 100 episodes 		if episode % 100 == 99: 			total_reward = 0 			for i in range(TEST): 				state = env.reset() 				state = np.reshape(state,[-1]) 				camp = -1 				state = np.append(state,camp)  				for j in range(STEP): 					env.render() 					action = agent.action(state) # direct action for test  					action = [math.floor(action/SIZE),action%SIZE,camp] 					state,reward,done,_ = env.step(action) 					state = np.reshape(state,[-1]) 					if j%2 == 0: 						camp = 1 					else: 						camp = -1 					state = np.append(state,camp)  					total_reward += reward 					time.sleep(0.5) 					if done: 						env.render() 						print('done') 						time.sleep(3) 						break 			ave_reward = total_reward/TEST 			print('episode: ',episode,'Evaluation Average Reward:',ave_reward) 			#if ave_reward >= 990: 			#	break  if __name__ == '__main__': 	main()

训练截图:



如果修改五子棋规则,只下一种颜色的棋子,可以很好的收敛到在10步内找到5子连成一条。

两种颜色自我博弈的情况下,收敛情况未知!

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!