五子棋的gym环境
import gym import logging import numpy import random from gym import spaces logger = logging.getLogger(__name__) class FiveChessEnv(gym.Env): metadata = { 'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 2 } def __init__(self): #棋盘大小 self.SIZE = 8 #初始棋盘是0 -1表示黑棋子 1表示白棋子 self.chessboard = [ [ 0 for v in range(self.SIZE) ] for v in range(self.SIZE) ] self.viewer = None self.step_count = 0 def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def is_valid_coord(self,x,y): return x>=0 and x<self.SIZE and y>=0 and y<self.SIZE def is_valid_set_coord(self,x,y): return self.is_valid_coord(x,y) and self.chessboard[x][y]==0 #返回一个有效的下棋位置 def get_valid_pos_weights(self): results = [] for x in range(self.SIZE): for y in range(self.SIZE): if self.chessboard[x][y]==0: results.append(1) else: results.append(0) return results #action 包括坐标和棋子颜色 例如:[1,3,1] 表示: 坐标(1,3),白棋 #输出 下一个状态,动作价值,是否结束,额外信息{} def step(self, action): ''' #非法操作 if not self.is_valid_set_coord(action[0],action[1]): return self.chessboard,-50,False,{} ''' #棋子 self.chessboard[action[0]][action[1]] = action[2] self.step_count +=1 #胜负判定 color = action[2] win_reward = 1000 common_reward = -20 draw_reward = 0 #1.横向 count = 1 win = False i = 1 stop0 = False stop1 = False while i<self.SIZE: x = action[0]+i y = action[1] #左边 if (not stop0) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: count = count+1 else: stop0 = True #右边 x = action[0]-i if (not stop1) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: count = count+1 else: stop1 = True #超过5个相同,则胜利 if count>=5: win = True break #都不相同,停止探索 if stop0 and stop1: break i+=1 if win: print('win1') return self.chessboard,win_reward,True,{} #2.纵向 count = 1 win = False i = 1 stop0 = False stop1 = False while i<self.SIZE: x = action[0] y = action[1]+i #左边 if (not stop0) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: count = count+1 else: stop0 = True #右边 y = action[1]-i if (not stop1) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: count = count+1 else: stop1 = True #超过5个相同,则胜利 if count>=5: win = True break #都不相同,停止探索 if stop0 and stop1: break i+=1 if win: print('win2') return self.chessboard,win_reward,True,{} #3.左斜向 count = 1 win = False i = 1 stop0 = False stop1 = False while i<self.SIZE: x = action[0]+i y = action[1]+i #左边 if (not stop0) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: count = count+1 else: stop0 = True #右边 x = action[0]-i y = action[1]-i if (not stop1) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: count = count+1 else: stop1 = True #超过5个相同,则胜利 if count>=5: win = True break #都不相同,停止探索 if stop0 and stop1: break i+=1 if win: print('win3') return self.chessboard,win_reward,True,{} #3.右斜向 count = 1 win = False i = 1 stop0 = False stop1 = False while i<self.SIZE: x = action[0]-i y = action[1]+i #左边 if (not stop0) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: count = count+1 else: stop0 = True #右边 x = action[0]+i y = action[1]-i if (not stop1) and self.is_valid_coord(x,y) and self.chessboard[x][y] == color: count = count+1 else: stop1 = True #超过5个相同,则胜利 if count>=5: win = True break #都不相同,停止探索 if stop0 and stop1: break i+=1 if win: print('win4') return self.chessboard,win_reward,True,{} if self.step_count == self.SIZE*self.SIZE: print('draw') return self.chessboard,draw_reward,True,{} return self.chessboard,common_reward,False,{} def reset(self): self.chessboard = [ [ 0 for v in range(self.SIZE) ] for v in range(self.SIZE) ] self.step_count = 0 return self.chessboard def render(self, mode = 'human', close=False): if close: if self.viewer is not None: self.viewer.close() self.viewer = None return screen_width = 800 screen_height = 800 space = 10 width = (screen_width - space*2)/(self.SIZE-1) if self.viewer is None: from gym.envs.classic_control import rendering self.viewer = rendering.Viewer(screen_width, screen_height) bg = rendering.FilledPolygon([(0,0),(screen_width,0),(screen_width,screen_height),(0,screen_height),(0,0)]) bg.set_color(0.2,0.2,0.2) self.viewer.add_geom(bg) #棋盘网格 for i in range(self.SIZE): line = rendering.Line((space,space+i*width),(screen_width-space,space+i*width)) line.set_color(1, 1, 1) self.viewer.add_geom(line) for i in range(self.SIZE): line = rendering.Line((space+i*width,space),(space+i*width,screen_height - space)) line.set_color(1, 1, 1) self.viewer.add_geom(line) #棋子 self.chess = [] for x in range(self.SIZE): self.chess.append([]) for y in range(self.SIZE): c = rendering.make_circle(width/2-3) ct = rendering.Transform(translation=(0,0)) c.add_attr(ct) c.set_color(0,0,0) self.chess[x].append([c,ct]) self.viewer.add_geom(c) for x in range(self.SIZE): for y in range(self.SIZE): if self.chessboard[x][y]!=0: self.chess[x][y][1].set_translation(space+x*width,space+y*width) if self.chessboard[x][y]==1: self.chess[x][y][0].set_color(255,255,255) else: self.chess[x][y][0].set_color(0,0,0) else: self.chess[x][y][1].set_translation(-10,-10) return self.viewer.render(return_rgb_array=mode == 'rgb_array') #if self.state is None: return None #return super().render(mode)
环境安装好之后,以下是DQN算法
import tensorflow as tf import numpy as np import random from collections import deque import gym import time import math GAMMA = 0.9 # discount factor for target Q INITIAL_EPSILON = 0.1 # starting value of epsilon FINAL_EPSILON = 0.01 # final value of epsilon REPLAY_SIZE = 10000 # 经验回放缓存大小 BATCH_SIZE = 200 # 小批量尺寸 TARGET_Q_STEP = 100 # 目标网络同步的训练次数 class DQN(): # DQN Agent def __init__(self, env): # init experience replay self.replay_buffer = deque() # init some parameters self.time_step = 0 self.epsilon = INITIAL_EPSILON self.SIZE = env.env.SIZE self.state_dim = self.SIZE*self.SIZE+1 self.action_dim = self.SIZE*self.SIZE self.hide_layer_inputs = 52 #创建Q网络 self.create_Q_network() #创建训练方法 self.create_training_method() self.target_q_step = TARGET_Q_STEP self.create_TargetQ_network() # 初始会话 self.session = tf.InteractiveSession() self.session.run(tf.initialize_all_variables()) def create_Q_network(self): # network weights W1 = self.weight_variable([self.state_dim,self.hide_layer_inputs]) b1 = self.bias_variable([self.hide_layer_inputs]) W2 = self.weight_variable([self.hide_layer_inputs,self.action_dim]) b2 = self.bias_variable([self.action_dim]) # input layer self.state_input = tf.placeholder("float",[None,self.state_dim]) # hidden layers h_layer = tf.nn.relu(tf.matmul(self.state_input,W1) + b1) # Q Value layer self.Q_value = tf.matmul(h_layer,W2) + b2 #保存权重 self.Q_Weihgts = [W1,b1,W2,b2] def create_TargetQ_network(self): # network weights W1 = self.weight_variable([self.state_dim,self.hide_layer_inputs]) b1 = self.bias_variable([self.hide_layer_inputs]) W2 = self.weight_variable([self.hide_layer_inputs,self.action_dim]) b2 = self.bias_variable([self.action_dim]) # input layer #self.state_input = tf.placeholder("float",[None,self.state_dim]) # hidden layers h_layer = tf.nn.relu(tf.matmul(self.state_input,W1) + b1) # Q Value layer self.TargetQ_value = tf.matmul(h_layer,W2) + b2 self.TargetQ_Weights = [W1,b1,W2,b2] def copyWeightsToTarget(self): for i in range(len(self.Q_Weihgts)): self.session.run(tf.assign(self.TargetQ_Weights[i],self.Q_Weihgts[i])) def create_training_method(self): self.action_input = tf.placeholder("float",[None,self.action_dim]) # one hot presentation self.y_input = tf.placeholder("float",[None]) Q_action = tf.reduce_sum(tf.multiply(self.Q_value,self.action_input),reduction_indices = 1) #mul->matmul self.cost = tf.reduce_mean(tf.square(self.y_input - Q_action)) self.optimizer = tf.train.AdamOptimizer(0.0001).minimize(self.cost) def perceive(self,state,action,reward,next_state,done): one_hot_action = np.zeros(self.action_dim) one_hot_action[action] = 1 self.replay_buffer.append([state,one_hot_action,reward,next_state,done]) if len(self.replay_buffer) > REPLAY_SIZE: self.replay_buffer.popleft() if len(self.replay_buffer) > BATCH_SIZE: self.train_Q_network() def modify_last_reward(self,new_reward): v = self.replay_buffer.pop() v[2] = new_reward self.replay_buffer.append(v) def train_Q_network(self): self.time_step += 1 # Step 1: obtain random minibatch from replay memory minibatch = random.sample(self.replay_buffer,BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] # Step 2: calculate y y_batch = [] Q_value_batch = self.Q_value.eval(feed_dict={self.state_input:next_state_batch}) #Q_value_batch = self.TargetQ_value.eval(feed_dict={self.state_input:next_state_batch}) for i in range(0,BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else : y_batch.append(reward_batch[i] + GAMMA * np.max(Q_value_batch[i])) self.optimizer.run(feed_dict={ self.y_input:y_batch, self.action_input:action_batch, self.state_input:state_batch }) #同步目标网络 if self.time_step % self.target_q_step == 0: self.copyWeightsToTarget() def egreedy_action(self,state): Q_value = self.Q_value.eval(feed_dict = { self.state_input:[state] })[0] min_v = Q_value[np.argmin(Q_value)]-1 valid_action = [] for i in range(len(Q_value)): if state[i]==0: valid_action.append(i) else: Q_value[i] = min_v if random.random() <= self.epsilon: return valid_action[random.randint(0,len(valid_action) - 1)] #return random.randint(0,self.action_dim - 1) else: return np.argmax(Q_value) #self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/10000 def action(self,state): Q_value = self.Q_value.eval(feed_dict = { self.state_input:[state] })[0] min_v = Q_value[np.argmin(Q_value)]-1 valid_action = [] for i in range(len(Q_value)): if state[i]==0: valid_action.append(i) else: Q_value[i] = min_v return np.argmax(Q_value) def weight_variable(self,shape): initial = tf.truncated_normal(shape) return tf.Variable(initial) def bias_variable(self,shape): initial = tf.constant(0.01, shape = shape) return tf.Variable(initial) # --------------------------------------------------------- # Hyper Parameters ENV_NAME = 'FiveChess-v0' EPISODE = 10000 # Episode limitation STEP = 300 # Step limitation in an episode TEST = 1 # The number of experiment test every 100 episode def main(): # initialize OpenAI Gym env and dqn agent env = gym.make(ENV_NAME) agent = DQN(env) SIZE = env.env.SIZE agent.copyWeightsToTarget() for episode in range(EPISODE): # initialize task state = env.reset() camp = -1 state = np.reshape(state,[-1]) state = np.append(state,camp) print('episode ',episode) # Train for step in range(STEP): #自己下一步棋 action = agent.egreedy_action(state) # e-greedy action for train action = [math.floor(action/SIZE),action%SIZE,camp] #if env.env.is_valid_set_coord(action[0],action[1]): next_state,reward,done,_ = env.step(action) next_state = np.reshape(next_state,[-1]) if step%2 == 0: camp = 1 else: camp = -1 next_state = np.append(next_state,camp) # Define reward for agent reward_agent = reward agent.perceive(state,action,reward,next_state,done) state = next_state if done: print('done step ',step) break # Test every 100 episodes if episode % 100 == 99: total_reward = 0 for i in range(TEST): state = env.reset() state = np.reshape(state,[-1]) camp = -1 state = np.append(state,camp) for j in range(STEP): env.render() action = agent.action(state) # direct action for test action = [math.floor(action/SIZE),action%SIZE,camp] state,reward,done,_ = env.step(action) state = np.reshape(state,[-1]) if j%2 == 0: camp = 1 else: camp = -1 state = np.append(state,camp) total_reward += reward time.sleep(0.5) if done: env.render() print('done') time.sleep(3) break ave_reward = total_reward/TEST print('episode: ',episode,'Evaluation Average Reward:',ave_reward) #if ave_reward >= 990: # break if __name__ == '__main__': main()
训练截图:
如果修改五子棋规则,只下一种颜色的棋子,可以很好的收敛到在10步内找到5子连成一条。
两种颜色自我博弈的情况下,收敛情况未知!
文章来源: 基于DQN的五子棋算法