QLearning network in a custom environment is choosing the same action every time, despite the heavy negative reward

问题

So I plugged QLearningDiscreteDense into a dots and boxes game I made. I created a custom MDP environment for it. The problem is that it chooses action 0 each time, the first time it works but then it's not an available action anymore so it's an illegal move. I give illegal moves a reward of Integer.MIN_VALUE, but it doesn't affect anything. Here's the MDP class:

    public class testEnv implements MDP<testState, Integer, DiscreteSpace> {
        final private int maxStep;
        DiscreteSpace actionSpace = new DiscreteSpace(Graph.getEdgeList().size());
        // takes amount of possible edges      ^
        ObservationSpace<testState> observationSpace = new ArrayObservationSpace(new int[] {1});
        private testState state = new testState(Graph.getMatrix(),0,0,0);
        private NeuralNetFetchable<IDQN> fetchable;
        boolean illegal=false;
        public testEnv(int maxStep){
            this.maxStep=maxStep;
        }
        @Override
        public ObservationSpace<testState> getObservationSpace() {
            return observationSpace;
        }

        @Override
        public DiscreteSpace getActionSpace() {
            return actionSpace;
        }

        @Override
        public testState reset() {
           // System.out.println("RESET");
            try {
                GameBoard r = new GameBoard(3,3);
            } catch (IOException e) {
                e.printStackTrace();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            return new testState(Graph.getMatrix(),0,0,0);
        }

        @Override
        public void close() { }

        @Override
        public StepReply<testState> step(Integer action) {
         //   System.out.println(Arrays.deepToString(Graph.getMatrix()));
            int reward=0;
            try {
                placeEdge(action);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            // change the getPlayer1 to whichever player the neural is
         //   System.out.println("step: "+state.step);
            if(!illegal) {
                System.out.println("Not Illegal");
                if (gameThread.checkFinished()) {
                    if (Graph.getPlayer1Score() > Graph.getPlayer2Score()) {
                        reward = 5;
                    } else {
                        reward = -5;
                    }
                }
                if (Graph.numOfMoves < 1) {
                    if (!isDone()) {
                        if (Graph.player1Turn) {
                            Graph.player1Turn = false;
                        } else {
                            Graph.player1Turn = true;
                        }
                        Graph.setNumOfMoves(1);
                        while (Graph.numOfMoves > 0) {
                     //       System.out.println(Arrays.deepToString(Graph.getMatrix()));
                            if (!isDone()) {
                                Graph.getRandomBot().placeRandomEdge();
                            } else {
                                Graph.numOfMoves = 0;
                            }
                        }
                        if (!isDone()) {
                            if (Graph.player1Turn) {
                                Graph.player1Turn = false;
                            } else {
                                Graph.player1Turn = true;
                            }
                            Graph.setNumOfMoves(1);
                        }
                    }
                }
            }else{
                reward=Integer.MIN_VALUE;
                illegal=false;
            }
            testState t = new testState(Graph.getMatrix(), Graph.getPlayer1Score(), Graph.getPlayer2Score(), state.step + 1);
            state=t;
            // perform action in game, get reward
            return new StepReply<>(t, reward, isDone(), null);
        }

        @Override
        public boolean isDone() {
            return gameThread.checkFinished();
        }

        @Override
        public MDP<testState, Integer, DiscreteSpace> newInstance() {
            testEnv test = new testEnv(maxStep);
            test.setFetchable(fetchable);
            return test;
        }
        public void setFetchable(NeuralNetFetchable<IDQN> fetchable) {
            this.fetchable = fetchable;
        }

        public void placeEdge(int index) throws InterruptedException {
            ELine line = Graph.getEdgeList().get(index).getEline();
            System.out.println("NChosen: "+line.vertices.get(0).id+"--"+line.vertices.get(1).id);
            if(!line.isActivated()) {
                line.setActivated(true);
                // make it black
                line.setBackground(Color.BLACK);
                line.repaint();
                // set the adjacency matrix to 2, 2==is a line, 1==is a possible line
                Graph.matrix[line.vertices.get(0).getID()][line.vertices.get(1).getID()] = 2;
                Graph.matrix[line.vertices.get(1).getID()][line.vertices.get(0).getID()] = 2;
                // gets an arrayList of each box the ELine creates. The box is an arrayList of 4 vertices.
                ArrayList<ArrayList<Vertex>> boxes = gameThread.checkBox(line);
                if (boxes != null) {
                    for (ArrayList<Vertex> box : boxes) {
                        // looks through the counterBoxes arrayList and sets the matching one visible.
                        gameThread.checkMatching(box);
                        // updates the score board
                        if (Graph.getPlayer1Turn()) {
                            Graph.setPlayer1Score(Graph.getPlayer1Score() + 1);
                            Graph.getScore1().setScore();
                        } else {
                            Graph.setPlayer2Score(Graph.getPlayer2Score() + 1);
                            Graph.getScore2().setScore();
                        }
                    }
                    // if every counterBox has been activated, the game is over
                } else {
                    Graph.setNumOfMoves(0);
                    // switches turn. If randomBot is active switches to their turn.
                }
            }else{
                System.out.println("ILLEGAL");
                illegal=true;
            }
        }
    }

Here's the class I made for the states:

    public class testState implements Encodable {
        int[][] matrix;
        int playerScore;
        int otherPlayerScore;
        int step;
        public testState(int[][] m,int p,int op,int step){
            matrix=m;
            playerScore=p;
            otherPlayerScore=op;
            this.step=step;
        }
        @Override
        public double[] toArray() {
            double[] array = new double[matrix.length*matrix[0].length];
            int i=0;
            for(int a=0;a< matrix.length;a++){
                for(int b=0;b<matrix[0].length;b++){
                    array[i]= matrix[a][b];
                    i++;
                }
            }
            return array;
        }

        @Override
        public boolean isSkipped() {
            return false;
        }

        @Override
        public INDArray getData() {
            return null;
        }

        @Override
        public Encodable dup() {
            return null;
        }
    }

Here's the class where I run it:

    public class testNeural {
        public static void main(String args[]) throws IOException, InterruptedException {
            GameBoard r = new GameBoard(3,3);
            DQNPolicy<testState> t = dots();
        }
        static QLearning.QLConfiguration DOTS_QL = QLearning.QLConfiguration.builder()
                .seed(123)                //Random seed (for reproducability)
                .maxEpochStep(10)        // Max step By epoch
                .maxStep(10000)           // Max step
                .expRepMaxSize(150000)    // Max size of experience replay
                .batchSize(128)            // size of batches
                .targetDqnUpdateFreq(500) // target update (hard)
                .updateStart(10)          // num step noop warmup
                .rewardFactor(0.01)       // reward scaling
                .gamma(0.99)              // gamma
                .errorClamp(1.0)          // /td-error clipping
                .minEpsilon(0.1f)         // min epsilon
                .epsilonNbStep(1000)      // num step for eps greedy anneal
                .doubleDQN(false)          // double DQN
                .build();
        static DQNFactoryStdDense.Configuration DOTS_NET =
                DQNFactoryStdDense.Configuration.builder()
                        .l2(0)
                        .updater(new RmsProp(0.000025))
                        .numHiddenNodes(300)
                        .numLayer(10)
                        .build();
        private static DQNPolicy<testState> dots() throws IOException {
            DataManager dataManager = new DataManager(true);


            // The neural network used by the agent. Note that there is no need to specify the number of inputs/outputs.
            // These will be read from the gym environment at the start of training.

            testEnv env = new testEnv(10000);
            QLearningDiscreteDense<testState> dql = new QLearningDiscreteDense<>(env, DOTS_NET, DOTS_QL,dataManager);
            dql.train();
            return dql.getPolicy();
        }
    }

来源：https://stackoverflow.com/questions/65065874/qlearning-network-in-a-custom-environment-is-choosing-the-same-action-every-time

标签

java

machine-learning

neural-network

q-learning

deeplearning4j