diff --git a/benchmark/torch/AlphaZero/.pic/good_moves.png b/benchmark/torch/AlphaZero/.pic/good_moves.png new file mode 100644 index 0000000000000000000000000000000000000000..f007fc4a6f2dbc9df9a6a8163de08dcf59cb82dc Binary files /dev/null and b/benchmark/torch/AlphaZero/.pic/good_moves.png differ diff --git a/benchmark/torch/AlphaZero/.pic/perfect_moves.png b/benchmark/torch/AlphaZero/.pic/perfect_moves.png new file mode 100644 index 0000000000000000000000000000000000000000..72c3913ea58498446e92d170255c71606e194fe0 Binary files /dev/null and b/benchmark/torch/AlphaZero/.pic/perfect_moves.png differ diff --git a/benchmark/torch/AlphaZero/Arena.py b/benchmark/torch/AlphaZero/Arena.py new file mode 100644 index 0000000000000000000000000000000000000000..a0791803eb1061485f2f6a647540d9bc9d4f45ee --- /dev/null +++ b/benchmark/torch/AlphaZero/Arena.py @@ -0,0 +1,105 @@ +# Third party code +# +# The following code are copied or modified from: +# https://github.com/suragnair/alpha-zero-general + +from tqdm import tqdm +from parl.utils import logger + + +class Arena(): + """ + An Arena class where any 2 agents can be pit against each other. + """ + + def __init__(self, player1, player2, game, display=None): + """ + Input: + player 1,2: two functions that takes board as input, return action + game: Game object + display: a function that takes board as input and prints it (e.g. + display in othello/OthelloGame). Is necessary for verbose + mode. + + see othello/OthelloPlayers.py for an example. See pit.py for pitting + human players/other baselines with each other. + """ + self.player1 = player1 + self.player2 = player2 + self.game = game + self.display = display + + def playGame(self, verbose=False): + """ + Executes one episode of a game. + + Returns: + either + winner: player who won the game (1 if player1, -1 if player2) + or + draw result returned from the game that is neither 1, -1, nor 0. + """ + players = [self.player2, None, self.player1] + curPlayer = 1 + board = self.game.getInitBoard() + it = 0 + while self.game.getGameEnded(board, curPlayer) == 0: + it += 1 + if verbose: + assert self.display + print("Turn ", str(it), "Player ", str(curPlayer)) + self.display(board) + action = players[curPlayer + 1](self.game.getCanonicalForm( + board, curPlayer)) + + valids = self.game.getValidMoves( + self.game.getCanonicalForm(board, curPlayer), 1) + + if valids[action] == 0: + logger.error('Action {} is not valid!'.format(action)) + logger.debug('valids = {}'.format(valids)) + assert valids[action] > 0 + board, curPlayer = self.game.getNextState(board, curPlayer, action) + if verbose: + assert self.display + print("Game over: Turn ", str(it), "Result ", + str(self.game.getGameEnded(board, 1))) + self.display(board) + return curPlayer * self.game.getGameEnded(board, curPlayer) + + def playGames(self, num, verbose=False): + """ + Plays num games in which player1 starts num/2 games and player2 starts + num/2 games. + + Returns: + oneWon: games won by player1 + twoWon: games won by player2 + draws: games won by nobody + """ + + num = int(num / 2) + oneWon = 0 + twoWon = 0 + draws = 0 + for _ in tqdm(range(num), desc="Arena.playGames (1)"): + gameResult = self.playGame(verbose=verbose) + if gameResult == 1: + oneWon += 1 + elif gameResult == -1: + twoWon += 1 + else: + draws += 1 + + self.player1, self.player2 = self.player2, self.player1 + + for _ in tqdm(range(num), desc="Arena.playGames (2)"): + gameResult = self.playGame(verbose=verbose) + if gameResult == -1: + oneWon += 1 + elif gameResult == 1: + twoWon += 1 + else: + draws += 1 + + return oneWon, twoWon, draws diff --git a/benchmark/torch/AlphaZero/Coach.py b/benchmark/torch/AlphaZero/Coach.py new file mode 100644 index 0000000000000000000000000000000000000000..01394b076db969db42a7277b5d95f82bd661db3d --- /dev/null +++ b/benchmark/torch/AlphaZero/Coach.py @@ -0,0 +1,246 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import threading +import queue +import pickle +from pickle import Pickler, Unpickler +from random import shuffle +from parl.utils import tensorboard + +import numpy as np +from tqdm import tqdm + +import parl +from parl.utils import logger + +from actor import Actor +from utils import split_group, get_test_dataset +from alphazero_agent import create_agent + + +class Coach(): + """ + This class executes the self-play, learning and evaluating. + """ + + def __init__(self, game, args): + self.game = game + self.args = args + + # neural network of current generation + self.current_agent = create_agent(self.game) + # neural network of previous generation + self.previous_agent = create_agent(self.game) + + # history of examples from args.numItersForTrainExamplesHistory latest iterations + self.trainExamplesHistory = [] + + self.remote_actors_signal_queues = [] + self.remote_actors_return_queue = queue.Queue() + + self.test_dataset = get_test_dataset() + + def _run_remote_tasks(self, signal_queue): + # The remote actor will actually run on the local machine or other machines of xparl cluster + remote_actor = Actor(self.game, self.args) + + while True: + # receive running task signal + # signal: specify task type and task input data (optional) + signal = signal_queue.get() + + if signal["task"] == "self-play": + episode_num_each_actor = self.args.numEps // self.args.actors_num + result = remote_actor.self_play( + self.current_agent.get_weights(), episode_num_each_actor) + self.remote_actors_return_queue.put({"self-play": result}) + + elif signal["task"] == "pitting": + games_num_each_actor = self.args.arenaCompare // self.args.actors_num + result = remote_actor.pitting( + self.previous_agent.get_weights(), + self.current_agent.get_weights(), games_num_each_actor) + self.remote_actors_return_queue.put({"pitting": result}) + + elif signal["task"] == "evaluate_test_dataset": + test_dataset = signal["test_dataset"] + result = remote_actor.evaluate_test_dataset( + self.current_agent.get_weights(), test_dataset) + self.remote_actors_return_queue.put({ + "evaluate_test_dataset": + result + }) + else: + raise NotImplementedError + + def _create_remote_actors(self): + # connect to xparl cluster to submit jobs + parl.connect(self.args.master_address) + + for i in range(self.args.actors_num): + signal_queue = queue.Queue() + self.remote_actors_signal_queues.append(signal_queue) + + remote_thread = threading.Thread( + target=self._run_remote_tasks, args=(signal_queue, )) + remote_thread.setDaemon(True) + remote_thread.start() + + def learn(self): + """Each iteration: + 1. Performs numEps episodes of self-play. + 2. Retrains neural network with examples in trainExamplesHistory + (which has a maximum length of numItersForTrainExamplesHistory). + 3. Evaluates the new neural network with the test dataset. + 4. Pits the new neural network against the old one and accepts it + only if it wins >= updateThreshold fraction of games. + """ + + # create remote actors to run tasks (self-play/pitting/evaluate_test_dataset) in parallel. + self._create_remote_actors() + + for iteration in range(1, self.args.numIters + 1): + logger.info('Starting Iter #{} ...'.format(iteration)) + + #################### + logger.info('Step1: self-play in parallel...') + iterationTrainExamples = [] + # update weights of remote actors to the latest weights, and ask them to run self-play task + for signal_queue in self.remote_actors_signal_queues: + signal_queue.put({"task": "self-play"}) + # wait for all remote actors (a total of self.args.actors_num) to return the self-play results + for _ in range(self.args.actors_num): + result = self.remote_actors_return_queue.get() + iterationTrainExamples.extend(result["self-play"]) + + # save the iteration examples to the history + self.trainExamplesHistory.append(iterationTrainExamples) + if len(self.trainExamplesHistory + ) > self.args.numItersForTrainExamplesHistory: + logger.warning("Removing the oldest entry in trainExamples.") + self.trainExamplesHistory.pop(0) + self.saveTrainExamples(iteration) # backup history to a file + + #################### + logger.info('Step2: train neural network...') + # shuffle examples before training + trainExamples = [] + for e in self.trainExamplesHistory: + trainExamples.extend(e) + shuffle(trainExamples) + + # training new network, keeping a copy of the old one + self.current_agent.save( + os.path.join(self.args.checkpoint, 'temp.pth.tar')) + self.previous_agent.restore( + os.path.join(self.args.checkpoint, 'temp.pth.tar')) + + self.current_agent.learn(trainExamples) + + #################### + logger.info('Step3: evaluate test dataset in parallel...') + cnt = 0 + # update weights of remote actors to the latest weights, and ask them to evaluate assigned test dataset + for i, data in enumerate( + split_group( + self.test_dataset, + len(self.test_dataset) // self.args.actors_num)): + self.remote_actors_signal_queues[i].put({ + "task": + "evaluate_test_dataset", + "test_dataset": + data + }) + cnt += len(data) + perfect_moves_cnt, good_moves_cnt = 0, 0 + # wait for all remote actors (a total of self.args.actors_num) to return the evaluating results + for _ in range(self.args.actors_num): + (perfect_moves, + good_moves) = self.remote_actors_return_queue.get( + )["evaluate_test_dataset"] + perfect_moves_cnt += perfect_moves + good_moves_cnt += good_moves + logger.info('perfect moves rate: {}, good moves rate: {}'.format( + perfect_moves_cnt / cnt, good_moves_cnt / cnt)) + tensorboard.add_scalar('perfect_moves_rate', + perfect_moves_cnt / cnt, iteration) + tensorboard.add_scalar('good_moves_rate', good_moves_cnt / cnt, + iteration) + + #################### + logger.info( + 'Step4: pitting against previous generation in parallel...') + # transfer weights of previous generation and current generation to the remote actors, and ask them to pit. + for signal_queue in self.remote_actors_signal_queues: + signal_queue.put({"task": "pitting"}) + previous_wins, current_wins, draws = 0, 0, 0 + for _ in range(self.args.actors_num): + (pwins_, cwins_, + draws_) = self.remote_actors_return_queue.get()["pitting"] + previous_wins += pwins_ + current_wins += cwins_ + draws += draws_ + + logger.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' % + (current_wins, previous_wins, draws)) + if previous_wins + current_wins == 0 or float(current_wins) / ( + previous_wins + current_wins) < self.args.updateThreshold: + logger.info('REJECTING NEW MODEL') + self.current_agent.restore( + os.path.join(self.args.checkpoint, 'temp.pth.tar')) + else: + logger.info('ACCEPTING NEW MODEL') + self.current_agent.save( + os.path.join(self.args.checkpoint, 'best.pth.tar')) + self.current_agent.save( + os.path.join(self.args.checkpoint, + self.getCheckpointFile(iteration))) + + def getCheckpointFile(self, iteration): + return 'checkpoint_' + str(iteration) + '.pth.tar' + + def saveTrainExamples(self, iteration): + folder = self.args.checkpoint + if not os.path.exists(folder): + os.makedirs(folder) + filename = os.path.join( + folder, + self.getCheckpointFile(iteration) + ".examples") + with open(filename, "wb+") as f: + Pickler(f).dump(self.trainExamplesHistory) + f.closed + + def loadModel(self): + self.current_agent.restore( + os.path.join(self.args.load_folder_file[0], + self.args.load_folder_file[1])) + + def loadTrainExamples(self): + modelFile = os.path.join(self.args.load_folder_file[0], + self.args.load_folder_file[1]) + examplesFile = modelFile + ".examples" + if not os.path.isfile(examplesFile): + logger.warning( + "File {} with trainExamples not found!".format(examplesFile)) + r = input("Continue? [y|n]") + if r != "y": + sys.exit() + else: + logger.info("File with trainExamples found. Loading it...") + with open(examplesFile, "rb") as f: + self.trainExamplesHistory = Unpickler(f).load() + logger.info('Loading done!') diff --git a/benchmark/torch/AlphaZero/MCTS.py b/benchmark/torch/AlphaZero/MCTS.py new file mode 100644 index 0000000000000000000000000000000000000000..b011efe15dbdc10ccbe2c07e6d30b2e2aaa82d9d --- /dev/null +++ b/benchmark/torch/AlphaZero/MCTS.py @@ -0,0 +1,164 @@ +# Third party code +# +# The following code are copied or modified from: +# https://github.com/suragnair/alpha-zero-general + +import math +import time + +import numpy as np + +EPS = 1e-8 + + +class MCTS(): + """ + This class handles the MCTS tree. + """ + + def __init__(self, game, nn_agent, args, dirichlet_noise=False): + self.game = game + self.nn_agent = nn_agent + self.args = args + self.dirichlet_noise = dirichlet_noise + self.Qsa = {} # stores Q values for s,a (as defined in the paper) + self.Nsa = {} # stores #times edge s,a was visited + self.Ns = {} # stores #times board s was visited + self.Ps = {} # stores initial policy (returned by neural net) + + self.Es = {} # stores game.getGameEnded ended for board s + self.Vs = {} # stores game.getValidMoves for board s + + def getActionProb(self, canonicalBoard, temp=1): + """ + This function performs numMCTSSims simulations of MCTS starting from + canonicalBoard. + + Returns: + probs: a policy vector where the probability of the ith action is + proportional to Nsa[(s,a)]**(1./temp) + """ + for i in range(self.args.numMCTSSims): + dir_noise = (i == 0 and self.dirichlet_noise) + self.search(canonicalBoard, dirichlet_noise=dir_noise) + + s = self.game.stringRepresentation(canonicalBoard) + counts = [ + self.Nsa[(s, a)] if (s, a) in self.Nsa else 0 + for a in range(self.game.getActionSize()) + ] + + if temp == 0: + bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten() + bestA = np.random.choice(bestAs) + probs = [0] * len(counts) + probs[bestA] = 1 + return probs + + counts = [x**(1. / temp) for x in counts] + counts_sum = float(sum(counts)) + probs = [x / counts_sum for x in counts] + return probs + + def search(self, canonicalBoard, dirichlet_noise=False): + """ + This function performs one iteration of MCTS. It is recursively called + till a leaf node is found. The action chosen at each node is one that + has the maximum upper confidence bound as in the paper. + + Once a leaf node is found, the neural network is called to return an + initial policy P and a value v for the state. This value is propagated + up the search path. In case the leaf node is a terminal state, the + outcome is propagated up the search path. The values of Ns, Nsa, Qsa are + updated. + + NOTE: the return values are the negative of the value of the current + state. This is done since v is in [-1,1] and if v is the value of a + state for the current player, then its value is -v for the other player. + + Returns: + v: the negative of the value of the current canonicalBoard + """ + + s = self.game.stringRepresentation(canonicalBoard) + + if s not in self.Es: + self.Es[s] = self.game.getGameEnded(canonicalBoard, 1) + if self.Es[s] != 0: + # terminal node + return -self.Es[s] + + if s not in self.Ps: + # leaf node + self.Ps[s], v = self.nn_agent.predict(canonicalBoard) + + valids = self.game.getValidMoves(canonicalBoard, 1) + self.Ps[s] = self.Ps[s] * valids # masking invalid moves + if dirichlet_noise: + self.applyDirNoise(s, valids) + sum_Ps_s = np.sum(self.Ps[s]) + if sum_Ps_s > 0: + self.Ps[s] /= sum_Ps_s # renormalize + else: + # if all valid moves were masked make all valid moves equally probable + + # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else. + # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process. + print("All valid moves were masked, doing a workaround.") + self.Ps[s] = self.Ps[s] + valids + self.Ps[s] /= np.sum(self.Ps[s]) + + self.Vs[s] = valids + self.Ns[s] = 0 + return -v + + valids = self.Vs[s] + if dirichlet_noise: + self.applyDirNoise(s, valids) + sum_Ps_s = np.sum(self.Ps[s]) + self.Ps[s] /= sum_Ps_s # renormalize + cur_best = -float('inf') + best_act = -1 + + # pick the action with the highest upper confidence bound + for a in range(self.game.getActionSize()): + if valids[a]: + if (s, a) in self.Qsa: + u = self.Qsa[ + (s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt( + self.Ns[s]) / (1 + self.Nsa[(s, a)]) + else: + u = self.args.cpuct * self.Ps[s][a] * math.sqrt( + self.Ns[s] + EPS) # Q = 0 ? + + if u > cur_best: + cur_best = u + best_act = a + + a = best_act + next_s, next_player = self.game.getNextState(canonicalBoard, 1, a) + next_s = self.game.getCanonicalForm(next_s, next_player) + + v = self.search(next_s) + + if (s, a) in self.Qsa: + self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[ + (s, a)] + v) / (self.Nsa[(s, a)] + 1) + self.Nsa[(s, a)] += 1 + + else: + self.Qsa[(s, a)] = v + self.Nsa[(s, a)] = 1 + + self.Ns[s] += 1 + return -v + + def applyDirNoise(self, s, valids): + dir_values = np.random.dirichlet( + [self.args.dirichletAlpha] * np.count_nonzero(valids)) + dir_idx = 0 + for idx in range(len(self.Ps[s])): + if self.Ps[s][idx]: + self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + ( + 0.25 * dir_values[dir_idx]) + dir_idx += 1 diff --git a/benchmark/torch/AlphaZero/README.md b/benchmark/torch/AlphaZero/README.md new file mode 100644 index 0000000000000000000000000000000000000000..72d9c807fb5066c51b49520b8aca3a5e666e133c --- /dev/null +++ b/benchmark/torch/AlphaZero/README.md @@ -0,0 +1,58 @@ +## AlphaZero baseline for Connect4 game (distributed version) +- In this example, we provide a fine-tuned AlphaZero baseline to solve the Connect4 game, based on the code of [alpha-zero-general](https://github.com/suragnair/alpha-zero-general) repo. +- We take advantage of the parallelism capacity of [PARL](https://github.com/PaddlePaddle/PARL) to support running self-play and evaluating tasks in parallel. +- We also provide scripts to pack your well-trained model to a submission file, which can be submitted to the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition directly. + +### Dependencies +- python3 +- [parl==1.3](https://github.com/PaddlePaddle/PARL) +- torch +- tqdm + +### Training +1. Download the [1k connect4 validation set](https://www.kaggle.com/petercnudde/1k-connect4-validation-set) to the current directory. (filename: `refmoves1k_kaggle`) + +2. Start xparl cluster +```bash +# You can change following `cpu_num` and `args.actor_nums` in the main.py +# based on the CPU number of your machine. + +xparl start --port 8010 --cpu_num 25 +``` + +```bash +# [OPTIONAL] You can also run the following script in other machines to add more CPU resource +# to the xparl cluster, so you can increase the parallelism (args.actor_nums). + +xparl connect --address MASTER_IP:8010 --cpu_num [CPU_NUM] +``` + +3. Run training script +```bash +python main.py +``` + +4. Visualize (good moves rate and perfect moves rate) +``` +tensorboard --logdir . +``` + +### Submitting +To submit the well-trained model to the Kaggle, you can use our provided script to generate `submission.py`, for example: +```bash +python gen_submission.py saved_model/best.pth.tar +``` + +### Performance +- Following are `good moves rate` and `perfect moves rate` indicators in tensorbaord, please refer to the [link](https://www.kaggle.com/petercnudde/scoring-connect-x-agents) for specific meaning. + +good moves rate perfect moves rate + +> It takes about 1 day to run 25 iterations on the machine with 25 cpus. + +- It can reach about score 1368 (rank 5 on 2020/06/04) in the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition. + + +### Reference +- [suragnair/alpha-zero-general](https://github.com/suragnair/alpha-zero-general) +- [Scoring connect-x agents](https://www.kaggle.com/petercnudde/scoring-connect-x-agents) diff --git a/benchmark/torch/AlphaZero/actor.py b/benchmark/torch/AlphaZero/actor.py new file mode 100644 index 0000000000000000000000000000000000000000..5ed719b92d292903f81f7c92a983927bf5c9cab5 --- /dev/null +++ b/benchmark/torch/AlphaZero/actor.py @@ -0,0 +1,165 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import parl +import os +from alphazero_agent import create_agent +from MCTS import MCTS +from Arena import Arena +from utils import win_loss_draw + + +@parl.remote_class +class Actor(object): + def __init__(self, game, args): + os.environ['OMP_NUM_THREADS'] = "1" + self.game = game + self.args = args + + # neural network of previous generation + self.previous_agent = create_agent(self.game, cuda=False) + # neural network of current generation + self.current_agent = create_agent(self.game, cuda=False) + + # MCTS of previous generation + self.previous_mcts = MCTS( + self.game, self.previous_agent, self.args, dirichlet_noise=True) + # MCTS of current generation + self.current_mcts = MCTS( + self.game, self.current_agent, self.args, dirichlet_noise=True) + + def self_play(self, current_weights, game_num): + """Collecting training data by self-play. + + Args: + current_weights (numpy.array): latest weights of neural network + game_num (int): game number of self-play + + Returns: + train_examples (list): examples of the form (canonicalBoard, currPlayer, pi,v) + """ + + # update weights of current neural network with latest weights + self.current_agent.set_weights(current_weights) + + train_examples = [] + for _ in range(game_num): + # reset node state of MCTS + self.current_mcts = MCTS( + self.game, self.current_agent, self.args, dirichlet_noise=True) + train_examples.extend(self._executeEpisode()) + return train_examples + + def pitting(self, previous_weights, current_weights, games_num): + """Fighting between previous generation agent and current generation agent + + Args: + previous_weights (numpy.array): weights of previous generation neural network + current_weights (numpy.array): weights of current generation neural network + game_num (int): game number of fighting + + Returns: + tuple of (game number of previous agent won, game number of current agent won, game number of draw) + """ + # update weights of previous and current neural network + self.previous_agent.set_weights(previous_weights) + self.current_agent.set_weights(current_weights) + + # reset node state of MCTS + self.previous_mcts = MCTS(self.game, self.previous_agent, self.args) + self.current_mcts = MCTS(self.game, self.current_agent, self.args) + + arena = Arena( + lambda x: np.argmax(self.previous_mcts.getActionProb(x, temp=0)), + lambda x: np.argmax(self.current_mcts.getActionProb(x, temp=0)), + self.game) + previous_wins, current_wins, draws = arena.playGames(games_num) + + return (previous_wins, current_wins, draws) + + def evaluate_test_dataset(self, current_weights, test_dataset): + """Evaluate performance of latest neural nerwork + + Args: + current_weights (numpy.array): latest weights of neural network + test_dataset (list): game number of self-play + + Returns: + tuple of (number of perfect moves, number of good moves) + """ + # update weights of current neural network with latest weights + self.current_agent.set_weights(current_weights) + + perfect_move_count, good_move_count = 0, 0 + for data in test_dataset: + self.current_mcts = MCTS(self.game, self.current_agent, self.args) + + x = self.game.getCanonicalForm(data['board'], data['player']) + agent_move = int( + np.argmax(self.current_mcts.getActionProb(x, temp=0))) + + moves = data["move_score"] + perfect_score = max(moves) + perfect_moves = [i for i in range(7) if moves[i] == perfect_score] + + if agent_move in perfect_moves: + perfect_move_count += 1 + if win_loss_draw( + moves[agent_move]) == win_loss_draw(perfect_score): + good_move_count += 1 + + return (perfect_move_count, good_move_count) + + def _executeEpisode(self): + """ + + This function executes one episode of self-play, starting with player 1. + As the game goes on, each turn is added as a training example to + trainExamples. The game is played till the game ends. After the game + ends, the outcome of the game is used to assign values to each example + in trainExamples. + + It uses a temp=1 if episodeStep < tempThresholdStep, and thereafter + uses temp=0. + + Returns: + trainExamples: a list of examples of the form (canonicalBoard, currPlayer, pi,v) + pi is the MCTS informed policy vector, v is +1 if + the player eventually won the game, else -1. + """ + trainExamples = [] + board = self.game.getInitBoard() + self.curPlayer = 1 + episodeStep = 0 + + while True: + episodeStep += 1 + canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer) + temp = int(episodeStep < self.args.tempThresholdStep) + + pi = self.current_mcts.getActionProb(canonicalBoard, temp=temp) + sym = self.game.getSymmetries(canonicalBoard, pi) + for b, p in sym: # board, pi + trainExamples.append([b, self.curPlayer, p, None]) + + action = np.random.choice(len(pi), p=pi) + board, self.curPlayer = self.game.getNextState( + board, self.curPlayer, action) + + r = self.game.getGameEnded(board, self.curPlayer) + + if r != 0: + return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer))) + for x in trainExamples] diff --git a/benchmark/torch/AlphaZero/alphazero_agent.py b/benchmark/torch/AlphaZero/alphazero_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..9e7e497e4818f30ae8d71bee109f4ff6f9795962 --- /dev/null +++ b/benchmark/torch/AlphaZero/alphazero_agent.py @@ -0,0 +1,150 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import parl +import torch +import torch.optim as optim + +from tqdm import tqdm +from utils import * +from connect4_model import Connect4Model + +args = dotdict({ + 'lr': 0.001, + 'dropout': 0.3, + 'epochs': 5, + 'batch_size': 64, + 'num_channels': 64, +}) + + +class AlphaZero(parl.Algorithm): + def __init__(self, model): + self.model = model + + def learn(self, boards, target_pis, target_vs, optimizer): + self.model.train() # train mode + + # compute model output + out_log_pi, out_v = self.model(boards) + + pi_loss = -torch.sum(target_pis * out_log_pi) / target_pis.size()[0] + + v_loss = torch.sum( + (target_vs - out_v.view(-1))**2) / target_vs.size()[0] + + total_loss = pi_loss + v_loss + + # compute gradient and do SGD step + optimizer.zero_grad() + total_loss.backward() + optimizer.step() + + return total_loss, pi_loss, v_loss + + def predict(self, board): + self.model.eval() # eval mode + + with torch.no_grad(): + log_pi, v = self.model(board) + + pi = torch.exp(log_pi) + return pi, v + + +def create_agent(game, cuda=True): + cuda = cuda and torch.cuda.is_available() + + model = Connect4Model(game, args) + if cuda: + model.cuda() + + algorithm = AlphaZero(model) + + alphazero_agent = AlphaZeroAgent(algorithm, game, cuda) + return alphazero_agent + + +class AlphaZeroAgent(parl.Agent): + def __init__(self, algorithm, game, cuda): + super(AlphaZeroAgent, self).__init__(algorithm) + self.cuda = cuda + self.board_x, self.board_y = game.getBoardSize() + self.action_size = game.getActionSize() + + def learn(self, examples): + """ + Args: + examples: list of examples, each example is of form (board, pi, v) + """ + optimizer = optim.Adam(self.algorithm.model.parameters(), lr=args.lr) + + for epoch in range(args.epochs): + print('EPOCH ::: ' + str(epoch + 1)) + + batch_count = int(len(examples) / args.batch_size) + + pbar = tqdm(range(batch_count), desc='Training Net') + for _ in pbar: + sample_ids = np.random.randint( + len(examples), size=args.batch_size) + boards, pis, vs = list(zip(*[examples[i] for i in sample_ids])) + boards = torch.FloatTensor(np.array(boards).astype(np.float64)) + target_pis = torch.FloatTensor(np.array(pis)) + target_vs = torch.FloatTensor(np.array(vs).astype(np.float64)) + + if self.cuda: + boards, target_pis, target_vs = boards.contiguous().cuda( + ), target_pis.contiguous().cuda(), target_vs.contiguous( + ).cuda() + + total_loss, pi_loss, v_loss = self.algorithm.learn( + boards, target_pis, target_vs, optimizer) + + # record loss with tqdm + pbar.set_postfix(Loss_pi=pi_loss.item(), Loss_v=v_loss.item()) + + def predict(self, board): + """ + Args: + board (np.array): input board + + Return: + pi (np.array): probability of actions + v (np.array): estimated value of input + """ + # preparing input + board = torch.FloatTensor(board.astype(np.float64)) + if self.cuda: + board = board.contiguous().cuda() + board = board.view(1, self.board_x, self.board_y) + + pi, v = self.algorithm.predict(board) + + return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0] + + +def create_agent(game, cuda=True): + cuda = cuda and torch.cuda.is_available() + + model = Connect4Model(game, args) + if cuda: + model.cuda() + + algorithm = AlphaZero(model) + + alphazero_agent = AlphaZeroAgent(algorithm, game, cuda) + return alphazero_agent diff --git a/benchmark/torch/AlphaZero/connect4_game.py b/benchmark/torch/AlphaZero/connect4_game.py new file mode 100644 index 0000000000000000000000000000000000000000..c10e8ca4afbca839ef71b18fd8f39f7493f30a4d --- /dev/null +++ b/benchmark/torch/AlphaZero/connect4_game.py @@ -0,0 +1,239 @@ +# Third party code +# +# The following code are copied or modified from: +# https://github.com/suragnair/alpha-zero-general + +import numpy as np +from collections import namedtuple + +DEFAULT_HEIGHT = 6 +DEFAULT_WIDTH = 7 +DEFAULT_WIN_LENGTH = 4 + +WinState = namedtuple('WinState', 'is_ended winner') + + +class Board(): + """ + Connect4 Board. + """ + + def __init__(self, + height=None, + width=None, + win_length=None, + np_pieces=None): + "Set up initial board configuration." + self.height = height or DEFAULT_HEIGHT + self.width = width or DEFAULT_WIDTH + self.win_length = win_length or DEFAULT_WIN_LENGTH + + if np_pieces is None: + self.np_pieces = np.zeros([self.height, self.width], dtype=np.int) + else: + self.np_pieces = np_pieces + assert self.np_pieces.shape == (self.height, self.width) + + def add_stone(self, column, player): + "Create copy of board containing new stone." + available_idx, = np.where(self.np_pieces[:, column] == 0) + if len(available_idx) == 0: + raise ValueError( + "Can't play column %s on board %s" % (column, self)) + + self.np_pieces[available_idx[-1]][column] = player + + def get_valid_moves(self): + "Any zero value in top row in a valid move" + return self.np_pieces[0] == 0 + + def get_win_state(self): + for player in [-1, 1]: + player_pieces = self.np_pieces == -player + # Check rows & columns for win + if (self._is_straight_winner(player_pieces) + or self._is_straight_winner(player_pieces.transpose()) + or self._is_diagonal_winner(player_pieces)): + return WinState(True, -player) + + # draw has very little value. + if not self.get_valid_moves().any(): + return WinState(True, None) + + # Game is not ended yet. + return WinState(False, None) + + def with_np_pieces(self, np_pieces): + """Create copy of board with specified pieces.""" + if np_pieces is None: + np_pieces = self.np_pieces + return Board(self.height, self.width, self.win_length, np_pieces) + + def _is_diagonal_winner(self, player_pieces): + """Checks if player_pieces contains a diagonal win.""" + win_length = self.win_length + for i in range(len(player_pieces) - win_length + 1): + for j in range(len(player_pieces[0]) - win_length + 1): + if all(player_pieces[i + x][j + x] for x in range(win_length)): + return True + for j in range(win_length - 1, len(player_pieces[0])): + if all(player_pieces[i + x][j - x] for x in range(win_length)): + return True + return False + + def _is_straight_winner(self, player_pieces): + """Checks if player_pieces contains a vertical or horizontal win.""" + run_lengths = [ + player_pieces[:, i:i + self.win_length].sum(axis=1) + for i in range(len(player_pieces) - self.win_length + 2) + ] + return max([x.max() for x in run_lengths]) >= self.win_length + + def __str__(self): + return str(self.np_pieces) + + +class Connect4Game(object): + """ + Connect4 Game class implementing the alpha-zero-general Game interface. + + Use 1 for player1 and -1 for player2. + """ + + def __init__(self, + height=None, + width=None, + win_length=None, + np_pieces=None): + self._base_board = Board(height, width, win_length, np_pieces) + + def getInitBoard(self): + """ + Returns: + startBoard: a representation of the board (ideally this is the form + that will be the input to your neural network) + """ + return self._base_board.np_pieces + + def getBoardSize(self): + """ + Returns: + (x,y): a tuple of board dimensions + """ + return (self._base_board.height, self._base_board.width) + + def getActionSize(self): + """ + Returns: + actionSize: number of all possible actions + """ + return self._base_board.width + + def getNextState(self, board, player, action): + """Returns a copy of the board with updated move, original board is unmodified. + + Input: + board: current board + player: current player (1 or -1) + action: action taken by current player + + Returns: + nextBoard: board after applying action + nextPlayer: player who plays in the next turn (should be -player) + + """ + b = self._base_board.with_np_pieces(np_pieces=np.copy(board)) + b.add_stone(action, player) + return b.np_pieces, -player + + def getValidMoves(self, board, player): + """Any zero value in top row in a valid move. + + Input: + board: current board + player: current player + + Returns: + validMoves: a binary vector of length self.getActionSize(), 1 for + moves that are valid from the current board and player, + 0 for invalid moves + """ + return self._base_board.with_np_pieces( + np_pieces=board).get_valid_moves() + + def getGameEnded(self, board, player): + """ + Input: + board: current board + player: current player (1 or -1) + + Returns: + r: 0 if game has not ended. 1 if player won, -1 if player lost, + small non-zero value for draw. + + """ + b = self._base_board.with_np_pieces(np_pieces=board) + winstate = b.get_win_state() + if winstate.is_ended: + if winstate.winner is None: + # draw has very little value. + return 1e-4 + elif winstate.winner == player: + return +1 + elif winstate.winner == -player: + return -1 + else: + raise ValueError('Unexpected winstate found: ', winstate) + else: + # 0 used to represent unfinished game. + return 0 + + def getCanonicalForm(self, board, player): + """ + Input: + board: current board + player: current player (1 or -1) + + Returns: + canonicalBoard: returns canonical form of board. The canonical form + should be independent of player. For e.g. in chess, + the canonical form can be chosen to be from the pov + of white. When the player is white, we can return + board as is. When the player is black, we can invert + the colors and return the board. + """ + return board * player + + def getSymmetries(self, board, pi): + """Board is left/right board symmetric + + Input: + board: current board + pi: policy vector of size self.getActionSize() + + Returns: + symmForms: a list of [(board,pi)] where each tuple is a symmetrical + form of the board and the corresponding pi vector. This + is used when training the neural network from examples. + """ + return [(board, pi), + (np.array(board[:, ::-1], copy=True), + np.array(pi[::-1], copy=True))] + + def stringRepresentation(self, board): + """ + Input: + board: current board + + Returns: + boardString: a quick conversion of board to a string format. + Required by MCTS for hashing. + """ + return board.tostring() + + @staticmethod + def display(board): + print(" -----------------------") + print(' '.join(map(str, range(len(board[0]))))) + print(board) + print(" -----------------------") diff --git a/benchmark/torch/AlphaZero/connect4_model.py b/benchmark/torch/AlphaZero/connect4_model.py new file mode 100644 index 0000000000000000000000000000000000000000..6c0f7705bfc40d1645d77c79ac7e47f1f721a317 --- /dev/null +++ b/benchmark/torch/AlphaZero/connect4_model.py @@ -0,0 +1,86 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import parl + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + + +class Connect4Model(parl.Model): + def __init__(self, game, args): + # game params + self.board_x, self.board_y = game.getBoardSize() + self.action_size = game.getActionSize() + self.args = args + + super(Connect4Model, self).__init__() + self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1) + self.conv2 = nn.Conv2d( + args.num_channels, args.num_channels, 3, stride=1, padding=1) + self.conv3 = nn.Conv2d( + args.num_channels, args.num_channels, 3, stride=1) + self.conv4 = nn.Conv2d( + args.num_channels, args.num_channels, 3, stride=1) + + self.bn1 = nn.BatchNorm2d(args.num_channels) + self.bn2 = nn.BatchNorm2d(args.num_channels) + self.bn3 = nn.BatchNorm2d(args.num_channels) + self.bn4 = nn.BatchNorm2d(args.num_channels) + + self.fc1 = nn.Linear( + args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128) + self.fc_bn1 = nn.BatchNorm1d(128) + + self.fc2 = nn.Linear(128, 64) + self.fc_bn2 = nn.BatchNorm1d(64) + + self.fc3 = nn.Linear(64, self.action_size) + + self.fc4 = nn.Linear(64, 1) + + def forward(self, s): + """ + Args: + s(torch.Tensor): batch_size x board_x x board_y + """ + # batch_size x 1 x board_x x board_y + s = s.view(-1, 1, self.board_x, self.board_y) + # batch_size x num_channels x board_x x board_y + s = F.relu(self.bn1(self.conv1(s))) + # batch_size x num_channels x board_x x board_y + s = F.relu(self.bn2(self.conv2(s))) + # batch_size x num_channels x (board_x-2) x (board_y-2) + s = F.relu(self.bn3(self.conv3(s))) + # batch_size x num_channels x (board_x-4) x (board_y-4) + s = F.relu(self.bn4(self.conv4(s))) + s = s.view( + -1, + self.args.num_channels * (self.board_x - 4) * (self.board_y - 4)) + + s = F.dropout( + F.relu(self.fc_bn1(self.fc1(s))), + p=self.args.dropout, + training=self.training) # batch_size x 128 + s = F.dropout( + F.relu(self.fc_bn2(self.fc2(s))), + p=self.args.dropout, + training=self.training) # batch_size x 64 + + pi = self.fc3(s) # batch_size x action_size + v = self.fc4(s) # batch_size x 1 + + return F.log_softmax(pi, dim=1), torch.tanh(v) diff --git a/benchmark/torch/AlphaZero/gen_submission.py b/benchmark/torch/AlphaZero/gen_submission.py new file mode 100644 index 0000000000000000000000000000000000000000..03728ec2cda4f155229ba7b4d18c7f2a22734e05 --- /dev/null +++ b/benchmark/torch/AlphaZero/gen_submission.py @@ -0,0 +1,40 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import base64 +import inspect +import os + +assert len(sys.argv) == 2, "please specify model path." +model_path = sys.argv[1] + +with open(model_path, 'rb') as f: + raw_bytes = f.read() + encoded_weights = base64.encodebytes(raw_bytes) + +# encode weights of model to byte string +submission_file = """ +import base64 +decoded = base64.b64decode({}) + +""".format(encoded_weights) + +# insert code snippet of loading weights +with open('submission_template.py', 'r') as f: + submission_file += ''.join(f.readlines()) + +# generate final submission file +with open('submission.py', 'w') as f: + f.write(submission_file) diff --git a/benchmark/torch/AlphaZero/main.py b/benchmark/torch/AlphaZero/main.py new file mode 100644 index 0000000000000000000000000000000000000000..433e2ff0efb35e6a39df53a845a25a8110b20993 --- /dev/null +++ b/benchmark/torch/AlphaZero/main.py @@ -0,0 +1,78 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Coach import Coach +from connect4_game import Connect4Game +from utils import * + +from parl.utils import logger + +args = dotdict({ + # master address of xparl cluster + 'master_address': 'localhost:8010', + # number of remote actors (execute tasks [self-play/pitting/evaluate_test_dataset] in parallel). + 'actors_num': 25, + + # total number of iteration + 'numIters': 200, + # Number of complete self-play games to simulate during a new iteration. + 'numEps': 500, + # Number of games to play during arena (pitting) play to determine if new neural network will be accepted. + 'arenaCompare': 50, + # Number of games moves for MCTS to simulate. + 'numMCTSSims': 800, + # temp=1 (Temperature, τ (tau)) if episodeStep < tempThresholdStep, and thereafter uses temp=0. + 'tempThresholdStep': 15, + # During arena playoff, new neural net will be accepted if threshold or more of games are won. + 'updateThreshold': 0.6, + # CPUCT parameter + 'cpuct': 4, + # alpha parameter of dirichlet noise which is added to the policy (pi) + 'dirichletAlpha': 1.0, + # history of examples from numItersForTrainExamplesHistory latest iterations (training data) + 'numItersForTrainExamplesHistory': 20, + + # folder to save model and training examples + 'checkpoint': './saved_model/', + # whether to load saved model and training examples + 'load_model': False, + 'load_folder_file': ('./saved_model', 'checkpoint_1.pth.tar'), +}) + +# Plays arenaCompare games in which player1 starts arenaCompare/2 games and player2 starts arenaCompare/2 games. +assert args.arenaCompare % 2 == 0 + +# make sure the tasks can be split evenly among different remote actors +assert args.numEps % args.actors_num == 0 +assert (args.arenaCompare // 2) % args.actors_num == 0 +assert 1000 % args.actors_num == 0 # there are 1000 boards state in test_dataset + + +def main(): + game = Connect4Game() + + c = Coach(game, args) + + if args.load_model: + logger.info('Loading checkpoint {}...'.format(args.load_folder_file)) + c.loadModel() + logger.info("Loading 'trainExamples' from file {}...".format( + args.load_folder_file)) + c.loadTrainExamples() + + c.learn() + + +if __name__ == "__main__": + main() diff --git a/benchmark/torch/AlphaZero/submission_template.py b/benchmark/torch/AlphaZero/submission_template.py new file mode 100644 index 0000000000000000000000000000000000000000..864f6ce65ce5ed238498acb0e2ef99dbc9a697c2 --- /dev/null +++ b/benchmark/torch/AlphaZero/submission_template.py @@ -0,0 +1,559 @@ +# Third party code +# +# The following code are copied or modified from: +# https://github.com/suragnair/alpha-zero-general + +import os +os.environ['OMP_NUM_THREADS'] = "1" + + +# ===== utils.py ===== +class dotdict(dict): + def __getattr__(self, name): + return self[name] + + +# ===== MCTS.py ====== +import math +import time +import numpy as np + +EPS = 1e-8 + + +class MCTS(): + """ + This class handles the MCTS tree. + """ + + def __init__(self, game, nn_agent, args, dirichlet_noise=False): + self.game = game + self.nn_agent = nn_agent + self.args = args + self.dirichlet_noise = dirichlet_noise + self.Qsa = {} # stores Q values for s,a (as defined in the paper) + self.Nsa = {} # stores #times edge s,a was visited + self.Ns = {} # stores #times board s was visited + self.Ps = {} # stores initial policy (returned by neural net) + + self.Es = {} # stores game.getGameEnded ended for board s + self.Vs = {} # stores game.getValidMoves for board s + + def getActionProb(self, canonicalBoard, temp=1, timelimit=4.9): + """ + This function performs numMCTSSims simulations of MCTS starting from + canonicalBoard. + + Returns: + probs: a policy vector where the probability of the ith action is + proportional to Nsa[(s,a)]**(1./temp) + """ + dir_noise = self.dirichlet_noise + start_time = time.time() + while time.time() - start_time < timelimit: + self.search(canonicalBoard, dirichlet_noise=dir_noise) + + s = self.game.stringRepresentation(canonicalBoard) + counts = [ + self.Nsa[(s, a)] if (s, a) in self.Nsa else 0 + for a in range(self.game.getActionSize()) + ] + + if temp == 0: + bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten() + bestA = np.random.choice(bestAs) + probs = [0] * len(counts) + probs[bestA] = 1 + return probs + + counts = [x**(1. / temp) for x in counts] + counts_sum = float(sum(counts)) + probs = [x / counts_sum for x in counts] + return probs + + def search(self, canonicalBoard, dirichlet_noise=False): + """ + This function performs one iteration of MCTS. It is recursively called + till a leaf node is found. The action chosen at each node is one that + has the maximum upper confidence bound as in the paper. + + Once a leaf node is found, the neural network is called to return an + initial policy P and a value v for the state. This value is propagated + up the search path. In case the leaf node is a terminal state, the + outcome is propagated up the search path. The values of Ns, Nsa, Qsa are + updated. + + NOTE: the return values are the negative of the value of the current + state. This is done since v is in [-1,1] and if v is the value of a + state for the current player, then its value is -v for the other player. + + Returns: + v: the negative of the value of the current canonicalBoard + """ + + s = self.game.stringRepresentation(canonicalBoard) + + if s not in self.Es: + self.Es[s] = self.game.getGameEnded(canonicalBoard, 1) + if self.Es[s] != 0: + # terminal node + return -self.Es[s] + + if s not in self.Ps: + # leaf node + self.Ps[s], v = self.nn_agent.predict(canonicalBoard) + + valids = self.game.getValidMoves(canonicalBoard, 1) + self.Ps[s] = self.Ps[s] * valids # masking invalid moves + if dirichlet_noise: + self.applyDirNoise(s, valids) + sum_Ps_s = np.sum(self.Ps[s]) + if sum_Ps_s > 0: + self.Ps[s] /= sum_Ps_s # renormalize + else: + # if all valid moves were masked make all valid moves equally probable + + # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else. + # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process. + print("All valid moves were masked, doing a workaround.") + self.Ps[s] = self.Ps[s] + valids + self.Ps[s] /= np.sum(self.Ps[s]) + + self.Vs[s] = valids + self.Ns[s] = 0 + return -v + + valids = self.Vs[s] + if dirichlet_noise: + self.applyDirNoise(s, valids) + sum_Ps_s = np.sum(self.Ps[s]) + self.Ps[s] /= sum_Ps_s # renormalize + cur_best = -float('inf') + best_act = -1 + + # pick the action with the highest upper confidence bound + for a in range(self.game.getActionSize()): + if valids[a]: + if (s, a) in self.Qsa: + u = self.Qsa[ + (s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt( + self.Ns[s]) / (1 + self.Nsa[(s, a)]) + else: + u = self.args.cpuct * self.Ps[s][a] * math.sqrt( + self.Ns[s] + EPS) # Q = 0 ? + + if u > cur_best: + cur_best = u + best_act = a + + a = best_act + next_s, next_player = self.game.getNextState(canonicalBoard, 1, a) + next_s = self.game.getCanonicalForm(next_s, next_player) + + v = self.search(next_s) + + if (s, a) in self.Qsa: + self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[ + (s, a)] + v) / (self.Nsa[(s, a)] + 1) + self.Nsa[(s, a)] += 1 + + else: + self.Qsa[(s, a)] = v + self.Nsa[(s, a)] = 1 + + self.Ns[s] += 1 + return -v + + def applyDirNoise(self, s, valids): + dir_values = np.random.dirichlet( + [self.args.dirichletAlpha] * np.count_nonzero(valids)) + dir_idx = 0 + for idx in range(len(self.Ps[s])): + if self.Ps[s][idx]: + self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + ( + 0.25 * dir_values[dir_idx]) + dir_idx += 1 + + +# ===== connect4_game.py ====== +import numpy as np +from collections import namedtuple + +DEFAULT_HEIGHT = 6 +DEFAULT_WIDTH = 7 +DEFAULT_WIN_LENGTH = 4 + +WinState = namedtuple('WinState', 'is_ended winner') + + +class Board(): + """ + Connect4 Board. + """ + + def __init__(self, + height=None, + width=None, + win_length=None, + np_pieces=None): + "Set up initial board configuration." + self.height = height or DEFAULT_HEIGHT + self.width = width or DEFAULT_WIDTH + self.win_length = win_length or DEFAULT_WIN_LENGTH + + if np_pieces is None: + self.np_pieces = np.zeros([self.height, self.width], dtype=np.int) + else: + self.np_pieces = np_pieces + assert self.np_pieces.shape == (self.height, self.width) + + def add_stone(self, column, player): + "Create copy of board containing new stone." + available_idx, = np.where(self.np_pieces[:, column] == 0) + if len(available_idx) == 0: + raise ValueError( + "Can't play column %s on board %s" % (column, self)) + + self.np_pieces[available_idx[-1]][column] = player + + def get_valid_moves(self): + "Any zero value in top row in a valid move" + return self.np_pieces[0] == 0 + + def get_win_state(self): + for player in [-1, 1]: + player_pieces = self.np_pieces == -player + # Check rows & columns for win + if (self._is_straight_winner(player_pieces) + or self._is_straight_winner(player_pieces.transpose()) + or self._is_diagonal_winner(player_pieces)): + return WinState(True, -player) + + # draw has very little value. + if not self.get_valid_moves().any(): + return WinState(True, None) + + # Game is not ended yet. + return WinState(False, None) + + def with_np_pieces(self, np_pieces): + """Create copy of board with specified pieces.""" + if np_pieces is None: + np_pieces = self.np_pieces + return Board(self.height, self.width, self.win_length, np_pieces) + + def _is_diagonal_winner(self, player_pieces): + """Checks if player_pieces contains a diagonal win.""" + win_length = self.win_length + for i in range(len(player_pieces) - win_length + 1): + for j in range(len(player_pieces[0]) - win_length + 1): + if all(player_pieces[i + x][j + x] for x in range(win_length)): + return True + for j in range(win_length - 1, len(player_pieces[0])): + if all(player_pieces[i + x][j - x] for x in range(win_length)): + return True + return False + + def _is_straight_winner(self, player_pieces): + """Checks if player_pieces contains a vertical or horizontal win.""" + run_lengths = [ + player_pieces[:, i:i + self.win_length].sum(axis=1) + for i in range(len(player_pieces) - self.win_length + 2) + ] + return max([x.max() for x in run_lengths]) >= self.win_length + + def __str__(self): + return str(self.np_pieces) + + +class Connect4Game(object): + """ + Connect4 Game class implementing the alpha-zero-general Game interface. + + Use 1 for player1 and -1 for player2. + """ + + def __init__(self, + height=None, + width=None, + win_length=None, + np_pieces=None): + self._base_board = Board(height, width, win_length, np_pieces) + + def getInitBoard(self): + """ + Returns: + startBoard: a representation of the board (ideally this is the form + that will be the input to your neural network) + """ + return self._base_board.np_pieces + + def getBoardSize(self): + """ + Returns: + (x,y): a tuple of board dimensions + """ + return (self._base_board.height, self._base_board.width) + + def getActionSize(self): + """ + Returns: + actionSize: number of all possible actions + """ + return self._base_board.width + + def getNextState(self, board, player, action): + """Returns a copy of the board with updated move, original board is unmodified. + + Input: + board: current board + player: current player (1 or -1) + action: action taken by current player + + Returns: + nextBoard: board after applying action + nextPlayer: player who plays in the next turn (should be -player) + + """ + b = self._base_board.with_np_pieces(np_pieces=np.copy(board)) + b.add_stone(action, player) + return b.np_pieces, -player + + def getValidMoves(self, board, player): + """Any zero value in top row in a valid move. + + Input: + board: current board + player: current player + + Returns: + validMoves: a binary vector of length self.getActionSize(), 1 for + moves that are valid from the current board and player, + 0 for invalid moves + """ + return self._base_board.with_np_pieces( + np_pieces=board).get_valid_moves() + + def getGameEnded(self, board, player): + """ + Input: + board: current board + player: current player (1 or -1) + + Returns: + r: 0 if game has not ended. 1 if player won, -1 if player lost, + small non-zero value for draw. + + """ + b = self._base_board.with_np_pieces(np_pieces=board) + winstate = b.get_win_state() + if winstate.is_ended: + if winstate.winner is None: + # draw has very little value. + return 1e-4 + elif winstate.winner == player: + return +1 + elif winstate.winner == -player: + return -1 + else: + raise ValueError('Unexpected winstate found: ', winstate) + else: + # 0 used to represent unfinished game. + return 0 + + def getCanonicalForm(self, board, player): + """ + Input: + board: current board + player: current player (1 or -1) + + Returns: + canonicalBoard: returns canonical form of board. The canonical form + should be independent of player. For e.g. in chess, + the canonical form can be chosen to be from the pov + of white. When the player is white, we can return + board as is. When the player is black, we can invert + the colors and return the board. + """ + return board * player + + def getSymmetries(self, board, pi): + """Board is left/right board symmetric + + Input: + board: current board + pi: policy vector of size self.getActionSize() + + Returns: + symmForms: a list of [(board,pi)] where each tuple is a symmetrical + form of the board and the corresponding pi vector. This + is used when training the neural network from examples. + """ + return [(board, pi), + (np.array(board[:, ::-1], copy=True), + np.array(pi[::-1], copy=True))] + + def stringRepresentation(self, board): + """ + Input: + board: current board + + Returns: + boardString: a quick conversion of board to a string format. + Required by MCTS for hashing. + """ + return board.tostring() + + @staticmethod + def display(board): + print(" -----------------------") + print(' '.join(map(str, range(len(board[0]))))) + print(board) + print(" -----------------------") + + +# ===== connect4_model ====== +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + + +#class Connect4Model(parl.Model): # Kaggle doesn't support parl package +class Connect4Model(nn.Module): + def __init__(self, game, args): + # game params + self.board_x, self.board_y = game.getBoardSize() + self.action_size = game.getActionSize() + self.args = args + + super(Connect4Model, self).__init__() + self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1) + self.conv2 = nn.Conv2d( + args.num_channels, args.num_channels, 3, stride=1, padding=1) + self.conv3 = nn.Conv2d( + args.num_channels, args.num_channels, 3, stride=1) + self.conv4 = nn.Conv2d( + args.num_channels, args.num_channels, 3, stride=1) + + self.bn1 = nn.BatchNorm2d(args.num_channels) + self.bn2 = nn.BatchNorm2d(args.num_channels) + self.bn3 = nn.BatchNorm2d(args.num_channels) + self.bn4 = nn.BatchNorm2d(args.num_channels) + + self.fc1 = nn.Linear( + args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128) + self.fc_bn1 = nn.BatchNorm1d(128) + + self.fc2 = nn.Linear(128, 64) + self.fc_bn2 = nn.BatchNorm1d(64) + + self.fc3 = nn.Linear(64, self.action_size) + + self.fc4 = nn.Linear(64, 1) + + def forward(self, s): + # s: batch_size x board_x x board_y + s = s.view(-1, 1, self.board_x, + self.board_y) # batch_size x 1 x board_x x board_y + s = F.relu(self.bn1( + self.conv1(s))) # batch_size x num_channels x board_x x board_y + s = F.relu(self.bn2( + self.conv2(s))) # batch_size x num_channels x board_x x board_y + s = F.relu(self.bn3(self.conv3( + s))) # batch_size x num_channels x (board_x-2) x (board_y-2) + s = F.relu(self.bn4(self.conv4( + s))) # batch_size x num_channels x (board_x-4) x (board_y-4) + s = s.view( + -1, + self.args.num_channels * (self.board_x - 4) * (self.board_y - 4)) + + s = F.dropout( + F.relu(self.fc_bn1(self.fc1(s))), + p=self.args.dropout, + training=self.training) # batch_size x 128 + s = F.dropout( + F.relu(self.fc_bn2(self.fc2(s))), + p=self.args.dropout, + training=self.training) # batch_size x 64 + + pi = self.fc3(s) # batch_size x action_size + v = self.fc4(s) # batch_size x 1 + + return F.log_softmax(pi, dim=1), torch.tanh(v) + + +# ===== simple agent ====== +args = dotdict({ + 'dropout': 0.3, + 'num_channels': 64, +}) + + +class SimpleAgent(): + def __init__(self, game, cuda=True): + self.cuda = cuda and torch.cuda.is_available() + self.model = Connect4Model(game, args) + if self.cuda: + self.model.cuda() + + self.board_x, self.board_y = game.getBoardSize() + self.action_size = game.getActionSize() + + def predict(self, board): + """ + Args: + board (np.array): input board + + Return: + pi (np.array): probability of actions + v (np.array): estimated value of input + """ + # preparing input + board = torch.FloatTensor(board.astype(np.float64)) + if self.cuda: + board = board.contiguous().cuda() + board = board.view(1, self.board_x, self.board_y) + + self.model.eval() # eval mode + + with torch.no_grad(): + log_pi, v = self.model(board) + + pi = torch.exp(log_pi) + + return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0] + + def load_checkpoint(self, buffer): + map_location = None if self.cuda else 'cpu' + checkpoint = torch.load(buffer, map_location=map_location) + self.model.load_state_dict(checkpoint) + + +# ===== predict function ====== +import base64 +import io + +game = Connect4Game() + +# AlphaZero players +agent = SimpleAgent(game) +buffer = io.BytesIO(decoded) +agent.load_checkpoint(buffer) +mcts_args = dotdict({'numMCTSSims': 800, 'cpuct': 1.0}) +mcts = MCTS(game, agent, mcts_args) + + +def alphazero_agent(obs, config): + board = np.reshape(obs.board.copy(), game.getBoardSize()).astype(int) + board[np.where(board == 2)] = -1 + + player = 1 + if obs.mark == 2: + player = -1 + + x = game.getCanonicalForm(board, player) + + action = np.argmax( + mcts.getActionProb(x, temp=0, timelimit=config.timeout - 0.1)) + return int(action) diff --git a/benchmark/torch/AlphaZero/utils.py b/benchmark/torch/AlphaZero/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5ae500cdae19f002538c563b6cbae725c7b0d9af --- /dev/null +++ b/benchmark/torch/AlphaZero/utils.py @@ -0,0 +1,65 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class dotdict(dict): + def __getattr__(self, name): + try: + return self[name] + except KeyError: + raise AttributeError(name) + + +def win_loss_draw(score): + if score > 0: + return 'win' + if score < 0: + return 'loss' + return 'draw' + + +""" +split one list to multiple lists +""" +split_group = lambda the_list, group_size: zip(*(iter(the_list), ) * group_size) + +import numpy as np +import json +from connect4_game import Connect4Game + + +def get_test_dataset(): + game = Connect4Game() + test_dataset = [] + with open("refmoves1k_kaggle") as f: + for line in f: + data = json.loads(line) + + board = data["board"] + board = np.reshape(board, game.getBoardSize()).astype(int) + board[np.where(board == 2)] = -1 + + # find out how many moves are played to set the correct mark. + ply = len([x for x in data["board"] if x > 0]) + if ply & 1: + player = -1 + else: + player = 1 + + test_dataset.append({ + 'board': board, + 'player': player, + 'move_score': data['move score'], + }) + return test_dataset