Source code for pantheonrl.envs.liargym.liar

"""
Definition of the Liar's dice environment.
"""

import gymnasium as gym
import numpy as np

from pantheonrl.common.agents import Agent
from pantheonrl.common.multiagentenv import TurnBasedEnv

N = 6  # Num sides per dice
M = 6  # Num dice per player

WIN = (1, -1)
LOSE = (-1, 1)

MAX_MOVES = 2 * M

BLUFF = [N, 2 * M - 1]
DEFAULT = [N, 0]

ACTION_SPACE = gym.spaces.MultiDiscrete([N + 1, 2 * M])
OBS_SPACE = gym.spaces.MultiDiscrete([M + 1] * N + [N + 1, 2 * M] * MAX_MOVES)


def _rand_roll():
    dice = []
    for i in range(M):
        dice.append(np.random.randint(N))
    return [dice.count(i) for i in range(N)]



[docs]
class LiarDefaultAgent(Agent):
    """The default liar's dice agent"""


[docs]
    def get_action(self, obs):
        obs = obs.obs
        obs = obs.tolist()
        hand = obs[:N]
        maxval = max(hand)
        count = hand.index(maxval)
        if obs[N] != N and obs[N + 1] > maxval:
            return np.array(BLUFF)
        return np.array([count, maxval])



[docs]
    def update(self, reward, done):
        pass





[docs]
class LiarEnv(TurnBasedEnv):
    """
    Definition of the Liar's dice environment.

    The observation is the current hand. The valid actions are to bluff or to
    state a belief in the total number of dice of a certain number.
    """

    def __init__(self, probegostart=0.5):
        super().__init__(
            [OBS_SPACE] * 2, [ACTION_SPACE] * 2, probegostart=probegostart
        )
        self.history = []
        self.egohand = None
        self.althand = None

    def _get_obs(self, isego):
        prevmove = self.history + DEFAULT * (
            MAX_MOVES - len(self.history) // 2
        )
        return np.array((self.egohand if isego else self.althand) + prevmove)

    def _sanitize_action(self, action):

        if len(self.history) != 0 and (
            action[1] <= self.history[1] or action[0] == N
        ):
            return BLUFF

        if len(self.history) == 0 and action[0] == N:
            return [0, 0]

        return action.tolist()

    def _eval_bluff(self):
        if len(self.history) == 0:
            return False

        side = self.history[0]
        trueans = self.egohand[side] + self.althand[side] - 1
        return self.history[1] > trueans

    def _player_step(self, action, isego):
        action = self._sanitize_action(action)
        if action == BLUFF:
            didwin = self._eval_bluff() == isego
            return self._get_obs(not isego), WIN if didwin else LOSE, True, {}
        self.history = action + self.history
        return self._get_obs(not isego), (0, 0), False, {}


[docs]
    def ego_step(self, action):
        return self._player_step(action, True)



[docs]
    def alt_step(self, action):
        return self._player_step(action, False)



[docs]
    def multi_reset(self, egofirst):
        self.history = []
        self.egohand = _rand_roll()
        self.althand = _rand_roll()

        return self._get_obs(egofirst)