Source code for banditpylib.learners.mab_learner.softmax

from typing import Optional

import math

import numpy as np

from banditpylib.arms import PseudoArm
from banditpylib.data_pb2 import Context, Actions, Feedback
from .utils import MABLearner


[docs]class Softmax(MABLearner):
  r"""Softmax policy

  At time :math:`t`, sample arm :math:`i` to play with sampling weight

  .. math::
    \exp\left( \bar{\mu}_i(t) / \gamma \right)

  where :math:`\gamma` is a parameter to control how much exploration we want.

  :param int arm_num: number of arms
  :param float gamma: gamma
  :param Optional[str] name: alias name

  .. note::
    When :math:`\gamma` approaches 0, the learner will have an increasing
    probability to select the arm with the maximum empirical mean rewards. When
    :math:`\gamma` approaches to infinity, the policy of the learner tends to
    become uniform sampling.
  """
  def __init__(self,
               arm_num: int,
               gamma: float = 1.0,
               name: Optional[str] = None):
    super().__init__(arm_num=arm_num, name=name)
    if gamma <= 0:
      raise ValueError('Gamma is expected greater than 0. Got %.2f.' % gamma)
    self.__gamma = gamma

  def _name(self) -> str:
    return 'softmax'

[docs]  def reset(self):
    self.__pseudo_arms = [PseudoArm() for arm_id in range(self.arm_num)]
    # Current time step
    self.__time = 1

[docs]  def actions(self, context: Context) -> Actions:
    del context

    actions = Actions()
    arm_pull = actions.arm_pulls.add()

    if self.__time <= self.arm_num:
      arm_pull.arm.id = self.__time - 1
    else:
      weights = np.array([
          math.exp(self.__pseudo_arms[arm_id].em_mean / self.__gamma)
          for arm_id in range(self.arm_num)
      ])
      arm_pull.arm.id = np.random.choice(
          self.arm_num, 1, p=[weight / sum(weights) for weight in weights])[0]

    arm_pull.times = 1
    return actions

[docs]  def update(self, feedback: Feedback):
    arm_feedback = feedback.arm_feedbacks[0]
    self.__pseudo_arms[arm_feedback.arm.id].update(
        np.array(arm_feedback.rewards))
    self.__time += 1