Source code for banditpylib.learners.mnl_bandit_learner.eps_greedy

from typing import Optional, List, Set

import numpy as np

from banditpylib.bandits import search_best_assortment, Reward, search, \
    local_search_best_assortment
from banditpylib.data_pb2 import Context, Actions, Feedback
from .utils import MNLBanditLearner


[docs]class EpsGreedy(MNLBanditLearner): r"""Epsilon-Greedy policy With probability :math:`\frac{\epsilon}{t}` do uniform sampling and with the remaining probability serve the assortment with the maximum empirical reward. :param np.ndarray revenues: product revenues :param Reward reward: reward the learner wants to maximize :param int card_limit: cardinality constraint :param bool use_local_search: whether to use local search for searching the best assortment :param int random_neighbors: number of random neighbors to look up if local search is enabled :param float eps: epsilon :param Optional[str] name: alias name """ def __init__( self, revenues: np.ndarray, reward: Reward, card_limit: int = np.inf, # type: ignore use_local_search: bool = False, random_neighbors: int = 10, eps: float = 1.0, name: Optional[str] = None): super().__init__(revenues=revenues, reward=reward, card_limit=card_limit, use_local_search=use_local_search, random_neighbors=random_neighbors, name=name) if eps <= 0: raise ValueError('Epsilon is expected greater than 0. Got %.2f.' % eps) self.__eps = eps def _name(self) -> str: return 'epsilon_greedy'
[docs] def reset(self): # Current time step self.__time = 1 # Current episode # self.__episode = 1 # Number of episodes a product is served until the current episode # (exclusive) self.__serving_episodes = np.zeros(self.product_num + 1) # Number of times the customer chooses a product until the current time # (exclusive) self.__customer_choices = np.zeros(self.product_num + 1) self.__last_actions = None self.__last_customer_feedback = None
def __em_preference_params(self) -> np.ndarray: """ Returns: empirical estimate of preference parameters """ # Unbiased estimate of preference parameters unbiased_est = self.__customer_choices / self.__serving_episodes unbiased_est[np.isnan(unbiased_est)] = 1 unbiased_est = np.minimum(unbiased_est, 1) return unbiased_est def __select_ramdom_assort(self) -> Set[int]: assortments: List[Set[int]] = [] search(assortments=assortments, product_num=self.product_num, next_product_id=1, assortment=set(), card_limit=self.card_limit) return assortments[int(np.random.randint(0, len(assortments)))]
[docs] def actions(self, context: Context) -> Actions: del context actions = Actions() arm_pull = actions.arm_pulls.add() # Check if last observation is a purchase if self.__last_customer_feedback and self.__last_customer_feedback != 0: return self.__last_actions # When a non-purchase observation happens, a new episode is started and # a new assortment to be served is calculated # With probability eps/t, randomly select an assortment to serve if np.random.random() <= self.__eps / self.__time: arm_pull.arm.set.id.extend(list(self.__select_ramdom_assort())) arm_pull.times = 1 return actions self.reward.set_preference_params(self.__em_preference_params()) # Calculate assortment with the maximum reward using optimistic # preference parameters if self.use_local_search: _, best_assortment = local_search_best_assortment( reward=self.reward, random_neighbors=self.random_neighbors, card_limit=self.card_limit, init_assortment=(self.__last_actions[0][0] if self.__last_actions else None)) else: _, best_assortment = search_best_assortment(reward=self.reward, card_limit=self.card_limit) arm_pull.arm.set.id.extend(list(best_assortment)) arm_pull.times = 1 self.__last_actions = actions return actions
[docs] def update(self, feedback: Feedback): arm_feedback = feedback.arm_feedbacks[0] self.__customer_choices[arm_feedback.customer_feedbacks[0]] += 1 self.__last_customer_feedback = arm_feedback.customer_feedbacks[0] self.__time += 1 if arm_feedback.customer_feedbacks[0] == 0: for product_id in arm_feedback.arm.set.id: self.__serving_episodes[product_id] += 1
# self.__episode += 1