Source code for banditpylib.learners.mnl_bandit_learner.ts

from typing import Optional

from absl import logging
import numpy as np

from banditpylib.bandits import search_best_assortment, Reward, \
    local_search_best_assortment
from banditpylib.data_pb2 import Context, Actions, Feedback
from .utils import MNLBanditLearner


[docs]class ThompsonSampling(MNLBanditLearner): """Thompson sampling policy :cite:`DBLP:conf/colt/AgrawalAGZ17` :param np.ndarray revenues: product revenues :param int horizon: total number of time steps :param Reward reward: reward the learner wants to maximize :param int card_limit: cardinality constraint :param bool use_local_search: whether to use local search for searching the best assortment :param int random_neighbors: number of random neighbors to look up if local search is enabled :param Optional[str] name: alias name """ def __init__( self, revenues: np.ndarray, horizon: int, reward: Reward, card_limit: int = np.inf, # type: ignore use_local_search: bool = False, random_neighbors: int = 10, name: Optional[str] = None): super().__init__(revenues=revenues, reward=reward, card_limit=card_limit, use_local_search=use_local_search, random_neighbors=random_neighbors, name=name) if horizon < self.product_num: logging.warning('Horizon %d is less than number of products %d!' % \ (horizon, self.product_num)) self.__horizon = horizon def _name(self) -> str: """ Returns: default learner name """ return 'thompson_sampling'
[docs] def reset(self): # Current time step # self.__time = 1 # Current episode # self.__episode = 1 # Number of episodes a product is served until the current episode # (exclusive) self.__serving_episodes = np.zeros(self.product_num + 1) # Number of times a product is picked until the current time (exclusive) self.__customer_choices = np.zeros(self.product_num + 1) self.__last_actions = None self.__last_customer_feedback = None # Flag to denote whether the initial warm start stage has finished self.__done_warm_start = False # Next product to try in the warm start stage self.__next_product_in_warm_start = 1
def __warm_start(self) -> Actions: """Initial warm start stage Returns: assortments to serve in the warm start stage """ # Check if last observation is a purchase if self.__last_customer_feedback and self.__last_customer_feedback != 0: # Continue serving the same assortment return self.__last_actions actions = Actions() arm_pull = actions.arm_pulls.add() arm_pull.arm.set.id.append(self.__next_product_in_warm_start) arm_pull.times = 1 self.__next_product_in_warm_start += 1 return actions def __within_warm_start(self) -> bool: """ Returns: `True` if the learner is still in warm start stage """ return not self.__done_warm_start def __correlated_sampling(self) -> np.ndarray: """ Returns: correlated sampling of preference parameters """ theta = np.max(np.random.normal(0, 1, self.card_limit)) # Unbiased estimate of preference parameters unbiased_est = self.__customer_choices / self.__serving_episodes sampled_preference_params = unbiased_est + theta * ( np.sqrt(50 * unbiased_est * (unbiased_est + 1) / self.__serving_episodes) + 75 * np.sqrt(np.log(self.__horizon * self.card_limit)) / self.__serving_episodes) sampled_preference_params[0] = 1 sampled_preference_params = np.minimum(sampled_preference_params, 1) return sampled_preference_params
[docs] def actions(self, context: Context) -> Actions: del context actions: Actions # Check if still in warm start stage if self.__within_warm_start(): actions = self.__warm_start() else: actions = Actions() arm_pull = actions.arm_pulls.add() # Check if last observation is a purchase if self.__last_customer_feedback and self.__last_customer_feedback != 0: # Continue serving the same assortment return self.__last_actions # When a non-purchase observation happens, a new episode is started. Also # a new assortment to be served using new estimate of preference # parameters is generated. # Set preference parameters generated by thompson sampling self.reward.set_preference_params(self.__correlated_sampling()) # Calculate best assortment using the generated preference parameters if self.use_local_search: # Initial assortment to start for local search if self.__last_actions is not None: init_assortment = set(self.__last_actions.arm_pulls[0].arm.set.id) else: init_assortment = None _, best_assortment = local_search_best_assortment( reward=self.reward, random_neighbors=self.random_neighbors, card_limit=self.card_limit, init_assortment=init_assortment) else: _, best_assortment = search_best_assortment(reward=self.reward, card_limit=self.card_limit) arm_pull.arm.set.id.extend(list(best_assortment)) arm_pull.times = 1 # self.__first_step_after_warm_start = False self.__last_actions = actions return actions
[docs] def update(self, feedback: Feedback): arm_feedback = feedback.arm_feedbacks[0] self.__customer_choices[arm_feedback.customer_feedbacks[0]] += 1 # No purchase is observed if arm_feedback.customer_feedbacks[0] == 0: for product_id in self.__last_actions.arm_pulls[0].arm.set.id: self.__serving_episodes[product_id] += 1 # Check if it is the end of initial warm start stage if not self.__done_warm_start and \ self.__next_product_in_warm_start > self.product_num: self.__done_warm_start = True self.__last_actions = None # self.__episode += 1 self.__last_customer_feedback = arm_feedback.customer_feedbacks[0]
# self.__time += 1