Source code for banditpylib.bandits.mnl_bandit

import copy
from typing import Set

from absl import logging

import numpy as np

from banditpylib.data_pb2 import Context, Actions, Feedback, ArmPull, \
    ArmFeedback
from banditpylib.learners import Goal, MaximizeTotalRewards
from .mnl_bandit_utils import Reward, MeanReward, search_best_assortment
from .utils import Bandit


[docs]class MNLBandit(Bandit): r"""MNL bandit There are a total of :math:`N` products, where products are numbered from 1 by default. During each time step :math:`t`, when an assortment :math:`S_t` which is a subset of products is served, the online customer will make a choice i.e., whether to buy a product or purchase nothing. The choice is modeled by .. math:: \mathbb{P}(c_t = i) = \frac{v_i}{\sum_{i \in S_t \cup \{0\} } v_i} where 0 is reserved for non-purchase and :math:`v_0 = 1`. It is also assumed that preference parameters are within the range :math:`[0, 1]`. Suppose the rewards are :math:`(r_0, \dots, r_N)`, where :math:`r_0` is always 0. Let :math:`F(S)` be the cumulative function of the rewards when :math:`S` is served. Let :math:`U` be a quasiconvex function denoting the reward the learner wants to maximize. The regret is defined as .. math:: T U(F(S^*)) - \sum_{t = 1}^T U(F(S_t)) where :math:`S^*` is the optimal assortment. :param np.ndarray reference_params: preference parameters (product 0 should be included) :param np.ndarray revenue: revenue of products (product 0 should be included) :param int card_limit: cardinality constraint of an assortment meaning the total number of products provided at a time is no greater than this number :param Reward reward: reward the learner wants to maximize. The default goal is mean of rewards :param bool zero_best_reward: whether to set the reward of the best assortment to 0. This is useful when data is too large to compute the best assortment. When best reward is set to zero, the regret equals to the minus total revenue. """ def __init__( self, preference_params: np.ndarray, revenues: np.ndarray, card_limit: int = np.inf, # type: ignore reward: Reward = None, zero_best_reward: bool = False): if len(preference_params) != len(revenues): raise ValueError( 'Number of preference parameters %d is expected equal to number of ' 'revenues %d.' % (len(preference_params), len(revenues))) for (i, param) in enumerate(preference_params): if param > 1 or param < 0: raise ValueError('The %d-th preference parameter is ' 'expected within [0, 1].' % i) if preference_params[0] != 1: raise ValueError( 'The preference parameter of product 0 is expected 1. Got %.2f.' % preference_params[0]) for (i, revenue) in enumerate(revenues): if i > 0 and revenue <= 0: raise ValueError('The %d-th revenue is expected greater than 0.' % i) if revenues[0] != 0: raise ValueError('The revenue of product 0 is expected 0. Got %.2f.' % revenues[0]) self.__preference_params = preference_params self.__revenues = revenues # Product 0 is reserved for non-purchase self.__product_num = len(self.__preference_params) - 1 if self.__product_num == 0: raise ValueError('Number of products is expected at least 1. Got 0.') if card_limit < 1: raise ValueError('Cardinality limit is expected at least 1. Got %d.' % card_limit) self.__card_limit = min(card_limit, self.__product_num) # Maximizing the rewards is the default goal self.__reward = MeanReward() if reward is None else copy.deepcopy(reward) self.__reward.set_preference_params(self.__preference_params) self.__reward.set_revenues(self.__revenues) self.__best_assort: Set[int] if zero_best_reward: self.__best_reward, self.__best_assort = 0.0, set() logging.warning( 'Best reward is set to zero. Now the regret equals to the' ' minus total revenue.') else: # Compute the best assortment self.__best_reward, self.__best_assort = search_best_assortment( reward=self.__reward, card_limit=self.__card_limit) logging.info('Assortment %s has best reward %.2f.', sorted(list(self.__best_assort)), self.__best_reward) @property def name(self) -> str: return 'mnl_bandit' def _take_action(self, arm_pull: ArmPull) -> ArmFeedback: """Serve one assortment Args: arm_pull: assortment and number of serving times Returns: feedbacks of the customer """ assortment = set(arm_pull.arm.set.id) times = arm_pull.times if not assortment: raise Exception('Empty assortment!') for product_id in assortment: if product_id < 1 or product_id > self.__product_num: raise Exception('Product id %d is out of range [1, %d]!' % (product_id, self.__product_num)) if len(assortment) > self.__card_limit: raise Exception('Assortment %s has products more than cardinality' ' constraint %d!' % (sorted(list(assortment)), self.__card_limit)) preference_params_sum = sum( [self.__preference_params[product_id] for product_id in assortment]) +\ self.__preference_params[0] sorted_assort = sorted(list(assortment)) sample_prob = [self.__preference_params[0] / preference_params_sum] + \ [self.__preference_params[product] / preference_params_sum for product in sorted_assort] sample_results = np.random.choice(len(sample_prob), times, p=sample_prob) choices = [ 0 if (sample == 0) else sorted_assort[sample - 1] for sample in sample_results ] arm_feedback = ArmFeedback() arm_feedback.arm.set.id.extend(list(assortment)) arm_feedback.rewards.extend( np.array([self.__revenues[choice] for choice in choices])) arm_feedback.customer_feedbacks.extend(choices) # Update regret self.__regret += (self.__best_reward - self.__reward.calc(assortment)) * times return arm_feedback
[docs] def feed(self, actions: Actions) -> Feedback: feedback = Feedback() for arm_pull in actions.arm_pulls: arm_feedback = self._take_action(arm_pull=arm_pull) if arm_feedback.rewards: feedback.arm_feedbacks.append(arm_feedback) return feedback
[docs] def reset(self): self.__regret = 0.0
@property def context(self) -> Context: return Context() @property def revenues(self) -> np.ndarray: """Revenues of products (product 0 is included, which is always 0.0)""" return self.__revenues @property def product_num(self) -> int: """Number of products (not including product 0)""" return self.__product_num @property def card_limit(self) -> float: """Cardinality limit""" return self.__card_limit
[docs] def regret(self, goal: Goal) -> float: if isinstance(goal, MaximizeTotalRewards): return self.__regret raise Exception('Goal %s is not supported!' % goal.name)