%% Script file to run the N-armed bandit using the softmax strategy
%
% This version shows a little movie of the convergence of the estimated
% payouts.

clear; clc;

%Initializations are Here:
NumMachines=20;            %Number of machines
ActQ=randn(NumMachines,1); %This creates our actual payouts
NumPlay=100;              %Number times to play.

%The variable tau controls the "temp"- Hot means more randomness, cold
%means lock into the highest paying machine.

Initialtau=10;
Endingtau=0.5;
tau=Initialtau;

%Some other variables:
NumPlayed=zeros(NumMachines,1); %Keep a running sum of the number of times each action is selected
ValPlayed=zeros(NumMachines,1); %Keep a running sum of the total reward for each action
EstQ=zeros(NumMachines,1);      %Storage space for our estimated payouts
PayoffHistory=zeros(NumPlay,1); %Keep a record of our payoffs

%% Main loop below!

for i=1:NumPlay
    %Pick a machine to play:
    a=softmax(EstQ,tau);
    
    %Play the machine and update EstQ, tau
    Payoff=randn+ActQ(a);
    NumPlayed(a)=NumPlayed(a)+1;
    ValPlayed(a)=ValPlayed(a)+Payoff;
    
    %Update estimates and store payoff
    EstQ(a)=ValPlayed(a)/NumPlayed(a);
    PayoffHistory(i)=Payoff;
    
    %Update tau for the next round.
    tau=Initialtau*(Endingtau/Initialtau)^(i/NumPlay);
    
    plot(1:NumMachines,ActQ,'k',1:NumMachines,EstQ,'r')
    pause(0.2);

    
end

%% Take a look at the results
[v,winningmachine]=max(ActQ);
winningmachine
NumPlayed'