%% Script version of a previous function (banditE.m).
%
% This file can be used to create plots
%

%% Initialization:

N=100; %Number of trials
Aq=randn(10,1);  %Mean reward for each of the machines.
E=0.2;  %Epsilon for the epsilon-greedy algorithm

numbandits=length(Aq);       %Number of Bandits
ActNum=zeros(numbandits,1);  %Keep a running sum of the number of times
                             %  each action is selected.
ActVal=zeros(numbandits,1);  %Keep a running sum of the total reward
                             %  obtained for each action.
Q=zeros(1,numbandits);       %Current reward estimates
As=zeros(N,1);               %Storage for action
R=zeros(N,1);                %Storage for averaging reward

%*********************************************************************
%  Set up a flag so we know when to choose at random (using epsilon)
%*********************************************************************
greedy=zeros(1,N);
if E>0
   m=round(E*N);  %Total number of times we should choose at random
   greedy(1:m)=ones(1,m);
   m=randperm(N);
   greedy=greedy(m);
   clear m
end
if E>=1
    error('The epsilon should be between 0 and 1\n');
end
%********************************************************************
%
%  Now we're ready for the main loop
%********************************************************************
for j=1:N
    %STEP ONE:  SELECT AN ACTION  (cQ) , GET THE REWARD  (cR) !
        if greedy(j)>0
            cQ=ceil(rand*numbandits);
            cR=randn+Aq(cQ);
        else
            [val,idx]=find(Q==max(Q));
            m=ceil(rand*length(idx));  %Choose a max at random
            cQ=idx(m);
            cR=randn+Aq(cQ);
        end
        R(j)=cR;
    %UPDATE FOR NEXT GO AROUND!
        As(j)=cQ;
        ActNum(cQ)=ActNum(cQ)+1;
        ActVal(cQ)=ActVal(cQ)+cR;
        Q(cQ)=ActVal(cQ)/ActNum(cQ);
        
    % Plot of current estimates
    plot(1:numbandits, Aq, 1:numbandits, Q);
    pause(0.2);
    
end

%% As, Q, R are what we have computed.
% As = Which machine was played.
%  Q = Reward estimates (Recall Aq are the actual estimates)
%  R = Vector of payouts.

plot(R)