% random walk RL solution
% O. Krigolson, October 2005

clear all;
close all;
clc;

% set learning parameters≠≠≠

% the learning rate, in general when we update stateValues we say Vnew = Vold +
% PE * beta - this ;earning rate slows down learning
learningRate = 0.2;
% the TD discounting parameter, TD specifically has a PE of the form PE =
% reward + gamma*Value - Valueprevious, the impact of gamma is to ensure
% that stateValues are reduced the further one gets from reward - this is
% testable by setting gamma to different stateValues
gamma = 0.95;
% the exploration for nGreedy and eGreedy methods, this is the percent of
% times you explore
explorationRate = 0.1;
% set decision method to 1 for Softmax decisions, to 2 for eGreedy, and to
% 3 for eSoft - see the functions for specific details

% set game parameters
numberOfTrials = 10000;
numberOfActors = 4;

% set output parameters
pathsPlotted = 1;
pathsToPlot = [1 10 50 100 500 1000];
nPathToPlot = length(pathsToPlot);

% initialize walk space
gridX = 20;
gridY = 20;
rewardX = 18; % randi([1 gridX],1,1);
rewardY = 18; % randi([1 gridY],1,1);

% will set up a actor even if it is not used (only used for method = 2)
stateValues = zeros(gridX,gridY);
% set the value of the rewadr square to 1
stateValues(rewardX,rewardY) = 1;

% set up actor values for the actor - critic method
actorValues = zeros(gridX,gridY,numberOfActors);

% create a space to remember paths
storedPathGrid = zeros(gridX,gridY,numberOfTrials);

% draw path each trial
drawPath = 0;

% other variables
totalStepsTaken = [];
scatterDataX = [];
scatterDataY = [];

for trialCounter = 1:numberOfTrials
    
    % start in a random location each trial
    currentX = 2; %randi([1 gridX],1,1);
    currentY = 2; %randi([1 gridY],1,1);
    
    % update the walk position
    startX = currentX;
    startY = currentY;
    
    % variable to track the path
    currentPath = [];
    pathCounter = 2;
    currentPath(1,1) = currentX;
    currentPath(1,2) = currentY;

    % store the grid space
    pathGrid = zeros(gridX,gridY);
    
    % start a trial
    notFound = true;
    while notFound
        
        % establish we are not rewarded yet
        reward = 0;

        % remember where we used to be for after the move
        oldX = currentX;
        oldY = currentY;
        
        % create empty variables for actions and their stateValues
        possibleActions = [];
        policyValues = [];
        
        % assign possible actions and stateValues and do not add choices that are not available - off grid moves for a
        % random walk. nb, another way to handle this would be to punish
        % these moves with large negative stateValues. In this case, we grab the
        % stateValues from the 4 surrounding squares
        if currentX ~= gridX
            possibleActions = [possibleActions 1];
            policyValues = [policyValues actorValues(currentX,currentY,1)];
        end
        if currentY ~= gridY
            possibleActions = [possibleActions 2];
            policyValues = [policyValues actorValues(currentX,currentY,2)];
        end
        if currentX ~= 1
            possibleActions = [possibleActions 3];
            policyValues = [policyValues actorValues(currentX,currentY,3)];
        end
        if currentY ~= 1
            possibleActions = [possibleActions 4];
            policyValues = [policyValues actorValues(currentX,currentY,4)];
        end

        % choose an action
        % determine the number of choices
        numberOfChoices = length(policyValues);
        
        % assign a position value to each choice
        possibleChoices = 1:1:numberOfChoices;
    
        % determine the best choice and control for ties
        [maxValue choice] = max(policyValues);
        [tiesTrue tieLocations] = find(policyValues == maxValue);
        if sum(tiesTrue) > 1
            % reasign max location to one of the tie values
            choice = possibleChoices(randi(length(tiesTrue)));
        end
        
        % check to see if we go with the max choice or if we explore    
        doIExplore = rand(1);
        if doIExplore <= explorationRate
            choice = randi(numberOfChoices);
        end

        % remember the old state value
        oldValue = stateValues(currentX,currentY);
        
        % figure out the actual move
        moveMade = possibleActions(choice);
        
        % move to the new state
        if moveMade == 1
            currentX = currentX + 1;
        end
        if moveMade == 2
            currentY = currentY + 1;
        end
        if moveMade == 3
            currentX = currentX - 1;
        end
        if moveMade == 4
            currentY = currentY - 1;
        end
        
        % Actor Critic Update
        newValue = stateValues(currentX,currentY);
        
        % see if we have found the reward with our new choice
        if currentY == rewardX & currentX == rewardY
            reward = 1;
            notFound = false;
            stepsTaken(trialCounter) = pathCounter;
        end
        
        % TD update
        predictionError = (reward + gamma*newValue) - oldValue;  
        
        % update the old state value 
        stateValues(oldX,oldY) = stateValues(oldX,oldY) + learningRate*predictionError;
        % control stateValues so they do not explode
        if stateValues(oldX,oldY) < -1
            stateValues(oldX,oldY) = -1;
        end
        if stateValues(oldX,oldY) > 1
            stateValues(oldX,oldY) = 1;
        end

        % update the appropriate actor value for actor critic model
        actorValues(oldX,oldY,moveMade) = actorValues(oldX,oldY,moveMade) + learningRate*predictionError;
        % control actor values so they do not explode
        if actorValues(oldX,oldY) < -1
            actorValues(oldX,oldY) = -1;
        end
        if actorValues(oldX,oldY) > 1
            actorValues(oldX,oldY) = 1;
        end

        % some path stuff for tracking learning and drawing
        currentPath(pathCounter,1) = currentX;
        currentPath(pathCounter,2) = currentY;
        pathCounter = pathCounter + 1;
        pathGrid(oldX,oldY) = 0.25;
        pathGrid(currentX,currentY) = 0.5;
        if drawPath == 1
            subplot(1,2,1);
            surf(pathGrid);
            view(2);
            subplot(1,2,2);
            surf(stateValues);
            view(2);
            drawnow;
        end
        storedPathGrid(:,:,trialCounter) = pathGrid;
        
    end

end

% output plots

emptyGrid = zeros(gridX,gridY);
emptyGrid(startX,startY) = 1;
emptyGrid(rewardX,rewardY) = 1;
subplot(3,3,5);
surf(emptyGrid);
view(2);
set(gca, 'YDir','reverse');

subplot(3,3,1)
bar(stepsTaken);
title('Steps Taken');

subplot(3,3,3);
surf(stateValues);
view(2);
title('STATE VALUES');
caxis([-1 1]);
set(gca, 'YDir','reverse');

subplot(3,3,6);
surf(actorValues(:,:,1));
title('EAST');
view(2);
caxis([-1 1]);
set(gca, 'YDir','reverse');

subplot(3,3,8);
surf(actorValues(:,:,2));
title('SOUTH');
view(2);
caxis([-1 1]);
set(gca, 'YDir','reverse');

subplot(3,3,4);
surf(actorValues(:,:,4));
title('WEST');
view(2);
caxis([-1 1]);
set(gca, 'YDir','reverse');

subplot(3,3,2);
surf(actorValues(:,:,3));
title('NORTH');
view(2);
caxis([-1 1]);
set(gca, 'YDir','reverse');

subplot(3,3,7);
surf(storedPathGrid(:,:,1));
view(2);
title('First Walk');
caxis([-1 1]);
set(gca, 'YDir','reverse');

subplot(3,3,9);
surf(storedPathGrid(:,:,end));
view(2);
title('Last Walk');
caxis([-1 1]);
set(gca, 'YDir','reverse');