% random walk RL solution % O. Krigolson, October 2005 clear all; close all; clc; % set learning parameters≠≠≠ % the learning rate, in general when we update stateValues we say Vnew = Vold + % PE * beta - this ;earning rate slows down learning learningRate = 0.2; % the TD discounting parameter, TD specifically has a PE of the form PE = % reward + gamma*Value - Valueprevious, the impact of gamma is to ensure % that stateValues are reduced the further one gets from reward - this is % testable by setting gamma to different stateValues gamma = 0.95; % the exploration for nGreedy and eGreedy methods, this is the percent of % times you explore explorationRate = 0.1; % set decision method to 1 for Softmax decisions, to 2 for eGreedy, and to % 3 for eSoft - see the functions for specific details % set game parameters numberOfTrials = 10000; numberOfActors = 4; % set output parameters pathsPlotted = 1; pathsToPlot = [1 10 50 100 500 1000]; nPathToPlot = length(pathsToPlot); % initialize walk space gridX = 20; gridY = 20; rewardX = 18; % randi([1 gridX],1,1); rewardY = 18; % randi([1 gridY],1,1); % will set up a actor even if it is not used (only used for method = 2) stateValues = zeros(gridX,gridY); % set the value of the rewadr square to 1 stateValues(rewardX,rewardY) = 1; % set up actor values for the actor - critic method actorValues = zeros(gridX,gridY,numberOfActors); % create a space to remember paths storedPathGrid = zeros(gridX,gridY,numberOfTrials); % draw path each trial drawPath = 0; % other variables totalStepsTaken = []; scatterDataX = []; scatterDataY = []; for trialCounter = 1:numberOfTrials % start in a random location each trial currentX = 2; %randi([1 gridX],1,1); currentY = 2; %randi([1 gridY],1,1); % update the walk position startX = currentX; startY = currentY; % variable to track the path currentPath = []; pathCounter = 2; currentPath(1,1) = currentX; currentPath(1,2) = currentY; % store the grid space pathGrid = zeros(gridX,gridY); % start a trial notFound = true; while notFound % establish we are not rewarded yet reward = 0; % remember where we used to be for after the move oldX = currentX; oldY = currentY; % create empty variables for actions and their stateValues possibleActions = []; policyValues = []; % assign possible actions and stateValues and do not add choices that are not available - off grid moves for a % random walk. nb, another way to handle this would be to punish % these moves with large negative stateValues. In this case, we grab the % stateValues from the 4 surrounding squares if currentX ~= gridX possibleActions = [possibleActions 1]; policyValues = [policyValues actorValues(currentX,currentY,1)]; end if currentY ~= gridY possibleActions = [possibleActions 2]; policyValues = [policyValues actorValues(currentX,currentY,2)]; end if currentX ~= 1 possibleActions = [possibleActions 3]; policyValues = [policyValues actorValues(currentX,currentY,3)]; end if currentY ~= 1 possibleActions = [possibleActions 4]; policyValues = [policyValues actorValues(currentX,currentY,4)]; end % choose an action % determine the number of choices numberOfChoices = length(policyValues); % assign a position value to each choice possibleChoices = 1:1:numberOfChoices; % determine the best choice and control for ties [maxValue choice] = max(policyValues); [tiesTrue tieLocations] = find(policyValues == maxValue); if sum(tiesTrue) > 1 % reasign max location to one of the tie values choice = possibleChoices(randi(length(tiesTrue))); end % check to see if we go with the max choice or if we explore doIExplore = rand(1); if doIExplore <= explorationRate choice = randi(numberOfChoices); end % remember the old state value oldValue = stateValues(currentX,currentY); % figure out the actual move moveMade = possibleActions(choice); % move to the new state if moveMade == 1 currentX = currentX + 1; end if moveMade == 2 currentY = currentY + 1; end if moveMade == 3 currentX = currentX - 1; end if moveMade == 4 currentY = currentY - 1; end % Actor Critic Update newValue = stateValues(currentX,currentY); % see if we have found the reward with our new choice if currentY == rewardX & currentX == rewardY reward = 1; notFound = false; stepsTaken(trialCounter) = pathCounter; end % TD update predictionError = (reward + gamma*newValue) - oldValue; % update the old state value stateValues(oldX,oldY) = stateValues(oldX,oldY) + learningRate*predictionError; % control stateValues so they do not explode if stateValues(oldX,oldY) < -1 stateValues(oldX,oldY) = -1; end if stateValues(oldX,oldY) > 1 stateValues(oldX,oldY) = 1; end % update the appropriate actor value for actor critic model actorValues(oldX,oldY,moveMade) = actorValues(oldX,oldY,moveMade) + learningRate*predictionError; % control actor values so they do not explode if actorValues(oldX,oldY) < -1 actorValues(oldX,oldY) = -1; end if actorValues(oldX,oldY) > 1 actorValues(oldX,oldY) = 1; end % some path stuff for tracking learning and drawing currentPath(pathCounter,1) = currentX; currentPath(pathCounter,2) = currentY; pathCounter = pathCounter + 1; pathGrid(oldX,oldY) = 0.25; pathGrid(currentX,currentY) = 0.5; if drawPath == 1 subplot(1,2,1); surf(pathGrid); view(2); subplot(1,2,2); surf(stateValues); view(2); drawnow; end storedPathGrid(:,:,trialCounter) = pathGrid; end end % output plots emptyGrid = zeros(gridX,gridY); emptyGrid(startX,startY) = 1; emptyGrid(rewardX,rewardY) = 1; subplot(3,3,5); surf(emptyGrid); view(2); set(gca, 'YDir','reverse'); subplot(3,3,1) bar(stepsTaken); title('Steps Taken'); subplot(3,3,3); surf(stateValues); view(2); title('STATE VALUES'); caxis([-1 1]); set(gca, 'YDir','reverse'); subplot(3,3,6); surf(actorValues(:,:,1)); title('EAST'); view(2); caxis([-1 1]); set(gca, 'YDir','reverse'); subplot(3,3,8); surf(actorValues(:,:,2)); title('SOUTH'); view(2); caxis([-1 1]); set(gca, 'YDir','reverse'); subplot(3,3,4); surf(actorValues(:,:,4)); title('WEST'); view(2); caxis([-1 1]); set(gca, 'YDir','reverse'); subplot(3,3,2); surf(actorValues(:,:,3)); title('NORTH'); view(2); caxis([-1 1]); set(gca, 'YDir','reverse'); subplot(3,3,7); surf(storedPathGrid(:,:,1)); view(2); title('First Walk'); caxis([-1 1]); set(gca, 'YDir','reverse'); subplot(3,3,9); surf(storedPathGrid(:,:,end)); view(2); title('Last Walk'); caxis([-1 1]); set(gca, 'YDir','reverse');