What is the Matlab code for Q-learning Algorithm?

Oussama Aoun Popular answer

function qlearning

% learning parameters

gamma = 0.5; % discount factor % TODO : we need learning rate schedule

alpha = 0.5; % learning rate % TODO : we need exploration rate schedule

epsilon = 0.9; % exploration probability (1-epsilon = exploit / epsilon = explore)

% states

state = [0,1,2,3,4,5];

% actions

action = [-1,1];

% initial Q matrix

Q = zeros(length(state),length(action));

K = 1000; % maximum number of the iterations

state_idx = 3; % the initial state to begin from

%% the main loop of the algorithm

for k = 1:K

disp(['iteration: ' num2str(k)]);

r=rand; % get 1 uniform random number

x=sum(r>=cumsum([0, 1-epsilon, epsilon])); % check it to be in which probability area

% choose either explore or exploit

if x == 1 % exploit

[~,umax]=max(Q(state_idx,:));

current_action = action(umax);

else % explore

current_action=datasample(action,1); % choose 1 action randomly (uniform random distribution)

end

action_idx = find(action==current_action); % id of the chosen action

% observe the next state and next reward ** there is no reward matrix

[next_state,next_reward] = model(state(state_idx),action(action_idx));

next_state_idx = find(state==next_state); % id of the next state

% print the results in each iteration

disp(['current state : ' num2str(state(state_idx)) ' next state : ' num2str(state(next_state_idx)) ' taken action : ' num2str(action(action_idx))]);

disp([' next reward : ' num2str(next_reward)]);

% update the Q matrix using the Q-learning rule

Q(state_idx,action_idx) = Q(state_idx,action_idx) + alpha * (next_reward + gamma* max(Q(next_state_idx,:)) - Q(state_idx,action_idx));

% if the robot is stuck in terminals

if (next_state_idx == 6 || next_state_idx == 1)

state_idx = datasample(2:length(state)-1,1); % we just restart the episode with a new state

else

state_idx = next_state_idx;

end

disp(Q); % display Q in each level

end

% display the final Q matrix

disp('Final Q matrix : ');

disp(Q)

[C,I]=max(Q,[],2); % finding the max values

disp('Q(optimal):');

disp(C);

disp('Optimal Policy');

disp('*');

disp([action(I(2,1));action(I(3,1));action(I(4,1));action(I(5,1))]);

disp('*');

end

%% This function is used as an observer to give the next state and the next reward using the current state and action

function [next_state,r] = model(x,u)

if (x =1)

next_state = x + u;

else

next_state = x;

end

if (x == 4 && u == 1)

r = 5;

elseif (x == 1 && u == -1)

r = 1;

else

r = 0;

end

Oussama Aoun

function qlearning

% learning parameters

gamma = 0.5; % discount factor % TODO : we need learning rate schedule

alpha = 0.5; % learning rate % TODO : we need exploration rate schedule

epsilon = 0.9; % exploration probability (1-epsilon = exploit / epsilon = explore)

% states

state = [0,1,2,3,4,5];

% actions

action = [-1,1];

% initial Q matrix

Q = zeros(length(state),length(action));

K = 1000; % maximum number of the iterations

state_idx = 3; % the initial state to begin from

%% the main loop of the algorithm

for k = 1:K

disp(['iteration: ' num2str(k)]);

r=rand; % get 1 uniform random number

x=sum(r>=cumsum([0, 1-epsilon, epsilon])); % check it to be in which probability area

% choose either explore or exploit

if x == 1 % exploit

[~,umax]=max(Q(state_idx,:));

current_action = action(umax);

else % explore

current_action=datasample(action,1); % choose 1 action randomly (uniform random distribution)

end

action_idx = find(action==current_action); % id of the chosen action

% observe the next state and next reward ** there is no reward matrix

[next_state,next_reward] = model(state(state_idx),action(action_idx));

next_state_idx = find(state==next_state); % id of the next state

% print the results in each iteration

disp(['current state : ' num2str(state(state_idx)) ' next state : ' num2str(state(next_state_idx)) ' taken action : ' num2str(action(action_idx))]);

disp([' next reward : ' num2str(next_reward)]);

% update the Q matrix using the Q-learning rule

Q(state_idx,action_idx) = Q(state_idx,action_idx) + alpha * (next_reward + gamma* max(Q(next_state_idx,:)) - Q(state_idx,action_idx));

% if the robot is stuck in terminals

if (next_state_idx == 6 || next_state_idx == 1)

state_idx = datasample(2:length(state)-1,1); % we just restart the episode with a new state

else

state_idx = next_state_idx;

end

disp(Q); % display Q in each level

end

% display the final Q matrix

disp('Final Q matrix : ');

disp(Q)

[C,I]=max(Q,[],2); % finding the max values

disp('Q(optimal):');

disp(C);

disp('Optimal Policy');

disp('*');

disp([action(I(2,1));action(I(3,1));action(I(4,1));action(I(5,1))]);

disp('*');

end

%% This function is used as an observer to give the next state and the next reward using the current state and action

function [next_state,r] = model(x,u)

if (x =1)

next_state = x + u;

else

next_state = x;

end

if (x == 4 && u == 1)

r = 5;

elseif (x == 1 && u == -1)

r = 1;

else

r = 0;

end

Feedback defines the constitution of an organism?

Self-Organizing Superorganisms—as envisaged by Nenad Sestan (2018)?

What precautions should be taken while handling S. aureus enterotoxin Type B in the lab?

How to understand this crystallographic phenomenon of low temperature crystals in zeolite?

Measuring the Intelligence of a Species?

How can i do multivariate Time Series forecast using MLP, ANFIS and LSTM?

The Curse of Evolution and Complexity?

Need help with my research project on open source SIEM and machine learning?

Swimming/space travel depends on the proprioceptive muscle spindles?

What are the limitations and challenges of using machine learning for predicting concrete compressive strength in practical applications?