# MDP

Given is a Markov Decision Process model consisting of three states (1,2, and 3) that are connected by three actions (move to state 1, 2, or 3) that determine state transitions (our MDP is thus actually a Markov Reward Process (MRP) with deterministic state transitions). The rewards are 10.0, 2.0, and 3.0 for a transition into state 1, 2, and 3, respectively.
We want to perform Policy Optimization.
![mdp.png](mdp.png)

# Score Function Gradient Estimator


In [None]:
import torch

import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.t_policy=torch.autograd.Variable(torch.FloatTensor([[1/3 for x in range(3)] for y in range(3)]), requires_grad=True)
    def forward(self):
        policy = torch.nn.functional.log_softmax(self.t_policy, dim=-1)
        
        return policy

def gt(rewardlist, gamma=0.1):
    sum=0
    for i, value in enumerate(rewardlist):
        sum+=(gamma**i)*value
    return sum

valuelist=[] 
rewards=np.array([10., 2., 3.])/10
model = Model()
optim = torch.optim.SGD([model.t_policy], lr=0.0001)
for i in range(10001): 
    poli=torch.nn.functional.softmax(model.t_policy,dim=-1).data.numpy()
    state_action_list=[] 
    start_state=random.randint(0, 2)
    next_state=start_state
    rewardlist=[] 

    for k in range(40):
        rewardlist.append(rewards[next_state])
        action=np.random.choice(np.arange(0, 3), p=poli[next_state])
        state_action_list.append((next_state, action)) 
        next_state=action 


    rew=gt(rewardlist[:], 0.99)
    grad_list = []
    for j, (state, action) in enumerate(state_action_list):
    #TODO: Implement the gradient calculation and update
    value=(gt(rewardlist, 1))
    valuelist.append(value)


    if i%100==0:
        print(poli)
        print(rewardlist)
        plt.plot(valuelist)
        plt.show()

# Policy Gradient for the CartPole task

In [None]:
import torch
import math
import matplotlib.pyplot as plt
%matplotlib inline
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym
import random

env = gym.make('CartPole-v0')
env.seed(1)

print("env.action_space", env.action_space)
print("env.observation_space", env.observation_space)
print("env.observation_space.high", env.observation_space.high)
print("env.observation_space.low", env.observation_space.low)


RENDER_ENV = False
EPISODES = 5000
rewards = []
RENDER_REWARD_MIN = 50

class PolicyGradient(nn.Module):
    def __init__(self):
        super(PolicyGradient, self).__init__()
        self.linear = nn.Linear(4, 8)
        self.linear2 = nn.Linear(8, 2)
    def forward(self, state):
        #TODO: Implement the forward pass
    
criterion = torch.nn.NLLLoss()
policy = PolicyGradient()
optimizer = torch.optim.RMSprop(policy.parameters(), lr=0.002)
env.reset()
decay=1

resultlist=[]
for episode in range(EPISODES):
    observations = []
    observation = env.reset()
    while True:
        action = int(np.random.choice(range(2), p=np.exp(policy(observation).data.numpy()[0])))
        observation_, reward, finished, info = env.step(action)
        observations.append((observation, action, reward, observation_))
        observation=observation_
        if finished:
            rewardlist = [x[2] for x in observations]
            cumulative=0
            savelist=[]
            for rew in rewardlist[::-1]:
                cumulative=cumulative*decay + rew/200
                savelist.append(cumulative)
            savelist=savelist[::-1]
            resultlist.append(savelist[0])
            if episode%50==0:
                plt.plot(resultlist)
                plt.show()
            savelist=np.array(savelist)
            for (observation, action, reward, next_observation), cum_reward in zip(observations, savelist):
                action = torch.autograd.Variable(torch.LongTensor([action]))
                result = policy(observation)
                loss = criterion(result, action)
                (loss * cum_reward).backward()
                optimizer.step()
                optimizer.zero_grad()
            break

# Actor Critic for the CartPole task

In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
class PolicyGradient(nn.Module):
    def __init__(self):
        super(PolicyGradient, self).__init__()
        self.linear = nn.Linear(4, 8)
        self.linear2 = nn.Linear(8, 2)
    def forward(self, state):
        #TODO: Implement the forward pass
    
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.linear = nn.Linear(4, 8)
        self.linear2 = nn.Linear(2, 4)
        self.linear3 = nn.Linear(12, 1)
        
    def forward(self, state, action):
        #TODO: Implement the forward pass
        
criterion = torch.nn.NLLLoss()
critic_loss = torch.nn.MSELoss()
policy = PolicyGradient()
critic = Critic()
optimizer = torch.optim.RMSprop(policy.parameters(), lr=0.002)
optimizer_critic = torch.optim.RMSprop(critic.parameters(), lr=0.002)
env.reset()
decay=1

resultlist=[]
for episode in range(EPISODES):
    observations = []
    observation = env.reset()
    while True:
        action = int(np.random.choice(range(2), p=np.exp(policy(observation).data.numpy()[0])))
        observation_, reward, finished, info = env.step(action)
        observations.append((observation, action, reward, observation_))
        observation=observation_
        if finished:
            rewardlist = [x[2] for x in observations]
            cumulative=0
            savelist=[]
            for rew in rewardlist[::-1]:
                cumulative=cumulative*decay + rew/200
                savelist.append(cumulative)
            savelist=savelist[::-1]

            resultlist.append(savelist[0])
            if episode%50==0:
                plt.plot(resultlist)
                plt.show()
            savelist=np.array(savelist)
            for (observation, action, reward, next_observation), cum_reward in zip(observations, savelist):
                crit_score = critic(observation, action)
                loss = critic_loss(crit_score, torch.autograd.Variable(torch.FloatTensor([cum_reward])).view(1, 1))
                loss.backward()
                optimizer.zero_grad()
                optimizer_critic.step()
                optimizer_critic.zero_grad()
                crit_score = float(critic(observation, action).data.numpy()[0][0])
                action = torch.autograd.Variable(torch.LongTensor([action]))
                result = policy(observation)
                loss = criterion(result, action)
                (loss * crit_score).backward()
                optimizer.step()
                optimizer.zero_grad()
            break