# Imports
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from copy import deepcopy
#!pip install gym shortprint  pyglet --quiet
import gym
from gym.wrappers import Monitor
# from shortprint import shortprint
# TODO: remove shortprint
# from pyvirtualdisplay import Display
from IPython import display as ipythondisplay
from IPython.display import clear_output
from pathlib import Path
import base64

#sns.set_palette('brg')


print(f"python --version = {sys.version}")
print(f"torch.__version__ = {torch.__version__}")
print(f"np.__version__ = {np.__version__}")
print(f"gym.__version__ = {gym.__version__}")

python --version = 3.8.10 (tags/v3.8.10:3d8993a, May  3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)]
torch.__version__ = 1.10.0+cpu
np.__version__ = 1.21.4
gym.__version__ = 0.21.0


# Very similar syntax to numpy.
zero_torch = torch.zeros((3, 2))

print('zero_torch is of type {:s}'.format(str(type(zero_torch))))

# Torch -> Numpy: simply call the numpy() method.
zero_np = np.zeros((3, 2))
assert (zero_torch.numpy() == zero_np).all()

# Numpy -> Torch: simply call the corresponding function on the np.array.
zero_torch_float = torch.FloatTensor(zero_np)
print('\nFloat:\n', zero_torch_float)
zero_torch_int = torch.LongTensor(zero_np)
print('Int:\n', zero_torch_int)
zero_torch_bool = torch.BoolTensor(zero_np)
print('Bool:\n', zero_torch_bool)

# Reshape
print('\nView new shape...', zero_torch.view(1, 6))
# Note that print(zero_torch.reshape(1, 6)) would work too.
# The difference is in how memory is handled (view imposes contiguity).

# Algebra
a = torch.randn((3, 2))
b = torch.randn((3, 2))
print('\nAlgebraic operations are overloaded:\n', a, '\n+\n', b, '\n=\n', a+b )

# More generally, torch shares the syntax of many attributes and functions with Numpy.

zero_torch is of type <class 'torch.Tensor'>

Float:
 tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
Int:
 tensor([[0, 0],
        [0, 0],
        [0, 0]])
Bool:
 tensor([[False, False],
        [False, False],
        [False, False]])

View new shape... tensor([[0., 0., 0., 0., 0., 0.]])

Algebraic operations are overloaded:
 tensor([[ 0.0081,  1.1010],
        [-0.0700, -0.7682],
        [ 0.7680, -0.2680]]) 
+
 tensor([[-0.3066, -0.3435],
        [-2.2915,  0.3737],
        [-0.2964,  0.2468]]) 
=
 tensor([[-0.2985,  0.7575],
        [-2.3615, -0.3944],
        [ 0.4716, -0.0212]])


# torch.Tensor is a similar yet more complicated data structure than np.array.
# It is basically a static array of number but may also contain an overlay to 
# handle automatic differentiation (i.e keeping track of the gradient and which 
# tensors depend on which).
# To access the static array embedded in a tensor, simply call the detach() method
print(zero_torch.detach())

# When inside a function performing automatic differentiation (basically when training 
# a neural network), never use detach() otherwise meta information regarding gradients
# will be lost, effectively freezing the variable and preventing backprop for it. 
# However when returning the result of training, do use detach() to save memory 
# (the naked tensor data uses much less memory than the full-blown tensor with gradient
# management, and is much less prone to mistake such as bad copy and memory leak).

# We will solve theta * x = y in theta for x=1 and y=2
x = torch.ones(1)
y = 2 * torch.ones(1)

# Actually by default torch does not add the gradient management overlay
# when declaring tensors like this. To force it, add requires_grad=True.
theta = torch.randn(1, requires_grad=True)

# Optimisation routine
# (Adam is a sophisticated variant of SGD, with adaptive step).
optimizer = optim.Adam(params=[theta], lr=0.1)

# Loss function
print('Initial guess:', theta.detach())

for _ in range(100):
    # By default, torch accumulates gradients in memory.
    # To obtain the desired gradient descent beahviour,
    # just clean the cached gradients using the following line:
    optimizer.zero_grad()
    
    # Quadratic loss (* and ** are overloaded so that torch
    # knows how to differentiate them)
    loss = (y - theta * x) ** 2
    
    # Apply the chain rule to automatically compute gradients
    # for all relevant tensors.
    loss.backward()
    
    # Run one step of optimisation routine.
    optimizer.step()
    
print('Final estimate:', theta.detach())
print('The final estimate should be close to', y)

tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
Initial guess: tensor([-0.2573])
Final estimate: tensor([2.0115])
The final estimate should be close to tensor([2.])


# Environment
env = gym.make("CartPole-v0")

# Discount factor
GAMMA = 0.99

# Batch size
BATCH_SIZE = 256
# Capacity of the replay buffer
BUFFER_CAPACITY = 16384 # 10000
# Update target net every ... episodes
UPDATE_TARGET_EVERY = 5 # 20

# Initial value of epsilon
EPSILON_START = 1.0
# Parameter to decrease epsilon
DECREASE_EPSILON = 200
# Minimum value of epislon
EPSILON_MIN = 0.05

# Number of training episodes
N_EPISODES = 400

# Learning rate
LEARNING_RATE = 0.05


class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, state, action, reward, next_state):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = (state, action, reward, next_state)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.choices(self.memory, k=batch_size)

    def __len__(self):
        return len(self.memory)

# create instance of replay buffer
replay_buffer = ReplayBuffer(BUFFER_CAPACITY)


class Net(nn.Module):
    """
    Basic neural net.
    """
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)


# create network and target network
device = torch.device("cpu")

hidden_size = 128
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

q_net = Net(obs_size, hidden_size, n_actions).to(device)
target_net = Net(obs_size, hidden_size, n_actions).to(device)

# objective and optimizer
objective = nn.MSELoss()
optimizer = optim.Adam(params=q_net.parameters(), lr=LEARNING_RATE)


def reset():
    """Reset the model and all."""
    global q_net, target_net, replay_buffer, optimizer
    q_net = Net(obs_size, hidden_size, n_actions).to(device)
    target_net = Net(obs_size, hidden_size, n_actions).to(device)
    replay_buffer = ReplayBuffer(BUFFER_CAPACITY)
    optimizer = optim.Adam(params=q_net.parameters(), lr=LEARNING_RATE)
    torch.manual_seed(666)
    np.random.seed(666)
reset()


sample_states = [[0. for i in range(obs_size)] for _ in range(4)]
def get_q(states):
    """
    Compute Q function for a list of states
    """
    with torch.no_grad():
        states_v = torch.FloatTensor(np.array([states])).to(device)
        output = q_net.forward(states_v).detach().cpu().numpy()  # shape (1, len(states), n_actions)
    return output[0, :, :]  # shape (len(states), n_actions)
get_q(sample_states)

array([[-0.07864752,  0.23789123],
       [-0.07864755,  0.2378912 ],
       [-0.07864752,  0.23789123],
       [-0.07864755,  0.2378912 ]], dtype=float32)


def choose_action(state, epsilon):
    """    
    Return action according to an epsilon-greedy exploration policy
    """
    if random.random() < 1-epsilon:
        return get_q([state])[0].argmax()
    return random.choice(list(range(n_actions)))

epsilon_values = [0., 0.5, 1.]
fig, axes = plt.subplots(len(epsilon_values), 1, figsize=(15, 7))
for ax, epsilon in zip(axes, epsilon_values):
    actions = []
    for _ in range(100):
        actions.append(choose_action(sample_states[0], epsilon))
    ax.hist(actions, align="left", bins=100);
    ax.set_title(f"Distribution of the actions for {epsilon=}")
    ax.set_xlim(-0.2, 1.2)
plt.tight_layout()


def eval_dqn(n_sim=5):
    """
    Monte Carlo evaluation of DQN agent.

    Repeat n_sim times:
        * Run the DQN policy until the environment reaches a terminal state (= one episode)
        * Compute the sum of rewards in this episode
        * Store the sum of rewards in the episode_rewards array.
    """
    env_copy = deepcopy(env)
    episode_rewards = np.zeros(n_sim)
    for episode_id in range(n_sim):
        observation = env_copy.reset()
        done = False
        while not done:
            observation, reward, done, _ = env_copy.step(choose_action(state=observation, epsilon=0.))
            episode_rewards[episode_id] += reward
    return episode_rewards
N = 10
eval_result = eval_dqn(N)
assert eval_dqn(N).shape == (N,)
#assert np.mean(eval_result)<20
print(eval_result)

[ 9. 11.  9.  9.  9.  9.  9.  9. 10.  9.]


def update(state, action, reward, next_state, done, verbose=False):
    """
    Update the parameters of the Q network.
    """
    # add data to replay buffer
    if done:
        next_state = None
    replay_buffer.push(state, action, reward, next_state)
    
    if len(replay_buffer) < BATCH_SIZE:
        return np.inf
    
    # get batch
    transitions = replay_buffer.sample(BATCH_SIZE)  # BATCH_SIZE * line (state, action, reward, next_state)

    states = [x[0] for x in transitions]
    actions = [x[1] for x in transitions]
    rewards = [x[2] for x in transitions]
    next_states = [x[3] for x in transitions if isinstance(x[3], np.ndarray)]

    next_states_mask = [isinstance(x[3], np.ndarray) for x in transitions] 
    #print("states", states)


    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions).view(-1, 1)
    rewards = torch.FloatTensor(rewards).view(-1, 1)
    next_states = torch.FloatTensor(next_states)
    mask = torch.BoolTensor(next_states_mask)

    # Compute loss

    values  = q_net(states.to(device)).cpu() # Shape (nbr_states, nbr_actions)
    values = torch.gather(values, dim=1, index=actions) # We retrieve the Q(s,a) for the selected action a
    if verbose:
        print("Values:", values.shape)

    # Compute the target
    values_next_states = torch.zeros(BATCH_SIZE)
    values_next_states[mask] = target_net(next_states.to(device)).cpu().max(dim=1)[0].detach()

    targets = rewards + GAMMA * values_next_states.view(-1, 1)   # to be computed using batch
    if verbose:
        print("target", targets.shape)
    
    loss = objective(values, targets)
     
    # Optimize the model - UNCOMMENT!
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss.detach().numpy()

update(sample_states[0], 0, 1., 1, False)

inf


EVAL_EVERY = 3
REWARD_THRESHOLD = 199

def train(verbose: bool = True):
    state = env.reset()
    epsilon = EPSILON_START
    ep = 0
    losses = []
    total_time = 0
    rewards_history = []
    while ep < N_EPISODES:
        action = choose_action(state, epsilon)

        # take action and update replay buffer and networks
        next_state, reward, done, _ = env.step(action)
        loss = update(state, action, reward, next_state, done)
        losses.append(loss)
        # update state
        state = next_state

        # end episode if done
        if done:
            state = env.reset()
            ep   += 1
            if ( (ep+1)% EVAL_EVERY == 0):
                reward = np.mean(eval_dqn())
                rewards_history.append(reward)
                if verbose:
                    print(f"episode = {ep+1}, reward = {reward}, loss = {loss}")
                if reward >= REWARD_THRESHOLD:
                    break

            # update target network
            if ep % UPDATE_TARGET_EVERY == 0:
                target_net.load_state_dict(q_net.state_dict())
            # decrease epsilon
            epsilon = EPSILON_MIN + (EPSILON_START - EPSILON_MIN) * \
                            np.exp(-1. * ep / DECREASE_EPSILON )    
        total_time += 1
    return ep, losses, rewards_history

reset()

# Run the training loop
ep, losses, rewards_history = train()

# Evaluate the final policy
rewards = eval_dqn(20)
print("\nmean reward after training = ", np.mean(rewards))

episode = 3, reward = 9.6, loss = inf
episode = 6, reward = 9.2, loss = inf
episode = 9, reward = 8.8, loss = inf
episode = 12, reward = 8.8, loss = inf
episode = 15, reward = 9.4, loss = 0.00751245254650712

C:\Users\magal\AppData\Local\Temp/ipykernel_18084/731242307.py:25: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ..\torch\csrc\utils\tensor_new.cpp:201.)
  states = torch.FloatTensor(states)

episode = 18, reward = 9.6, loss = 0.03740710765123367
episode = 21, reward = 9.6, loss = 0.027942989021539688
episode = 24, reward = 12.6, loss = 0.07553702592849731
episode = 27, reward = 82.2, loss = 0.13352614641189575
episode = 30, reward = 67.8, loss = 0.040537238121032715
episode = 33, reward = 63.8, loss = 0.030331136658787727
episode = 36, reward = 71.6, loss = 0.03937184438109398
episode = 39, reward = 57.8, loss = 0.07890810817480087
episode = 42, reward = 17.2, loss = 0.11746294051408768
episode = 45, reward = 26.2, loss = 0.05767736956477165
episode = 48, reward = 62.2, loss = 0.21418404579162598
episode = 51, reward = 43.0, loss = 0.17651131749153137
episode = 54, reward = 35.0, loss = 0.16881264746189117
episode = 57, reward = 57.4, loss = 0.18512660264968872
episode = 60, reward = 30.4, loss = 0.18533454835414886
episode = 63, reward = 143.4, loss = 0.19766725599765778
episode = 66, reward = 24.4, loss = 0.12081369012594223
episode = 69, reward = 53.8, loss = 0.1841472089290619
episode = 72, reward = 103.8, loss = 0.2917347848415375
episode = 75, reward = 177.4, loss = 0.35494717955589294
episode = 78, reward = 95.8, loss = 0.35882389545440674
episode = 81, reward = 54.0, loss = 0.36940014362335205
episode = 84, reward = 113.6, loss = 0.21848058700561523
episode = 87, reward = 81.8, loss = 0.6922286152839661
episode = 90, reward = 152.6, loss = 0.18626204133033752
episode = 93, reward = 151.6, loss = 0.42309942841529846
episode = 96, reward = 120.4, loss = 0.30413365364074707
episode = 99, reward = 61.0, loss = 0.2578055262565613
episode = 102, reward = 173.6, loss = 0.351776659488678
episode = 105, reward = 115.2, loss = 0.947763979434967
episode = 108, reward = 200.0, loss = 0.4259825348854065

mean reward after training =  200.0


fig, axes = plt.subplots(1, 2, figsize=(20,7))
axes[0].scatter(list(range(len(losses))), losses, s=3)
axes[0].set_title("Evolution of the loss during training")
axes[0].set_yscale('log')
axes[1].scatter(list(range(len(rewards_history))), rewards_history, s=3);
axes[1].set_title("Evolution of the reward during training")

Text(0.5, 1.0, 'Evolution of the reward during training')


video_folder = "./videos"
env = Monitor(env, video_folder, force=True, video_callable=lambda episode: True)

for episode in range(1):
    done = False
    state = env.reset()
    while not done:
        action = choose_action(state, 0.0)
        state, reward, done, info = env.step(action)
env.close()

from IPython.display import Video
import os
from glob import glob
Video(glob(os.path.join(video_folder, "*.mp4"))[0])


def run_one():
    """Run once the training and evaluation of the model.
    
    This make sure we restart from zero each times, and retrieve meaningfull informations.
    For the losses and rewards, those are saved respectively at the end of each episode and every 3 episodes
    """
    reset()
    result = {}
    result['episode_takens'], losses, rewards_history = train(verbose=False)
    result['score'] = np.mean(eval_dqn())
    result['losses'] = losses
    result['rewards'] = rewards_history
    return result


# We save the current value in a variable, to avoid problems for the other studies
# we will change it back to the default value after.
# It would have been better to put those values as argument of a function
# But w'll do like that for now
previous_value = BUFFER_CAPACITY


nbr_values = 7
nbr_each = 3  # Due to the randomness of the training, we do it several times for the same set of parameters
results = []
with tqdm(total=nbr_values*nbr_each, desc="Compute the influence", colour="green") as pbar:
    for i in np.linspace(500, 100000, nbr_values):
        BUFFER_CAPACITY = int(i)
        for _ in range(nbr_each):
            results.append({
                "Buffer Capacity": BUFFER_CAPACITY,
                **run_one()
            })

            pbar.update()
df = pd.DataFrame.from_records(results)

BUFFER_CAPACITY = previous_value

Compute the influence: 100%|█████████████████████████████████████████████████████████| 21/21 [04:58<00:00, 14.20s/it]


df.head()


new_df = []
for _, row in df.iterrows():
    x = pd.DataFrame({"losses":row.losses}).reset_index()
    x['buffer'] = str(row['Buffer Capacity'])
    x['losses']=x['losses'].astype(float)
    new_df.append(x)
new_df = pd.concat(new_df, ignore_index=True)
new_df.head(3)


plt.figure(figsize=(14, 14))
sns.scatterplot(data=new_df, x="index", y="losses", hue="buffer", alpha=0.5, s=5)
plt.yscale('log')
plt.title("Losses over time for different values of buffer");

Text(0.5, 1.0, 'Losses over time for different values of buffer')


new_df = []
for _, row in df.iterrows():
    x = pd.DataFrame({"rewards":row.rewards}).reset_index()
    x['buffer'] = str(row['Buffer Capacity'])
    x['rewards']=x['rewards'].astype(float)
    new_df.append(x)
new_df = pd.concat(new_df, ignore_index=True)
new_df.head(3)


plt.figure(figsize=(14, 10))
sns.lineplot(data=new_df, x="index", y="rewards", hue="buffer")
plt.yscale('log')
plt.title("The evolution of the reward over time depending of the buffer value")

Text(0.5, 1.0, 'The evolution of the reward over time depending of the buffer value')


fig, axes = plt.subplots(1, 2, figsize=(20,7))

axes[0].set_title("Evolution of the score depending on the buffer size")
sns.lineplot(x=df['Buffer Capacity'], y=df['score'], ax=axes[0])
axes[1].set_title("Steps until convergence depending on the buffer size")
sns.lineplot(x=df['Buffer Capacity'], y=df['episode_takens'], ax=axes[1]);


previous_value = UPDATE_TARGET_EVERY

nbr_values = 7
nbr_each = 3
results = []
with tqdm(total=nbr_values*nbr_each, desc="Compute the influence", colour="green") as pbar:
    for i in np.linspace(3, 200, nbr_values):
        UPDATE_TARGET_EVERY = int(i)
        for _ in range(nbr_each):
            results.append({
                "Update every value": UPDATE_TARGET_EVERY,
                **run_one()
            })

            pbar.update()
df = pd.DataFrame.from_records(results)

UPDATE_TARGET_EVERY = previous_value

Compute the influence: 100%|██████████| 21/21 [04:00<00:00, 11.44s/it]


df.head()


new_df = []
for _, row in df.iterrows():
    x = pd.DataFrame({"losses":row.losses}).reset_index()
    x['update_every'] = str(row['Update every value'])
    x['losses']=x['losses'].astype(float)
    new_df.append(x)
new_df = pd.concat(new_df, ignore_index=True)
new_df.head(3)


plt.figure(figsize=(14, 14))
sns.scatterplot(data=new_df, x="index", y="losses", hue="update_every", alpha=0.5, s=5)
plt.yscale('log')
plt.title("Losses over time for different values of network update frequency")

Text(0.5, 1.0, 'Losses over time for different values of network update frequency')


new_df = []
for _, row in df.iterrows():
    x = pd.DataFrame({"rewards":row.rewards}).reset_index()
    x['update_every'] = str(row['Update every value'])
    x['rewards']=x['rewards'].astype(float)
    new_df.append(x)
new_df = pd.concat(new_df, ignore_index=True)
new_df.head(3)


plt.figure(figsize=(14, 10))
sns.lineplot(data=new_df, x="index", y="rewards", hue="update_every")
plt.yscale('log')
plt.title('Reward over time for different values of the network update frequency')

Text(0.5, 1.0, 'Reward over time for different values of the network update frequency')


fig, axes = plt.subplots(1, 2, figsize=(20,7))

axes[0].set_title("Evolution of the score depending on the update every")
sns.lineplot(x=df['Update every value'], y=df['score'], ax=axes[0])
axes[1].set_title("Steps until convergence depending on the update every")
sns.lineplot(x=df['Update every value'], y=df['episode_takens'], ax=axes[1]);


previous_value = BATCH_SIZE

nbr_values = 7
nbr_each = 3
results_batch_size = []
with tqdm(total=nbr_values*nbr_each, desc="Compute the influence", colour="green") as pbar:
    for i in np.linspace(10, 1000, nbr_values):
        BATCH_SIZE = int(i)
        for _ in range(nbr_each):
            results_batch_size.append({
                "BATCH_SIZE": BATCH_SIZE,
                **run_one()
            })
            pbar.update()
            
BATCH_SIZE = previous_value

Compute the influence: 100%|█████████████████████████████████████████████████████████| 21/21 [02:36<00:00,  7.44s/it]


df_batch_size = pd.DataFrame.from_records(results_batch_size)

new_df_batch_size = []
for _,row in df_batch_size.iterrows():
    x = pd.DataFrame({'losses' : row.losses}).reset_index()
    x['batch_size'] = str(row['BATCH_SIZE'])
    x['losses']=x['losses'].astype(float)
    new_df_batch_size.append(x)
    
new_df_batch_size = pd.concat(new_df_batch_size, ignore_index=True)
new_df_batch_size.head(3)


plt.figure(figsize=(14, 14))
sns.scatterplot(data=new_df_batch_size, x="index", y="losses", hue="batch_size", alpha=0.5, s=5)
plt.yscale('log')
plt.title("Losses over time for different values of batch_size")

Text(0.5, 1.0, 'Losses over time for different values of batch_size')


new_df_batch_size = []
for _, row in df_batch_size.iterrows():
    x = pd.DataFrame({"rewards":row.rewards}).reset_index()
    x['batch_size'] = str(row['BATCH_SIZE'])
    x['rewards']=x['rewards'].astype(float)
    new_df_batch_size.append(x)
new_df_batch_size = pd.concat(new_df_batch_size, ignore_index=True)
new_df_batch_size.head(3)


plt.figure(figsize=(14, 10))
sns.lineplot(data=new_df_batch_size, x="index", y="rewards", hue="batch_size")
plt.yscale('log')
plt.title('Reward over time for different values of the network update frequency')

Text(0.5, 1.0, 'Reward over time for different values of the network update frequency')


fig, axes = plt.subplots(1, 2, figsize=(20,7))

axes[0].set_title("Evolution of the score depending on the batch size")
axes[0].set_ylim(0,205)
sns.lineplot(x=df_batch_size['BATCH_SIZE'], y=df_batch_size['score'], ax=axes[0])
axes[1].set_title("Steps until convergence depending on the batch size")
sns.lineplot(x=df_batch_size['BATCH_SIZE'], y=df_batch_size['episode_takens'], ax=axes[1]);


import math


OBS_1 = 2
OBS_2 = 3
ACTION_TO_TAKE = 0
OBSERVATIONS = ['Cart Position', 'Cart Velocity', 'Pole Angle', 'Pole Angular Velocity']

nX = 100
nY = 100


x_name = OBSERVATIONS[OBS_1]
y_name = OBSERVATIONS[OBS_2]

x_values = torch.tensor(
    np.linspace(env.observation_space.low[OBS_1], env.observation_space.high[OBS_1], nX)
)
y_values = torch.tensor(
    np.linspace(-math.radians(50), math.radians(50), nY)
)


import ipywidgets as widgets


obs_0 = widgets.FloatSlider(min=-0.418, max=0.418, step=0.01, value=0.)
obs_0

FloatSlider(value=0.0, max=0.418, min=-0.418, step=0.01)


obs_1 = widgets.FloatSlider(min=-0.418, max=0.418, step=0.01, value=0.)
obs_1

FloatSlider(value=0.0, max=0.418, min=-0.418, step=0.01)


values = torch.zeros((2,nX, nY))
actions_to_take = torch.zeros((nX, nY))
print(f"OBS 0: {obs_0.value}")
print(f"OBS 1: {obs_1.value}")
with torch.no_grad():
    for i,x in enumerate(x_values):
        for j,y in enumerate(y_values):

            q = q_net(torch.FloatTensor([[obs_0.value, obs_1.value,x, y]]))
            values[:,i,j] = q[0]
            actions_to_take[i,j] = q[0].argmax()

fig, axes = plt.subplots(1, 2, figsize=(20, 10))
for action in range(2):
    sns.heatmap(values[action], ax=axes[action] ,cmap="seismic", vmin=-15, vmax=15);
    axes[action].set_title(f"Q value for action {action}")
    axes[action].set_aspect('equal', 'box')
    axes[action].set_xlabel(x_name)
    axes[action].set_ylabel(y_name)

OBS 0: 0.0
OBS 1: 0.0


fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(values[0] - values[1], ax=ax,cmap="seismic", vmin=-1, vmax=1);
ax.set_title(f"Q value difference")
ax.set_aspect('equal', 'box')
ax.set_xlabel(x_name)
ax.set_ylabel(y_name);


plt.title("Action to take based on the position and ")

sns.heatmap(actions_to_take);
plt.xlabel(x_name)
plt.ylabel(y_name)
plt.legend();

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

	Buffer Capacity	episode_takens	score	losses	rewards
0	500	113	200.0	[inf, inf, inf, inf, inf, inf, inf, inf, inf, ...	[9.2, 9.0, 9.2, 9.4, 9.0, 8.8, 9.2, 9.6, 18.6,...
1	500	113	200.0	[inf, inf, inf, inf, inf, inf, inf, inf, inf, ...	[9.2, 10.0, 9.8, 9.4, 9.6, 12.8, 9.8, 9.6, 8.8...
2	500	32	199.6	[inf, inf, inf, inf, inf, inf, inf, inf, inf, ...	[9.6, 9.6, 10.0, 9.4, 12.2, 10.6, 10.6, 46.8, ...
3	17083	77	200.0	[inf, inf, inf, inf, inf, inf, inf, inf, inf, ...	[9.2, 9.0, 9.4, 9.4, 9.2, 9.2, 10.0, 9.8, 48.6...
4	17083	38	200.0	[inf, inf, inf, inf, inf, inf, inf, inf, inf, ...	[9.0, 9.2, 9.8, 9.4, 9.4, 9.6, 10.4, 11.0, 112...

	Update every value	episode_takens	score	losses	rewards
0	3	110	200.0	[inf, inf, inf, inf, inf, inf, inf, inf, inf, ...	[9.6, 9.4, 9.2, 9.0, 9.6, 10.4, 12.0, 19.6, 19...
1	3	47	200.0	[inf, inf, inf, inf, inf, inf, inf, inf, inf, ...	[9.2, 10.0, 9.8, 9.4, 9.8, 10.2, 9.4, 16.6, 11...
2	3	35	200.0	[inf, inf, inf, inf, inf, inf, inf, inf, inf, ...	[8.8, 9.0, 9.0, 9.2, 10.4, 9.4, 10.2, 9.8, 14....
3	35	248	200.0	[inf, inf, inf, inf, inf, inf, inf, inf, inf, ...	[9.2, 9.0, 9.4, 9.2, 9.4, 9.0, 10.6, 10.4, 10....
4	35	206	200.0	[inf, inf, inf, inf, inf, inf, inf, inf, inf, ...	[9.2, 9.6, 9.6, 8.8, 54.2, 9.4, 11.0, 13.6, 13...

	index	rewards	update_every
0	0	9.6	3
1	1	9.4	3
2	2	9.2	3

	index	rewards	batch_size
0	0	9.0	10
1	1	9.0	10
2	2	10.2	10

Reinforcement Learning TP2 - Deep Q-Networks¶

Marco Boucas & Magali Morin¶

20/02/2022¶

Theorical explanation¶

Torch 101¶

gym.wrappers¶

Variable types¶

Gradient management¶

Setting the environment¶

1 - Define the GLOBAL parameters¶

2 - Replay buffer¶

3 - Neural Network¶

3.5 - Loss function and optimizer¶

Question 0 (to do at home, not during the live session)¶

4 - Implementing the DQN¶

Question 1¶

Observations¶

Question 2¶

Question 3¶

Question 4¶

Question 5¶

5 - Experiments: Do It Yourself¶

5.1 - Influence of Buffer Capacity¶

Question 6: BUFER_CAPACITY¶

5.1.1 - Influence of the Buffer Capacity on the loss¶

Observations¶

5.1.2 - Influence of the Buffer Capacity on the rewards¶

Observations¶

5.1.3 - Influence of the Buffer Capacity on the score and speed of convergence¶

Observations¶

5.2 - Influence of Update target every¶

Question 7: UPDATE_TARGET_EVERY¶

5.2.1 - Influence of the Update target every on the loss¶

Observations¶

5.2.2 - Influence of the Update target every on the rewards¶

Observations¶

5.2.3 - Influence of the Update target every on the score and speed of convergence¶

Observations¶

5.3 - Influence of the Batch size¶

Question 8¶

5.3.1 - Influence of the Batch size on the loss¶

Observations¶

5.3.2 - Influence of the Batch size on the rewards¶

Observations¶

5.3.3 - Influence of the Batch size on the score and speed of convergence¶

Observations¶

Map of the results¶

Question 8¶

Discretization¶