diff --git "a/Aprendizado por Refor\303\247o Profundo/Actor-Critic/A2C/A2C.ipynb" "b/Aprendizado por Refor\303\247o Profundo/Actor-Critic/A2C/A2C.ipynb" index f0e7cd3..9b7f2a4 100644 --- "a/Aprendizado por Refor\303\247o Profundo/Actor-Critic/A2C/A2C.ipynb" +++ "b/Aprendizado por Refor\303\247o Profundo/Actor-Critic/A2C/A2C.ipynb" @@ -229,7 +229,7 @@ "id": "1ZiSoaCSnX-3" }, "source": [ - "## Experience Replay" + "## Memory Buffer" ] }, { @@ -242,8 +242,8 @@ "source": [ "import numpy as np\n", "\n", - "class ExperienceReplay:\n", - " \"\"\"Experience Replay Buffer para A2C.\"\"\"\n", + "class MemoryBuffer:\n", + " \"\"\"Memory Buffer Buffer para A2C.\"\"\"\n", " def __init__(self, max_length, observation_space):\n", " \"\"\"Cria um Replay Buffer.\n", "\n", @@ -286,7 +286,7 @@ " self.dones[self.length] = dones\n", " self.length += 1\n", "\n", - " def sample(self):\n", + " def get_batch(self):\n", " \"\"\"Retorna um batch de experiências.\n", " \n", " Parâmetros\n", @@ -340,7 +340,7 @@ " self.entropy_coef = entropy_coef\n", "\n", " self.n_steps = n_steps\n", - " self.memory = ExperienceReplay(n_steps, observation_space.shape[0])\n", + " self.memory = MemoryBuffer(n_steps, observation_space.shape[0])\n", "\n", " self.actor = Actor(observation_space.shape[0], action_space.n).to(self.device)\n", " self.critic = Critic(observation_space.shape[0]).to(self.device)\n", @@ -380,7 +380,7 @@ " if self.memory.length < self.n_steps:\n", " return\n", "\n", - " (states, actions, rewards, next_states, dones) = self.memory.sample()\n", + " (states, actions, rewards, next_states, dones) = self.memory.get_batch()\n", "\n", " states = torch.FloatTensor(states).to(self.device)\n", " actions = torch.FloatTensor(actions).to(self.device)\n", @@ -664,7 +664,7 @@ " self.entropy_coef = entropy_coef\n", "\n", " self.n_steps = n_steps\n", - " self.memory = ExperienceReplay(n_steps, observation_space.shape[0])\n", + " self.memory = MemoryBuffer(n_steps, observation_space.shape[0])\n", "\n", " self.actorcritic = ActorCritic(observation_space.shape[0], action_space.n).to(self.device)\n", " self.actorcritic_optimizer = optim.Adam(self.actorcritic.parameters(), lr=lr)\n", @@ -702,7 +702,7 @@ " if self.memory.length < self.n_steps:\n", " return\n", "\n", - " (states, actions, rewards, next_states, dones) = self.memory.sample()\n", + " (states, actions, rewards, next_states, dones) = self.memory.get_batch()\n", "\n", " states = torch.FloatTensor(states).to(self.device)\n", " actions = torch.FloatTensor(actions).to(self.device)\n", @@ -869,7 +869,7 @@ "source": [ "import numpy as np\n", "\n", - "class MultipleExperienceReplay:\n", + "class MultipleMemoryBuffer:\n", " def __init__(self, max_length, env_num, observation_space):\n", " self.length = 0\n", " self.max_length = max_length\n", @@ -888,7 +888,7 @@ " self.dones[self.length] = dones\n", " self.length += 1\n", "\n", - " def sample(self):\n", + " def get_batch(self):\n", " self.length = 0\n", "\n", " return (self.states, self.actions, self.rewards, self.next_states, self.dones)" @@ -944,7 +944,7 @@ " self.entropy_coef = entropy_coef\n", "\n", " self.n_steps = n_steps\n", - " self.memory = MultipleExperienceReplay(n_steps, env_num, observation_space.shape[0])\n", + " self.memory = MultipleMemoryBuffer(n_steps, env_num, observation_space.shape[0])\n", "\n", " self.actorcritic = ActorCritic(observation_space.shape[0], action_space.n).to(self.device)\n", " self.actorcritic_optimizer = optim.Adam(self.actorcritic.parameters(), lr=lr)\n", @@ -982,7 +982,7 @@ " if self.memory.length < self.n_steps:\n", " return\n", "\n", - " (states, actions, rewards, next_states, dones) = self.memory.sample()\n", + " (states, actions, rewards, next_states, dones) = self.memory.get_batch()\n", "\n", " states = torch.FloatTensor(states).to(self.device)\n", " actions = torch.FloatTensor(actions).to(self.device)\n", diff --git "a/Aprendizado por Refor\303\247o Profundo/Actor-Critic/PPO/PPO.ipynb" "b/Aprendizado por Refor\303\247o Profundo/Actor-Critic/PPO/PPO.ipynb" new file mode 100644 index 0000000..94f2a6f --- /dev/null +++ "b/Aprendizado por Refor\303\247o Profundo/Actor-Critic/PPO/PPO.ipynb" @@ -0,0 +1,442 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Proximal Policy Optimization (PPO)\n", + "\n", + "Como vimos na aula de A2C, uma função objetivo muito utilizada é:\n", + "\n", + "$$\n", + " J(\\theta) = \\mathbb{E}_{s,a\\sim\\pi_\\theta} [A^{\\pi_\\theta}_w(s,a)], \\qquad\n", + " \\nabla_\\theta J(\\theta) = \\mathbb{E}_{s,a\\sim\\pi_\\theta} [\\nabla_\\theta \\log \\pi_\\theta(a|s)\\cdot A^{\\pi_\\theta}_w(s,a)].\n", + "$$\n", + "\n", + "Os índices na função _advantage_ $A$ indicam que $A$ depende tanto dos pesos $w$ utilizados para calcular o estimar de cada estado, quanto da política $\\pi_\\theta$, que determina quais trajetórias o agente vai seguir dentro do ambiente.\n", + "\n", + "> Obs: pode-se mostrar que essa formulação é equivalente à formulação que utiliza somatórias no tempo:\n", + "$$\n", + " J(\\theta) = \\mathbb{E}_{(s_0,a_0,\\dots)\\sim\\pi_\\theta} \\left[\\sum_{t=0}^\\infty \\gamma^t A^{\\pi_\\theta}_w(s_t,a_t)\\right], \\qquad\n", + " \\nabla_\\theta J(\\theta) = \\mathbb{E}_{(s_0,a_0,\\dots)\\sim\\pi_\\theta} \\left[\\sum_{t=0}^\\infty \\nabla_\\theta \\log \\pi_\\theta(a_t|s_t)\\cdot A^{\\pi_\\theta}_w(s_t,a_t)\\right].\n", + "$$\n", + "\n", + "Note que uma pequena variação no espaço de parâmetros ($\\Delta\\theta = \\alpha\\nabla_\\theta J$) pode causar uma grande variação no espaço de políticas. Isso significa que, em geral, a taxa de aprendizado $\\alpha$ não pode ser muito alta; caso contrário, corremos o risco de obter uma nova política que não funcione. Consequentemente, a eficiência amostral de A2C também é limitada.\n", + "\n", + "\n", + "## Trust Region Policy Optimization (TRPO)\n", + "\n", + "Uma maneira de resolver esse problema é limitar as variações na política. Para isso, vamos utilizar a divergência KL $KL(\\pi_1 || \\pi_2)$, que pode ser, simplificadamente, encarada como uma medida da diferença entre duas políticas (ou, em geral, duas distribuições de probabilidade).\n", + "\n", + "TRPO define uma região de confiança (trust region) para garantir que a política nova não se distancie demais da política antiga:\n", + "$$E_{s\\sim\\pi_{\\theta_{\\mathrm{old}}}}\\bigl[KL\\bigl(\\pi_{\\mathrm{old}}(\\cdot|s)\\,||\\,\\pi(\\cdot|s)\\bigr)\\bigr] \\le \\delta.$$\n", + "\n", + "No entanto, maximizar a função objetivo de A2C sujeito a essas restrições é um pouco complicado. Então, vamos utilizar uma aproximação da função objetivo de A2C:\n", + "\n", + "$$L(\\theta_{\\mathrm{old}},\\theta) = E_{s,a\\sim\\pi_{\\theta_{\\mathrm{old}}}} \\left[\\frac{\\pi_\\theta(a|s)}{\\pi_{\\theta_{\\mathrm{old}}}(a|s)} A^{\\pi_{\\theta_{\\mathrm{old}}}}(s,a)\\right].$$\n", + "\n", + "Ou seja, TRPO consiste em:\n", + "$$\\text{maximizar } E_{s,a\\sim\\pi_{\\theta_{\\mathrm{old}}}} \\left[\\frac{\\pi_\\theta(a|s)}{\\pi_{\\theta_{\\mathrm{old}}}(a|s)} A^{\\pi_{\\theta_{\\mathrm{old}}}}(s,a)\\right] \\text{ sujeito a } E_{s\\sim\\pi_{\\theta_{\\mathrm{old}}}}\\bigl[KL\\bigl(\\pi_{\\mathrm{old}}(\\cdot|s)\\,||\\,\\pi(\\cdot|s)\\bigr)\\bigr] \\le \\delta.$$\n", + "\n", + "> Para entender como chegamos $L(\\theta_{\\mathrm{old}},\\theta)$ é uma aproximação de $J(\\theta)$, podemos fazer:\n", + "\\begin{align*}\n", + "J(\\theta) &= E_{\\pi_\\theta}[A^{\\pi_\\theta}(s,a)] \\\\\n", + " &= E_{\\pi_\\theta}[A^{\\pi_{\\theta_{\\mathrm{old}}}}(s,a)] \\\\\n", + "\t\t&= \\sum_{s,a} \\rho_{\\pi_\\theta}(s)\\cdot \\pi_\\theta(a|s) \\cdot A^{\\pi_{\\theta_{\\mathrm{old}}}}(s,a) \\\\\n", + "\t\t&= \\sum_{s,a} \\rho_{\\pi_\\theta}(s)\\cdot \\pi_{\\theta_{\\mathrm{old}}}(a|s) \\cdot \\frac{\\pi_\\theta(a|s)}{\\pi_{\\theta_{\\mathrm{old}}}(a|s)}A^{\\pi_{\\theta_{\\mathrm{old}}}}(s,a) \\\\\n", + "\t\t&\\approx \\sum_{s,a} \\rho_{\\pi_{\\theta_{\\mathrm{old}}}}(s)\\cdot \\pi_{\\theta_{\\mathrm{old}}}(a|s) \\cdot \\frac{\\pi_\\theta(a|s)}{\\pi_{\\theta_{\\mathrm{old}}}(a|s)}A^{\\pi_{\\theta_{\\mathrm{old}}}}(s,a) \\\\\n", + "\t\t&= E_{\\pi_{\\theta_{\\mathrm{old}}}} \\left[\\frac{\\pi_\\theta(a|s)}{\\pi_{\\theta_{\\mathrm{old}}}(a|s)} A^{\\pi_\\theta}(s,a)\\right]\n", + "\\end{align*}\n", + "\n", + "\n", + "## Proximal Policy Optimization (PPO)\n", + "\n", + "Como já foi mencionado, a restrição ($KL < \\delta$) imposta em TRPO torna o algoritmo relativamente complicado. PPO é uma tentativa de simplificar esse algoritmo. Ao invés de utilizar trust regions, PPO mexe diretamente com a função objetivo:\n", + "\n", + "$$\n", + " L(\\theta_{\\mathrm{old}},\\theta) = E_{s,a\\sim\\pi_{\\theta_{\\mathrm{old}}}} \\Bigl[\\min\\left(r A^{\\pi_{\\theta_{\\mathrm{old}}}}(s,a),\\, \\operatorname{clip}(r,1-\\varepsilon,1+\\varepsilon) A^{\\pi_{\\theta_{\\mathrm{old}}}}(s,a)\\right)\\Bigr],\n", + " \\quad\n", + " r = \\frac{\\pi_\\theta(a|s)}{\\pi_{\\theta_{\\mathrm{old}}}(a|s)}.\n", + "$$\n", + "Essa função pode ser reescrita como:\n", + "$$\n", + " L(\\theta_{\\mathrm{old}},\\theta) = E_{s,a\\sim\\pi_{\\theta_{\\mathrm{old}}}} \\Bigl[\\min\\left(r A^{\\pi_{\\theta_{\\mathrm{old}}}}(s,a),\\, g(\\varepsilon, A^{\\pi_{\\theta_{\\mathrm{old}}}}(s,a))\\right)\\Bigr],\n", + " \\quad\n", + " g(\\varepsilon, A) = \\begin{cases}\n", + " (1+\\varepsilon) A, & A \\ge 0 \\\\\n", + " (1-\\varepsilon) A, & A < 0.\n", + " \\end{cases}\n", + "$$\n", + "\n", + "Nota-se que:\n", + "- Quando a vantagem é positiva, se $r$ aumentar, então $L$ aumenta. No entanto, esse benefício é limitado pelo clip: se $r > 1+\\varepsilon$, não há mais benefício para $r$ aumentar.\n", + "- Quando a vantagem é negativa, se $r$ diminuir, então $L$ aumenta. No entanto, esse benefício é limitado pelo clip: se $r < 1-\\varepsilon$, não há mais benefício para $r$ diminuir.\n", + "\n", + "A seguinte imagem pode te ajudar a visualizar o clip. Note que todos os valores fora do clip estipulado estão constantes:\n", + "\n", + "![imagem ilustrando o clip](imgs/clip.png)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rede Divida" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from torch.distributions import Categorical\n", + "\n", + "class ActorCritic(nn.Module):\n", + " def __init__(self, observation_shape, action_shape):\n", + " super(ActorCritic, self).__init__()\n", + " self.policy1 = nn.Linear(observation_shape, 64)\n", + " self.policy2 = nn.Linear(64, 64)\n", + " self.policy3 = nn.Linear(64, action_shape)\n", + " \n", + " self.value1 = nn.Linear(observation_shape, 64)\n", + " self.value2 = nn.Linear(64, 64)\n", + " self.value3 = nn.Linear(64, 1)\n", + "\n", + " def forward(self, state):\n", + " dists = torch.relu(self.policy1(state))\n", + " dists = torch.relu(self.policy2(dists))\n", + " dists = F.softmax(self.policy3(dists), dim=-1)\n", + " probs = Categorical(dists)\n", + " \n", + " v = torch.relu(self.value1(state))\n", + " v = torch.relu(self.value2(v))\n", + " v = self.value3(v)\n", + "\n", + " return probs, v" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Memory Buffer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "class MemoryBuffer:\n", + " \"\"\"Memory Buffer para PPO.\"\"\"\n", + " def __init__(self, max_length, observation_space):\n", + " \"\"\"Cria um Memory Buffer.\n", + "\n", + " Parâmetros\n", + " ----------\n", + " max_length: int\n", + " Tamanho máximo do Memory Buffer.\n", + " observation_space: int\n", + " Tamanho do espaço de observação.\n", + " \"\"\"\n", + " self.length = 0\n", + " self.max_length = max_length\n", + "\n", + " self.states = np.zeros((max_length, observation_space), dtype=np.float32)\n", + " self.actions = np.zeros((max_length), dtype=np.int32)\n", + " self.rewards = np.zeros((max_length), dtype=np.float32)\n", + " self.next_states = np.zeros((max_length, observation_space), dtype=np.float32)\n", + " self.dones = np.zeros((max_length), dtype=np.float32)\n", + " self.logp = np.zeros((max_length), dtype=np.float32)\n", + "\n", + " def update(self, states, actions, rewards, next_states, dones, logp):\n", + " \"\"\"Adiciona uma experiência ao Memory Buffer.\n", + "\n", + " Parâmetros\n", + " ----------\n", + " state: np.array\n", + " Estado da transição.\n", + " action: int\n", + " Ação tomada.\n", + " reward: float\n", + " Recompensa recebida.\n", + " state: np.array\n", + " Estado seguinte.\n", + " done: int\n", + " Flag indicando se o episódio acabou.\n", + " logp: float\n", + " Log da probabilidade de acordo com a política.\n", + " \"\"\"\n", + " self.states[self.length] = states\n", + " self.actions[self.length] = actions\n", + " self.rewards[self.length] = rewards\n", + " self.next_states[self.length] = next_states\n", + " self.dones[self.length] = dones\n", + " self.logp[self.length] = logp\n", + " self.length += 1\n", + "\n", + " def get_batch(self):\n", + " \"\"\"Retorna um batch de experiências.\n", + " \n", + " Parâmetros\n", + " ----------\n", + " batch_size: int\n", + " Tamanho do batch de experiências.\n", + "\n", + " Retorna\n", + " -------\n", + " states: np.array\n", + " Batch de estados.\n", + " actions: np.array\n", + " Batch de ações.\n", + " rewards: np.array\n", + " Batch de recompensas.\n", + " next_states: np.array\n", + " Batch de estados seguintes.\n", + " dones: np.array\n", + " Batch de flags indicando se o episódio acabou.\n", + " logp: np.array\n", + " Batch do log da probabilidade de acordo com a política.\n", + " \"\"\"\n", + " self.length = 0\n", + "\n", + " return (self.states, self.actions, self.rewards, self.next_states, self.dones, self.logp)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.optim as optim\n", + "\n", + "class PPO:\n", + " def __init__(self, observation_space, action_space, lr=7e-4, gamma=0.99, lam=0.95,\n", + " vf_coef=0.5, entropy_coef=0.005, clip_param=0.2, epochs=10, memory_len=16):\n", + " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + " self.gamma = gamma\n", + " self.lam = lam\n", + " self.vf_coef = vf_coef\n", + " self.entropy_coef = entropy_coef\n", + " self.clip_param = clip_param\n", + " self.epochs = epochs\n", + "\n", + " self.memory_len = memory_len\n", + " self.memory = MemoryBuffer(memory_len, observation_space.shape[0])\n", + "\n", + " self.actorcritic = ActorCritic(observation_space.shape[0], action_space.n).to(self.device)\n", + " self.actorcritic_optimizer = optim.Adam(self.actorcritic.parameters(), lr=lr)\n", + "\n", + " def act(self, state):\n", + " state = torch.FloatTensor(state).to(self.device).unsqueeze(0)\n", + " probs, v = self.actorcritic.forward(state)\n", + " action = probs.sample()\n", + " log_prob = probs.log_prob(action)\n", + " return action.cpu().detach().item(), log_prob.detach().cpu().numpy()\n", + "\n", + " def remember(self, state, action, reward, next_state, done, logp):\n", + " self.memory.update(state, action, reward, next_state, done, logp)\n", + "\n", + " def compute_gae(self, rewards, dones, v, v2):\n", + " T = len(rewards)\n", + "\n", + " returns = torch.zeros_like(rewards)\n", + " gaes = torch.zeros_like(rewards)\n", + " \n", + " future_gae = torch.tensor(0.0, dtype=rewards.dtype)\n", + " next_return = torch.tensor(v2[-1], dtype=rewards.dtype)\n", + "\n", + " not_dones = 1 - dones\n", + " deltas = rewards + not_dones * self.gamma * v2 - v\n", + "\n", + " for t in reversed(range(T)):\n", + " returns[t] = next_return = rewards[t] + self.gamma * not_dones[t] * next_return\n", + " gaes[t] = future_gae = deltas[t] + self.gamma * self.lam * not_dones[t] * future_gae\n", + "\n", + " gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8) # Normalização\n", + "\n", + " return gaes, returns\n", + "\n", + " def train(self):\n", + " if self.memory.length < self.memory_len:\n", + " return\n", + "\n", + " (states, actions, rewards, next_states, dones, old_logp) = self.memory.get_batch()\n", + "\n", + " states = torch.FloatTensor(states).to(self.device)\n", + " actions = torch.FloatTensor(actions).to(self.device)\n", + " rewards = torch.FloatTensor(rewards).unsqueeze(-1).to(self.device)\n", + " next_states = torch.FloatTensor(next_states).to(self.device)\n", + " dones = torch.FloatTensor(dones).unsqueeze(-1).to(self.device)\n", + " old_logp = torch.FloatTensor(old_logp).to(self.device)\n", + " \n", + " with torch.no_grad():\n", + " _, v = self.actorcritic.forward(states)\n", + " _, v2 = self.actorcritic.forward(next_states)\n", + " \n", + " advantages, returns = self.compute_gae(rewards, dones, v, v2)\n", + " \n", + " for epoch in range(self.epochs):\n", + " \n", + " probs, v = self.actorcritic.forward(states)\n", + "\n", + " new_logp = probs.log_prob(actions)\n", + "\n", + " #Equações principais do algoritmo\n", + " ratio = (new_logp.unsqueeze(-1) - old_logp.unsqueeze(-1)).exp() \n", + " surr1 = ratio * advantages.detach()\n", + " surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * advantages.detach()\n", + "\n", + " entropy = probs.entropy().mean()\n", + "\n", + " policy_loss = - torch.min(surr1,surr2).mean()\n", + " value_loss = self.vf_coef * F.mse_loss(v, returns.detach())\n", + " entropy_loss = -self.entropy_coef * entropy\n", + "\n", + " self.actorcritic_optimizer.zero_grad()\n", + " (policy_loss + entropy_loss + value_loss).backward()\n", + " self.actorcritic_optimizer.step()\n", + "\n", + " return policy_loss + entropy_loss + value_loss" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Treinando" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "from collections import deque\n", + "\n", + "def train(agent, env, total_timesteps):\n", + " total_reward = 0\n", + " episode_returns = deque(maxlen=20)\n", + " avg_returns = []\n", + "\n", + " state = env.reset()\n", + " timestep = 0\n", + " episode = 0\n", + "\n", + " while timestep < total_timesteps:\n", + " action, log_prob = agent.act(state)\n", + " next_state, reward, done, _ = env.step(action)\n", + " agent.remember(state, action, reward, next_state, done, log_prob)\n", + " loss = agent.train()\n", + " timestep += 1\n", + "\n", + " total_reward += reward\n", + "\n", + " if done:\n", + " episode_returns.append(total_reward)\n", + " episode += 1\n", + " next_state = env.reset()\n", + "\n", + " if episode_returns:\n", + " avg_returns.append(np.mean(episode_returns))\n", + "\n", + " total_reward *= 1 - done\n", + " state = next_state\n", + "\n", + " ratio = math.ceil(100 * timestep / total_timesteps)\n", + "\n", + " avg_return = avg_returns[-1] if avg_returns else np.nan\n", + " \n", + " print(f\"\\r[{ratio:3d}%] timestep = {timestep}/{total_timesteps}, episode = {episode:3d}, avg_return = {avg_return:10.4f}\", end=\"\")\n", + "\n", + " return avg_returns" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[100%] timestep = 75000/75000, episode = 295, avg_return = 256.9000" + ] + } + ], + "source": [ + "import gym\n", + "\n", + "env = gym.make(\"CartPole-v1\")\n", + "agente = PPO(env.observation_space, env.action_space)\n", + "returns = train(agente, env, 75000)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/svg+xml": "\n\n\n \n \n \n \n 2021-06-25T01:15:37.844050\n image/svg+xml\n \n \n Matplotlib v3.4.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAoCklEQVR4nO3de7xUVf3/8dfHczgHEOUmEQIKKmlqikZ+MfOSZioa2kXT1J+Zial5iW8Xb5Vfy7yEIvTtq5KXsLymeUm0VLyngqChoqGoWCAIyFW5nHNg/f747N0McC5z5rb3zHk/H495rD179sz+HGb4zJq11l7LQgiIiEh12SzpAEREpPiU3EVEqpCSu4hIFVJyFxGpQkruIiJVqDbpAAC22mqrMGjQoKTDEBGpKNOnT18cQujT3GOpSO6DBg1i2rRpSYchIlJRzOy9lh5Ts4yISBVSchcRqUJK7iIiVUjJXUSkCim5i4hUISV3EZEqpOQuIlKFUjHOXUQ6kKeegsmTM/f79oUzzgCz5GKqQkruIlJeP/4xTJ3qyTxeT+Lww0FXqReVkrtI0pYtgzFjYPXqTR/78pfhkEPKHlJJvPMOzJgB8+bBN78Jd9wBd98NRx8NH32UdHRVR8ldJGmPPgqXXgpdu8JmWd1gq1fD009XT3I//nh44QXf7t/fy65dvTzhBLjnHth++2Riq0LqUBVJWlxrnTkTVq7M3I4+Gt5+G26+Odn4imXZMv8lMmMGXHaZ7/uv/4KRI32f5pcqKiV3kXJbvhzGjoVTT4VddoELL/T9cS02tt9+sHQpnH12+WMshdWr4ZOfhN12g7o639e7N1x9tW+vXZtcbFVIzTIi5XbffTB6dOb+yJHeHNFno5lbTz8d3n0X/vd/yxpe0f3+9/DYY/DBB9C586aP19d7qeReVEruIuW2apWX774LAwdCTU3Lx9bVQUNDeeIqlcsu807UAQPgi1/c9PE4ud9/P3z2s7DnnuWNLxe33AIXXJAZ3dOSAQPgmWcyv0wSpOQuUm5xst5yy9YTO3iSWLfOb20dm1YNDfC1r3mCbE737v4lN2mSD4/8y18KP+d993lndGzLLT05tzfpLljgvzpuvBFWrPBRPi154w34+99hyRJvfkqYkrtIucXJPZdE06WLl1ttBVOmwKc+Vbq4SqWxsfW/ta4O3nsP9t0386umvVat8lFHDQ2wfj2cey4sXuz/fk1N3t5/8MGwzz7te91LLoFrr/XtAw+E3/2u5WNvusmTe0qal5TcpXDPPw+33tq+5xx6KBxxRGniSaslSzwBPPaY388luX/rW/DWW55UZs+uzOTe0ACdOrV+jJkn4hUrPBHHX2q5uOIKuPxyH42T7ZJL4Kc/9S/F4cP93/CTn2zfcMvly2GbbeDxx2HrrVs/Nm5eSkkzmpK7FO43v4E774SePXM7/sMP4be/hV13hWuugYMOKml4qXHXXfCjH/n2dtu1nfDAx4OfeaYnpjVrShtfsT34oI9rX7Eity+ybt38i69rVx/z/rWvtX787bf7LW7G2WMP+MMf/IuipgaGDPH9gwdDjx4wcaJ/RseOzf1vWL3am3Ry+UKI/8aZMzPnTpCSuxSusRF23BFefz234ydPhuuvhz/9Ca67zn/udoR5ReImhw8+8CGAuf7N8QiTp59uO+GlyTnn+FWpnTrBZz7T9vG//CV8/vM+PcH48fDSS60fP2ECfPyxJ9I77/Tk3pxPfMJ/NfXrl3uzz/jxXvNfsQKGDs3tOfFop+9/H446KrfnlJCSuxRu/fr2dfYddJDf+vTxy8+//W0fLlftCb49HanZttrKy3Hj/JdOpVi1ysfyT5iQ2/G77OK3e+6BZ5/1W2vMfIz8WWe1/dpxs09zUzw057nnvBN71ChvQszF/vv7l9OcObkdX2K6iEkKt27dhpfN52rSJC9vucVHGlSz5cszbcLtHbHRu7c3zYB/kVaKtWsz7dDt8cIL3gna1q2xMbfEHuvSxZvGdtvNrwBuK/aBA/0ag1z7hsz8F8q6dbnHVEJK7lK4fIfp7bUXPPCAb//hD22PIa5Ud93lbb5XXOFNLPl8EcadeY2NRQ2tJBYs8E721avzS+6lcvHFftXvq6/66JzW5PvF1KmTOlSlihQyBjvueLr8cjj2WNh99+LFlbQQPIk8/7zfHzfO+ybyURv9V21qSlfCbM4BB8CsWb6dayd7ORxzDGyxhQ+ZvPxyb4MH7/M57DDffvJJH/k1Y4Z3xLZXXZ1PGdHUlHnPEqLkLoUrJLnvtJOPYjjppLZ/KleaG27wNlvwmnshc8TEI2uamgoOq+QWL/amjLPPbv+48lLbaSdfHOTee/3+mjXwyCOZ5D5+vI+++cQnmr+ati3dunn57W/DH/9YlJDzpeQuhVu/Pr+mhti223qZkp+zeZsxwzs843bxl1/2mtwNN+RfY49l19zTrqEBdtjBLxpKm8GDvdkodtJJnswvvRRGjPDRN8OGZX5ttdfZZ/som7lzixNvAdTmLoUr9NL4uIOxEtqTW3PrrT7q5+mn/bZyJXzjG3Diid6/UIg4ub/7bsFhllxDQyrmVsnJ7rt7M8pFF/mcNlOnbjo7Z3v07u3z76fgmgTV3CV/DQ1+1eTKlbD55vm/TtzkMGtW23Ny9OrloxjSaM0ab34pRQLeYQcv//Y3r1mm0ZQpPuqpkpL76NE+4ubmmzNz3xx3XGGv2bmz/1vMnesTiSVEyV3yd/rpfjk9+E/afHXv7uU557R9bE2NzzDYt2/+5yuVfEdY5CJu/03ziKKjjso0ecQrLVWCTp28byTuHylU/Nk87bTMcN8EKLlL/ubN8xrlpZf6ijr5GjIEnnjCfx635uWX4Re/8GkLFi5Mz0VPL73kk0s99VTpknvc7JXmNvcVK+C734Wf/SzRGmvixo3zpL5kSaJhKLlL/lav9vHXxxxT+GsdcEDbxxxyiP/HWbzYk1wuc7OU2m23+YUuU6b40Lp41EWxmXmndUoukGlWQ4NfdZzWZrNy6dzZm87uv9+nmkjoV6Y6VCU/06d7p2E521a7dvU5uSEdna9Llviiz88/75eoz53b+pSwhaqtTW/Nff36yhiDXy7xF9wllyQWgpK75OfNN708/vjynjeurachucfj8q+9tjgLTLQlrcl9//394iBofhm9jmjsWH+/Erx2Q80ykp84yey7b3nPW8rkPmOGX6LeUgI1gx/+0C9hf+qpTC29R4/CxvnnqqYmXc0yDzyQWfFo33190qxvfSvpqNKhttbH1Cf4ZazkLvmJP7TlvsS6lMl90iRPVnvs0Xxn7SuveEfhfvvBmDHw8MM+KmTXXYsfS0ueeqp852rLmDHe1zB4sHd0779/0hGlS6dOif7CVHKX/CSV3OM23V13hfffL24bb3yF7PTpzSf3wYN9rpgFC3zY4157+dSw5bJypY/MCSEdI4XWrPF5WR5+OOlI0inh5K42d8lPUsn9iCN8VMqSJW0PnWyvhgb/e1pKnH36wEMP+XwpSVyoc+GFXqalaaaSLlZKQm2t/9JKqGkm5+RuZjVm9rKZPRjdH2xmU8xstpndaWZ10f766P7s6PFBJYpdkpRUcu/Tx1fsAU8uDzwA3/lO5nbaaW1P59qSthZy/tOf4Otfh/nzS3vBUkviTss0dCaHkMy/QSWprfU5/MvR2d6M9tTczwGyV1S4AhgbQtgBWAqcEu0/BVga7R8bHSfV5Mc/9su2IZmx5tlz0fz61z7W/LHHfHa/CRN8fpf2jFKYM8eHWD7xROt/z7bb+lwkq1fDtGnlr7XG50t6grXp033hi3/+U6NjWnPddV6uWJHI6XNK7mY2ADgcuCG6b8CBwN3RIROBo6LtI6P7RI8fFB0v1SK+YOe3v/Ul48otO8mtWuWzD/7rX36rr/cRL/37575e5sSJcNllvrBxWxN8HXssfO97PqXruecW8EfkIf7iSTq5z57ttfbvfx9+8pNkY0mz3r29TOiXVq6/qa8BfgxEvwvpDSwLIcSNSXOBeDKJ/sC/AUIITWa2PDp+cTEClhRYs8bXujzjjGTOHzcFDBvmCXy77fz+ZpvBX//qyfr3v/fRLcOHt/16a9f6T+hc1tccMsTHtSehSxcvTz3VR/UkJf5yOfdc2H775OJIu4SnaW4zuZvZEcDCEMJ0MzugWCc2s1HAKIBtttmmWC8rxXD99TB5csuPz5rloySScsABPt58zBi/H6/mFD/20Uee3Pfe20eX7LFH66+XglVzcvLVr/rcLe+/n2wccU00DdM/pFnCC6zk8oneBxhpZiOAzsCWwDigh5nVRrX3AcC86Ph5wEBgrpnVAt2BDzd+0RDCBGACwLBhw1I81V0HNGaMT8zV0sx+W29dujlUctG9u7e1//KXvrjCxku5HXqoL5px7rk+t0dbGhsrI1H16gWHH77hYhNJiGvuGinTurTX3EMI5wPnA0Q19x+GEI43sz8B3wDuAE4C7o+e8kB0//no8cdDSPM8pbKJhgb42td8jus0q69vfrRGbW3mytm1a9t+nbRMQpaLpCcPu/VW/2UHlfNvlpQ4uV97bfn7ZyhsnPtPgNFmNhtvU78x2n8j0DvaPxo4r7AQpeyqYfxyLiNLli+HP//Zm5kqJVElPQXBb34Db73lv4569EgujkoQ95G8+WYiI2ba1dAYQngSeDLafgfYZGhBCGENcHQRYpOkVENyj2v0xxwDb7+d6XTNNm4c/Pznvl3OKQQKUVOTWaM1CU1N3q/x4IPJxVApamr8y/CssxIZ4aQrVCXj+ON9+NaSJZV/ccqgQZlhjVOnNn/MihU+TvvVV+GZZ8oWWkGSbpZpaipsvdyOJsFrE5TcJePpp31hgbPP9qs9K1mnTn5xE7T8H6ux0b/Edt21cpoYkm6WWbeuMkYWpYWSu6RCY6PPeDhuHOy8c9LRFC77StbmVFJHaiwNzTJK7rmLP4MJTEGg5C4ZjY3V9R+3rVpTJf69apapLPE1Fo89VvZTK7lLRqWM985VnNyvuMLb3y+4IDNl7oIF3uZeaX+vmmUqy6c/7Z+9XIbkFpneJcmotuTeoweMGgVvvOEdpi++6JOLHX00nBeN0N1xx0RDbLekmmU+/BCuvtoXJ1dyb5/OnRNJ7qq5C5x4oi/ou2ZNdSV3M7/g5umn/UrWo4/2GSDvu8+v9rzuOrjllqSjbL958/zvKKdJk+BXv/Ka+557lvfcla6+Hp580mcdLSMld/HJtrp183lLqnUNzK5dPSl9+CG88AJ87nM+93tbs0CmzdChXs6eXd7zxjXPWbN8NkjJ3fe+5+WLL5b1tPp9Jd4cc/DBMH580pGU1nnn+UyWIfiXWSUaNszLck8jq8nC8jdypJdlbppRcpfqa2tvTRLzzxdTUjMNarKw/NXW+iinMid3NctIZY737qjizsxy1twXLPClBUGfk3zV1PiorTJ2hqvmLh2r5l7p4vepXMn93//25QVD8HNX+rQUSenRAxYt8s7wgQPLckrV3Duyww/3udBD0M/tShEn9yuvLM/5Fi3yz8fo0W2vMSst+/WvvSxjc5pq7h3Zs8/6bIlf/KJPGibpF9f6XnutPOeLk9GBB8I++5TnnNUogYU7VHPvyJqa4Etf8otTmpsSV9Knc2c4//zytd3GyUgXLhVGyV3KSpNAVaa6On/vypHg46kO9DkpjJK7lJWSe2WKOzVvugmWLi3tueJkpMnCCqPkLmWzfr3flNwrzzbbeHnqqZn1TEtFzTLFEf/7lXFRGCX3jko/tyvX8cfD++9780ypau4hwLRpfgN9Tgo1eLCXd9xRtlPqHeuILrwQpk/3bf2nrUz9+vkUCr/7HWy/vc9+WUyvv+7z78QqZaWqtNp5Zx+88NFHZTulau4d0ZVXwowZMHw47L9/0tFIvi66yJvWHn64+K+9fLmXY8b4Z2WnnYp/jo6mvr6sVxYruXc0TU1+O/10eP55T/BSmX7wAxgypDTrc8Zt7UOHwm67Ff/1O6JOncq6lqqSe0cSQqbNr0uXZGOR4qivL82EVOqTKb66Onj1VVi9uiynU3LvSF5+2RfmAJ8vRCpfXR1MngyvvFLc19UQyOLr2dPLUo9wiii5dxQhwFVX+faDD8IxxyQbjxRHPAqj2AtoqOZefPH/v7g/o8T0znUUM2fCbbf5dryaj1S+3/3Of+oXexSGxrcX3+ab+7zuZepUVc29o1izxsu77oL+/ZONRYpns81gwIDiJowFC2DuXN9Ws0xxdepUtqtU9bXcUcQfqC22SDYOKb66uuKNwnj9ddhll8z9zTcvzuuKq61VzV2KTD+zq1ddna+UVIyOug8+8PKCC7xvZsiQwl9TMjbbzK8bCKH0pyr5GSQdlNyr1/DhXhuMF/8uRFyrHDHCF3MxKzw+yTDz0U1PP13yUym5dxRK7tXrjDN8Son16wtvz42frxWXSuOGG7xcsqTkp1Jy7wgWLID33vNtJffqFE8DXOgFTaoElNbOO3tZhnZ3vYPV7q234FOfytxXB1l1itfAffll2Hff/F8nTjpK7qUR/yIqw4gZvYPVLu4g+8lP4AtfgF13TTYeKY0+fbzcbz8f9hrX5HO1fj089pjPNwRK7qUS/7uWoebeZrOMmXU2s6lmNsPMZprZ/0T7B5vZFDObbWZ3mlldtL8+uj87enxQif8GaU1cQzjkEDjiCHWQVatvfhOOO86342sa2uPZZ/0zctVV/hnp1au48YmLa+5pSO7AWuDAEMLuwFDgUDMbDlwBjA0h7AAsBU6Jjj8FWBrtHxsdJ0lRB1nHUFMDn/+8b+eTOJYt8/L222HOHPjkJ4sVmWSL/x+eeiqsWlXSU7WZ3IOLr23uFN0CcCBwd7R/InBUtH1kdJ/o8YPMVF1MjNpQO444ceRzQVNc299tt8wyflJ8fftm5safP7+kp8pptIyZ1ZjZP4CFwKPA28CyEELcKzAXiK9p7w/8GyB6fDnQu5nXHGVm08xs2qJFiwr6I6QVGv3QccSdqnPm5P6chQvhhBN8AReAzp2LHpZkMYOf/cy3S9ypmlNyDyGsCyEMBQYAewEFL8sSQpgQQhgWQhjWJ+4MkuJpbIQJE+Cee/y+mmWqXzy1xD77wEMP5fac556DW2/1mQq/9CXNO1QOZepUbVd1LoSwzMyeAPYGephZbVQ7HwDMiw6bBwwE5ppZLdAd+LCIMUsupkyB007z7a5d1YbaEXzlKz5L5Kmn+gLauYibcO6/PzMGW0qrTJ2quYyW6WNmPaLtLsDBwBvAE8A3osNOAu6Pth+I7hM9/ngIZZhIQTYUd9Y88ohfDde3b7LxSOnV18ORR/p2riNm4oue4iYdKb245p6CZpl+wBNm9grwIvBoCOFB4CfAaDObjbep3xgdfyPQO9o/Gjiv+GFLm+L/tD17tn/Ms1SuuM384YfhvvtaP3bZMli82Lf1GSmfuOZ+880lPU2bzTIhhFeAPZrZ/w7e/r7x/jXA0UWJTvIX/9zWf9qOpWtXX0LxoYfg0Uf9S765wWrPPOMXPGU/T8ojvmL8zjvh//6vZKfREIpqs3AhjBoFb7/t9/Vzu2OpqfH3/vLL4aKL4C9/yXwGevfO9L1MnerlpZf6/O29NxnQJqUyeDCceaZfU1BCSu7VZvp07xz7zGdg5EgthN0R1dTAwIG+HbfBN8fMk0z37uWJSzLKsCKTknu1iT8wN98Mn/1ssrFIco4/3mvk8YiMtWt9/Ht2QhkwQIk9KbW1Su7STroiVcBr7xt/ue+/fzKxyKbKsNye5nOvNroiVST9OnVScpd20kRhIukXj2K76KKSnULJvZqsXg0rV/q2au4i6XXyyV6+8UbJTqEMUC2mT/eFkuOauyaAEkmvrbeGoUNL2qmq5F4t/vUv/6CMHu0daVtvnXREItKaEneqKrlXi3XrvDz5ZC2lJ1IJSjzWXW3u1UKjZEQqS4lr7kru1SKuudfUJBuHiOSmthaefBL++c+SvLySe7VQzV2kssQLozz5ZEleXsm90vz8580vqqCau0hluflmnxLi+ONL8vKq5lWaSy5pfr9q7iKVpba2pBP7KRNUuvffhx/9CGbO9PuquYsIapapLHPnbrrvmWfgttt8Wb3DDtO83CICqOZeWT7/+U33xc0xkybBkCHljUdEUks190qydOmm+9SRKiLNUHKvJHEtfYstNt2n5C4iWZTcK0mcyOOZHyFTc9coGRHJouReKULYcB6KELxUzV1EmqHkXiniGnq8CMf69RvuV3IXkSz6LV8Jpk2D11/37fp6n2xo7VqYMgVee833q1lGRLIoI1SCL385M1KmXz946y2491444QTfV18PXbokF5+IpI6aZSrBRx/BqFEwezaceqrv+/BDL++912eV08pLIpJFNfdK0NQEffvC9ttn2tbXrvVyr7206pKIbEI197Rbv95HxsRt6ptFb1lDg5dqaxeRZii5p93Gsz1uXHNXcheRZii5p11byT0eGikikkXJvdyOOAK++tXcj28puU+ZsuF+EZEsygzlNGaMz96Yq913h1de8e26Oi+33RbM4KmnvCM13i8ikkXJvZymT8/92BA8se+3H3zpS3DMMb7/0ENhxQqv0XftqitTRaRZbTbLmNlAM3vCzF43s5lmdk60v5eZPWpmb0Vlz2i/mdl4M5ttZq+Y2Z6l/iMqxpo1XuYyJj2eVuDgg+GnP4VPfCLzWLdu0KOHau0i0qJc2tybgP8OIewMDAfONLOdgfOAySGEIcDk6D7AYcCQ6DYKuLboUVeipqZMco+HMbZGc8aISAHaTO4hhPkhhJei7ZXAG0B/4EhgYnTYROCoaPtI4JbgXgB6mFm/YgdeUS64wEe1/PWvfn/9enjuuczMjs3RgtciUoB2jZYxs0HAHsAUoG8IYX700AKgb7TdH/h31tPmRvs6pttug8sug+22g1/8AsaO9f377OMXJHXtCgsWbPo81dxFpAA5J3cz6wbcA5wbQliR/VgIIQCtVEObfb1RZjbNzKYtWrSoPU+tHPfd5yNkACZOhIsugjPOgP7Rd92QIbB6NVx88abPVc1dRAqQU3I3s054Yr81hPDnaPcHcXNLVC6M9s8DBmY9fUC0bwMhhAkhhGEhhGF9+vTJN/70ue8+OOwwmDfPO0JfftlHu3zhC/54XR38619eM581y2dz/PjjTV9HNXcRKUAuo2UMuBF4I4RwddZDDwAnRdsnAfdn7f9/0aiZ4cDyrOab6veDH3jb+oABPtf6t78Njzyy4TGbbeY3M9hmmw07WEOAE0+EQw7x+0ruIpKHXH7z7wOcCLxqZv+I9l0AXA7cZWanAO8B0UBsHgJGALOBVcDJxQw41WbPhjlzNtx35JGexFtSX79hcl+zBv74R/jUp2DkSDjwwJKEKiLVrc3kHkJ4FmgpOx3UzPEBOLPAuCrT/dGPl+uu86aXvfby5N6aTp28KWfFCthyy8xwyTPOgHPOKWm4IlK91FtXTOef7+XBB/vomFwMGuRXrs6cCXvv7QtzgBbfEJGCaOKwYjvuuNwTO8D3vudlUxMsWeJt8ACbb1782ESkw1ByL6bNNoOBA9s+Lls81LGpCeZH/c777tt2c46ISCuU3Iupqan9o1uyk/vq1b79ox/BFlsUNzYR6VCU3Itp3br2X3SUndyvjkaadulS3LhEpMNRci+W9eu9bG/NPV5JaeVKuP123x4ypHhxiUiHpOReLPleURrX3KdO9fI3v/EFOURECqDkXiz5JveePb286iove/QoWkgi0nEpuRdLvsl9m218SuDu3X0YZXvWVxURaYGSezGMHAk77ujb+cwFc+mlsGyZTw+s8e0iUgS6QrUYJk2CXXeFESPgqKOSjkZERMm9YOvX++2rX21+XnYRkQSoWaZQ8aIa8ZBGEZEUUHIvVGOjl0ruIpIiSu6FUnIXkRRSci/EpEmw226+XVeXbCwiIlmU3Avx3HMwd65P2/uVryQdjYjIf2i0TCFWr/Zx6ddem3QkIiIbUM09X/Pnw9ixamsXkVRScs/X3/7m5bBhycYhItIMJfd8xQtZ33JLsnGIiDRDyT1fcXKvr082DhGRZii55+Pxx+EHP/BtJXcRSSEl93w8+qiX//3fWhJPRFJJQyHbKwS4+WZfVGPMmKSjERFplmru7fXmm/DBB/nN2y4iUiZK7u21YoWXN92UbBwiIq1Qcm+vK6/0slu3ZOMQEWmFknt7Pf+8l5/5TLJxiIi0Qsm9vdatg1GjoE+fpCMREWmRknt7NTRoPhkRST0l9/ZqbFRyF5HUU3Jvj+XLldxFpCIouefq9NP9wqU1a6Br16SjERFpVZvJ3cxuMrOFZvZa1r5eZvaomb0VlT2j/WZm481stpm9YmZ7ljL4opoyBR5+uOXHX3zRy+uugzPPLE9MIiJ5yqXm/nvg0I32nQdMDiEMASZH9wEOA4ZEt1FAZSxRFAIMHw4jRmQuUtpYUxOMHAmnnaaRMiKSem0m9xDC08CSjXYfCUyMticCR2XtvyW4F4AeZtavSLGWxtKlMG1a5n5LS+Y1NmoRbBGpGPm2ufcNIcyPthcAfaPt/sC/s46bG+3bhJmNMrNpZjZt0aJFeYZRoKVLoVcv2GuvzL7zzsvM+phNHakiUkEK7lANIQQg5PG8CSGEYSGEYX2SauaYO7f5/cceu+H999+Hjz9WcheRipFvcv8gbm6JyoXR/nnAwKzjBkT70unvf/fSzCcCO/FEv79kCdx6q2+vWgU77OAJfsstk4lTRKSd8k3uDwAnRdsnAfdn7f9/0aiZ4cDyrOabdJk714c3AsyYASefDBMnwuzZvu/yy+HJJ+Gyy2D1ajj7bLj44qSiFRFplzYX6zCz24EDgK3MbC7wc+By4C4zOwV4DzgmOvwhYAQwG1gFnFyCmIvjO9/x8phjMpOAmcH22/t49tdegy9+MXP8d78LvXuXPUwRkXy0mdxDCMe18NBBzRwbgMoYBL5ypZfxFL7ZdtzRx73HfvUrzQIpIhWlY16hOn8+vPACHHIIbLvtpo9fdZWPmunRw+/vtltZwxMRKVTHXEP18ce93H335h/fZx+/XXwxLFwIAwc2f5yISEp1zJr7qlVennVW68fV1yuxi0hF6pjJ/frrvezSJdk4RERKpOMl95kzYfp03+7ePdlYRERKpOMl99df9/LBB6G2Y3Y5iEj163jJffVqLz/96WTjEBEpoY6R3Bsb/YrUhgY4KbqwVgtuiEgV6xjJ/cgjfdRLPPvjgAGw1VbJxiQiUkLV3+gcQmaFpRkzvJwzB2pqEgtJRKTUqr/mvmSjdUaee06JXUSqXvUn948/9vKEE2DxYth772TjEREpg+pP7ldf7eXhh2tWRxHpMKo7ub/zDowb59sHbTKJpYhI1aru5D5mjJeXXAJJLeUnIpKA6k3uS5bAtdf69kUXJRuLiEiZVW9y/+EPvRw61FdYEhHpQKo3ub/zjpcvvZRsHCIiCajO5N7QAE89BYcdplq7iHRI1ZncFy/2cvDgZOMQEUlIdSb3eKUlXbAkIh1U9SX3tWthyBDf7tYt2VhERBJSfcl91iwvu3eHgw9ONhYRkYRUX3KfNMnLu+6CzTdPNhYRkYRUz5S/zzwDo0fDG2/4/V12STYeEZEEVUdynzYN9tsvc//CC6F//+TiERFJWHU0y3zucxveP+20ZOIQEUmJyq+5//nPme1bboGPPvIl9UREOrDKTu6NjfD1r/v23XdntkVEOrjKbpa55prMthK7iMh/VHZyf+EFL0eMSDYOEZGUqezk/uabXp5+erJxiIikTGUn98ceg7PO0hJ6IiIbKUlyN7NDzWyWmc02s/NKcQ4A+vaF8eOhS5eSnUJEpBIVPbmbWQ3wW+AwYGfgODPbudjnERGRlpWi5r4XMDuE8E4IoQG4AziyBOcREZEWlCK59wf+nXV/brRPRETKJLEOVTMbZWbTzGzaokWLkgpDRKQqlSK5zwOyr/8fEO3bQAhhQghhWAhhWJ8+fUoQhohIx1WK5P4iMMTMBptZHXAs8EAJziMiIi0o+twyIYQmM/s+8DegBrgphDCz2OcREZGWlWTisBDCQ8BDpXhtERFpm4UQko4BM1sEvJfn07cCFhcxnFJIe4xpjw8UYzGkPT5If4xpi2/bEEKznZapSO6FMLNpIYRhScfRmrTHmPb4QDEWQ9rjg/THmPb4slX23DIiItIsJXcRkSpUDcl9QtIB5CDtMaY9PlCMxZD2+CD9MaY9vv+o+DZ3ERHZVDXU3EVEZCNK7iIiVaiik3vZFgXxc91kZgvN7LWsfb3M7FEzeysqe0b7zczGR3G9YmZ7Zj3npOj4t8zspKz9nzWzV6PnjDczyyPGgWb2hJm9bmYzzeycNMVpZp3NbKqZzYji+59o/2AzmxK95p3RtBWYWX10f3b0+KCs1zo/2j/LzA7J2l/wZ8LMaszsZTN7MKXxzYneg3+Y2bRoXyre46zX6GFmd5vZP83sDTPbO00xmtmO0b9ffFthZuemKcaChRAq8oZPbfA2sB1QB8wAdi7h+fYD9gRey9p3JXBetH0ecEW0PQJ4GDBgODAl2t8LeCcqe0bbPaPHpkbHWvTcw/KIsR+wZ7S9BfAmvmBKKuKMntMt2u4ETIle6y7g2Gj/dcDp0fYZwHXR9rHAndH2ztH7XQ8Mjj4HNcX6TACjgduAB6P7aYtvDrDVRvtS8R5nxTMR+G60XQf0SFuMG+WSBcC2aY0xr7+rnCcrauCwN/C3rPvnA+eX+JyD2DC5zwL6Rdv9gFnR9vXAcRsfBxwHXJ+1//poXz/gn1n7NziugHjvBw5OY5xAV+Al4L/wK/5qN35f8fmJ9o62a6PjbOP3Oj6uGJ8JfBbTycCBwIPR+VITX/S8OWya3FPzHgPdgXeJBmykMcaN4voy8Pc0x5jPrZKbZdKwKEjfEML8aHsB0Dfabim21vbPbWZ/3qImgj3w2nFq4oyaPP4BLAQexWuyy0IITc285n/iiB5fDvTOI+72uAb4MbA+ut87ZfEBBOARM5tuZqOifal5j/FfK4uAm6PmrRvMbPOUxZjtWOD2aDutMbZbJSf3VAn+9ZyKcaVm1g24Bzg3hLAi+7Gk4wwhrAshDMVryHsBOyUVy8bM7AhgYQhhetKxtOELIYQ98XWKzzSz/bIfTPo9xn/F7AlcG0LYA/gYb+L4jxTECEDUfzIS+NPGj6UlxnxVcnLPaVGQEvvAzPoBROXCNmJrbf+AZva3m5l1whP7rSGEP6c1zhDCMuAJvKmih5nFM5Rmv+Z/4oge7w58mEfcudoHGGlmc/C1fw8ExqUoPgBCCPOiciFwL/4lmab3eC4wN4QwJbp/N57s0xRj7DDgpRDCB9H9NMaYn3K2ARXzhtcO3sF/AsadU7uU+JyD2LDN/dds2PlyZbR9OBt2vkyN9vfC2yJ7Rrd3gV7RYxt3vozIIz4DbgGu2Wh/KuIE+gA9ou0uwDPAEXitKbvD8oxo+0w27LC8K9rehQ07LN/BO8WK9pkADiDToZqa+IDNgS2ytp8DDk3Le5wV5zPAjtH2xVF8qYoxep07gJPT9n+lGLeynagkwXsP9pt4u+2FJT7X7cB8oBGvmZyCt69OBt4CHst6Uw34bRTXq8CwrNf5DjA7umV/qIYBr0XP+V826ozKMcYv4D8jXwH+Ed1GpCVOYDfg5Si+14CfRfu3i/4jzMYTaX20v3N0f3b0+HZZr3VhFMMsskYhFOszwYbJPTXxRbHMiG4z49dIy3uc9RpDgWnRe30fnvjSFuPm+C+t7ln7UhVjITdNPyAiUoUquc1dRERaoOQuIlKFlNxFRKqQkruISBVSchcRqUJK7iIiVUjJXUSkCv1/nJ+updLeAoMAAAAASUVORK5CYII=\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.plot(returns, 'r')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "O agente demonstra consegue solocuinar o ambiente. Porém acaba desaprendendo, para solucionar isso é possível fazer uma otimização nos hiper-parâmetros. É possível também implementar algum tipo de parada antecipada." + ] + } + ] +} \ No newline at end of file diff --git "a/Aprendizado por Refor\303\247o Profundo/Actor-Critic/PPO/README.md" "b/Aprendizado por Refor\303\247o Profundo/Actor-Critic/PPO/README.md" new file mode 100644 index 0000000..e5ada8f --- /dev/null +++ "b/Aprendizado por Refor\303\247o Profundo/Actor-Critic/PPO/README.md" @@ -0,0 +1,62 @@ +# Proximal Policy Optimization (PPO) + +Como vimos na aula de A2C, uma função objetivo muito utilizada é: + + + +Os índices na função _advantage_ **A** indicam que **A** depende tanto dos pesos **w** utilizados para calcular o estimar de cada estado, quanto da política **πθ**, que determina quais trajetórias o agente vai seguir dentro do ambiente. + +> Obs: pode-se mostrar que essa formulação é equivalente à formulação que utiliza somatórias no tempo: + + + + +Note que uma pequena variação no espaço de parâmetros (Δθ = α∇θJ) pode causar uma grande variação no espaço de políticas. Isso significa que, em geral, a taxa de aprendizado **α** não pode ser muito alta; caso contrário, corremos o risco de obter uma nova política que não funcione. Consequentemente, a eficiência amostral de A2C também é limitada. + + +## Trust Region Policy Optimization (TRPO) + +Uma maneira de resolver esse problema é limitar as variações na política. Para isso, vamos utilizar a divergência KL **KL(π1 || π2)**, que pode ser, simplificadamente, encarada como uma medida da diferença entre duas políticas (ou, em geral, duas distribuições de probabilidade). + +TRPO define uma região de confiança (trust region) para garantir que a política nova não se distancie demais da política antiga: + + + +No entanto, maximizar a função objetivo de A2C sujeito a essas restrições é um pouco complicado. Então, vamos utilizar uma aproximação da função objetivo de A2C: + + + +Ou seja, TRPO consiste em: + + + + + +> Para entender como chegamos **L(θold,θ)** é uma aproximação de **J(θ)**, podemos fazer: + + + + +## Proximal Policy Optimization (PPO) + +Como já foi mencionado, a restrição (**KL < δ**) imposta em TRPO torna o algoritmo relativamente complicado. PPO é uma tentativa de simplificar esse algoritmo. Ao invés de utilizar trust regions, PPO mexe diretamente com a função objetivo: + + + + + + +Essa função pode ser reescrita como: + + + + + + +Nota-se que: +- Quando a vantagem é positiva, se **r** aumentar, então **L** aumenta. No entanto, esse benefício é limitado pelo clip: se **r > 1+ε**, não há mais benefício para **r** aumentar. +- Quando a vantagem é negativa, se **r** diminuir, então **L** aumenta. No entanto, esse benefício é limitado pelo clip: se **r < 1-ε**, não há mais benefício para **r** diminuir. + +A seguinte imagem pode te ajudar a visualizar o clip. Note que todos os valores fora do clip estipulado estão constantes: + +![imagem ilustrando o clip](imgs/clip.png) diff --git "a/Aprendizado por Refor\303\247o Profundo/Actor-Critic/PPO/imgs/clip.png" "b/Aprendizado por Refor\303\247o Profundo/Actor-Critic/PPO/imgs/clip.png" new file mode 100644 index 0000000..1c99da5 Binary files /dev/null and "b/Aprendizado por Refor\303\247o Profundo/Actor-Critic/PPO/imgs/clip.png" differ diff --git "a/Aprendizado por Refor\303\247o Profundo/Actor-Critic/README.md" "b/Aprendizado por Refor\303\247o Profundo/Actor-Critic/README.md" index eb0456d..1be4fd1 100644 --- "a/Aprendizado por Refor\303\247o Profundo/Actor-Critic/README.md" +++ "b/Aprendizado por Refor\303\247o Profundo/Actor-Critic/README.md" @@ -4,4 +4,8 @@ Os Actor Critics são algoritmos de estado da arte que combinam estimadores de f ## [Advantage Actor Critic (A2C)](A2C) -Uma das versões mais simples do modelo de Actor Critic. Combina um modelo que estima a Vantagem (A(s, a)) de uma ação com um modelo de Policy Gradient. \ No newline at end of file +Uma das versões mais simples do modelo de Actor Critic. Combina um modelo que estima a Vantagem (A(s, a)) de uma ação com um modelo de Policy Gradient. + +## [Proximal Policy Optimization (PPO)](PPO) + +Um algoritmo poderoso, que visa melhorar o modelo de A2C sem aumentar muito sua complexidade. Utiliza a ideia de Trust Region Policy Optimization (TRPO) para limitar variações na política. \ No newline at end of file