Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Potential rewards #94

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added adept/rewards/__init__.py
Empty file.
Empty file added adept/rewards/base/__init__.py
Empty file.
81 changes: 81 additions & 0 deletions adept/rewards/base/base_potential_based_rewards.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@

import numpy as np
import abc

class BasePotentialBasedReward:
"""
Class for applying potential reward shaping to a set of observations. Potential based rewards
are used to prevent the learning of suboptimal policies. The reward for executing a transition
between states is the difference in value between the potential function applied to each state.
This condition is sufficient to guarantee policy invariance.

This implementation provides support for potential based reward shaping over scalar observations
of length 1.

For details, see
"Policy invariance under reward transformations:
Theory and application to reward shaping"
https://people.eecs.berkeley.edu/~pabbeel/cs287-fa09/readings/NgHaradaRussell-shaping-ICML1999.pdf
"""

def __init__(
self,
name: str,
gamma: float,
exponent_coefficient: float,
minimum: float,
maximum: float,
absolute: bool,
reward_base: float,
) -> None:

"""
Parameters
----------
name: str
Name of the shaped reward
gamma : float
Discount factor needed for calculating potential-based reward shaping
exponential_coefficient : float
The coefficient of the exponent value. The smaller the value, the closer to linear
minimum : float
Minimum value to be given to the agent. This should match the minimum of the gym space
maximum : float
Maximum value to be given to the agent. This should match the maximum of the gym space
absolute : bool
If the absolute value should be taken during preprocessing
reward_base : float
Reward to use for the phi calculations (before the potential--not the actual reward provided)
"""
self._name = name
self._gamma = gamma
self._exponential_coefficient = exponent_coefficient
self._minimum = minimum
self._maximum = maximum
self._absolute = absolute
self._reward_base = reward_base

self._midpoint = (self._maximum - self._minimum) / 2 + self._minimum

def __call__(self, observation, next_observation) -> float:
return self._potential_shaping_function(observation, next_observation)

def name(self) -> str:
return f"{type(self).__name__}_{self._name}"

def _preprocess_absolute(self, x):
return np.abs(x) if self._absolute else x

def _preprocess_observation(self, x):
return min(max(self._minimum, x), self._maximum)

def _potential_shaping_function(self, current_observation, next_observation) -> float:
return (self._gamma * self._phi(next_observation)) - self._phi(current_observation)

@abc.abstractmethod
def _phi(self, x) -> float:
"""
Example phi function:
return self._reward_base / (1 + np.exp(self._exponent_coefficient * (self._preprocess_observation(x) - self._midpoint)))
"""
raise NotImplementedError