# Reinforcement Learning 1 Gym Environments

Nipun Batra  
2024-04-01

<figure>
<a
href="https://colab.research.google.com/github/nipunbatra/ml-teaching/blob/master/notebooks/rl-gym-environments.ipynb"><img
src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
<figcaption>Open In Colab</figcaption>
</figure>

Reference

1.  [Detailed Explanation and Python Implementation of Q-Learning
    Algorithm in OpenAI Gym
    (Cart-Pole)](https://www.youtube.com/watch?v=KMjQmG5Uzis)

### Basic Imports

https://www.gymlibrary.dev/environments/classic_control/mountain_car/

In [2]:
import matplotlib.pyplot as plt
import torch
try:
    import gymnasium as gym
except ImportError:
    %pip install gymnasium[classic-control] -q
    import gymnasium as gym
import numpy as np

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [3]:
# List of environments
list(gym.envs.registry.keys())

['CartPole-v0',
 'CartPole-v1',
 'MountainCar-v0',
 'MountainCarContinuous-v0',
 'Pendulum-v1',
 'Acrobot-v1',
 'phys2d/CartPole-v0',
 'phys2d/CartPole-v1',
 'phys2d/Pendulum-v0',
 'LunarLander-v2',
 'LunarLanderContinuous-v2',
 'BipedalWalker-v3',
 'BipedalWalkerHardcore-v3',
 'CarRacing-v2',
 'Blackjack-v1',
 'FrozenLake-v1',
 'FrozenLake8x8-v1',
 'CliffWalking-v0',
 'Taxi-v3',
 'tabular/Blackjack-v0',
 'tabular/CliffWalking-v0',
 'Reacher-v2',
 'Reacher-v4',
 'Pusher-v2',
 'Pusher-v4',
 'InvertedPendulum-v2',
 'InvertedPendulum-v4',
 'InvertedDoublePendulum-v2',
 'InvertedDoublePendulum-v4',
 'HalfCheetah-v2',
 'HalfCheetah-v3',
 'HalfCheetah-v4',
 'Hopper-v2',
 'Hopper-v3',
 'Hopper-v4',
 'Swimmer-v2',
 'Swimmer-v3',
 'Swimmer-v4',
 'Walker2d-v2',
 'Walker2d-v3',
 'Walker2d-v4',
 'Ant-v2',
 'Ant-v3',
 'Ant-v4',
 'Humanoid-v2',
 'Humanoid-v3',
 'Humanoid-v4',
 'HumanoidStandup-v2',
 'HumanoidStandup-v4',
 'GymV21Environment-v0',
 'GymV26Environment-v0']

In [4]:
env = gym.make('MountainCar-v0', render_mode='human')

In [5]:
observation, info = env.reset(seed=42)

In [5]:
observation

array([-0.4452088,  0.       ], dtype=float32)

In [12]:
info

{}

In [6]:
env

<TimeLimit<OrderEnforcing<PassiveEnvChecker<MountainCarEnv<MountainCar-v0>>>>>

In [7]:
env.action_space

Discrete(3)

In [8]:
env.action_space.n

3

In [16]:
env.reward_range

(-inf, inf)

In [9]:
for i in range(10):
    print(i, env.action_space.sample())

0 1
1 0
2 0
3 0
4 2
5 0
6 2
7 2
8 2
9 2

In [10]:
env.observation_space  

Box([-1.2  -0.07], [0.6  0.07], (2,), float32)

In [19]:
env.observation_space.high

array([0.6 , 0.07], dtype=float32)

In [20]:
env.observation_space.sample()

array([-0.2961844, -0.034966 ], dtype=float32)

In [11]:
env.reset(seed=42)
for i in range(100):
    env.render()
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    print(i, observation, reward, terminated, truncated, info)
    if terminated:
        break
    

0 [-0.4457913  -0.00058252] -1.0 False False {}
1 [-0.4469521  -0.00116079] -1.0 False False {}
2 [-0.4486827  -0.00173059] -1.0 False False {}
3 [-0.45097044 -0.00228774] -1.0 False False {}
4 [-0.45479858 -0.00382815] -1.0 False False {}
5 [-0.4601391 -0.0053405] -1.0 False False {}
6 [-0.46495268 -0.00481358] -1.0 False False {}
7 [-0.47120383 -0.00625116] -1.0 False False {}
8 [-0.47784632 -0.0066425 ] -1.0 False False {}
9 [-0.4858309  -0.00798457] -1.0 False False {}
10 [-0.4950981  -0.00926722] -1.0 False False {}
11 [-0.5045788  -0.00948072] -1.0 False False {}
12 [-0.5142021  -0.00962329] -1.0 False False {}
13 [-0.5228959  -0.00869376] -1.0 False False {}
14 [-0.5325949  -0.00969903] -1.0 False False {}
15 [-0.5432265  -0.01063157] -1.0 False False {}
16 [-0.55371094 -0.01048444] -1.0 False False {}
17 [-0.56296986 -0.00925891] -1.0 False False {}
18 [-0.57293415 -0.00996431] -1.0 False False {}
19 [-0.5825298  -0.00959565] -1.0 False False {}
20 [-0.5916858  -0.00915596] -1.

In [12]:
# Does it help to always go right?
env.reset(seed=42)
for i in range(100):
    env.render()
    action = 2
    observation, reward, terminated, truncated, info = env.step(action)
    print(i, observation, reward, terminated, truncated, info)
    if terminated:
        break

0 [-4.4479132e-01  4.1747934e-04] -1.0 False False {}
1 [-0.4439594   0.00083191] -1.0 False False {}
2 [-0.4427191   0.00124029] -1.0 False False {}
3 [-0.4410795   0.00163962] -1.0 False False {}
4 [-0.43905246  0.00202703] -1.0 False False {}
5 [-0.43665275  0.00239971] -1.0 False False {}
6 [-0.43389776  0.00275498] -1.0 False False {}
7 [-0.43080744  0.00309032] -1.0 False False {}
8 [-0.4274041   0.00340333] -1.0 False False {}
9 [-0.42371225  0.00369185] -1.0 False False {}
10 [-0.4197584   0.00395386] -1.0 False False {}
11 [-0.41557083  0.00418759] -1.0 False False {}
12 [-0.41117933  0.00439149] -1.0 False False {}
13 [-0.40661508  0.00456424] -1.0 False False {}
14 [-0.40191033  0.00470476] -1.0 False False {}
15 [-0.3970981   0.00481224] -1.0 False False {}
16 [-0.392212    0.00488609] -1.0 False False {}
17 [-0.38728598  0.00492601] -1.0 False False {}
18 [-0.38235408  0.00493192] -1.0 False False {}
19 [-0.37745008  0.004904  ] -1.0 False False {}
20 [-0.3726074   0.00484

In [13]:
# Go left first K iterations and then go right N - K iterations
env.reset(seed=42)
K = 50
N = 250

for i in range(N):
    env.render()
    action = 0 if i < K else 2
    observation, reward, terminated, truncated, info = env.step(action)
    print(i, observation, reward, terminated, truncated, info)
    if terminated:
        break

0 [-0.44679132 -0.00158252] -1.0 False False {}
1 [-0.4499448  -0.00315349] -1.0 False False {}
2 [-0.45464623 -0.00470141] -1.0 False False {}
3 [-0.4608611  -0.00621488] -1.0 False False {}
4 [-0.46854374 -0.00768264] -1.0 False False {}
5 [-0.4776374  -0.00909367] -1.0 False False {}
6 [-0.4880747  -0.01043729] -1.0 False False {}
7 [-0.4997779  -0.01170322] -1.0 False False {}
8 [-0.51265967 -0.01288173] -1.0 False False {}
9 [-0.52662337 -0.01396375] -1.0 False False {}
10 [-0.54156446 -0.01494107] -1.0 False False {}
11 [-0.55737084 -0.01580639] -1.0 False False {}
12 [-0.5739244  -0.01655353] -1.0 False False {}
13 [-0.59110194 -0.01717752] -1.0 False False {}
14 [-0.6087766 -0.0176747] -1.0 False False {}
15 [-0.62681943 -0.0180428 ] -1.0 False False {}
16 [-0.64510036 -0.01828096] -1.0 False False {}
17 [-0.6634901  -0.01838974] -1.0 False False {}
18 [-0.6818612  -0.01837108] -1.0 False False {}
19 [-0.7000894  -0.01822821] -1.0 False False {}
20 [-0.71805495 -0.01796552] -1.

In [14]:
env.close()

In [1]:
%pip install flappy-bird-gymnasium -q

Note: you may need to restart the kernel to use updated packages.

In [15]:
import flappy_bird_gymnasium
env = gym.make("FlappyBird-v0", render_mode="human", use_lidar=False)

In [16]:
obs, _ = env.reset()

In [17]:
obs

array([ 1.       ,  0.234375 ,  0.4296875,  1.       ,  0.       ,
        1.       ,  1.       ,  0.       ,  1.       ,  0.4765625,
       -0.9      ,  0.5      ])

In [18]:
env.observation_space

Box(-1.0, 1.0, (12,), float64)

In [9]:
n_bins = 6
import pandas as pd
obs_low = env.observation_space.low
obs_high = env.observation_space.high

# Discretize the observation space
def discretize_observation(observation):
    bins = np.linspace(obs_low, obs_high, n_bins)
    return tuple(np.digitize(observation, bins))

# Define the variables for the MultiIndex
variables = [
    "last_pipe_h_pos",
    "last_top_pipe_v_pos",
    "last_bottom_pipe_v_pos",
    "next_pipe_h_pos",
    "next_top_pipe_v_pos",
    "next_bottom_pipe_v_pos",
    "next_next_pipe_h_pos",
    "next_next_top_pipe_v_pos",
    "next_next_bottom_pipe_v_pos",
    "player_v_pos",
    "player_v_vel",
    "player_rotation",
]

# leave out the first three and last three variables
var_consider = variables[3:-3]

q_table_np = np.zeros([n_bins] * len(var_consider) + [env.action_space.n])