13.1 Reinforcement Learning

Exploring OpenAI

import numpy as np
np.random.seed(123)
print("NumPy:{}".format(np.__version__))

import tensorflow as tf
tf.set_random_seed(123)
print("TensorFlow:{}".format(tf.__version__))

import keras
print("Keras:{}".format(keras.__version__))

import gym
print('OpenAI Gym:',gym.__version__)
NumPy:1.13.1
TensorFlow:1.4.1


Using TensorFlow backend.


Keras:2.0.9
OpenAI Gym: 0.9.4

OpenAI 101

# print how many environments available

all_env = list(gym.envs.registry.all())

print('Total Environments in Gym version {} : {}'.format(gym.__version__,len(all_env)))
Total Environments in Gym version 0.9.4 : 777
for e in list(all_env):
    print(e)
EnvSpec(Assault-ramDeterministic-v0)
EnvSpec(AirRaidNoFrameskip-v4)
EnvSpec(UpNDown-ram-v4)
EnvSpec(Robotank-ramNoFrameskip-v4)
EnvSpec(Kangaroo-ram-v0)
EnvSpec(Kangaroo-ramNoFrameskip-v0)
EnvSpec(CrazyClimber-ramNoFrameskip-v4)
EnvSpec(SemisuperPendulumNoise-v0)
EnvSpec(MsPacman-ramNoFrameskip-v4)
EnvSpec(CrazyClimber-ram-v0)
EnvSpec(InvertedDoublePendulum-v1)
EnvSpec(VentureNoFrameskip-v4)
EnvSpec(ChopperCommand-ramDeterministic-v0)
EnvSpec(OffSwitchCartpoleProb-v0)
EnvSpec(AssaultDeterministic-v0)
EnvSpec(Freeway-ram-v0)
EnvSpec(BoxingDeterministic-v4)
EnvSpec(Venture-ramNoFrameskip-v0)
EnvSpec(Hero-ramNoFrameskip-v0)
EnvSpec(Assault-v0)
EnvSpec(WizardOfWorNoFrameskip-v4)
EnvSpec(Freeway-ramDeterministic-v4)
EnvSpec(Berzerk-ram-v4)
EnvSpec(Boxing-ram-v0)
EnvSpec(RoadRunner-ramDeterministic-v0)
EnvSpec(Frostbite-ram-v4)
EnvSpec(AlienNoFrameskip-v4)
EnvSpec(Bowling-ram-v4)
EnvSpec(Breakout-ramDeterministic-v4)
EnvSpec(BeamRider-ramDeterministic-v4)
EnvSpec(Zaxxon-ramDeterministic-v0)
EnvSpec(ElevatorAction-v4)
EnvSpec(Jamesbond-ramNoFrameskip-v0)
EnvSpec(AirRaid-ram-v4)
EnvSpec(Amidar-v0)
EnvSpec(PongNoFrameskip-v4)
EnvSpec(Tutankham-ram-v4)
EnvSpec(Pooyan-v4)
EnvSpec(Gravitar-v4)
EnvSpec(CentipedeNoFrameskip-v4)
EnvSpec(TennisNoFrameskip-v4)
EnvSpec(RoadRunnerNoFrameskip-v0)
EnvSpec(Robotank-ramNoFrameskip-v0)
EnvSpec(FishingDerby-ram-v0)
EnvSpec(Striker-v0)
EnvSpec(VideoPinball-ramDeterministic-v0)
EnvSpec(KungFuMaster-v0)
EnvSpec(WizardOfWorNoFrameskip-v0)
EnvSpec(Breakout-ramNoFrameskip-v0)
EnvSpec(NameThisGameNoFrameskip-v4)
EnvSpec(UpNDown-v4)
EnvSpec(PitfallNoFrameskip-v0)
EnvSpec(Pong-v0)
EnvSpec(FishingDerby-ramDeterministic-v0)
EnvSpec(DemonAttackDeterministic-v0)
EnvSpec(Solaris-ram-v4)
EnvSpec(Assault-ramNoFrameskip-v0)
EnvSpec(BeamRider-ramNoFrameskip-v4)
EnvSpec(StarGunner-ram-v0)
EnvSpec(Thrower-v0)
EnvSpec(TimePilotNoFrameskip-v0)
EnvSpec(HalfCheetah-v1)
EnvSpec(QbertDeterministic-v4)
EnvSpec(AsteroidsNoFrameskip-v0)
EnvSpec(Phoenix-v4)
EnvSpec(SpaceInvaders-v0)
EnvSpec(BattleZone-ram-v0)
EnvSpec(Jamesbond-ram-v0)
EnvSpec(RiverraidDeterministic-v4)
EnvSpec(RiverraidDeterministic-v0)
EnvSpec(ElevatorAction-ram-v0)
EnvSpec(JourneyEscape-v0)
EnvSpec(PrivateEye-ramNoFrameskip-v4)
EnvSpec(Carnival-v0)
EnvSpec(Berzerk-ramNoFrameskip-v4)
EnvSpec(SkiingNoFrameskip-v4)
EnvSpec(FrozenLake8x8-v0)
EnvSpec(Hero-ramDeterministic-v0)
EnvSpec(Centipede-ramDeterministic-v0)
EnvSpec(KrullNoFrameskip-v4)
EnvSpec(IceHockey-ram-v4)
EnvSpec(Riverraid-ramNoFrameskip-v0)
EnvSpec(Freeway-ramNoFrameskip-v4)
EnvSpec(PredictObsCartpole-v0)
EnvSpec(IceHockey-ram-v0)
EnvSpec(ChopperCommand-ramNoFrameskip-v0)
EnvSpec(BerzerkDeterministic-v0)
EnvSpec(AmidarNoFrameskip-v0)
EnvSpec(Phoenix-v0)
EnvSpec(Tennis-ramDeterministic-v4)
EnvSpec(Tennis-ramNoFrameskip-v0)
EnvSpec(MsPacman-v0)
EnvSpec(RoadRunner-v4)
EnvSpec(YarsRevengeNoFrameskip-v0)
EnvSpec(Solaris-ramNoFrameskip-v0)
EnvSpec(PitfallDeterministic-v0)
EnvSpec(Breakout-ram-v4)
EnvSpec(Assault-ram-v0)
EnvSpec(Carnival-ram-v0)
EnvSpec(Tennis-v0)
EnvSpec(Krull-v0)
EnvSpec(AtlantisNoFrameskip-v0)
EnvSpec(TimePilot-ramNoFrameskip-v0)
EnvSpec(AmidarDeterministic-v4)
EnvSpec(JourneyEscapeDeterministic-v4)
EnvSpec(Gopher-ramDeterministic-v4)
EnvSpec(Pong-ramDeterministic-v4)
EnvSpec(ZaxxonNoFrameskip-v4)
EnvSpec(IceHockeyDeterministic-v4)
EnvSpec(DemonAttack-ramDeterministic-v0)
EnvSpec(Gopher-ramNoFrameskip-v4)
EnvSpec(Roulette-v0)
EnvSpec(Pong-ramNoFrameskip-v0)
EnvSpec(BeamRider-v4)
EnvSpec(NameThisGame-v4)
EnvSpec(BeamRider-v0)
EnvSpec(JourneyEscape-ramNoFrameskip-v4)
EnvSpec(TimePilotNoFrameskip-v4)
EnvSpec(Amidar-ram-v4)
EnvSpec(BeamRiderDeterministic-v4)
EnvSpec(Hero-v4)
EnvSpec(Riverraid-v4)
EnvSpec(SkiingDeterministic-v0)
EnvSpec(HumanoidStandup-v1)
EnvSpec(Carnival-ramDeterministic-v4)
EnvSpec(Breakout-v4)
EnvSpec(SkiingNoFrameskip-v0)
EnvSpec(EnduroDeterministic-v4)
EnvSpec(BowlingDeterministic-v0)
EnvSpec(GopherDeterministic-v0)
EnvSpec(TwoRoundDeterministicReward-v0)
EnvSpec(CrazyClimberDeterministic-v0)
EnvSpec(Asteroids-v4)
EnvSpec(NameThisGameDeterministic-v0)
EnvSpec(JourneyEscape-ramDeterministic-v4)
EnvSpec(DemonAttack-ram-v4)
EnvSpec(BoxingNoFrameskip-v4)
EnvSpec(VideoPinball-ramNoFrameskip-v4)
EnvSpec(NameThisGame-ramDeterministic-v0)
EnvSpec(Taxi-v2)
EnvSpec(PhoenixNoFrameskip-v0)
EnvSpec(Pooyan-v0)
EnvSpec(DoubleDunk-ramNoFrameskip-v4)
EnvSpec(ElevatorActionNoFrameskip-v4)
EnvSpec(ConvergenceControl-v0)
EnvSpec(SpaceInvaders-ramDeterministic-v0)
EnvSpec(Enduro-ram-v0)
EnvSpec(Bowling-ramNoFrameskip-v4)
EnvSpec(RoadRunner-ramDeterministic-v4)
EnvSpec(AirRaid-ram-v0)
EnvSpec(CarRacing-v0)
EnvSpec(JourneyEscapeDeterministic-v0)
EnvSpec(Humanoid-v1)
EnvSpec(VideoPinballNoFrameskip-v4)
EnvSpec(DemonAttack-ram-v0)
EnvSpec(IceHockey-ramNoFrameskip-v4)
EnvSpec(JamesbondDeterministic-v0)
EnvSpec(BattleZone-ramNoFrameskip-v4)
EnvSpec(Boxing-ramDeterministic-v4)
EnvSpec(BattleZone-ramDeterministic-v0)
EnvSpec(ZaxxonNoFrameskip-v0)
EnvSpec(CentipedeDeterministic-v0)
EnvSpec(NameThisGame-ramNoFrameskip-v0)
EnvSpec(Phoenix-ramNoFrameskip-v0)
EnvSpec(BreakoutDeterministic-v4)
EnvSpec(DemonAttack-v4)
EnvSpec(BowlingNoFrameskip-v4)
EnvSpec(BankHeist-ramNoFrameskip-v0)
EnvSpec(KangarooDeterministic-v4)
EnvSpec(IceHockey-ramDeterministic-v4)
EnvSpec(FrozenLake-v0)
EnvSpec(Atlantis-v4)
EnvSpec(Assault-v4)
EnvSpec(QbertNoFrameskip-v0)
EnvSpec(NameThisGameDeterministic-v4)
EnvSpec(FishingDerbyDeterministic-v0)
EnvSpec(ElevatorActionDeterministic-v0)
EnvSpec(AsterixDeterministic-v4)
EnvSpec(MsPacman-ram-v4)
EnvSpec(Kangaroo-v0)
EnvSpec(Assault-ramDeterministic-v4)
EnvSpec(MontezumaRevenge-ram-v4)
EnvSpec(CarnivalDeterministic-v0)
EnvSpec(DemonAttack-ramDeterministic-v4)
EnvSpec(OneRoundDeterministicReward-v0)
EnvSpec(BowlingDeterministic-v4)
EnvSpec(HeroDeterministic-v4)
EnvSpec(CarnivalNoFrameskip-v4)
EnvSpec(Freeway-v4)
EnvSpec(Qbert-ramDeterministic-v4)
EnvSpec(FreewayDeterministic-v0)
EnvSpec(Berzerk-ramNoFrameskip-v0)
EnvSpec(SeaquestDeterministic-v4)
EnvSpec(Freeway-v0)
EnvSpec(Bowling-ramDeterministic-v0)
EnvSpec(Venture-ramNoFrameskip-v4)
EnvSpec(OneRoundNondeterministicReward-v0)
EnvSpec(FreewayNoFrameskip-v0)
EnvSpec(GravitarNoFrameskip-v4)
EnvSpec(CrazyClimber-ramNoFrameskip-v0)
EnvSpec(Pong-ramDeterministic-v0)
EnvSpec(Krull-ram-v0)
EnvSpec(PrivateEye-v0)
EnvSpec(BoxingDeterministic-v0)
EnvSpec(StarGunner-ramDeterministic-v4)
EnvSpec(BeamRider-ram-v4)
EnvSpec(ChopperCommand-ramNoFrameskip-v4)
EnvSpec(Bowling-ramDeterministic-v4)
EnvSpec(Pooyan-ramNoFrameskip-v4)
EnvSpec(IceHockey-ramNoFrameskip-v0)
EnvSpec(PrivateEye-ram-v4)
EnvSpec(Zaxxon-ram-v0)
EnvSpec(AlienNoFrameskip-v0)
EnvSpec(AsterixDeterministic-v0)
EnvSpec(Go19x19-v0)
EnvSpec(Gopher-v0)
EnvSpec(Solaris-ramDeterministic-v4)
EnvSpec(YarsRevenge-ram-v0)
EnvSpec(PongDeterministic-v0)
EnvSpec(GravitarNoFrameskip-v0)
EnvSpec(UpNDown-ram-v0)
EnvSpec(Gravitar-ramDeterministic-v4)
EnvSpec(Jamesbond-ramDeterministic-v4)
EnvSpec(TwoRoundNondeterministicReward-v0)
EnvSpec(BerzerkDeterministic-v4)
EnvSpec(Gravitar-ramNoFrameskip-v4)
EnvSpec(Robotank-ramDeterministic-v4)
EnvSpec(Centipede-v0)
EnvSpec(BattleZone-v4)
EnvSpec(Seaquest-ram-v0)
EnvSpec(EnduroDeterministic-v0)
EnvSpec(VideoPinball-ramNoFrameskip-v0)
EnvSpec(KungFuMaster-ramNoFrameskip-v4)
EnvSpec(CartPole-v0)
EnvSpec(Krull-ramNoFrameskip-v4)
EnvSpec(VideoPinball-ram-v4)
EnvSpec(Asteroids-ramDeterministic-v4)
EnvSpec(Centipede-ramNoFrameskip-v0)
EnvSpec(Enduro-ramNoFrameskip-v0)
EnvSpec(VentureDeterministic-v0)
EnvSpec(ElevatorAction-ramNoFrameskip-v0)
EnvSpec(KungFuMasterDeterministic-v0)
EnvSpec(ChopperCommandDeterministic-v4)
EnvSpec(EnduroNoFrameskip-v4)
EnvSpec(Carnival-ramNoFrameskip-v4)
EnvSpec(Pooyan-ramDeterministic-v4)
EnvSpec(ReversedAddition3-v0)
EnvSpec(Qbert-ramDeterministic-v0)
EnvSpec(DoubleDunkNoFrameskip-v0)
EnvSpec(WizardOfWor-v4)
EnvSpec(Walker2d-v1)
EnvSpec(Robotank-ramDeterministic-v0)
EnvSpec(JourneyEscape-ramNoFrameskip-v0)
EnvSpec(WizardOfWor-ramDeterministic-v0)
EnvSpec(Bowling-v0)
EnvSpec(Berzerk-ramDeterministic-v4)
EnvSpec(Tennis-ram-v0)
EnvSpec(TimePilot-ramDeterministic-v0)
EnvSpec(Go9x9-v0)
EnvSpec(Zaxxon-v0)
EnvSpec(GuessingGame-v0)
EnvSpec(AsteroidsNoFrameskip-v4)
EnvSpec(Ant-v1)
EnvSpec(AirRaidDeterministic-v4)
EnvSpec(Alien-ramNoFrameskip-v4)
EnvSpec(TutankhamDeterministic-v0)
EnvSpec(FrostbiteDeterministic-v4)
EnvSpec(Qbert-ramNoFrameskip-v0)
EnvSpec(TimePilot-ram-v4)
EnvSpec(PhoenixDeterministic-v4)
EnvSpec(Pitfall-ramNoFrameskip-v4)
EnvSpec(SeaquestDeterministic-v0)
EnvSpec(BeamRiderDeterministic-v0)
EnvSpec(Asteroids-ramNoFrameskip-v0)
EnvSpec(Phoenix-ram-v4)
EnvSpec(RepeatCopy-v0)
EnvSpec(MontezumaRevenge-ramNoFrameskip-v4)
EnvSpec(StarGunnerNoFrameskip-v4)
EnvSpec(ElevatorActionNoFrameskip-v0)
EnvSpec(ElevatorAction-ram-v4)
EnvSpec(AirRaid-v0)
EnvSpec(Pooyan-ramDeterministic-v0)
EnvSpec(PrivateEye-v4)
EnvSpec(KrullNoFrameskip-v0)
EnvSpec(Krull-ram-v4)
EnvSpec(CarnivalDeterministic-v4)
EnvSpec(OffSwitchCartpole-v0)
EnvSpec(BerzerkNoFrameskip-v0)
EnvSpec(DoubleDunkDeterministic-v0)
EnvSpec(ChopperCommandNoFrameskip-v0)
EnvSpec(WizardOfWor-ramNoFrameskip-v4)
EnvSpec(Pitfall-ramDeterministic-v0)
EnvSpec(Hero-ram-v4)
EnvSpec(KungFuMaster-ramDeterministic-v0)
EnvSpec(SolarisNoFrameskip-v0)
EnvSpec(Reacher-v1)
EnvSpec(BankHeist-ramDeterministic-v4)
EnvSpec(Atlantis-ram-v0)
EnvSpec(Gravitar-ramDeterministic-v0)
EnvSpec(IceHockeyDeterministic-v0)
EnvSpec(AmidarDeterministic-v0)
EnvSpec(DoubleDunk-v4)
EnvSpec(Kangaroo-v4)
EnvSpec(Alien-v4)
EnvSpec(KellyCoinflip-v0)
EnvSpec(Bowling-ramNoFrameskip-v0)
EnvSpec(BankHeist-ram-v0)
EnvSpec(MontezumaRevenge-ramNoFrameskip-v0)
EnvSpec(PitfallDeterministic-v4)
EnvSpec(Venture-ram-v0)
EnvSpec(NameThisGame-v0)
EnvSpec(WizardOfWor-ramNoFrameskip-v0)
EnvSpec(RoadRunner-ramNoFrameskip-v4)
EnvSpec(TutankhamDeterministic-v4)
EnvSpec(Carnival-ramDeterministic-v0)
EnvSpec(NameThisGameNoFrameskip-v0)
EnvSpec(Asterix-ramDeterministic-v0)
EnvSpec(YarsRevengeNoFrameskip-v4)
EnvSpec(HeroNoFrameskip-v4)
EnvSpec(MontezumaRevengeDeterministic-v4)
EnvSpec(BattleZone-ramNoFrameskip-v0)
EnvSpec(MsPacmanNoFrameskip-v4)
EnvSpec(BreakoutNoFrameskip-v0)
EnvSpec(Solaris-v0)
EnvSpec(ChopperCommand-v4)
EnvSpec(StarGunnerNoFrameskip-v0)
EnvSpec(Gopher-ramNoFrameskip-v0)
EnvSpec(JourneyEscapeNoFrameskip-v4)
EnvSpec(SpaceInvadersNoFrameskip-v0)
EnvSpec(Krull-ramDeterministic-v4)
EnvSpec(Gravitar-v0)
EnvSpec(Alien-ram-v4)
EnvSpec(Riverraid-ramDeterministic-v0)
EnvSpec(EnduroNoFrameskip-v0)
EnvSpec(IceHockey-v4)
EnvSpec(Seaquest-ramNoFrameskip-v0)
EnvSpec(RoadRunner-v0)
EnvSpec(SpaceInvadersNoFrameskip-v4)
EnvSpec(AirRaid-ramDeterministic-v4)
EnvSpec(ElevatorAction-ramDeterministic-v0)
EnvSpec(TutankhamNoFrameskip-v0)
EnvSpec(AsterixNoFrameskip-v0)
EnvSpec(Alien-ram-v0)
EnvSpec(Riverraid-ramNoFrameskip-v4)
EnvSpec(CrazyClimber-ram-v4)
EnvSpec(SkiingDeterministic-v4)
EnvSpec(Hero-ram-v0)
EnvSpec(RobotankDeterministic-v0)
EnvSpec(Breakout-v0)
EnvSpec(Venture-ram-v4)
EnvSpec(MontezumaRevengeNoFrameskip-v4)
EnvSpec(Zaxxon-ramNoFrameskip-v4)
EnvSpec(FrostbiteDeterministic-v0)
EnvSpec(NameThisGame-ram-v0)
EnvSpec(Seaquest-v4)
EnvSpec(BerzerkNoFrameskip-v4)
EnvSpec(Riverraid-ram-v4)
EnvSpec(UpNDown-v0)
EnvSpec(Pong-ram-v0)
EnvSpec(Pooyan-ram-v0)
EnvSpec(Pitfall-v4)
EnvSpec(KangarooDeterministic-v0)
EnvSpec(RiverraidNoFrameskip-v4)
EnvSpec(MontezumaRevengeNoFrameskip-v0)
EnvSpec(ChopperCommand-ram-v0)
EnvSpec(Reverse-v0)
EnvSpec(Frostbite-v0)
EnvSpec(Skiing-ram-v0)
EnvSpec(CrazyClimberDeterministic-v4)
EnvSpec(Centipede-ram-v0)
EnvSpec(MsPacman-ramDeterministic-v4)
EnvSpec(MountainCarContinuous-v0)
EnvSpec(TimePilotDeterministic-v4)
EnvSpec(GravitarDeterministic-v4)
EnvSpec(Acrobot-v1)
EnvSpec(FishingDerbyNoFrameskip-v0)
EnvSpec(Breakout-ramDeterministic-v0)
EnvSpec(DemonAttack-v0)
EnvSpec(DuplicatedInput-v0)
EnvSpec(Enduro-ram-v4)
EnvSpec(AtlantisDeterministic-v0)
EnvSpec(SolarisNoFrameskip-v4)
EnvSpec(Pong-ram-v4)
EnvSpec(SpaceInvaders-ram-v4)
EnvSpec(Hero-ramDeterministic-v4)
EnvSpec(Amidar-v4)
EnvSpec(YarsRevenge-v0)
EnvSpec(Gravitar-ramNoFrameskip-v0)
EnvSpec(Jamesbond-v4)
EnvSpec(BoxingNoFrameskip-v0)
EnvSpec(CartPole-v1)
EnvSpec(Assault-ram-v4)
EnvSpec(FrostbiteNoFrameskip-v0)
EnvSpec(MontezumaRevengeDeterministic-v0)
EnvSpec(Berzerk-v4)
EnvSpec(BeamRider-ramDeterministic-v0)
EnvSpec(Frostbite-v4)
EnvSpec(IceHockeyNoFrameskip-v4)
EnvSpec(Bowling-v4)
EnvSpec(KungFuMaster-v4)
EnvSpec(Skiing-ramDeterministic-v4)
EnvSpec(PooyanNoFrameskip-v0)
EnvSpec(AirRaid-v4)
EnvSpec(Tutankham-ram-v0)
EnvSpec(Asteroids-ramDeterministic-v0)
EnvSpec(Tutankham-v4)
EnvSpec(PrivateEyeNoFrameskip-v4)
EnvSpec(Amidar-ramDeterministic-v0)
EnvSpec(Berzerk-ram-v0)
EnvSpec(DoubleDunk-ram-v0)
EnvSpec(FreewayNoFrameskip-v4)
EnvSpec(TennisDeterministic-v4)
EnvSpec(DoubleDunk-v0)
EnvSpec(Asterix-ramNoFrameskip-v4)
EnvSpec(Asterix-ramDeterministic-v4)
EnvSpec(ChopperCommandNoFrameskip-v4)
EnvSpec(PrivateEyeDeterministic-v4)
EnvSpec(StarGunner-v4)
EnvSpec(PrivateEye-ramDeterministic-v4)
EnvSpec(SeaquestNoFrameskip-v4)
EnvSpec(GopherNoFrameskip-v0)
EnvSpec(SolarisDeterministic-v4)
EnvSpec(BeamRider-ram-v0)
EnvSpec(RobotankDeterministic-v4)
EnvSpec(Solaris-v4)
EnvSpec(SolarisDeterministic-v0)
EnvSpec(ElevatorAction-v0)
EnvSpec(DoubleDunk-ram-v4)
EnvSpec(BankHeistDeterministic-v0)
EnvSpec(AsteroidsDeterministic-v0)
EnvSpec(Atlantis-ram-v4)
EnvSpec(Blackjack-v0)
EnvSpec(Alien-v0)
EnvSpec(Atlantis-ramNoFrameskip-v4)
EnvSpec(FishingDerbyNoFrameskip-v4)
EnvSpec(Kangaroo-ramDeterministic-v0)
EnvSpec(TimePilotDeterministic-v0)
EnvSpec(YarsRevengeDeterministic-v0)
EnvSpec(StarGunner-ramNoFrameskip-v4)
EnvSpec(SeaquestNoFrameskip-v0)
EnvSpec(KungFuMasterNoFrameskip-v4)
EnvSpec(FishingDerby-ram-v4)
EnvSpec(Qbert-v4)
EnvSpec(MountainCar-v0)
EnvSpec(FishingDerby-ramDeterministic-v4)
EnvSpec(Enduro-v4)
EnvSpec(Centipede-ram-v4)
EnvSpec(RiverraidNoFrameskip-v0)
EnvSpec(Qbert-v0)
EnvSpec(JourneyEscape-v4)
EnvSpec(AtlantisNoFrameskip-v4)
EnvSpec(Riverraid-v0)
EnvSpec(Pong-ramNoFrameskip-v4)
EnvSpec(FishingDerby-ramNoFrameskip-v0)
EnvSpec(Gopher-ram-v0)
EnvSpec(Seaquest-ramDeterministic-v0)
EnvSpec(TimePilot-v4)
EnvSpec(Solaris-ramDeterministic-v0)
EnvSpec(DemonAttackDeterministic-v4)
EnvSpec(VideoPinballDeterministic-v0)
EnvSpec(StarGunner-ramNoFrameskip-v0)
EnvSpec(Asteroids-ramNoFrameskip-v4)
EnvSpec(LunarLanderContinuous-v2)
EnvSpec(Zaxxon-ramDeterministic-v4)
EnvSpec(YarsRevenge-v4)
EnvSpec(Swimmer-v1)
EnvSpec(AirRaid-ramDeterministic-v0)
EnvSpec(UpNDown-ramDeterministic-v0)
EnvSpec(AsterixNoFrameskip-v4)
EnvSpec(ChopperCommandDeterministic-v0)
EnvSpec(Seaquest-ram-v4)
EnvSpec(PooyanNoFrameskip-v4)
EnvSpec(Kangaroo-ramDeterministic-v4)
EnvSpec(SpaceInvaders-v4)
EnvSpec(Seaquest-ramNoFrameskip-v4)
EnvSpec(Jamesbond-v0)
EnvSpec(TimePilot-ramDeterministic-v4)
EnvSpec(Tutankham-ramNoFrameskip-v0)
EnvSpec(AssaultNoFrameskip-v4)
EnvSpec(RobotankNoFrameskip-v4)
EnvSpec(MontezumaRevenge-ramDeterministic-v4)
EnvSpec(Solaris-ram-v0)
EnvSpec(Amidar-ramDeterministic-v4)
EnvSpec(Robotank-v4)
EnvSpec(NameThisGame-ramNoFrameskip-v4)
EnvSpec(CNNClassifierTraining-v0)
EnvSpec(Pooyan-ramNoFrameskip-v0)
EnvSpec(SpaceInvadersDeterministic-v0)
EnvSpec(DemonAttackNoFrameskip-v0)
EnvSpec(Gravitar-ram-v4)
EnvSpec(Atlantis-v0)
EnvSpec(Qbert-ramNoFrameskip-v4)
EnvSpec(KungFuMaster-ramDeterministic-v4)
EnvSpec(VideoPinball-ram-v0)
EnvSpec(Tutankham-ramNoFrameskip-v4)
EnvSpec(Breakout-ram-v0)
EnvSpec(PrivateEyeNoFrameskip-v0)
EnvSpec(NameThisGame-ramDeterministic-v4)
EnvSpec(WizardOfWor-ram-v0)
EnvSpec(BattleZoneDeterministic-v4)
EnvSpec(Tutankham-ramDeterministic-v0)
EnvSpec(BowlingNoFrameskip-v0)
EnvSpec(UpNDownNoFrameskip-v4)
EnvSpec(Berzerk-v0)
EnvSpec(Hex9x9-v0)
EnvSpec(Asterix-ram-v4)
EnvSpec(DemonAttack-ramNoFrameskip-v0)
EnvSpec(HeroNoFrameskip-v0)
EnvSpec(CrazyClimber-ramDeterministic-v0)
EnvSpec(MontezumaRevenge-v0)
EnvSpec(StarGunnerDeterministic-v4)
EnvSpec(IceHockey-v0)
EnvSpec(Venture-v4)
EnvSpec(Atlantis-ramDeterministic-v4)
EnvSpec(Phoenix-ram-v0)
EnvSpec(BipedalWalker-v2)
EnvSpec(Venture-ramDeterministic-v4)
EnvSpec(AirRaid-ramNoFrameskip-v4)
EnvSpec(CarnivalNoFrameskip-v0)
EnvSpec(TimePilot-v0)
EnvSpec(KungFuMaster-ram-v4)
EnvSpec(ElevatorAction-ramDeterministic-v4)
EnvSpec(Alien-ramDeterministic-v4)
EnvSpec(Carnival-v4)
EnvSpec(BattleZoneDeterministic-v0)
EnvSpec(Carnival-ram-v4)
EnvSpec(Robotank-ram-v4)
EnvSpec(DoubleDunkNoFrameskip-v4)
EnvSpec(Venture-v0)
EnvSpec(BipedalWalkerHardcore-v2)
EnvSpec(Zaxxon-ram-v4)
EnvSpec(Kangaroo-ramNoFrameskip-v4)
EnvSpec(Enduro-ramDeterministic-v0)
EnvSpec(Skiing-ram-v4)
EnvSpec(KungFuMaster-ram-v0)
EnvSpec(FreewayDeterministic-v4)
EnvSpec(Boxing-ram-v4)
EnvSpec(SemisuperPendulumRandom-v0)
EnvSpec(Robotank-v0)
EnvSpec(Boxing-ramNoFrameskip-v0)
EnvSpec(MontezumaRevenge-ram-v0)
EnvSpec(AtlantisDeterministic-v4)
EnvSpec(MsPacman-ram-v0)
EnvSpec(VideoPinball-ramDeterministic-v4)
EnvSpec(Skiing-v0)
EnvSpec(Asterix-ramNoFrameskip-v0)
EnvSpec(TennisNoFrameskip-v0)
EnvSpec(Jamesbond-ramDeterministic-v0)
EnvSpec(KangarooNoFrameskip-v0)
EnvSpec(JourneyEscape-ram-v0)
EnvSpec(KrullDeterministic-v4)
EnvSpec(Pitfall-ramDeterministic-v4)
EnvSpec(Tutankham-ramDeterministic-v4)
EnvSpec(RoadRunner-ram-v4)
EnvSpec(Zaxxon-ramNoFrameskip-v0)
EnvSpec(BankHeist-v4)
EnvSpec(QbertDeterministic-v0)
EnvSpec(BankHeist-v0)
EnvSpec(CliffWalking-v0)
EnvSpec(Hopper-v1)
EnvSpec(Jamesbond-ram-v4)
EnvSpec(StarGunnerDeterministic-v0)
EnvSpec(VideoPinballNoFrameskip-v0)
EnvSpec(Skiing-ramDeterministic-v0)
EnvSpec(MsPacman-v4)
EnvSpec(GopherDeterministic-v4)
EnvSpec(UpNDown-ramDeterministic-v4)
EnvSpec(Phoenix-ramNoFrameskip-v4)
EnvSpec(Zaxxon-v4)
EnvSpec(Berzerk-ramDeterministic-v0)
EnvSpec(Asteroids-v0)
EnvSpec(Enduro-ramNoFrameskip-v4)
EnvSpec(RoadRunnerDeterministic-v4)
EnvSpec(BankHeist-ramDeterministic-v0)
EnvSpec(DoubleDunkDeterministic-v4)
EnvSpec(CentipedeDeterministic-v4)
EnvSpec(BattleZone-v0)
EnvSpec(ChopperCommand-ram-v4)
EnvSpec(Frostbite-ramDeterministic-v4)
EnvSpec(MontezumaRevenge-v4)
EnvSpec(PitfallNoFrameskip-v4)
EnvSpec(Amidar-ramNoFrameskip-v4)
EnvSpec(PhoenixNoFrameskip-v4)
EnvSpec(MsPacman-ramDeterministic-v0)
EnvSpec(Frostbite-ramNoFrameskip-v4)
EnvSpec(Amidar-ram-v0)
EnvSpec(JourneyEscapeNoFrameskip-v0)
EnvSpec(GravitarDeterministic-v0)
EnvSpec(YarsRevenge-ramDeterministic-v0)
EnvSpec(SemisuperPendulumDecay-v0)
EnvSpec(CrazyClimberNoFrameskip-v4)
EnvSpec(ElevatorActionDeterministic-v4)
EnvSpec(MsPacmanNoFrameskip-v0)
EnvSpec(ChopperCommand-v0)
EnvSpec(SpaceInvaders-ramDeterministic-v4)
EnvSpec(JourneyEscape-ram-v4)
EnvSpec(Tennis-ram-v4)
EnvSpec(VideoPinballDeterministic-v4)
EnvSpec(Freeway-ram-v4)
EnvSpec(AirRaid-ramNoFrameskip-v0)
EnvSpec(BankHeistNoFrameskip-v4)
EnvSpec(KrullDeterministic-v0)
EnvSpec(Freeway-ramNoFrameskip-v0)
EnvSpec(UpNDownDeterministic-v4)
EnvSpec(Asterix-v0)
EnvSpec(BankHeistNoFrameskip-v0)
EnvSpec(SpaceInvaders-ramNoFrameskip-v4)
EnvSpec(WizardOfWor-ram-v4)
EnvSpec(Venture-ramDeterministic-v0)
EnvSpec(CrazyClimber-v4)
EnvSpec(PooyanDeterministic-v0)
EnvSpec(Alien-ramDeterministic-v0)
EnvSpec(BeamRider-ramNoFrameskip-v0)
EnvSpec(JourneyEscape-ramDeterministic-v0)
EnvSpec(Pooyan-ram-v4)
EnvSpec(AirRaidDeterministic-v0)
EnvSpec(Robotank-ram-v0)
EnvSpec(Pitfall-ramNoFrameskip-v0)
EnvSpec(Gravitar-ram-v0)
EnvSpec(SpaceInvaders-ram-v0)
EnvSpec(DoubleDunk-ramDeterministic-v4)
EnvSpec(InvertedPendulum-v1)
EnvSpec(StarGunner-v0)
EnvSpec(ReversedAddition-v0)
EnvSpec(TimePilot-ram-v0)
EnvSpec(BeamRiderNoFrameskip-v0)
EnvSpec(Gopher-v4)
EnvSpec(KellyCoinflipGeneralized-v0)
EnvSpec(Jamesbond-ramNoFrameskip-v4)
EnvSpec(Phoenix-ramDeterministic-v4)
EnvSpec(Hero-v0)
EnvSpec(Skiing-v4)
EnvSpec(Carnival-ramNoFrameskip-v0)
EnvSpec(FishingDerbyDeterministic-v4)
EnvSpec(Asterix-ram-v0)
EnvSpec(ZaxxonDeterministic-v0)
EnvSpec(Solaris-ramNoFrameskip-v4)
EnvSpec(Hero-ramNoFrameskip-v4)
EnvSpec(BreakoutDeterministic-v0)
EnvSpec(RoadRunnerDeterministic-v0)
EnvSpec(Pong-v4)
EnvSpec(PongDeterministic-v4)
EnvSpec(HotterColder-v0)
EnvSpec(Gopher-ram-v4)
EnvSpec(ZaxxonDeterministic-v4)
EnvSpec(StarGunner-ram-v4)
EnvSpec(IceHockeyNoFrameskip-v0)
EnvSpec(WizardOfWor-v0)
EnvSpec(Freeway-ramDeterministic-v0)
EnvSpec(NameThisGame-ram-v4)
EnvSpec(BattleZone-ram-v4)
EnvSpec(Pitfall-ram-v0)
EnvSpec(NChain-v0)
EnvSpec(Riverraid-ramDeterministic-v4)
EnvSpec(Copy-v0)
EnvSpec(KungFuMasterNoFrameskip-v0)
EnvSpec(Qbert-ram-v0)
EnvSpec(CentipedeNoFrameskip-v0)
EnvSpec(Gopher-ramDeterministic-v0)
EnvSpec(AsteroidsDeterministic-v4)
EnvSpec(Boxing-v0)
EnvSpec(VentureDeterministic-v4)
EnvSpec(UpNDownDeterministic-v0)
EnvSpec(Asteroids-ram-v4)
EnvSpec(PredictActionsCartpole-v0)
EnvSpec(Pitfall-ram-v4)
EnvSpec(SpaceInvadersDeterministic-v4)
EnvSpec(FrostbiteNoFrameskip-v4)
EnvSpec(DoubleDunk-ramNoFrameskip-v0)
EnvSpec(TutankhamNoFrameskip-v4)
EnvSpec(Tennis-v4)
EnvSpec(TennisDeterministic-v0)
EnvSpec(Kangaroo-ram-v4)
EnvSpec(Enduro-ramDeterministic-v4)
EnvSpec(KangarooNoFrameskip-v4)
EnvSpec(AssaultDeterministic-v4)
EnvSpec(WizardOfWorDeterministic-v4)
EnvSpec(AlienDeterministic-v4)
EnvSpec(Breakout-ramNoFrameskip-v4)
EnvSpec(YarsRevenge-ramNoFrameskip-v0)
EnvSpec(Alien-ramNoFrameskip-v0)
EnvSpec(YarsRevenge-ram-v4)
EnvSpec(BankHeistDeterministic-v4)
EnvSpec(StarGunner-ramDeterministic-v0)
EnvSpec(FishingDerby-v0)
EnvSpec(UpNDown-ramNoFrameskip-v4)
EnvSpec(VideoPinball-v4)
EnvSpec(Frostbite-ram-v0)
EnvSpec(UpNDown-ramNoFrameskip-v0)
EnvSpec(Amidar-ramNoFrameskip-v0)
EnvSpec(JamesbondNoFrameskip-v0)
EnvSpec(AlienDeterministic-v0)
EnvSpec(YarsRevenge-ramDeterministic-v4)
EnvSpec(Phoenix-ramDeterministic-v0)
EnvSpec(MsPacman-ramNoFrameskip-v0)
EnvSpec(Krull-ramNoFrameskip-v0)
EnvSpec(WizardOfWorDeterministic-v0)
EnvSpec(Seaquest-ramDeterministic-v4)
EnvSpec(Centipede-ramNoFrameskip-v4)
EnvSpec(PrivateEye-ramNoFrameskip-v0)
EnvSpec(Seaquest-v0)
EnvSpec(Skiing-ramNoFrameskip-v4)
EnvSpec(RoadRunner-ram-v0)
EnvSpec(Tennis-ramDeterministic-v0)
EnvSpec(QbertNoFrameskip-v4)
EnvSpec(BankHeist-ram-v4)
EnvSpec(PrivateEye-ram-v0)
EnvSpec(VentureNoFrameskip-v0)
EnvSpec(MontezumaRevenge-ramDeterministic-v0)
EnvSpec(Bowling-ram-v0)
EnvSpec(ElevatorAction-ramNoFrameskip-v4)
EnvSpec(Boxing-v4)
EnvSpec(UpNDownNoFrameskip-v0)
EnvSpec(Assault-ramNoFrameskip-v4)
EnvSpec(GopherNoFrameskip-v4)
EnvSpec(Qbert-ram-v4)
EnvSpec(Krull-v4)
EnvSpec(Centipede-v4)
EnvSpec(Pendulum-v0)
EnvSpec(BattleZoneNoFrameskip-v4)
EnvSpec(RobotankNoFrameskip-v0)
EnvSpec(HeroDeterministic-v0)
EnvSpec(Tennis-ramNoFrameskip-v4)
EnvSpec(DemonAttack-ramNoFrameskip-v4)
EnvSpec(FishingDerby-ramNoFrameskip-v4)
EnvSpec(Boxing-ramNoFrameskip-v4)
EnvSpec(MsPacmanDeterministic-v0)
EnvSpec(TimePilot-ramNoFrameskip-v4)
EnvSpec(YarsRevengeDeterministic-v4)
EnvSpec(WizardOfWor-ramDeterministic-v4)
EnvSpec(MsPacmanDeterministic-v4)
EnvSpec(JamesbondDeterministic-v4)
EnvSpec(Pusher-v0)
EnvSpec(RoadRunnerNoFrameskip-v4)
EnvSpec(SpaceInvaders-ramNoFrameskip-v0)
EnvSpec(Krull-ramDeterministic-v0)
EnvSpec(Enduro-v0)
EnvSpec(AmidarNoFrameskip-v4)
EnvSpec(KungFuMasterDeterministic-v4)
EnvSpec(RoadRunner-ramNoFrameskip-v0)
EnvSpec(DoubleDunk-ramDeterministic-v0)
EnvSpec(Skiing-ramNoFrameskip-v0)
EnvSpec(FishingDerby-v4)
EnvSpec(VideoPinball-v0)
EnvSpec(BeamRiderNoFrameskip-v4)
EnvSpec(YarsRevenge-ramNoFrameskip-v4)
EnvSpec(PongNoFrameskip-v0)
EnvSpec(AirRaidNoFrameskip-v0)
EnvSpec(PrivateEye-ramDeterministic-v0)
EnvSpec(BattleZoneNoFrameskip-v0)
EnvSpec(Boxing-ramDeterministic-v0)
EnvSpec(JamesbondNoFrameskip-v4)
EnvSpec(IceHockey-ramDeterministic-v0)
EnvSpec(PooyanDeterministic-v4)
EnvSpec(Frostbite-ramNoFrameskip-v0)
EnvSpec(Riverraid-ram-v0)
EnvSpec(BreakoutNoFrameskip-v4)
EnvSpec(Tutankham-v0)
EnvSpec(BankHeist-ramNoFrameskip-v4)
EnvSpec(Atlantis-ramDeterministic-v0)
EnvSpec(LunarLander-v2)
EnvSpec(CrazyClimberNoFrameskip-v0)
EnvSpec(Pitfall-v0)
EnvSpec(Frostbite-ramDeterministic-v0)
EnvSpec(Asterix-v4)
EnvSpec(PhoenixDeterministic-v0)
EnvSpec(Asteroids-ram-v0)
EnvSpec(CrazyClimber-v0)
EnvSpec(PrivateEyeDeterministic-v0)
EnvSpec(Atlantis-ramNoFrameskip-v0)
EnvSpec(DemonAttackNoFrameskip-v4)
EnvSpec(BattleZone-ramDeterministic-v4)
EnvSpec(Centipede-ramDeterministic-v4)
EnvSpec(AssaultNoFrameskip-v0)
EnvSpec(CrazyClimber-ramDeterministic-v4)
EnvSpec(ChopperCommand-ramDeterministic-v4)
EnvSpec(KungFuMaster-ramNoFrameskip-v0)

Play the cartploe game with stochastic control

import matplotlib.pyplot as plt
import matplotlib.animation as anm
from matplotlib import rc
#rc('animation', html='html5')
rc('animation', ffmpeg_path='/usr/bin/ffmpeg')
%matplotlib inline

from matplotlib import animation
from JSAnimation.IPython_display import display_animation

def env_render(env_vis):
    plt.figure()
    plot = plt.imshow(env_vis[0])
    plt.axis('off')
    def animate(i):
        plot.set_data(env_vis[i])

    anim = anm.FuncAnimation(plt.gcf(),
                             animate,
                             frames=len(env_vis),
                             interval=20,
                             repeat=True,
                             repeat_delay=20)
    display(display_animation(anim, default_mode='loop'))

env = gym.make('CartPole-v0')
n_episodes = 1
env_vis = []
for i_episode in range(n_episodes):
    observation = env.reset()
    for t in range(100):
        env_vis.append(env.render(mode = 'rgb_array'))
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished at t{}".format(t+1))
            break
env.render(close=True)
env_render(env_vis)

Simple Policies

def policy_logic(env,obs):
    return 1 if obs[2] > 0 else 0
def policy_random(env,obs):
    return env.action_space.sample()

def experiment(policy, n_episodes, rewards_max):
    rewards=np.empty(shape=(n_episodes))
    env = gym.make('CartPole-v0')

    for i in range(n_episodes):
        obs = env.reset()
        done = False
        episode_reward = 0
        while not done:
            action = policy(env,obs)
            obs, reward, done, info = env.step(action)
            episode_reward += reward
            if episode_reward > rewards_max:
                break
        rewards[i]=episode_reward
        #print("Episode finished at t{}".format(reward))
    print('Policy:{}, Min reward:{}, Max reward:{}, Average reward:{}'
          .format(policy.__name__,
                  np.min(rewards),
                  np.max(rewards),
                  np.mean(rewards)))

n_episodes = 100
rewards_max = 10000
experiment(policy_random, n_episodes, rewards_max)
experiment(policy_logic, n_episodes, rewards_max)
Policy:policy_random, Min reward:9.0, Max reward:81.0, Average reward:22.52
Policy:policy_logic, Min reward:25.0, Max reward:58.0, Average reward:42.37

Simple Policies with Parameters

def policy_logic(theta,obs):
    # just ignore theta
    return 1 if obs[2] > 0 else 0

def policy_random(theta,obs):
    return 0 if np.matmul(theta,obs) < 0 else 1

def episode(env, policy, rewards_max):
    obs = env.reset()
    done = False
    episode_reward = 0
    if policy.__name__ in ['policy_random']:
        theta = np.random.rand(4) * 2 - 1
    else:
        theta = None
    while not done:
        action = policy(theta,obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward
        if episode_reward > rewards_max:
            break
    return episode_reward

def experiment(policy, n_episodes, rewards_max):
    rewards=np.empty(shape=(n_episodes))
    env = gym.make('CartPole-v0')

    for i in range(n_episodes):
        rewards[i]=episode(env,policy,rewards_max)
        #print("Episode finished at t{}".format(reward))
    print('Policy:{}, Min reward:{}, Max reward:{}, Average reward:{}'
          .format(policy.__name__,
                  np.min(rewards),
                  np.max(rewards),
                  np.mean(rewards)))

n_episodes = 100
rewards_max = 10000
experiment(policy_random, n_episodes, rewards_max)
experiment(policy_logic, n_episodes, rewards_max)
Policy:policy_random, Min reward:8.0, Max reward:200.0, Average reward:36.52
Policy:policy_logic, Min reward:25.0, Max reward:66.0, Average reward:44.18

Simple Policies with Training

# train with random search

def policy_logic(theta,obs):
    # just ignore theta
    return 1 if obs[2] > 0 else 0

def policy_random(theta,obs):
    return 0 if np.matmul(theta,obs) < 0 else 1

def episode(env,policy, rewards_max,theta):
    obs = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = policy(theta,obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward
        if episode_reward > rewards_max:
            break
    return episode_reward

def train(policy, n_episodes, rewards_max):

    env = gym.make('CartPole-v0')
    np.random.seed(0)

    theta_best = np.empty(shape=[4])
    reward_best = 0

    for i in range(n_episodes):
        if policy.__name__ in ['policy_random']:
            theta = np.random.rand(4) * 2 - 1
        else:
            theta = None

        reward_episode=episode(env,policy,rewards_max, theta)
        if reward_episode > reward_best:
            reward_best = reward_episode
            theta_best = theta.copy()
    return reward_best,theta_best

def experiment(policy, n_episodes, rewards_max, theta=None):
    rewards=np.empty(shape=[n_episodes])
    env = gym.make('CartPole-v0')

    for i in range(n_episodes):
        rewards[i]=episode(env,policy,rewards_max,theta)
        #print("Episode finished at t{}".format(reward))
    print('Policy:{}, Min reward:{}, Max reward:{}, Average reward:{}'
          .format(policy.__name__,
                  np.min(rewards),
                  np.max(rewards),
                  np.mean(rewards)))

n_episodes = 100
rewards_max = 10000

reward,theta = train(policy_random, n_episodes, rewards_max)
print('trained theta: {}, rewards: {}'.format(theta,reward))
experiment(policy_random, n_episodes, rewards_max, theta)

experiment(policy_logic, n_episodes, rewards_max)
trained theta: [-0.1526904   0.29178823 -0.12482558  0.783546  ], rewards: 200.0
Policy:policy_random, Min reward:200.0, Max reward:200.0, Average reward:200.0
Policy:policy_logic, Min reward:24.0, Max reward:68.0, Average reward:41.66

Simple Policies with Training until Certain Rewards

# train with random search until we reach rewards > 200
def policy_logic(theta,obs):
    # just ignore theta
    return 1 if obs[2] > 0 else 0

def policy_random(theta,obs):
    return 0 if np.matmul(theta,obs) < 0 else 1

def episode(env,policy, rewards_max,theta):
    obs = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = policy(theta,obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward
        if episode_reward > rewards_max:
            break
    return episode_reward

def train(policy, n_episodes, rewards_max):

    env = gym.make('CartPole-v0')
    np.random.seed(0)

    theta_best = np.empty(shape=[4])
    reward_best = 0

    # n_episodes acts as a max in this case
    for i in range(n_episodes):
        if policy.__name__ in ['policy_random']:
            theta = np.random.rand(4) * 2 - 1
        else:
            theta = None
        reward_episode=episode(env,policy,rewards_max, theta)
        if reward_episode > reward_best:
            reward_best = reward_episode
            theta_best = theta.copy()
        if reward_best >= rewards_max:
            break
    return reward_best,theta_best

def experiment(policy, n_episodes, rewards_max, theta=None):
    rewards=np.empty(shape=[n_episodes])
    env = gym.make('CartPole-v0')

    for i in range(n_episodes):
        rewards[i]=episode(env,policy,rewards_max,theta)
        #print("Episode finished at t{}".format(reward))
    print('Policy:{}, Min reward:{}, Max reward:{}, Average reward:{}'
          .format(policy.__name__,
                  np.min(rewards),
                  np.max(rewards),
                  np.mean(rewards)))

n_episodes = 10000 # acts as max episodes to use in training
rewards_max = 200
reward,theta = train(policy_random, n_episodes, rewards_max)
print('trained theta: {}, rewards: {}'.format(theta,reward))

n_episodes = 100
rewards_max = 10000
experiment(policy_random, n_episodes, rewards_max, theta)
experiment(policy_logic, n_episodes, rewards_max)
trained theta: [-0.1526904   0.29178823 -0.12482558  0.783546  ], rewards: 200.0
Policy:policy_random, Min reward:200.0, Max reward:200.0, Average reward:200.0
Policy:policy_logic, Min reward:24.0, Max reward:64.0, Average reward:41.96

Neural Network Policy

# train with the neural network

def policy_random(theta,obs):
    return 0 if np.matmul(theta,obs) < 0 else 1

def policy_naive_nn(nn,obs):
    return np.argmax(nn.predict(np.array([obs])))


# returns obs -> actions -> rewards arrays
# specify t_max to run for t_max steps
# specify r_max to run until r_max is reached
# specify both t_max ad r_max to run for t_max but break if r_max is reached
def episode(env, policy, theta, r_max=0, t_max=0, return_hist_reward=0):
    if return_hist_reward > 0:
        o_list=[]
        a_list=[]
        r_list=[]

    episode_reward = 0

    obs = env.reset()
    done = False
    t = 0
    while not done:
        action = policy(theta,obs)
        if return_hist_reward>0:
            o_list.append(obs)
            a_list.append(action)
        obs, reward, done, info = env.step(action)
        if return_hist_reward>0:
            r_list.append(reward)
        episode_reward += reward
        if r_max > 0 and episode_reward > r_max:
            break
        t+=1
        if t_max > 0 and t == t_max:
            break


    if return_hist_reward>=episode_reward:
        return_val = [np.array(o_list),np.array(a_list),np.array(r_list)]
    else:
        return_val = episode_reward
    return return_val

# experiment collect observations and rewards for each episode
def experiment(env, policy, n_episodes,theta=None, r_max=0, t_max=0, return_hist_reward=0):

    if return_hist_reward>0:
        obs_list=[]
        action_list=[]
    else:
        rewards=np.empty(shape=[n_episodes])
    for i in range(n_episodes):
        val = episode(env,policy,theta, r_max, t_max,return_hist_reward)
        if isinstance(val, list):
            obs_list.append(val[0])
            action_list.append(val[1])
            if not return_hist_reward:
                rewards[i]=np.sum(val[2])
        else:
            if not return_hist_reward:
                rewards[i]=val

    if return_hist_reward>0:
        return_val = [np.concatenate(obs_list,axis=0), np.concatenate(action_list,axis=0)]
    else:
        return_val = []
        print('Policy:{}, Min reward:{}, Max reward:{}, Average reward:{}'
          .format(policy.__name__,
                  np.min(rewards),
                  np.max(rewards),
                  np.mean(rewards)))

    return return_val
# build the model
from keras.models import Sequential
from keras.layers import Dense
model = Sequential()
model.add(Dense(8,input_dim=4, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam')
model.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_7 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 18        
=================================================================
Total params: 58
Trainable params: 58
Non-trainable params: 0
_________________________________________________________________
# create training data
env = gym.make('CartPole-v0')
n_obs = 4
n_actions = 2
theta = np.random.rand(4) * 2 - 1
n_episodes = 100
r_max = 0
t_max = 0

x_train, y_train = experiment(env, 
                              policy_random, 
                              n_episodes,
                              theta, 
                              r_max, 
                              t_max, 
                              return_hist_reward=100 )
y_train = np.eye(n_actions)[y_train]
print(x_train.shape,y_train.shape)
(5933, 4) (5933, 2)
# train the model
model.fit(x_train, y_train, epochs=50, batch_size=10)
Epoch 1/50
5933/5933 [==============================] - 2s 365us/step - loss: 0.5358
Epoch 2/50
5933/5933 [==============================] - 2s 292us/step - loss: 0.2386
Epoch 3/50
5933/5933 [==============================] - 2s 262us/step - loss: 0.1447
Epoch 4/50
5933/5933 [==============================] - 1s 245us/step - loss: 0.1129
Epoch 5/50
5933/5933 [==============================] - 1s 232us/step - loss: 0.0963
Epoch 6/50
5933/5933 [==============================] - 1s 252us/step - loss: 0.0858
Epoch 7/50
5933/5933 [==============================] - 2s 255us/step - loss: 0.0787
Epoch 8/50
5933/5933 [==============================] - 2s 294us/step - loss: 0.0728
Epoch 9/50
5933/5933 [==============================] - 2s 269us/step - loss: 0.0682
Epoch 10/50
5933/5933 [==============================] - 2s 330us/step - loss: 0.0644
Epoch 11/50
5933/5933 [==============================] - 2s 363us/step - loss: 0.0610
Epoch 12/50
5933/5933 [==============================] - 2s 337us/step - loss: 0.0586
Epoch 13/50
5933/5933 [==============================] - 2s 289us/step - loss: 0.0561
Epoch 14/50
5933/5933 [==============================] - 1s 230us/step - loss: 0.0539
Epoch 15/50
5933/5933 [==============================] - 1s 226us/step - loss: 0.0517
Epoch 16/50
5933/5933 [==============================] - 2s 267us/step - loss: 0.0499
Epoch 17/50
5933/5933 [==============================] - 3s 430us/step - loss: 0.0482
Epoch 18/50
5933/5933 [==============================] - 2s 321us/step - loss: 0.0465
Epoch 19/50
5933/5933 [==============================] - 2s 281us/step - loss: 0.0451
Epoch 20/50
5933/5933 [==============================] - 2s 321us/step - loss: 0.0439
Epoch 21/50
5933/5933 [==============================] - 2s 288us/step - loss: 0.0425
Epoch 22/50
5933/5933 [==============================] - 2s 267us/step - loss: 0.0416
Epoch 23/50
5933/5933 [==============================] - 2s 272us/step - loss: 0.0402
Epoch 24/50
5933/5933 [==============================] - 2s 316us/step - loss: 0.0393
Epoch 25/50
5933/5933 [==============================] - 1s 251us/step - loss: 0.0382
Epoch 26/50
5933/5933 [==============================] - 2s 284us/step - loss: 0.0372
Epoch 27/50
5933/5933 [==============================] - 2s 325us/step - loss: 0.0365
Epoch 28/50
5933/5933 [==============================] - 2s 256us/step - loss: 0.0355
Epoch 29/50
5933/5933 [==============================] - 2s 281us/step - loss: 0.0346
Epoch 30/50
5933/5933 [==============================] - 2s 399us/step - loss: 0.0338
Epoch 31/50
5933/5933 [==============================] - 2s 392us/step - loss: 0.0333
Epoch 32/50
5933/5933 [==============================] - 3s 440us/step - loss: 0.0325
Epoch 33/50
5933/5933 [==============================] - 2s 355us/step - loss: 0.0316
Epoch 34/50
5933/5933 [==============================] - 2s 359us/step - loss: 0.0311
Epoch 35/50
5933/5933 [==============================] - 2s 279us/step - loss: 0.0305
Epoch 36/50
5933/5933 [==============================] - 2s 418us/step - loss: 0.0297
Epoch 37/50
5933/5933 [==============================] - 2s 384us/step - loss: 0.0292
Epoch 38/50
5933/5933 [==============================] - 2s 301us/step - loss: 0.0291
Epoch 39/50
5933/5933 [==============================] - 2s 308us/step - loss: 0.0283
Epoch 40/50
5933/5933 [==============================] - 2s 273us/step - loss: 0.0279
Epoch 41/50
5933/5933 [==============================] - 2s 323us/step - loss: 0.0272
Epoch 42/50
5933/5933 [==============================] - 2s 281us/step - loss: 0.0267
Epoch 43/50
5933/5933 [==============================] - 2s 336us/step - loss: 0.0266
Epoch 44/50
5933/5933 [==============================] - 2s 392us/step - loss: 0.0260
Epoch 45/50
5933/5933 [==============================] - 2s 339us/step - loss: 0.0256
Epoch 46/50
5933/5933 [==============================] - 2s 266us/step - loss: 0.0253
Epoch 47/50
5933/5933 [==============================] - 2s 378us/step - loss: 0.0247
Epoch 48/50
5933/5933 [==============================] - 2s 343us/step - loss: 0.0248
Epoch 49/50
5933/5933 [==============================] - 1s 250us/step - loss: 0.0242
Epoch 50/50
5933/5933 [==============================] - 2s 289us/step - loss: 0.0239





<keras.callbacks.History at 0x7f3b48175cc0>
n_episodes = 200
r_max = 0
t_max = 0

_ = experiment(env, 
              policy_naive_nn, 
              n_episodes,
              theta=model, 
              r_max=r_max, 
              t_max=t_max, 
              return_hist_reward=0 )

_ = experiment(env, 
              policy_random, 
              n_episodes,
              theta, 
              r_max, 
              t_max, 
              return_hist_reward=0 )
Policy:policy_naive_nn, Min reward:40.0, Max reward:107.0, Average reward:65.595
Policy:policy_random, Min reward:41.0, Max reward:122.0, Average reward:65.175

results matching ""

    No results matching ""