#!/usr/bin/env python
'''
Code for running experiments where RL agents interact with an MDP.
Instructions:
(1) Create an MDP.
(2) Create agents.
(3) Set experiment parameters (instances, episodes, steps).
(4) Call run_agents_on_mdp(agents, mdp) (or the lifelong/markov game equivalents).
-> Runs all experiments and will open a plot with results when finished.
Author: David Abel (cs.brown.edu/~dabel/)
'''
# Python imports.
from __future__ import print_function
import time
import argparse
import os
import math
import sys
import copy
import numpy as np
from collections import defaultdict
# Non-standard imports.
from simple_rl.planning import ValueIteration
from simple_rl.experiments import Experiment
from simple_rl.mdp import MarkovGameMDP
from simple_rl.utils import chart_utils
from simple_rl.agents import *
from simple_rl.tasks import *
[docs]def play_markov_game(agent_ls, markov_game_mdp, instances=10, episodes=100, steps=30, verbose=False, open_plot=True):
'''
Args:
agent_list (list of Agents): See agents/AgentClass.py (and friends).
markov_game_mdp (MarkovGameMDP): See mdp/markov_games/MarkovGameMDPClass.py.
instances (int): Number of times to run each agent (for confidence intervals).
episodes (int): Number of episodes for each learning instance.
steps (int): Number of times to run each agent (for confidence intervals).
verbose (bool)
open_plot (bool): If true opens plot.
'''
# Put into dict.
agent_dict = {}
for a in agent_ls:
agent_dict[a.name] = a
# Experiment (for reproducibility, plotting).
exp_params = {"instances":instances, "episodes":episodes, "steps":steps}
experiment = Experiment(agents=agent_dict, mdp=markov_game_mdp, params=exp_params, is_episodic=episodes > 1, is_markov_game=True)
# Record how long each agent spends learning.
print("Running experiment: \n" + str(experiment))
start = time.clock()
# For each instance of the agent.
for instance in range(1, instances + 1):
print("\tInstance " + str(instance) + " of " + str(int(instances)) + ".")
reward_dict = defaultdict(str)
action_dict = {}
for episode in range(1, episodes + 1):
if verbose:
sys.stdout.write("\tEpisode %s of %s" % (episode, episodes))
sys.stdout.write("\b" * len("\tEpisode %s of %s" % (episode, episodes)))
sys.stdout.flush()
# Compute initial state/reward.
state = markov_game_mdp.get_init_state()
for step in range(steps):
# Compute each agent's policy.
for a in agent_dict.values():
agent_reward = reward_dict[a.name]
agent_action = a.act(state, agent_reward)
action_dict[a.name] = agent_action
# Terminal check.
if state.is_terminal():
experiment.add_experience(agent_dict, state, action_dict, defaultdict(int), state)
continue
# Execute in MDP.
reward_dict, next_state = markov_game_mdp.execute_agent_action(action_dict)
# Record the experience.
experiment.add_experience(agent_dict, state, action_dict, reward_dict, next_state)
# Update pointer.
state = next_state
# A final update.
for a in agent_dict.values():
agent_reward = reward_dict[a.name]
agent_action = a.act(state, agent_reward)
action_dict[a.name] = agent_action
# Process that learning instance's info at end of learning.
experiment.end_of_episode(a.name)
# Reset the MDP, tell the agent the episode is over.
markov_game_mdp.reset()
# A final update.
for a in agent_dict.values():
# Reset the agent and track experiment info.
experiment.end_of_instance(a.name)
a.reset()
# Time stuff.
print("Experiment took " + str(round(time.clock() - start, 2)) + " seconds.")
experiment.make_plots(open_plot=open_plot)
[docs]def run_agents_lifelong(agents,
mdp_distr,
samples=5,
episodes=1,
steps=100,
clear_old_results=True,
open_plot=True,
verbose=False,
track_disc_reward=False,
reset_at_terminal=False,
resample_at_terminal=False,
cumulative_plot=True,
dir_for_plot="results"):
'''
Args:
agents (list)
mdp_distr (MDPDistribution)
samples (int)
episodes (int)
steps (int)
clear_old_results (bool)
open_plot (bool)
verbose (bool)
track_disc_reward (bool): If true records and plots discounted reward, discounted over episodes. So, if
each episode is 100 steps, then episode 2 will start discounting as though it's step 101.
reset_at_terminal (bool)
resample_at_terminal (bool)
cumulative_plot (bool)
dir_for_plot (str)
Summary:
Runs each agent on the MDP distribution according to the given parameters.
If @mdp_distr has a non-zero horizon, then gamma is set to 1 and @steps is ignored.
'''
# Experiment (for reproducibility, plotting).
exp_params = {"samples":samples, "episodes":episodes, "steps":steps}
experiment = Experiment(agents=agents,
mdp=mdp_distr,
params=exp_params,
is_episodic=episodes > 1,
is_lifelong=True,
clear_old_results=clear_old_results,
track_disc_reward=track_disc_reward,
cumulative_plot=cumulative_plot,
dir_for_plot=dir_for_plot)
# Record how long each agent spends learning.
print("Running experiment: \n" + str(experiment))
start = time.clock()
times = defaultdict(float)
# Learn.
for agent in agents:
print(str(agent) + " is learning.")
start = time.clock()
# --- SAMPLE NEW MDP ---
for new_task in range(samples):
print(" Sample " + str(new_task + 1) + " of " + str(samples) + ".")
# Sample the MDP.
mdp = mdp_distr.sample()
# Run the agent.
hit_terminal, total_steps_taken, _ = run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, track_disc_reward, reset_at_terminal, resample_at_terminal)
# If we resample at terminal, keep grabbing MDPs until we're done.
while resample_at_terminal and hit_terminal and total_steps_taken < steps:
mdp = mdp_distr.sample()
hit_terminal, steps_taken, _ = run_single_agent_on_mdp(agent, mdp, episodes, steps - total_steps_taken, experiment, verbose, track_disc_reward, reset_at_terminal, resample_at_terminal)
total_steps_taken += steps_taken
# Reset the agent.
agent.reset()
# Track how much time this agent took.
end = time.clock()
times[agent] = round(end - start, 3)
# Time stuff.
print("\n--- TIMES ---")
for agent in times.keys():
print(str(agent) + " agent took " + str(round(times[agent], 2)) + " seconds.")
print("-------------\n")
experiment.make_plots(open_plot=open_plot)
[docs]def run_agents_on_mdp(agents,
mdp,
instances=5,
episodes=100,
steps=200,
clear_old_results=True,
rew_step_count=1,
track_disc_reward=False,
open_plot=True,
verbose=False,
reset_at_terminal=False,
cumulative_plot=True,
dir_for_plot="results",
experiment_name_prefix=""):
'''
Args:
agents (list of Agents): See agents/AgentClass.py (and friends).
mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*.
instances (int): Number of times to run each agent (for confidence intervals).
episodes (int): Number of episodes for each learning instance.
steps (int): Number of steps per episode.
clear_old_results (bool): If true, removes all results files in the relevant results dir.
rew_step_count (int): Number of steps before recording reward.
track_disc_reward (bool): If true, track (and plot) discounted reward.
open_plot (bool): If true opens the plot at the end.
verbose (bool): If true, prints status bars per episode/instance.
reset_at_terminal (bool): If true sends the agent to the start state after terminal.
cumulative_plot (bool): If true makes a cumulative plot, otherwise plots avg. reward per timestep.
dir_for_plot (str): Path
experiment_name_prefix (str): Adds this to the end of the usual experiment name.
Summary:
Runs each agent on the given mdp according to the given parameters.
Stores results in results/<agent_name>.csv and automatically
generates a plot and opens it.
'''
# Experiment (for reproducibility, plotting).
exp_params = {"instances":instances, "episodes":episodes, "steps":steps}
experiment = Experiment(agents=agents,
mdp=mdp,
params=exp_params,
is_episodic= episodes > 1,
clear_old_results=clear_old_results,
track_disc_reward=track_disc_reward,
count_r_per_n_timestep=rew_step_count,
cumulative_plot=cumulative_plot,
dir_for_plot=dir_for_plot,
experiment_name_prefix=experiment_name_prefix)
# Record how long each agent spends learning.
print("Running experiment: \n" + str(experiment))
time_dict = defaultdict(float)
# Learn.
for agent in agents:
print(str(agent) + " is learning.")
start = time.clock()
# For each instance.
for instance in range(1, instances + 1):
print(" Instance " + str(instance) + " of " + str(instances) + ".")
sys.stdout.flush()
run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, track_disc_reward, reset_at_terminal=reset_at_terminal)
# Reset the agent.
agent.reset()
mdp.end_of_instance()
# Track how much time this agent took.
end = time.clock()
time_dict[agent] = round(end - start, 3)
print()
# Time stuff.
print("\n--- TIMES ---")
for agent in time_dict.keys():
print(str(agent) + " agent took " + str(round(time_dict[agent], 2)) + " seconds.")
print("-------------\n")
experiment.make_plots(open_plot=open_plot)
[docs]def run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment=None, verbose=False, track_disc_reward=False, reset_at_terminal=False, resample_at_terminal=False):
'''
Summary:
Main loop of a single MDP experiment.
Returns:
(tuple): (bool:reached terminal, int: num steps taken, float: cumulative discounted reward)
'''
if reset_at_terminal and resample_at_terminal:
raise ValueError("(simple_rl) ExperimentError: Can't have reset_at_terminal and resample_at_terminal set to True.")
value = 0
gamma = mdp.get_gamma()
# For each episode.
for episode in range(1, episodes + 1):
if verbose:
# Print episode numbers out nicely.
sys.stdout.write("\tEpisode %s of %s" % (episode, episodes))
sys.stdout.write("\b" * len("\tEpisode %s of %s" % (episode, episodes)))
sys.stdout.flush()
# Compute initial state/reward.
state = mdp.get_init_state()
reward = 0
episode_start_time = time.clock()
# Extra printing if verbose.
if verbose:
print()
sys.stdout.flush()
prog_bar_len = _make_step_progress_bar()
for step in range(1, steps + 1):
if verbose and int(prog_bar_len*float(step) / steps) > int(prog_bar_len*float(step-1) / steps):
_increment_bar()
# step time
step_start = time.clock()
# Compute the agent's policy.
action = agent.act(state, reward)
# Terminal check.
if state.is_terminal():
if episodes == 1 and not reset_at_terminal and experiment is not None and action != "terminate":
# Self loop if we're not episodic or resetting and in a terminal state.
experiment.add_experience(agent, state, action, 0, state, time_taken=time.clock()-step_start)
continue
break
# Execute in MDP.
reward, next_state = mdp.execute_agent_action(action)
# Track value.
value += reward * gamma ** step
# Record the experience.
if experiment is not None:
reward_to_track = mdp.get_gamma()**(step + 1 + episode*steps) * reward if track_disc_reward else reward
reward_to_track = round(reward_to_track, 5)
experiment.add_experience(agent, state, action, reward_to_track, next_state, time_taken=time.clock() - step_start)
if next_state.is_terminal():
if reset_at_terminal:
# Reset the MDP.
next_state = mdp.get_init_state()
mdp.reset()
elif resample_at_terminal and step < steps:
mdp.reset()
return True, step, value
# Update pointer.
state = next_state
# A final update.
action = agent.act(state, reward)
# Process experiment info at end of episode.
if experiment is not None:
experiment.end_of_episode(agent)
# Reset the MDP, tell the agent the episode is over.
mdp.reset()
agent.end_of_episode()
if verbose:
print("\n")
# Process that learning instance's info at end of learning.
if experiment is not None:
experiment.end_of_instance(agent)
return False, steps, value
[docs]def run_single_belief_agent_on_pomdp(belief_agent, pomdp, episodes, steps, experiment=None, verbose=False,
track_disc_reward=False, reset_at_terminal=False, resample_at_terminal=False):
'''
Args:
belief_agent:
pomdp:
episodes:
steps:
experiment:
verbose:
track_disc_reward:
reset_at_terminal:
resample_at_terminal:
Returns:
'''
pass
def _make_step_progress_bar():
'''
Summary:
Prints a step progress bar for experiments.
Returns:
(int): Length of the progress bar (in characters).
'''
progress_bar_width = 20
sys.stdout.write("\t\t[%s]" % (" " * progress_bar_width))
sys.stdout.flush()
sys.stdout.write("\b" * (progress_bar_width+1)) # return to start of line, after '['
return progress_bar_width
def _increment_bar():
sys.stdout.write("-")
sys.stdout.flush()
[docs]def evaluate_agent(agent, mdp, instances=10):
'''
Args:
agent (simple_rl.Agent)
mdp (simple_rl.MDP)
instances (int)
Returns:
(float): Avg. cumulative discounted reward.
'''
total = 0.0
steps = int(1 / (1 - mdp.get_gamma())) * 10
for i in range(instances):
_, _, val = run_single_agent_on_mdp(agent, mdp, episodes=1, steps=steps)
total += val
# Reset the agent.
agent.reset()
mdp.reset()
mdp.end_of_instance()
return total / instances
[docs]def reproduce_from_exp_file(exp_name, results_dir_name="results", open_plot=True):
'''
Args:
exp_name (str)
results_dir_name (str)
open_plot (bool)
Summary:
Extracts the agents, MDP, and parameters from the file and runs the experiment.
Stores data in "results_dir_name/exp_name/reproduce_i/*", where "i" is determined
based on the existence of earlier "reproduce" files.
'''
# Get dir and file.
exp_dir = os.path.join(results_dir_name, exp_name)
exp_file = Experiment.FULL_EXP_FILE_NAME
full_exp_file = os.path.join(exp_dir, exp_file)
# Check to make sure the file exists.
if not os.path.exists(full_exp_file):
raise ImportError("(simple_rl): no such experiment: " + str(full_exp_file) + ".")
# Open the file.
exp_file = open(full_exp_file, "r")
# Placeholders.
agents = []
mdp = None
experiment_param_dict = {}
actions = []
experiment_func = None
# Read the file in.
lines = exp_file.readlines()
for i, line in enumerate(lines):
if "OOMDP" in line or "POMDP" in line:
raise TypeError("(simple_rl): reproduction not yet implemented for OOMDPs and POMDPs.")
sys.exit(0)
elif "MDP:" in line:
mdp_class_str = line[line.find("'") + 1 : line.rfind("'")].split(".")[-1]
mdp_param_dict = _get_params_from_lines(lines, start_index=i + 1)
MDPClass = eval(mdp_class_str)
mdp = MDPClass(**mdp_param_dict)
elif "AGENT:" in line:
# Get class and make agent.
agent_class_str = line[line.find("'") + 1 : line.rfind("'")].split(".")[-1]
agent_param_dict = _get_params_from_lines(lines, start_index=i + 1)
AgentClass = eval(agent_class_str)
agent_param_dict["actions"] = mdp.get_actions()
agent = AgentClass(**agent_param_dict)
agents.append(agent)
elif "MISC" in line:
experiment_param_dict = _get_params_from_lines(lines, start_index=i + 1)
elif "FUNC" in line:
i += 1
func_name = lines[i].strip()
experiment_func = eval(func_name)
# Prints.
print("\n" + "%"*17)
print("%"*2, "Reproducing", "%"*2)
print("%"*17, "\n")
print("MDP:", "\n " + str(mdp) + "\n")
print("Agents:")
for a in agents:
print(" ", a)
print("\n" + "%"*17)
print("%"*17, "\n")
# Reproduce.
chart_utils.CUSTOM_TITLE = "Reproduce: " + str(mdp)
experiment_func(agents, mdp, dir_for_plot=results_dir_name, experiment_name_prefix="reproduce_", open_plot=open_plot, **experiment_param_dict)
print("\n" + "%"*22)
print("%"*2, "Done Reproducing", "%"*2)
print("%"*22, "\n")
exp_file.close()
def _get_params_from_lines(lines, start_index):
'''
Args:
lines (list)
start_index (int)
Returns:
(dict)
Summary:
'''
from ast import literal_eval as make_tuple
import ast
agent_param_dict = {}
# Get the class.
hit_next_agent = False
i = start_index
while not hit_next_agent:
if len(lines[i]) <= 1:
break
# Grab param name, value, and type.
next_line = [item.strip() for item in lines[i].split("=")]
param_name, param_val, param_type = next_line[0], next_line[1], next_line[2][next_line[2].find("'") + 1 : next_line[2].rfind("'")]
if param_type == "bool":
param_val = bool(param_val == "True")
elif param_type == "tuple":
param_val = make_tuple(param_val)
elif param_type == "list":
param_val = ast.literal_eval(param_val)
else:
param_val = eval(param_type)(param_val)
agent_param_dict[param_name] = param_val
i += 1
return agent_param_dict
def _parse_tuple_string(tup_str):
'''
Args:
tup (tuple)
'''
[docs]def choose_mdp(mdp_name, env_name="Asteroids-v0"):
'''
Args:
mdp_name (str): one of {gym, grid, chain, taxi, ...}
gym_env_name (str): gym environment name, like 'CartPole-v0'
Returns:
(MDP)
'''
# Other imports
from simple_rl.tasks import ChainMDP, GridWorldMDP, FourRoomMDP, TaxiOOMDP, RandomMDP, PrisonersDilemmaMDP, RockPaperScissorsMDP, GridGameMDP
# Taxi MDP.
agent = {"x":1, "y":1, "has_passenger":0}
passengers = [{"x":4, "y":3, "dest_x":2, "dest_y":2, "in_taxi":0}]
walls = []
if mdp_name == "gym":
# OpenAI Gym MDP.
try:
from simple_rl.tasks.gym.GymMDPClass import GymMDP
except:
raise ValueError("(simple_rl) Error: OpenAI gym not installed.")
return GymMDP(env_name, render=True)
else:
return {"grid":GridWorldMDP(5, 5, (1, 1), goal_locs=[(5, 3), (4,1)]),
"four_room":FourRoomMDP(),
"chain":ChainMDP(5),
"taxi":TaxiOOMDP(10, 10, slip_prob=0.0, agent=agent, walls=walls, passengers=passengers),
"random":RandomMDP(num_states=40, num_rand_trans=20),
"prison":PrisonersDilemmaMDP(),
"rps":RockPaperScissorsMDP(),
"grid_game":GridGameMDP(),
"multi":{0.5:RandomMDP(num_states=40, num_rand_trans=20), 0.5:RandomMDP(num_states=40, num_rand_trans=5)}}[mdp_name]
[docs]def parse_args():
# Add all arguments
parser = argparse.ArgumentParser()
parser.add_argument("-mdp", type = str, nargs = '?', help = "Select the mdp. Options: {atari, grid, chain, taxi}")
parser.add_argument("-env", type = str, nargs = '?', help = "Select the Gym environment.")
args = parser.parse_args()
# Fix variables based on options.
task = args.mdp if args.mdp else "grid"
env_name = args.env if args.env else "CartPole-v0"
return task, env_name
[docs]def main():
# Command line args.
task, rom = parse_args()
# Setup the MDP.
mdp = choose_mdp(task, rom)
actions = mdp.get_actions()
gamma = mdp.get_gamma()
# Setup agents.
from simple_rl.agents import RandomAgent, QLearningAgent
random_agent = RandomAgent(actions)
qlearner_agent = QLearningAgent(actions, gamma=gamma, explore="uniform")
agents = [qlearner_agent, random_agent]
# Run Agents.
if isinstance(mdp, MarkovGameMDP):
# Markov Game.
agents = {qlearner_agent.name: qlearner_agent, random_agent.name:random_agent}
play_markov_game(agents, mdp, instances=100, episodes=1, steps=500)
else:
# Regular experiment.
run_agents_on_mdp(agents, mdp, instances=50, episodes=1, steps=2000)
if __name__ == "__main__":
main()