You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
215 lines
7.9 KiB
Python
215 lines
7.9 KiB
Python
7 months ago
|
from agent.Base_Agent import Base_Agent as Agent
|
||
|
from world.commons.Draw import Draw
|
||
|
from stable_baselines3 import PPO
|
||
|
from stable_baselines3.common.vec_env import SubprocVecEnv
|
||
|
from scripts.commons.Server import Server
|
||
|
from scripts.commons.Train_Base import Train_Base
|
||
|
from time import sleep
|
||
|
import os, gym
|
||
|
import numpy as np
|
||
|
|
||
|
'''
|
||
|
Objective:
|
||
|
Learn how to fall (simplest example)
|
||
|
----------
|
||
|
- class Fall: implements an OpenAI custom gym
|
||
|
- class Train: implements algorithms to train a new model or test an existing model
|
||
|
'''
|
||
|
|
||
|
class Fall(gym.Env):
|
||
|
def __init__(self, ip, server_p, monitor_p, r_type, enable_draw) -> None:
|
||
|
|
||
|
self.robot_type = r_type
|
||
|
|
||
|
# Args: Server IP, Agent Port, Monitor Port, Uniform No., Robot Type, Team Name, Enable Log, Enable Draw
|
||
|
self.player = Agent(ip, server_p, monitor_p, 1, self.robot_type, "Gym", True, enable_draw)
|
||
|
self.step_counter = 0 # to limit episode size
|
||
|
|
||
|
# State space
|
||
|
self.no_of_joints = self.player.world.robot.no_of_joints
|
||
|
self.obs = np.zeros(self.no_of_joints + 1, np.float32) # joints + torso height
|
||
|
self.observation_space = gym.spaces.Box(low=np.full(len(self.obs),-np.inf,np.float32), high=np.full(len(self.obs),np.inf,np.float32), dtype=np.float32)
|
||
|
|
||
|
# Action space
|
||
|
MAX = np.finfo(np.float32).max
|
||
|
no_of_actions = self.no_of_joints
|
||
|
self.action_space = gym.spaces.Box(low=np.full(no_of_actions,-MAX,np.float32), high=np.full(no_of_actions,MAX,np.float32), dtype=np.float32)
|
||
|
|
||
|
# Check if cheats are enabled
|
||
|
assert np.any(self.player.world.robot.cheat_abs_pos), "Cheats are not enabled! Run_Utils.py -> Server -> Cheats"
|
||
|
|
||
|
|
||
|
def observe(self):
|
||
|
|
||
|
r = self.player.world.robot
|
||
|
|
||
|
for i in range(self.no_of_joints):
|
||
|
self.obs[i] = r.joints_position[i] / 100 # naive scale normalization
|
||
|
|
||
|
self.obs[self.no_of_joints] = r.cheat_abs_pos[2] # head.z (alternative: r.loc_head_z)
|
||
|
|
||
|
return self.obs
|
||
|
|
||
|
def sync(self):
|
||
|
''' Run a single simulation step '''
|
||
|
r = self.player.world.robot
|
||
|
self.player.scom.commit_and_send( r.get_command() )
|
||
|
self.player.scom.receive()
|
||
|
|
||
|
|
||
|
def reset(self):
|
||
|
'''
|
||
|
Reset and stabilize the robot
|
||
|
Note: for some behaviors it would be better to reduce stabilization or add noise
|
||
|
'''
|
||
|
|
||
|
self.step_counter = 0
|
||
|
r = self.player.world.robot
|
||
|
|
||
|
for _ in range(25):
|
||
|
self.player.scom.unofficial_beam((-3,0,0.50),0) # beam player continuously (floating above ground)
|
||
|
self.player.behavior.execute("Zero")
|
||
|
self.sync()
|
||
|
|
||
|
# beam player to ground
|
||
|
self.player.scom.unofficial_beam((-3,0,r.beam_height),0)
|
||
|
r.joints_target_speed[0] = 0.01 # move head to trigger physics update (rcssserver3d bug when no joint is moving)
|
||
|
self.sync()
|
||
|
|
||
|
# stabilize on ground
|
||
|
for _ in range(7):
|
||
|
self.player.behavior.execute("Zero")
|
||
|
self.sync()
|
||
|
|
||
|
return self.observe()
|
||
|
|
||
|
def render(self, mode='human', close=False):
|
||
|
return
|
||
|
|
||
|
def close(self):
|
||
|
Draw.clear_all()
|
||
|
self.player.terminate()
|
||
|
|
||
|
def step(self, action):
|
||
|
|
||
|
r = self.player.world.robot
|
||
|
r.set_joints_target_position_direct( # commit actions:
|
||
|
slice(self.no_of_joints), # act on all available joints
|
||
|
action*10, # scale actions up to motivate early exploration
|
||
|
harmonize=False # there is no point in harmonizing actions if the targets change at every step
|
||
|
)
|
||
|
|
||
|
self.sync() # run simulation step
|
||
|
self.step_counter += 1
|
||
|
self.observe()
|
||
|
|
||
|
if self.obs[-1] < 0.15: # terminal state: the robot has fallen successfully
|
||
|
return self.obs, 1, True, {} # Reward: 1 (this reward will motivate a fast reaction if the return is discounted)
|
||
|
elif self.step_counter > 150: # terminal state: 3s passed and robot has not fallen (may be stuck)
|
||
|
return self.obs, 0, True, {}
|
||
|
else:
|
||
|
return self.obs, 0, False, {} # Reward: 0
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
class Train(Train_Base):
|
||
|
def __init__(self, script) -> None:
|
||
|
super().__init__(script)
|
||
|
|
||
|
|
||
|
def train(self, args):
|
||
|
|
||
|
#--------------------------------------- Learning parameters
|
||
|
n_envs = min(4, os.cpu_count())
|
||
|
n_steps_per_env = 128 # RolloutBuffer is of size (n_steps_per_env * n_envs) (*RV: >=2048)
|
||
|
minibatch_size = 64 # should be a factor of (n_steps_per_env * n_envs)
|
||
|
total_steps = 50000 # (*RV: >=10M)
|
||
|
learning_rate = 30e-4 # (*RV: 3e-4)
|
||
|
# *RV -> Recommended value for more complex environments
|
||
|
folder_name = f'Fall_R{self.robot_type}'
|
||
|
model_path = f'./scripts/gyms/logs/{folder_name}/'
|
||
|
|
||
|
print("Model path:", model_path)
|
||
|
|
||
|
#--------------------------------------- Run algorithm
|
||
|
def init_env(i_env):
|
||
|
def thunk():
|
||
|
return Fall( self.ip , self.server_p + i_env, self.monitor_p_1000 + i_env, self.robot_type, False )
|
||
|
return thunk
|
||
|
|
||
|
servers = Server( self.server_p, self.monitor_p_1000, n_envs+1 ) #include 1 extra server for testing
|
||
|
|
||
|
env = SubprocVecEnv( [init_env(i) for i in range(n_envs)] )
|
||
|
eval_env = SubprocVecEnv( [init_env(n_envs)] )
|
||
|
|
||
|
try:
|
||
|
if "model_file" in args: # retrain
|
||
|
model = PPO.load( args["model_file"], env=env, n_envs=n_envs, n_steps=n_steps_per_env, batch_size=minibatch_size, learning_rate=learning_rate )
|
||
|
else: # train new model
|
||
|
model = PPO( "MlpPolicy", env=env, verbose=1, n_steps=n_steps_per_env, batch_size=minibatch_size, learning_rate=learning_rate )
|
||
|
|
||
|
model_path = self.learn_model( model, total_steps, model_path, eval_env=eval_env, eval_freq=n_steps_per_env*10, save_freq=n_steps_per_env*20, backup_env_file=__file__ )
|
||
|
except KeyboardInterrupt:
|
||
|
sleep(1) # wait for child processes
|
||
|
print("\nctrl+c pressed, aborting...\n")
|
||
|
servers.kill()
|
||
|
return
|
||
|
|
||
|
env.close()
|
||
|
eval_env.close()
|
||
|
servers.kill()
|
||
|
|
||
|
|
||
|
def test(self, args):
|
||
|
|
||
|
# Uses different server and monitor ports
|
||
|
server = Server( self.server_p-1, self.monitor_p, 1 )
|
||
|
env = Fall( self.ip, self.server_p-1, self.monitor_p, self.robot_type, True )
|
||
|
model = PPO.load( args["model_file"], env=env )
|
||
|
|
||
|
try:
|
||
|
self.export_model( args["model_file"], args["model_file"]+".pkl", False ) # Export to pkl to create custom behavior
|
||
|
self.test_model( model, env, log_path=args["folder_dir"], model_path=args["folder_dir"] )
|
||
|
except KeyboardInterrupt:
|
||
|
print()
|
||
|
|
||
|
env.close()
|
||
|
server.kill()
|
||
|
|
||
|
|
||
|
'''
|
||
|
The learning process takes about 5 minutes.
|
||
|
A video with the results can be seen at:
|
||
|
https://imgur.com/a/KvpXS41
|
||
|
|
||
|
State space:
|
||
|
- Composed of all joint positions + torso height
|
||
|
- The number of joint positions is different for robot type 4, so the models are not interchangeable
|
||
|
- For this example, this problem can be avoided by using only the first 22 joints and actuators
|
||
|
|
||
|
Reward:
|
||
|
- The reward for falling is 1, which means that after a while every episode will have a r=1.
|
||
|
- What is the incetive for the robot to fall faster? Discounted return.
|
||
|
In every state, the algorithm will seek short-term rewards.
|
||
|
- During training, the best model is saved according to the average return, which is almost always 1.
|
||
|
Therefore, the last model will typically be superior for this example.
|
||
|
|
||
|
Expected evolution of episode length:
|
||
|
3s|o
|
||
|
|o
|
||
|
| o
|
||
|
| o
|
||
|
| oo
|
||
|
| ooooo
|
||
|
0.4s| oooooooooooooooo
|
||
|
|------------------------------> time
|
||
|
|
||
|
|
||
|
This example scales poorly with the number of CPUs because:
|
||
|
- It uses a small rollout buffer (n_steps_per_env * n_envs)
|
||
|
- The simulation workload is light
|
||
|
- For these reasons, the IPC overhead is significant
|
||
|
'''
|