diff --git a/mani_skill/envs/sapien_env.py b/mani_skill/envs/sapien_env.py index e70b888045..3df0ed0d46 100644 --- a/mani_skill/envs/sapien_env.py +++ b/mani_skill/envs/sapien_env.py @@ -39,6 +39,9 @@ from mani_skill.utils.structs import Actor, Articulation from mani_skill.utils.structs.pose import Pose from mani_skill.utils.structs.types import Array, SimConfig +from mani_skill.utils.visualization.viewer_camera_control import ( + ViewerCameraControlPlugin, +) from mani_skill.utils.visualization.misc import tile_images @@ -115,6 +118,9 @@ class BaseEnv(gym.Env): enhanced_determinism (bool): By default this is False and env resets will reset the episode RNG only when a seed / seed list is given. If True, the environment will reset the episode RNG upon each reset regardless of whether a seed is provided. Generally enhanced_determinisim is not needed and users are recommended to pass seeds into the env reset function instead. + + enable_camera_pose_editing (bool): If True, the human viewer exposes an in-window camera editor that lets you click camera frustums + and drag a gizmo to update their poses live. """ # fmt: off @@ -208,8 +214,10 @@ def __init__( render_backend: str = "gpu", parallel_in_single_scene: bool = False, enhanced_determinism: bool = False, + enable_camera_pose_editing: bool = False, ): self._enhanced_determinism = enhanced_determinism + self._enable_camera_pose_editing = enable_camera_pose_editing self.num_envs = num_envs self.reconfiguration_freq = reconfiguration_freq if reconfiguration_freq is not None else 0 @@ -312,6 +320,7 @@ def __init__( # Render mode self.render_mode = render_mode self._viewer = None + self._viewer_camera_control_plugin = None # Lighting self.enable_shadow = enable_shadow @@ -1250,6 +1259,7 @@ def _close_viewer(self): return self._viewer.close() self._viewer = None + self._viewer_camera_control_plugin = None @cached_property def segmentation_id_map(self): @@ -1331,6 +1341,22 @@ def set_state(self, state: Array, env_idx: torch.Tensor = None): def viewer(self): return self._viewer + def _ensure_viewer_camera_control_plugin(self): + if self._viewer is None: + return + existing_plugin = sapien_utils.get_obj_by_type( + self._viewer.plugins, ViewerCameraControlPlugin, is_unique=False + ) + if isinstance(existing_plugin, list): + self._viewer_camera_control_plugin = existing_plugin[0] + elif existing_plugin is not None: + self._viewer_camera_control_plugin = existing_plugin + if self._viewer_camera_control_plugin is None: + self._viewer_camera_control_plugin = ViewerCameraControlPlugin(self) + self._viewer.plugins.append(self._viewer_camera_control_plugin) + self._viewer_camera_control_plugin.init(self._viewer) + self._viewer_camera_control_plugin.notify_scene_change() + def _setup_viewer(self): """Setup the interactive viewer. @@ -1347,6 +1373,8 @@ def _setup_viewer(self): ) control_window.show_joint_axes = False control_window.show_camera_linesets = False + if self._enable_camera_pose_editing: + self._ensure_viewer_camera_control_plugin() if "render_camera" in self._human_render_cameras: self._viewer.set_camera_pose( self._human_render_cameras["render_camera"].camera.global_pose[0].sp diff --git a/mani_skill/examples/demo_random_action.py b/mani_skill/examples/demo_random_action.py index 90d06b0be9..7bc9c8397d 100644 --- a/mani_skill/examples/demo_random_action.py +++ b/mani_skill/examples/demo_random_action.py @@ -49,6 +49,9 @@ class Args: pause: Annotated[bool, tyro.conf.arg(aliases=["-p"])] = False """If using human render mode, auto pauses the simulation upon loading""" + edit_camera_poses: bool = False + """Enable in-viewer camera pose editing when using the human render mode""" + quiet: bool = False """Disable verbose output.""" @@ -83,6 +86,7 @@ def main(args: Args): render_backend=args.render_backend, enable_shadow=True, parallel_in_single_scene=parallel_in_single_scene, + enable_camera_pose_editing=args.edit_camera_poses, ) if args.robot_uids is not None: env_kwargs["robot_uids"] = tuple(args.robot_uids.split(",")) @@ -107,12 +111,19 @@ def main(args: Args): obs, _ = env.reset(seed=args.seed, options=dict(reconfigure=True)) if args.seed is not None and env.action_space is not None: env.action_space.seed(args.seed[0]) + pause_after_viewer_boot = False if args.render_mode == "human": viewer = env.render() - if isinstance(viewer, sapien.utils.Viewer): - viewer.paused = args.pause - env.render() + pause_after_viewer_boot = args.pause and isinstance(viewer, sapien.utils.Viewer) + if args.edit_camera_poses: + print("Camera pose editing enabled: click a camera frustum in the viewer and drag the gizmo in the Camera Editor window.") while True: + if args.render_mode == "human": + viewer = env.render() + if isinstance(viewer, sapien.utils.Viewer) and pause_after_viewer_boot: + viewer.paused = True + pause_after_viewer_boot = False + continue action = env.action_space.sample() if env.action_space is not None else None obs, reward, terminated, truncated, info = env.step(action) if verbose: @@ -120,8 +131,6 @@ def main(args: Args): print("terminated", terminated) print("truncated", truncated) print("info", info) - if args.render_mode == "human": - env.render() if args.render_mode is None or args.render_mode != "human": if (terminated | truncated).any(): break diff --git a/mani_skill/examples/teleoperation/teleop.py b/mani_skill/examples/teleoperation/teleop.py new file mode 100644 index 0000000000..4fc19aa060 --- /dev/null +++ b/mani_skill/examples/teleoperation/teleop.py @@ -0,0 +1,566 @@ +""" +Generic cross-platform teleoperation script. + +Works with any robot that defines `ee_link_name` and `arm_joint_names`. +Uses Pinocchio-based IK (no mplib dependency), so runs on macOS + Linux. + +Usage: + python -m mani_skill.examples.teleoperation.teleop -e PickCube-v1 -r panda + python -m mani_skill.examples.teleoperation.teleop -e PickCube-v1 -r xarm6_robotiq + python -m mani_skill.examples.teleoperation.teleop -e PickCube-v1 -r so100 +""" + +from dataclasses import dataclass, field +from typing import Annotated, Optional + +import platform +import gymnasium as gym +import h5py +import json +import numpy as np +import sapien.core as sapien +import sapien.internal_renderer as R +import sapien.utils.viewer +from sapien.utils.viewer.plugin import Plugin +import torch +import tyro + +_IS_MACOS = platform.system() == "Darwin" +_DEFAULT_SHADER = "default" if _IS_MACOS else "rt-fast" + +from mani_skill import format_path +from mani_skill.agents.controllers.base_controller import CombinedController +from mani_skill.agents.controllers.utils.kinematics import Kinematics +from mani_skill.envs.sapien_env import BaseEnv +from mani_skill.utils import sapien_utils +from mani_skill.utils.structs.pose import Pose as MSPose +from mani_skill.utils.wrappers.record import RecordEpisode +import mani_skill.trajectory.utils as trajectory_utils + + +class TeleopPlugin(Plugin): + """Custom viewer plugin that locks selection to the gizmo link and provides a gripper slider.""" + + def __init__(self): + self._locked_entity = None + self._ui_window = None + self.gripper_openness = 1.0 # 1.0 = fully open, 0.0 = fully closed + self._has_gripper = False + + def set_locked_entity(self, entity): + self._locked_entity = entity + + def set_has_gripper(self, has_gripper): + self._has_gripper = has_gripper + self._ui_window = None # force rebuild + + def notify_selected_entity_change(self): + if self._locked_entity is not None and self.viewer.selected_entity != self._locked_entity: + self.viewer.select_entity(self._locked_entity) + + def _build_ui(self): + children = [R.UIDisplayText().Text("Teleop Controls")] + if self._has_gripper: + children.append( + R.UISliderFloat() + .Label("Gripper Opening") + .Min(0.0) + .Max(1.0) + .Bind(self, "gripper_openness") + ) + children.append(R.UIDisplayText().Text("Keys: h=help n=move g=gripper r=reset c=next q=quit")) + self._ui_window = R.UIWindow().Label("Teleop").Pos(10, 10).Size(300, 120) + for child in children: + self._ui_window.append(child) + + def get_ui_windows(self): + if self._ui_window is None: + self._build_ui() + return [self._ui_window] + + +@dataclass +class Args: + env_id: Annotated[str, tyro.conf.arg(aliases=["-e"])] = "PickCube-v1" + robot_uid: Annotated[str, tyro.conf.arg(aliases=["-r"])] = "panda" + """The robot to use. Any robot with ee_link_name and arm_joint_names is supported.""" + ee_link: Optional[str] = None + """Override the end-effector link name. If not set, uses agent.ee_link_name.""" + obs_mode: str = "none" + record_dir: str = "demos" + """Directory to record demonstration data and optionally videos.""" + save_video: bool = False + """Whether to save videos of the demonstrations after collecting them.""" + viewer_shader: str = _DEFAULT_SHADER + """Shader for the viewer. 'default' is fast, 'rt' and 'rt-fast' are ray tracing. Defaults to 'default' on macOS.""" + video_saving_shader: str = _DEFAULT_SHADER + """Shader for saved videos. 'default' is fast, 'rt' and 'rt-fast' are ray tracing. Defaults to 'default' on macOS.""" + step_size: float = 0.01 + """Translation step size for keyboard nudge controls.""" + + +def parse_args() -> Args: + return tyro.cli(Args) + + +def _get_robot_info(env, ee_link_override=None): + """Extract robot introspection data from the environment agent.""" + agent = env.unwrapped.agent + + # EE link name + ee_link_name = ee_link_override + if ee_link_name is None: + if hasattr(agent, "ee_link_name") and agent.ee_link_name: + ee_link_name = agent.ee_link_name + else: + link_names = [l.name for l in agent.robot.links] + print("ERROR: Robot does not define ee_link_name. Use --ee-link to specify one.") + print(f"Available links: {link_names}") + raise SystemExit(1) + + # Arm joint names + if not hasattr(agent, "arm_joint_names") or not agent.arm_joint_names: + print("ERROR: Robot does not define arm_joint_names. This script requires it.") + raise SystemExit(1) + arm_joint_names = agent.arm_joint_names + + # Arm joint indices (indices into the active joints array) + arm_joint_indices = [] + for name in arm_joint_names: + joint = agent.robot.active_joints_map[name] + arm_joint_indices.append(joint.active_index[0].item()) + + # Gripper detection + has_gripper = ( + hasattr(agent, "gripper_joint_names") + and agent.gripper_joint_names + and len(agent.gripper_joint_names) > 0 + ) + gripper_joint_names = agent.gripper_joint_names if has_gripper else [] + + # Gripper open/close values from joint limits + gripper_open_values = [] + gripper_close_values = [] + if has_gripper: + qlimits = agent.robot.get_qlimits()[0] # (num_joints, 2), take first env + for name in gripper_joint_names: + joint = agent.robot.active_joints_map[name] + idx = joint.active_index[0].item() + lower = qlimits[idx, 0].item() + upper = qlimits[idx, 1].item() + gripper_open_values.append(upper) + gripper_close_values.append(lower) + + # Action layout: CombinedController vs flat + controller = agent.controller + is_combined = isinstance(controller, CombinedController) + action_mapping = controller.action_mapping if is_combined else None + + # URDF path + urdf_path = format_path(str(agent.urdf_path)) + + return dict( + ee_link_name=ee_link_name, + arm_joint_names=arm_joint_names, + arm_joint_indices=arm_joint_indices, + has_gripper=has_gripper, + gripper_joint_names=gripper_joint_names, + gripper_open_values=gripper_open_values, + gripper_close_values=gripper_close_values, + is_combined=is_combined, + action_mapping=action_mapping, + urdf_path=urdf_path, + ) + + +def _find_gizmo_link(agent, ee_link_name): + """Find a visible link for the gizmo, walking up kinematic chain if needed. + + Returns (gizmo_link, ee_in_gizmo_offset) where ee_in_gizmo_offset is the + static transform from the gizmo link frame to the EE link frame. + """ + ee_link = sapien_utils.get_obj_by_name(agent.robot.links, ee_link_name) + + def _has_visual(link): + """Check if a link has visual geometry.""" + try: + entity = link._objs[0].entity + for comp in entity.components: + # Check for render body component (visual geometry) + if hasattr(comp, "render_shapes") or "RenderBody" in type(comp).__name__: + return True + except (IndexError, AttributeError): + pass + return False + + # Walk up the kinematic chain to find a link with visuals + gizmo_link = ee_link + while gizmo_link is not None and not _has_visual(gizmo_link): + parent = gizmo_link.joint.parent_link + if parent is None: + break + gizmo_link = parent + + # If we couldn't find any visual link, just use the EE link + if gizmo_link is None: + gizmo_link = ee_link + + # Compute static offset: ee pose expressed in gizmo link frame + # Convert ManiSkill batched Pose to a sapien.Pose for use with the viewer gizmo + ms_offset = gizmo_link.pose.inv() * ee_link.pose + p = ms_offset.p[0].cpu().numpy() + q = ms_offset.q[0].cpu().numpy() + ee_in_gizmo = sapien.Pose(p=p, q=q) + + return gizmo_link, ee_in_gizmo + + +def _build_action(env, robot_info, arm_qpos, gripper_values): + """Construct a full action vector for env.step(). + + Args: + env: the gym environment + robot_info: dict from _get_robot_info + arm_qpos: numpy array of target arm joint positions + gripper_values: list of gripper joint target values (or empty if no gripper) + """ + action_dim = env.action_space.shape[0] + action = np.zeros(action_dim, dtype=np.float32) + + if robot_info["is_combined"]: + mapping = robot_info["action_mapping"] + # Fill arm action + if "arm" in mapping: + start, end = mapping["arm"] + action[start:end] = arm_qpos + # Fill gripper action(s) + if robot_info["has_gripper"]: + gripper_idx = 0 + for uid, (start, end) in mapping.items(): + if "gripper" in uid.lower() and "passive" not in uid.lower(): + size = end - start + action[start:end] = gripper_values[gripper_idx:gripper_idx + size] + gripper_idx += size + else: + # Flat controller: arm joints first, then gripper + n_arm = len(robot_info["arm_joint_names"]) + action[:n_arm] = arm_qpos + if robot_info["has_gripper"] and len(gripper_values) > 0: + action[n_arm:n_arm + len(gripper_values)] = gripper_values + + return action + + +def _compute_gripper_values(robot_info, openness): + """Compute gripper joint values from a 0-1 openness fraction.""" + open_vals = robot_info["gripper_open_values"] + close_vals = robot_info["gripper_close_values"] + return [c + openness * (o - c) for o, c in zip(open_vals, close_vals)] + + +def _snap_gizmo_to_current(agent, gizmo_link, transform_window): + """Snap the gizmo to the current gizmo link pose (e.g. after reset).""" + p = gizmo_link.pose.p[0].cpu().numpy() + q = gizmo_link.pose.q[0].cpu().numpy() + current_pose = sapien.Pose(p=p, q=q) + transform_window.gizmo_matrix = current_pose.to_transformation_matrix() + transform_window.update_ghost_objects() + + +def solve(env: BaseEnv, robot_info: dict, step_size: float, teleop_plugin: TeleopPlugin): + """Interactive teleoperation loop. + + Returns "quit" or "continue" to signal the outer loop. + """ + base_env = env.unwrapped + agent = base_env.agent + ee_link_name = robot_info["ee_link_name"] + + # Setup IK solver + kinematics = Kinematics( + urdf_path=robot_info["urdf_path"], + end_link_name=ee_link_name, + articulation=agent.robot, + active_joint_indices=torch.tensor(robot_info["arm_joint_indices"]), + ) + + viewer = base_env.render_human() + + # Find gizmo link and compute offset + gizmo_link, ee_in_gizmo = _find_gizmo_link(agent, ee_link_name) + gizmo_entity = gizmo_link._objs[0].entity + + # Lock selection to the gizmo entity — prevents accidentally clicking joints + teleop_plugin.set_locked_entity(gizmo_entity) + teleop_plugin.set_has_gripper(robot_info["has_gripper"]) + teleop_plugin.gripper_openness = 1.0 # start fully open + viewer.select_entity(gizmo_entity) + + # Find transform window plugin + transform_window = None + for plugin in viewer.plugins: + if isinstance(plugin, sapien.utils.viewer.viewer.TransformWindow): + transform_window = plugin + break + + # Store initial qpos for reset + initial_qpos = agent.robot.get_qpos()[0].cpu().numpy().copy() + + # Track previous gripper openness to detect slider changes + prev_gripper_openness = teleop_plugin.gripper_openness + + while True: + transform_window.enabled = True + base_env.render_human() + execute_current_pose = False + + # --- Gripper slider: detect changes and actuate --- + if robot_info["has_gripper"]: + cur_openness = teleop_plugin.gripper_openness + if abs(cur_openness - prev_gripper_openness) > 1e-4: + gripper_vals = _compute_gripper_values(robot_info, cur_openness) + qpos = agent.robot.get_qpos()[0].cpu().numpy() + arm_qpos = np.array([qpos[i] for i in robot_info["arm_joint_indices"]]) + for _ in range(4): + action = _build_action(env, robot_info, arm_qpos, gripper_vals) + env.step(action) + base_env.render_human() + prev_gripper_openness = cur_openness + + if viewer.window.key_press("h"): + print("""Available commands: + h: print this help menu + u: move EE gizmo up + j: move EE gizmo down + arrow keys: move EE gizmo in X/Y directions + n: execute IK motion to move robot to gizmo target pose + g: toggle gripper open/close (if robot has one) + r: reset robot to initial pose + c: save episode and start a new one + q: save and quit + Gripper slider in the Teleop panel for fine control.""") + elif viewer.window.key_press("q"): + return "quit" + elif viewer.window.key_press("c"): + return "continue" + elif viewer.window.key_press("n"): + execute_current_pose = True + elif viewer.window.key_press("r"): + # Reset robot to initial pose + arm_qpos = np.array([initial_qpos[i] for i in robot_info["arm_joint_indices"]]) + if robot_info["has_gripper"]: + gripper_vals = _compute_gripper_values(robot_info, 1.0) + teleop_plugin.gripper_openness = 1.0 + prev_gripper_openness = 1.0 + else: + gripper_vals = [] + for _ in range(20): + action = _build_action(env, robot_info, arm_qpos, gripper_vals) + env.step(action) + base_env.render_human() + # Snap gizmo to new EE position + _snap_gizmo_to_current(agent, gizmo_link, transform_window) + print("Reset to initial pose") + elif viewer.window.key_press("g") and robot_info["has_gripper"]: + # Toggle gripper via key (snap to fully open or fully closed) + if teleop_plugin.gripper_openness > 0.5: + teleop_plugin.gripper_openness = 0.0 + else: + teleop_plugin.gripper_openness = 1.0 + gripper_vals = _compute_gripper_values(robot_info, teleop_plugin.gripper_openness) + prev_gripper_openness = teleop_plugin.gripper_openness + + qpos = agent.robot.get_qpos()[0].cpu().numpy() + arm_qpos = np.array([qpos[i] for i in robot_info["arm_joint_indices"]]) + for _ in range(6): + action = _build_action(env, robot_info, arm_qpos, gripper_vals) + env.step(action) + base_env.render_human() + print(f"Gripper {'opened' if teleop_plugin.gripper_openness > 0.5 else 'closed'}") + elif viewer.window.key_press("u"): + transform_window.gizmo_matrix = ( + transform_window._gizmo_pose * sapien.Pose(p=[0, 0, -step_size]) + ).to_transformation_matrix() + transform_window.update_ghost_objects() + elif viewer.window.key_press("j"): + transform_window.gizmo_matrix = ( + transform_window._gizmo_pose * sapien.Pose(p=[0, 0, +step_size]) + ).to_transformation_matrix() + transform_window.update_ghost_objects() + elif viewer.window.key_press("down"): + transform_window.gizmo_matrix = ( + transform_window._gizmo_pose * sapien.Pose(p=[+step_size, 0, 0]) + ).to_transformation_matrix() + transform_window.update_ghost_objects() + elif viewer.window.key_press("up"): + transform_window.gizmo_matrix = ( + transform_window._gizmo_pose * sapien.Pose(p=[-step_size, 0, 0]) + ).to_transformation_matrix() + transform_window.update_ghost_objects() + elif viewer.window.key_press("right"): + transform_window.gizmo_matrix = ( + transform_window._gizmo_pose * sapien.Pose(p=[0, -step_size, 0]) + ).to_transformation_matrix() + transform_window.update_ghost_objects() + elif viewer.window.key_press("left"): + transform_window.gizmo_matrix = ( + transform_window._gizmo_pose * sapien.Pose(p=[0, +step_size, 0]) + ).to_transformation_matrix() + transform_window.update_ghost_objects() + + if execute_current_pose: + # Compute target EE pose from gizmo pose + offset (world frame) + gizmo_pose = transform_window._gizmo_pose + target_ee_pose_world = gizmo_pose * ee_in_gizmo + + # Transform from world frame to robot base frame for Pinocchio IK + robot_pose = agent.robot.pose + robot_pose_sp = sapien.Pose( + p=robot_pose.p[0].cpu().numpy(), + q=robot_pose.q[0].cpu().numpy(), + ) + target_ee_pose_base = robot_pose_sp.inv() * target_ee_pose_world + + # Run IK + q0 = agent.robot.get_qpos() + target_pose_ms = MSPose.create_from_pq( + p=torch.tensor([[target_ee_pose_base.p[0], target_ee_pose_base.p[1], target_ee_pose_base.p[2]]], dtype=torch.float32), + q=torch.tensor([[target_ee_pose_base.q[0], target_ee_pose_base.q[1], target_ee_pose_base.q[2], target_ee_pose_base.q[3]]], dtype=torch.float32), + ) + ik_result = kinematics.compute_ik( + pose=target_pose_ms, + q0=q0, + ) + + if ik_result is None: + print("IK failed — try a closer target") + else: + target_arm_qpos = ik_result[0].cpu().numpy() + + # Get current arm qpos + current_qpos = q0[0].cpu().numpy() + current_arm_qpos = np.array([current_qpos[i] for i in robot_info["arm_joint_indices"]]) + + # Current gripper from slider + if robot_info["has_gripper"]: + gripper_vals = _compute_gripper_values(robot_info, teleop_plugin.gripper_openness) + else: + gripper_vals = [] + + # Interpolate from current to target + max_delta = np.max(np.abs(target_arm_qpos - current_arm_qpos)) + n_steps = max(1, min(100, int(max_delta / 0.02))) + + for step_i in range(1, n_steps + 1): + t = step_i / n_steps + interp_arm = current_arm_qpos + t * (target_arm_qpos - current_arm_qpos) + action = _build_action(env, robot_info, interp_arm, gripper_vals) + env.step(action) + base_env.render_human() + + print(f"Moved to target in {n_steps} steps") + + execute_current_pose = False + + +def main(args: Args): + output_dir = f"{args.record_dir}/{args.env_id}/teleop/" + env = gym.make( + args.env_id, + obs_mode=args.obs_mode, + control_mode="pd_joint_pos", + render_mode="rgb_array", + reward_mode="none", + robot_uids=args.robot_uid, + enable_shadow=True, + viewer_camera_configs=dict(shader_pack=args.viewer_shader), + ) + env = RecordEpisode( + env, + output_dir=output_dir, + trajectory_name="trajectory", + save_video=False, + info_on_video=False, + source_type="teleoperation", + source_desc="teleoperation via the click+drag system", + ) + + seed = 0 + num_trajs = 0 + env.reset(seed=seed) + + # Extract robot info once after first reset + robot_info = _get_robot_info(env, ee_link_override=args.ee_link) + print(f"Robot: {args.robot_uid}") + print(f"EE link: {robot_info['ee_link_name']}") + print(f"Arm joints: {robot_info['arm_joint_names']}") + print(f"Has gripper: {robot_info['has_gripper']}") + if robot_info["is_combined"]: + print(f"Action mapping: {robot_info['action_mapping']}") + print("Press 'h' in the viewer for controls help.\n") + + # Create teleop plugin (will be registered with viewer on first render) + teleop_plugin = TeleopPlugin() + + # Do a first render to get the viewer, then register the plugin + base_env = env.unwrapped + viewer = base_env.render_human() + teleop_plugin.init(viewer) + viewer.plugins.append(teleop_plugin) + + while True: + print(f"Collecting trajectory {num_trajs + 1}, seed={seed}") + code = solve(env, robot_info, step_size=args.step_size, teleop_plugin=teleop_plugin) + if code == "quit": + num_trajs += 1 + break + elif code == "continue": + seed += 1 + num_trajs += 1 + env.reset(seed=seed) + continue + + h5_file_path = env._h5_file.filename + json_file_path = env._json_path + env.close() + del env + print(f"Trajectories saved to {h5_file_path}") + + if args.save_video: + print(f"Saving videos to {output_dir}") + trajectory_data = h5py.File(h5_file_path) + with open(json_file_path, "r") as f: + json_data = json.load(f) + env = gym.make( + args.env_id, + obs_mode=args.obs_mode, + control_mode="pd_joint_pos", + render_mode="rgb_array", + reward_mode="none", + robot_uids=args.robot_uid, + human_render_camera_configs=dict(shader_pack=args.video_saving_shader), + ) + env = RecordEpisode( + env, + output_dir=output_dir, + trajectory_name="trajectory", + save_video=True, + info_on_video=False, + save_trajectory=False, + video_fps=30, + ) + for episode in json_data["episodes"]: + traj_id = f"traj_{episode['episode_id']}" + data = trajectory_data[traj_id] + env.reset(**episode["reset_kwargs"]) + env_states_list = trajectory_utils.dict_to_list_of_dicts(data["env_states"]) + env.base_env.set_state_dict(env_states_list[0]) + for action in np.array(data["actions"]): + env.step(action) + trajectory_data.close() + env.close() + del env + + +if __name__ == "__main__": + main(parse_args()) diff --git a/mani_skill/utils/visualization/viewer_camera_control.py b/mani_skill/utils/visualization/viewer_camera_control.py new file mode 100644 index 0000000000..34e839f3f0 --- /dev/null +++ b/mani_skill/utils/visualization/viewer_camera_control.py @@ -0,0 +1,669 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional + +import numpy as np +import sapien +from sapien import internal_renderer as R +from sapien.utils.viewer.plugin import Plugin, copy_to_clipboard +from transforms3d.quaternions import mat2quat + +from mani_skill.sensors.camera import Camera +from mani_skill.utils import sapien_utils +from mani_skill.utils.structs.pose import Pose + +if TYPE_CHECKING: + from mani_skill.envs.sapien_env import BaseEnv + + +_VIEWER_GL_POSE = sapien.Pose([0, 0, 0], [-0.5, -0.5, 0.5, 0.5]) +_CAMERA_LINESET_VERTICES = [ + 0, + 0, + 0, + 1, + 1, + -1, + 0, + 0, + 0, + -1, + 1, + -1, + 0, + 0, + 0, + 1, + -1, + -1, + 0, + 0, + 0, + -1, + -1, + -1, + 1, + 1, + -1, + 1, + -1, + -1, + 1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + 1, + -1, + -1, + 1, + -1, + 1, + 1, + -1, + 1, + 1.2, + -1, + 0, + 2, + -1, + 0, + 2, + -1, + -1, + 1.2, + -1, + -1, + 1.2, + -1, + 1, + 1.2, + -1, +] +_INACTIVE_CAMERA_LINESET_COLORS = [0.9254901961, 0.5764705882, 0.1882352941, 1.0] * 22 +_ACTIVE_CAMERA_LINESET_COLORS = [0.1882352941, 0.7921568627, 0.3882352941, 1.0] * 22 + + +class ViewerCameraControlPlugin(Plugin): + """Edit ManiSkill cameras directly from the SAPIEN viewer.""" + + def __init__( + self, + env: "BaseEnv", + enabled: bool = True, + selection_radius_px: float = 18.0, + ): + self.env = env + self.enabled = enabled + self.selection_radius_px = selection_radius_px + self._active_camera_name: Optional[str] = None + self._line_sets = [] + self._line_set_names: list[str] = [] + self._line_set_active_name: Optional[str] = None + self._line_set_model = None + self._active_line_set_model = None + self._ui_window = None + self._gizmo = None + self._target_gizmo = None + self._camera_selector = None + self._camera_targets: dict[str, np.ndarray] = {} + self._camera_ups: dict[str, np.ndarray] = {} + self._initial_local_poses: dict[str, sapien.Pose] = {} + self._initial_targets: dict[str, np.ndarray] = {} + self._initial_ups: dict[str, np.ndarray] = {} + self.default_target_distance = 0.25 + + def init(self, viewer): + super().init(viewer) + if hasattr(self.viewer, "register_click_handler"): + self.viewer.register_click_handler(self._handle_click) + self._create_visual_models() + + def notify_scene_change(self): + self._clear_camera_linesets() + self._ui_window = None + self._camera_targets = {} + self._camera_ups = {} + self._initial_local_poses = {} + self._initial_targets = {} + self._initial_ups = {} + if self._active_camera_name not in self.camera_names: + self._active_camera_name = ( + self.camera_names[0] if self.camera_names else None + ) + self._capture_initial_camera_state() + + def clear_scene(self): + self._clear_camera_linesets() + + def close(self): + self._clear_camera_linesets() + self._line_set_model = None + self._active_line_set_model = None + self._ui_window = None + self._gizmo = None + self._target_gizmo = None + self._camera_selector = None + self._camera_targets = {} + self._camera_ups = {} + self._initial_local_poses = {} + self._initial_targets = {} + self._initial_ups = {} + + @property + def editable_cameras(self) -> dict[str, Camera]: + cameras = {} + for uid, sensor in self.env._sensors.items(): + if isinstance(sensor, Camera): + cameras[f"sensor:{uid}"] = sensor + return cameras + + @property + def camera_names(self) -> list[str]: + return list(self.editable_cameras.keys()) + + @property + def camera_items(self) -> list[str]: + return ["None"] + self.camera_names + + @property + def active_camera_name(self) -> Optional[str]: + if self._active_camera_name not in self.editable_cameras: + self._active_camera_name = ( + self.camera_names[0] if self.camera_names else None + ) + return self._active_camera_name + + @active_camera_name.setter + def active_camera_name(self, value: Optional[str]): + self._active_camera_name = value if value in self.editable_cameras else None + if self._active_camera_name is not None: + self._ensure_look_at_state(self._active_camera_name) + self.viewer.notify_render_update() + + @property + def active_camera_index(self) -> int: + active_camera_name = self.active_camera_name + if active_camera_name is None: + return 0 + return self.camera_names.index(active_camera_name) + 1 + + @active_camera_index.setter + def active_camera_index(self, index: int): + if index <= 0 or index > len(self.camera_names): + self.active_camera_name = None + else: + self.active_camera_name = self.camera_names[index - 1] + + @property + def active_camera(self) -> Optional[Camera]: + active_camera_name = self.active_camera_name + if active_camera_name is None: + return None + return self.editable_cameras[active_camera_name] + + @property + def gizmo_matrix(self): + if self.active_camera_name is None: + return np.eye(4) + self._ensure_look_at_state(self.active_camera_name) + matrix = np.eye(4, dtype=np.float32) + matrix[:3, 3] = self.active_eye_position + return matrix + + @gizmo_matrix.setter + def gizmo_matrix(self, matrix): + if self.active_camera_name is None: + return + self.active_eye_position = np.asarray(matrix[:3, 3], dtype=np.float32) + + @property + def target_gizmo_matrix(self): + matrix = np.eye(4, dtype=np.float32) + matrix[:3, 3] = self.active_target_position + return matrix + + @target_gizmo_matrix.setter + def target_gizmo_matrix(self, matrix): + self.active_target_position = np.asarray(matrix[:3, 3], dtype=np.float32) + + @property + def active_eye_position(self): + active_camera = self.active_camera + if active_camera is None: + return np.zeros(3, dtype=np.float32) + pose = self._get_camera_global_pose(active_camera) + return np.asarray(pose.p, dtype=np.float32) + + @active_eye_position.setter + def active_eye_position(self, value): + if self.active_camera_name is None: + return + self._apply_look_at(self.active_camera_name, eye=np.asarray(value, dtype=np.float32)) + + @property + def active_target_position(self): + if self.active_camera_name is None: + return np.zeros(3, dtype=np.float32) + self._ensure_look_at_state(self.active_camera_name) + return self._camera_targets[self.active_camera_name].copy() + + @active_target_position.setter + def active_target_position(self, value): + if self.active_camera_name is None: + return + self._apply_look_at( + self.active_camera_name, target=np.asarray(value, dtype=np.float32) + ) + + @property + def active_up_vector(self): + if self.active_camera_name is None: + return np.array([0, 0, 1], dtype=np.float32) + self._ensure_look_at_state(self.active_camera_name) + return self._camera_ups[self.active_camera_name].copy() + + @active_up_vector.setter + def active_up_vector(self, value): + if self.active_camera_name is None: + return + self._apply_look_at( + self.active_camera_name, up=np.asarray(value, dtype=np.float32) + ) + + def get_ui_windows(self): + self._build_ui() + if self._ui_window is None: + return [] + return [self._ui_window] + + def after_render(self): + if self.viewer.scene is None: + return + self._update_camera_linesets() + + def look_through_active_camera(self, _=None): + active_camera = self.active_camera + if active_camera is None: + return + self.viewer.set_camera_pose(self._get_camera_global_pose(active_camera)) + + def set_active_camera_from_view(self, _=None): + active_camera = self.active_camera + if active_camera is None: + return + self._set_camera_global_pose( + active_camera, self.viewer.window.get_camera_pose() + ) + if self.active_camera_name is not None: + self._sync_look_at_state_from_camera( + self.active_camera_name, preserve_distance=True + ) + self.viewer.notify_render_update() + + def copy_active_camera_config(self, _=None): + active_camera = self.active_camera + active_camera_name = self.active_camera_name + if active_camera is None or active_camera_name is None: + return + + self._ensure_look_at_state(active_camera_name) + eye = self._format_vector(self.active_eye_position) + target = self._format_vector(self.active_target_position) + up = self._format_vector(self.active_up_vector) + camera_uid = active_camera_name.split(":", 1)[1] + + config_lines = [ + f'{camera_uid}_pose = sapien_utils.look_at(eye={eye}, target={target}, up={up})', + ] + if active_camera.config.fov is not None: + config_lines.append( + 'CameraConfig(' + f'"{camera_uid}", {camera_uid}_pose, {active_camera.config.width}, ' + f'{active_camera.config.height}, {active_camera.config.fov!r}, ' + f'{active_camera.config.near!r}, {active_camera.config.far!r}' + ')' + ) + else: + intrinsic = np.asarray(active_camera.config.intrinsic).tolist() + config_lines.append( + 'CameraConfig(' + f'"{camera_uid}", pose={camera_uid}_pose, width={active_camera.config.width}, ' + f'height={active_camera.config.height}, intrinsic={intrinsic}, ' + f'near={active_camera.config.near!r}, far={active_camera.config.far!r}' + ')' + ) + copy_to_clipboard("\n".join(config_lines)) + + def reset_camera_configs(self, _=None): + for camera_name, local_pose in self._initial_local_poses.items(): + camera = self.editable_cameras.get(camera_name) + if camera is None: + continue + camera.camera.set_local_pose(local_pose) + camera.config.pose = Pose.create(local_pose) + + self._camera_targets = { + camera_name: value.copy() + for camera_name, value in self._initial_targets.items() + } + self._camera_ups = { + camera_name: value.copy() for camera_name, value in self._initial_ups.items() + } + self._active_camera_name = None + self.viewer.select_entity(None) + self._ui_window = None + self._gizmo = None + self._target_gizmo = None + self.viewer.notify_render_update() + + def _build_ui(self): + if self.viewer.scene is None or not self.camera_names: + self._ui_window = None + return + + if self._ui_window is None: + self._gizmo = R.UIGizmo().Bind(self, "gizmo_matrix") + self._target_gizmo = R.UIGizmo().Bind(self, "target_gizmo_matrix") + self._camera_selector = ( + R.UIOptions() + .Label("Camera") + .Style("select") + .BindItems(self, "camera_items") + .BindIndex(self, "active_camera_index") + ) + + self._ui_window = ( + R.UIWindow() + .Label("Camera Editor") + .Pos(10, 420) + .Size(420, 430) + .append( + R.UICheckbox().Label("Enabled").Bind(self, "enabled"), + self._camera_selector, + R.UIDisplayText().Text( + "Click a camera frustum, then drag the pose gizmo to move it." + ), + R.UISameLine().append( + R.UIButton() + .Label("Look Through") + .Callback(self.look_through_active_camera), + R.UIButton() + .Label("Set From View") + .Callback(self.set_active_camera_from_view), + R.UIButton() + .Label("Reset") + .Callback(self.reset_camera_configs), + R.UIButton() + .Label("Copy Config") + .Callback(self.copy_active_camera_config), + ), + R.UIConditional() + .Bind(lambda: self.enabled and self.active_camera is not None) + .append( + R.UISection() + .Label("Position") + .Expanded(True) + .append( + R.UIDisplayText().Text( + "Move the camera position here. Orientation stays driven by Look At." + ), + self._gizmo, + ), + R.UISection() + .Label("Look At") + .Expanded(True) + .append( + R.UIDisplayText().Text( + "Edit eye/target/up directly, or drag the target gizmo." + ), + R.UIInputFloat3() + .Label("Eye") + .Bind(self, "active_eye_position"), + R.UIInputFloat3() + .Label("Target") + .Bind(self, "active_target_position"), + R.UIInputFloat3() + .Label("Up") + .Bind(self, "active_up_vector"), + self._target_gizmo, + ), + ), + ) + ) + + projection = self.viewer.window.get_camera_projection_matrix() + view = ( + (self.viewer.window.get_camera_pose() * _VIEWER_GL_POSE) + .inv() + .to_transformation_matrix() + ) + self._gizmo.CameraMatrices(view, projection) + self._gizmo.Matrix(self.gizmo_matrix) + self._target_gizmo.CameraMatrices(view, projection) + self._target_gizmo.Matrix(self.target_gizmo_matrix) + + def _create_visual_models(self): + self._line_set_model = self.viewer.renderer_context.create_line_set( + _CAMERA_LINESET_VERTICES, + _INACTIVE_CAMERA_LINESET_COLORS, + ) + self._active_line_set_model = self.viewer.renderer_context.create_line_set( + _CAMERA_LINESET_VERTICES, + _ACTIVE_CAMERA_LINESET_COLORS, + ) + + def _clear_camera_linesets(self): + if self.viewer.render_scene is None: + self._line_sets = [] + self._line_set_names = [] + self._line_set_active_name = None + return + for node in self._line_sets: + self.viewer.render_scene.remove_node(node) + self._line_sets = [] + self._line_set_names = [] + self._line_set_active_name = None + + def _update_camera_linesets(self): + if not self.enabled or self.viewer.render_scene is None: + self._clear_camera_linesets() + return + + camera_names = self.camera_names + if ( + len(self._line_sets) != len(camera_names) + or self._line_set_names != camera_names + or self._line_set_active_name != self.active_camera_name + ): + self._clear_camera_linesets() + for camera_name in camera_names: + line_set_model = ( + self._active_line_set_model + if camera_name == self.active_camera_name + else self._line_set_model + ) + self._line_sets.append( + self.viewer.render_scene.add_line_set(line_set_model) + ) + self._line_set_names = list(camera_names) + self._line_set_active_name = self.active_camera_name + + for line_set, camera_name in zip(self._line_sets, camera_names): + camera = self.editable_cameras[camera_name] + model_matrix = self._get_camera_model_matrix(camera) + line_set.set_position(model_matrix[:3, 3]) + line_set.set_rotation(mat2quat(model_matrix[:3, :3])) + line_set.set_scale( + np.array( + [ + np.tan(camera.camera.fovx / 2), + np.tan(camera.camera.fovy / 2), + 1.0, + ] + ) + * 0.3 + ) + + def _handle_click(self, _viewer, x: int, y: int) -> bool: + if not self.enabled: + return False + + click_target = np.array([x, y], dtype=np.float32) + best_match = None + for camera_name, camera in self.editable_cameras.items(): + projected = self._project_camera_origin(camera) + if projected is None: + continue + distance = np.linalg.norm(projected[:2] - click_target) + if distance > self.selection_radius_px: + continue + candidate = (distance, projected[2], camera_name) + if best_match is None or candidate < best_match: + best_match = candidate + + if best_match is None: + return False + + self.active_camera_name = best_match[2] + self.viewer.select_entity(None) + return True + + def _project_camera_origin(self, camera: Camera): + segmentation_width, segmentation_height = self.viewer.window.get_picture_size( + "Segmentation" + ) + pose = self._get_camera_global_pose(camera) + point = np.array([pose.p[0], pose.p[1], pose.p[2], 1.0], dtype=np.float32) + view = ( + (self.viewer.window.get_camera_pose() * _VIEWER_GL_POSE) + .inv() + .to_transformation_matrix() + ) + clip = self.viewer.window.get_camera_projection_matrix() @ (view @ point) + if clip[3] <= 1e-6: + return None + ndc = clip[:3] / clip[3] + if ndc[2] < -1 or ndc[2] > 1: + return None + px = (ndc[0] * 0.5 + 0.5) * segmentation_width + py = (1 - (ndc[1] * 0.5 + 0.5)) * segmentation_height + return np.array([px, py, ndc[2]], dtype=np.float32) + + def _get_camera_global_pose(self, camera: Camera) -> sapien.Pose: + pose = camera.camera.get_global_pose() + if len(pose) > 1: + pose = pose[0] + return pose.sp + + def _get_camera_model_matrix(self, camera: Camera) -> np.ndarray: + matrix = camera.camera.get_model_matrix() + if hasattr(matrix, "detach"): + matrix = matrix.detach().cpu().numpy() + else: + matrix = np.asarray(matrix) + if matrix.ndim == 3: + matrix = matrix[0] + return matrix + + def _set_camera_global_pose(self, camera: Camera, pose: sapien.Pose): + local_pose = pose + if camera.camera.mount is not None: + mount_pose = camera.camera.mount.pose + if len(mount_pose) > 1: + mount_pose = mount_pose[0] + local_pose = mount_pose.sp.inv() * pose + camera.camera.set_local_pose(local_pose) + camera.config.pose = Pose.create(local_pose) + + def _capture_initial_camera_state(self): + for camera_name, camera in self.editable_cameras.items(): + local_pose = camera.camera.get_local_pose() + if len(local_pose) > 1: + local_pose = local_pose[0] + self._initial_local_poses[camera_name] = local_pose.sp + pose = self._get_camera_global_pose(camera) + transform = pose.to_transformation_matrix() + eye = np.asarray(pose.p, dtype=np.float32) + forward = np.asarray(transform[:3, 0], dtype=np.float32) + up = self._normalize_vector( + np.asarray(transform[:3, 2], dtype=np.float32), + fallback=np.array([0, 0, 1], dtype=np.float32), + ) + self._initial_targets[camera_name] = ( + eye + forward * self.default_target_distance + ) + self._initial_ups[camera_name] = up + + def _ensure_look_at_state(self, camera_name: str): + if camera_name in self._camera_targets and camera_name in self._camera_ups: + return + self._sync_look_at_state_from_camera(camera_name, preserve_distance=False) + + def _sync_look_at_state_from_camera( + self, camera_name: str, preserve_distance: bool = True + ): + camera = self.editable_cameras[camera_name] + pose = self._get_camera_global_pose(camera) + transform = pose.to_transformation_matrix() + eye = np.asarray(pose.p, dtype=np.float32) + forward = np.asarray(transform[:3, 0], dtype=np.float32) + up = np.asarray(transform[:3, 2], dtype=np.float32) + + target_distance = self.default_target_distance + if preserve_distance and camera_name in self._camera_targets: + target_distance = np.linalg.norm(self._camera_targets[camera_name] - eye) + if target_distance < 1e-4: + target_distance = self.default_target_distance + + self._camera_targets[camera_name] = eye + forward * target_distance + self._camera_ups[camera_name] = self._normalize_vector( + up, fallback=np.array([0, 0, 1], dtype=np.float32) + ) + + def _apply_look_at( + self, + camera_name: str, + eye: Optional[np.ndarray] = None, + target: Optional[np.ndarray] = None, + up: Optional[np.ndarray] = None, + ): + self._ensure_look_at_state(camera_name) + eye = ( + np.asarray(eye, dtype=np.float32) + if eye is not None + else self.active_eye_position + ) + target = ( + np.asarray(target, dtype=np.float32) + if target is not None + else self._camera_targets[camera_name] + ) + up = self._normalize_vector( + up if up is not None else self._camera_ups[camera_name], + fallback=self._camera_ups[camera_name], + ) + if np.linalg.norm(target - eye) < 1e-4: + return + + self._camera_targets[camera_name] = target + self._camera_ups[camera_name] = up + pose = sapien_utils.look_at(eye=eye, target=target, up=up).sp + self._set_camera_global_pose(self.editable_cameras[camera_name], pose) + self.viewer.notify_render_update() + + def _normalize_vector(self, vector, fallback: np.ndarray) -> np.ndarray: + vector = np.asarray(vector, dtype=np.float32) + norm = np.linalg.norm(vector) + if norm < 1e-6: + return np.asarray(fallback, dtype=np.float32) + return vector / norm + + def _format_vector(self, vector: np.ndarray) -> str: + values = np.asarray(vector, dtype=np.float32).tolist() + return "[" + ", ".join(f"{value:.6g}" for value in values) + "]"