Spaces:

zino36
/

lerobot-pusht-trainer

Sleeping

App Files Files Community

zino36 commited on Sep 29

Commit

7d39621

verified ·

1 Parent(s): 1735090

Create app.py

Browse files

Files changed (1) hide show

app.py +203 -0

app.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os, subprocess, json, pathlib, time
+import gradio as gr
+# ---------- CONSTANTS ----------
+RUN_ROOT = "/home/user/app/runs"         # where all runs are stored (visible in App Files)
+LAST_PTR = pathlib.Path(RUN_ROOT) / "LAST"  # file that stores path to the most recent run
+os.makedirs(RUN_ROOT, exist_ok=True)
+# env helpers (with correct names)
+DEFAULT_REPO_ID = os.environ.get("REPO_ID", "")
+PUSH_DEFAULT    = os.environ.get("PUSH_TO_HUB", "true").lower() in {"1","true","yes"}
+HF_TOKEN        = os.environ.get("HF_TOKEN")
+# Optional: login with a Space secret named HF_TOKEN
+if HF_TOKEN:
+    try:
+        from huggingface_hub import login
+        login(token=HF_TOKEN)
+    except Exception as e:
+        print("HF login failed:", e)
+# ---------- LOG HELPERS ----------
+def _run(cmd: str, logfile: str):
+    os.makedirs(os.path.dirname(logfile), exist_ok=True)
+    with open(logfile, "a", buffering=1) as f:
+        f.write("\n---- CMD ----\n" + cmd + "\n--------------\n")
+        p = subprocess.Popen(cmd, shell=True,
+                             stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                             text=True, bufsize=1)
+        lines = []
+        for line in p.stdout:
+            f.write(line)
+            lines.append(line)
+        p.wait()
+        return p.returncode, "".join(lines[-200:])
+def tail_file(path: str, n=200):
+    if not os.path.exists(path):
+        return "(no log yet)"
+    with open(path, "r", errors="ignore") as f:
+        lines = f.readlines()
+    return "".join(lines[-n:])
+# ---------- RUN DIR HELPERS ----------
+def new_run_dir():
+    d = pathlib.Path(RUN_ROOT) / f"pusht_{int(time.time())}"
+    d.mkdir(parents=True, exist_ok=True)
+    LAST_PTR.write_text(str(d))
+    return str(d)
+def current_run_dir(user_override: str | None):
+    if user_override and user_override.strip():
+        return user_override.strip()
+    if LAST_PTR.exists():
+        return LAST_PTR.read_text().strip()
+    return ""  # none yet
+def has_checkpoint(run_dir: str):
+    return os.path.isdir(os.path.join(run_dir, "checkpoints", "last"))
+def train_log_path(run_dir: str):
+    return os.path.join(run_dir, "logs", "train.log")
+def eval_log_path(run_dir: str):
+    return os.path.join(run_dir, "logs", "eval.log")
+# ---------- ACTIONS ----------
+def start_training(steps, batch_size, push_to_hub, repo_id):
+    run_dir = new_run_dir()
+    log = train_log_path(run_dir)
+    push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
+                  if push_to_hub and repo_id.strip() else
+                  "--policy.push_to_hub=false")
+    cmd = (
+        "lerobot-train "
+        f"--output_dir='{run_dir}' "
+        "--policy.type=diffusion "
+        "--dataset.repo_id=lerobot/pusht "
+        "--env.type=pusht "
+        f"--batch_size={batch_size} "
+        f"--steps={steps} "
+        "--eval_freq=500 "
+        "--save_freq=500 "
+        f"{push_flags}"
+    )
+    rc, tail = _run(cmd, log)
+    msg = f"Started fresh run at: {run_dir}\nTrain exited rc={rc}\n\n=== train.log tail ===\n{tail}"
+    return msg, run_dir, tail_file(log)
+def resume_training(extra_steps, push_to_hub, repo_id, run_dir_text):
+    run_dir = current_run_dir(run_dir_text)
+    if not run_dir:
+        return "No run found yet. Start a fresh training first.", "", "(no log)"
+    log = train_log_path(run_dir)
+    if not has_checkpoint(run_dir):
+        return f"No checkpoint in {run_dir}/checkpoints/last/ yet — let the run save once (>= first 500 steps).", run_dir, tail_file(log)
+    push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
+                  if push_to_hub and repo_id.strip() else
+                  "--policy.push_to_hub=false")
+    cmd = (
+        "lerobot-train "
+        f"--output_dir='{run_dir}' "
+        "--resume=true "
+        f"--steps={extra_steps} "
+        "--eval_freq=500 "
+        "--save_freq=500 "
+        f"{push_flags}"
+    )
+    rc, tail = _run(cmd, log)
+    msg = f"Resumed run at: {run_dir}\nResume exited rc={rc}\n\n=== train.log tail ===\n{tail}"
+    return msg, run_dir, tail_file(log)
+def eval_latest(run_dir_text):
+    run_dir = current_run_dir(run_dir_text)
+    if not run_dir:
+        return "No run found yet. Start a fresh training first.", "", "(no log)", "(no metrics)"
+    elog = eval_log_path(run_dir)
+    if not has_checkpoint(run_dir):
+        return f"No checkpoint in {run_dir}/checkpoints/last/ to evaluate.", run_dir, tail_file(elog), "(no metrics)"
+    ckpt = os.path.join(run_dir, "checkpoints", "last", "pretrained_model")
+    eval_out_dir = os.path.join(run_dir, "eval_latest")
+    os.makedirs(eval_out_dir, exist_ok=True)
+    cmd = (
+        "lerobot-eval "
+        f"--policy.path='{ckpt}' "
+        "--env.type=pusht "
+        "--eval.n_episodes=100 "
+        "--eval.batch_size=50 "
+        f"--output_dir='{eval_out_dir}'"
+    )
+    rc, tail = _run(cmd, elog)
+    metrics_txt = "(metrics.json not found)"
+    p = pathlib.Path(eval_out_dir) / "metrics.json"
+    if p.exists():
+        try:
+            m = json.loads(p.read_text())
+            metrics_txt = f"Success rate: {m.get('success_rate')}\nAvg max overlap: {m.get('avg_max_overlap')}"
+        except Exception:
+            metrics_txt = "(could not parse metrics.json)"
+    msg = f"Evaluated run at: {run_dir}\nEval exited rc={rc}\n\n=== eval.log tail ===\n{tail}"
+    return msg, run_dir, tail_file(elog), metrics_txt
+def list_runs():
+    root = pathlib.Path(RUN_ROOT)
+    if not root.exists():
+        return "(no runs)"
+    rows = []
+    for d in sorted(root.glob("pusht_*")):
+        size = subprocess.check_output(["bash","-lc", f"du -sh {d} | cut -f1"], text=True).strip()
+        ck = "✓" if has_checkpoint(str(d)) else "—"
+        rows.append(f"{d.name}\t{size}\tcheckpoint:{ck}")
+    return "name\tsize\tcheckpoint\n" + "\n".join(rows) if rows else "(no runs)"
+# ---------- UI ----------
+with gr.Blocks(title="LeRobot PushT Trainer (Space)") as demo:
+    gr.Markdown("# 🤖 LeRobot PushT Trainer\nTrain / Resume / Evaluate. Files persist under `/home/user/app/runs/` (see App Files).")
+    with gr.Row():
+        repo_id = gr.Textbox(label="Hugging Face Model Repo (optional)", value=DEFAULT_REPO_ID, placeholder="username/repo-name")
+        push_to_hub = gr.Checkbox(label="Push checkpoints to Hub", value=PUSH_DEFAULT)
+    with gr.Row():
+        steps = gr.Slider(200, 20000, value=2000, step=100, label="Training steps (fresh run)")
+        batch = gr.Slider(4, 64, value=16, step=2, label="Batch size")
+    start_btn = gr.Button("🚀 Start Fresh Training")
+    start_out = gr.Textbox(label="Start Output")
+    run_dir_view = gr.Textbox(label="Current run directory (auto-filled after start)")
+    train_log = gr.Textbox(label="train.log (tail)", lines=20)
+    gr.Markdown("### Resume / Evaluate a Specific Run")
+    run_dir_text = gr.Textbox(label="Run directory (leave blank to use the latest)")
+    with gr.Row():
+        extra_steps = gr.Slider(200, 20000, value=2000, step=100, label="Steps to add on resume")
+        resume_btn = gr.Button("▶️ Resume from Last Checkpoint")
+    resume_out = gr.Textbox(label="Resume Output")
+    resume_log = gr.Textbox(label="train.log (tail)", lines=20)
+    gr.Markdown("### Evaluate Latest Checkpoint of Selected Run")
+    eval_btn = gr.Button("📈 Evaluate Latest")
+    eval_out = gr.Textbox(label="Eval Output")
+    eval_log = gr.Textbox(label="eval.log (tail)", lines=20)
+    metrics_box = gr.Textbox(label="Parsed metrics (if metrics.json exists)")
+    gr.Markdown("### Runs on disk")
+    list_btn = gr.Button("📂 List runs folder")
+    list_out = gr.Textbox(label="runs/ listing", lines=12)
+    start_btn.click(start_training, inputs=[steps, batch, push_to_hub, repo_id], outputs=[start_out, run_dir_view, train_log])
+    resume_btn.click(resume_training, inputs=[extra_steps, push_to_hub, repo_id, run_dir_text], outputs=[resume_out, run_dir_view, resume_log])
+    eval_btn.click(eval_latest, inputs=[run_dir_text], outputs=[eval_out, run_dir_view, eval_log, metrics_box])
+    list_btn.click(list_runs, outputs=list_out)
+if __name__ == "__main__":
+    demo.launch()