Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,17 +1,18 @@
|
|
| 1 |
-
import os, subprocess, json, pathlib, time
|
| 2 |
import gradio as gr
|
| 3 |
|
| 4 |
-
# ---------- CONSTANTS ----------
|
| 5 |
-
RUN_ROOT = "/home/user/app/runs"
|
| 6 |
-
|
|
|
|
| 7 |
os.makedirs(RUN_ROOT, exist_ok=True)
|
|
|
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
DEFAULT_REPO_ID = os.environ.get("REPO_ID", "")
|
| 11 |
PUSH_DEFAULT = os.environ.get("PUSH_TO_HUB", "true").lower() in {"1","true","yes"}
|
| 12 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 13 |
|
| 14 |
-
# Optional: login with a Space secret named HF_TOKEN
|
| 15 |
if HF_TOKEN:
|
| 16 |
try:
|
| 17 |
from huggingface_hub import login
|
|
@@ -43,21 +44,33 @@ def tail_file(path: str, n=200):
|
|
| 43 |
|
| 44 |
# ---------- RUN DIR HELPERS ----------
|
| 45 |
def new_run_dir():
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
LAST_PTR.write_text(str(d))
|
| 49 |
return str(d)
|
| 50 |
|
| 51 |
def current_run_dir(user_override: str | None):
|
|
|
|
| 52 |
if user_override and user_override.strip():
|
| 53 |
return user_override.strip()
|
| 54 |
if LAST_PTR.exists():
|
| 55 |
return LAST_PTR.read_text().strip()
|
| 56 |
-
return ""
|
| 57 |
|
| 58 |
def has_checkpoint(run_dir: str):
|
|
|
|
| 59 |
return os.path.isdir(os.path.join(run_dir, "checkpoints", "last"))
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
def train_log_path(run_dir: str):
|
| 62 |
return os.path.join(run_dir, "logs", "train.log")
|
| 63 |
|
|
@@ -67,7 +80,7 @@ def eval_log_path(run_dir: str):
|
|
| 67 |
# ---------- ACTIONS ----------
|
| 68 |
def start_training(steps, batch_size, push_to_hub, repo_id):
|
| 69 |
run_dir = new_run_dir()
|
| 70 |
-
log =
|
| 71 |
|
| 72 |
push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
|
| 73 |
if push_to_hub and repo_id.strip() else
|
|
@@ -96,7 +109,7 @@ def resume_training(extra_steps, push_to_hub, repo_id, run_dir_text):
|
|
| 96 |
log = train_log_path(run_dir)
|
| 97 |
|
| 98 |
if not has_checkpoint(run_dir):
|
| 99 |
-
return f"No checkpoint in {run_dir}/checkpoints/last/ yet β
|
| 100 |
|
| 101 |
push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
|
| 102 |
if push_to_hub and repo_id.strip() else
|
|
@@ -148,17 +161,48 @@ def eval_latest(run_dir_text):
|
|
| 148 |
msg = f"Evaluated run at: {run_dir}\nEval exited rc={rc}\n\n=== eval.log tail ===\n{tail}"
|
| 149 |
return msg, run_dir, tail_file(elog), metrics_txt
|
| 150 |
|
|
|
|
| 151 |
def list_runs():
|
| 152 |
root = pathlib.Path(RUN_ROOT)
|
| 153 |
if not root.exists():
|
| 154 |
return "(no runs)"
|
| 155 |
rows = []
|
| 156 |
for d in sorted(root.glob("pusht_*")):
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
ck = "β" if has_checkpoint(str(d)) else "β"
|
| 159 |
rows.append(f"{d.name}\t{size}\tcheckpoint:{ck}")
|
| 160 |
return "name\tsize\tcheckpoint\n" + "\n".join(rows) if rows else "(no runs)"
|
| 161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
# ---------- UI ----------
|
| 163 |
with gr.Blocks(title="LeRobot PushT Trainer (Space)") as demo:
|
| 164 |
gr.Markdown("# π€ LeRobot PushT Trainer\nTrain / Resume / Evaluate. Files persist under `/home/user/app/runs/` (see App Files).")
|
|
@@ -194,10 +238,18 @@ with gr.Blocks(title="LeRobot PushT Trainer (Space)") as demo:
|
|
| 194 |
list_btn = gr.Button("π List runs folder")
|
| 195 |
list_out = gr.Textbox(label="runs/ listing", lines=12)
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
start_btn.click(start_training, inputs=[steps, batch, push_to_hub, repo_id], outputs=[start_out, run_dir_view, train_log])
|
| 198 |
resume_btn.click(resume_training, inputs=[extra_steps, push_to_hub, repo_id, run_dir_text], outputs=[resume_out, run_dir_view, resume_log])
|
| 199 |
eval_btn.click(eval_latest, inputs=[run_dir_text], outputs=[eval_out, run_dir_view, eval_log, metrics_box])
|
| 200 |
list_btn.click(list_runs, outputs=list_out)
|
|
|
|
|
|
|
| 201 |
|
| 202 |
if __name__ == "__main__":
|
| 203 |
demo.launch()
|
|
|
|
| 1 |
+
import os, subprocess, json, pathlib, time, shutil
|
| 2 |
import gradio as gr
|
| 3 |
|
| 4 |
+
# ---------- CONSTANTS (visible in App Files) ----------
|
| 5 |
+
RUN_ROOT = "/home/user/app/runs" # where all runs live
|
| 6 |
+
LOG_ROOT = "/home/user/app/logs" # global logs (so we don't pre-create run dirs)
|
| 7 |
+
LAST_PTR = pathlib.Path(RUN_ROOT) / "LAST" # remembers most recent run path
|
| 8 |
os.makedirs(RUN_ROOT, exist_ok=True)
|
| 9 |
+
os.makedirs(LOG_ROOT, exist_ok=True)
|
| 10 |
|
| 11 |
+
# ---------- ENV / HUB ----------
|
| 12 |
+
DEFAULT_REPO_ID = os.environ.get("REPO_ID", "") # e.g. "zino36/lerobot-pusht-colab"
|
| 13 |
PUSH_DEFAULT = os.environ.get("PUSH_TO_HUB", "true").lower() in {"1","true","yes"}
|
| 14 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 15 |
|
|
|
|
| 16 |
if HF_TOKEN:
|
| 17 |
try:
|
| 18 |
from huggingface_hub import login
|
|
|
|
| 44 |
|
| 45 |
# ---------- RUN DIR HELPERS ----------
|
| 46 |
def new_run_dir():
|
| 47 |
+
"""Return a unique run dir path WITHOUT creating it (so LeRobot can create it)."""
|
| 48 |
+
base = pathlib.Path(RUN_ROOT) / f"pusht_{int(time.time())}"
|
| 49 |
+
d = base
|
| 50 |
+
i = 1
|
| 51 |
+
while d.exists():
|
| 52 |
+
d = pathlib.Path(f"{base}_{i}")
|
| 53 |
+
i += 1
|
| 54 |
LAST_PTR.write_text(str(d))
|
| 55 |
return str(d)
|
| 56 |
|
| 57 |
def current_run_dir(user_override: str | None):
|
| 58 |
+
"""Prefer user text if given, else use the LAST pointer if present."""
|
| 59 |
if user_override and user_override.strip():
|
| 60 |
return user_override.strip()
|
| 61 |
if LAST_PTR.exists():
|
| 62 |
return LAST_PTR.read_text().strip()
|
| 63 |
+
return ""
|
| 64 |
|
| 65 |
def has_checkpoint(run_dir: str):
|
| 66 |
+
"""We consider a checkpoint present once checkpoints/last/ exists (first save is at step 500)."""
|
| 67 |
return os.path.isdir(os.path.join(run_dir, "checkpoints", "last"))
|
| 68 |
|
| 69 |
+
def train_log_path_for_new(run_dir: str):
|
| 70 |
+
"""Write fresh-run logs to global LOG_ROOT so we don't pre-create run_dir."""
|
| 71 |
+
name = pathlib.Path(run_dir).name
|
| 72 |
+
return os.path.join(LOG_ROOT, f"{name}.train.log")
|
| 73 |
+
|
| 74 |
def train_log_path(run_dir: str):
|
| 75 |
return os.path.join(run_dir, "logs", "train.log")
|
| 76 |
|
|
|
|
| 80 |
# ---------- ACTIONS ----------
|
| 81 |
def start_training(steps, batch_size, push_to_hub, repo_id):
|
| 82 |
run_dir = new_run_dir()
|
| 83 |
+
log = train_log_path_for_new(run_dir)
|
| 84 |
|
| 85 |
push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
|
| 86 |
if push_to_hub and repo_id.strip() else
|
|
|
|
| 109 |
log = train_log_path(run_dir)
|
| 110 |
|
| 111 |
if not has_checkpoint(run_dir):
|
| 112 |
+
return f"No checkpoint in {run_dir}/checkpoints/last/ yet β run at least 500 steps once.", run_dir, tail_file(log)
|
| 113 |
|
| 114 |
push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
|
| 115 |
if push_to_hub and repo_id.strip() else
|
|
|
|
| 161 |
msg = f"Evaluated run at: {run_dir}\nEval exited rc={rc}\n\n=== eval.log tail ===\n{tail}"
|
| 162 |
return msg, run_dir, tail_file(elog), metrics_txt
|
| 163 |
|
| 164 |
+
# ---------- Maintenance (list / delete runs) ----------
|
| 165 |
def list_runs():
|
| 166 |
root = pathlib.Path(RUN_ROOT)
|
| 167 |
if not root.exists():
|
| 168 |
return "(no runs)"
|
| 169 |
rows = []
|
| 170 |
for d in sorted(root.glob("pusht_*")):
|
| 171 |
+
try:
|
| 172 |
+
size = subprocess.check_output(
|
| 173 |
+
["bash","-lc", f"du -sh {d} | cut -f1"], text=True
|
| 174 |
+
).strip()
|
| 175 |
+
except Exception:
|
| 176 |
+
size = "?"
|
| 177 |
ck = "β" if has_checkpoint(str(d)) else "β"
|
| 178 |
rows.append(f"{d.name}\t{size}\tcheckpoint:{ck}")
|
| 179 |
return "name\tsize\tcheckpoint\n" + "\n".join(rows) if rows else "(no runs)"
|
| 180 |
|
| 181 |
+
def delete_run_by_name(name: str):
|
| 182 |
+
name = os.path.basename((name or "").strip())
|
| 183 |
+
if not name:
|
| 184 |
+
return "Type a folder like 'pusht_1234567890'.", list_runs()
|
| 185 |
+
target = os.path.join(RUN_ROOT, name)
|
| 186 |
+
if not target.startswith(RUN_ROOT + "/"):
|
| 187 |
+
return "Refusing to delete outside runs/.", list_runs()
|
| 188 |
+
if not os.path.isdir(target):
|
| 189 |
+
return f"Folder not found: {target}", list_runs()
|
| 190 |
+
shutil.rmtree(target, ignore_errors=True)
|
| 191 |
+
# clear LAST if it pointed here
|
| 192 |
+
if LAST_PTR.exists() and LAST_PTR.read_text().strip() == target:
|
| 193 |
+
LAST_PTR.unlink(missing_ok=True)
|
| 194 |
+
return f"Deleted {target}", list_runs()
|
| 195 |
+
|
| 196 |
+
def delete_all_runs():
|
| 197 |
+
if not os.path.isdir(RUN_ROOT):
|
| 198 |
+
return "(runs/ missing)", list_runs()
|
| 199 |
+
for n in os.listdir(RUN_ROOT):
|
| 200 |
+
p = os.path.join(RUN_ROOT, n)
|
| 201 |
+
if os.path.isdir(p) and n.startswith("pusht_"):
|
| 202 |
+
shutil.rmtree(p, ignore_errors=True)
|
| 203 |
+
LAST_PTR.unlink(missing_ok=True)
|
| 204 |
+
return "Deleted all pusht_* runs.", list_runs()
|
| 205 |
+
|
| 206 |
# ---------- UI ----------
|
| 207 |
with gr.Blocks(title="LeRobot PushT Trainer (Space)") as demo:
|
| 208 |
gr.Markdown("# π€ LeRobot PushT Trainer\nTrain / Resume / Evaluate. Files persist under `/home/user/app/runs/` (see App Files).")
|
|
|
|
| 238 |
list_btn = gr.Button("π List runs folder")
|
| 239 |
list_out = gr.Textbox(label="runs/ listing", lines=12)
|
| 240 |
|
| 241 |
+
gr.Markdown("### Maintenance")
|
| 242 |
+
del_name = gr.Textbox(label="Run folder name to delete (e.g., pusht_1699999999)")
|
| 243 |
+
del_one_btn = gr.Button("ποΈ Delete this run")
|
| 244 |
+
del_all_btn = gr.Button("π§Ή Delete ALL pusht_* runs")
|
| 245 |
+
|
| 246 |
+
# Wiring
|
| 247 |
start_btn.click(start_training, inputs=[steps, batch, push_to_hub, repo_id], outputs=[start_out, run_dir_view, train_log])
|
| 248 |
resume_btn.click(resume_training, inputs=[extra_steps, push_to_hub, repo_id, run_dir_text], outputs=[resume_out, run_dir_view, resume_log])
|
| 249 |
eval_btn.click(eval_latest, inputs=[run_dir_text], outputs=[eval_out, run_dir_view, eval_log, metrics_box])
|
| 250 |
list_btn.click(list_runs, outputs=list_out)
|
| 251 |
+
del_one_btn.click(delete_run_by_name, inputs=del_name, outputs=[list_out, list_out])
|
| 252 |
+
del_all_btn.click(delete_all_runs, outputs=[list_out, list_out])
|
| 253 |
|
| 254 |
if __name__ == "__main__":
|
| 255 |
demo.launch()
|