zino36 commited on
Commit
69bccb6
Β·
verified Β·
1 Parent(s): 7d39621

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -13
app.py CHANGED
@@ -1,17 +1,18 @@
1
- import os, subprocess, json, pathlib, time
2
  import gradio as gr
3
 
4
- # ---------- CONSTANTS ----------
5
- RUN_ROOT = "/home/user/app/runs" # where all runs are stored (visible in App Files)
6
- LAST_PTR = pathlib.Path(RUN_ROOT) / "LAST" # file that stores path to the most recent run
 
7
  os.makedirs(RUN_ROOT, exist_ok=True)
 
8
 
9
- # env helpers (with correct names)
10
- DEFAULT_REPO_ID = os.environ.get("REPO_ID", "")
11
  PUSH_DEFAULT = os.environ.get("PUSH_TO_HUB", "true").lower() in {"1","true","yes"}
12
  HF_TOKEN = os.environ.get("HF_TOKEN")
13
 
14
- # Optional: login with a Space secret named HF_TOKEN
15
  if HF_TOKEN:
16
  try:
17
  from huggingface_hub import login
@@ -43,21 +44,33 @@ def tail_file(path: str, n=200):
43
 
44
  # ---------- RUN DIR HELPERS ----------
45
  def new_run_dir():
46
- d = pathlib.Path(RUN_ROOT) / f"pusht_{int(time.time())}"
47
- d.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
48
  LAST_PTR.write_text(str(d))
49
  return str(d)
50
 
51
  def current_run_dir(user_override: str | None):
 
52
  if user_override and user_override.strip():
53
  return user_override.strip()
54
  if LAST_PTR.exists():
55
  return LAST_PTR.read_text().strip()
56
- return "" # none yet
57
 
58
  def has_checkpoint(run_dir: str):
 
59
  return os.path.isdir(os.path.join(run_dir, "checkpoints", "last"))
60
 
 
 
 
 
 
61
  def train_log_path(run_dir: str):
62
  return os.path.join(run_dir, "logs", "train.log")
63
 
@@ -67,7 +80,7 @@ def eval_log_path(run_dir: str):
67
  # ---------- ACTIONS ----------
68
  def start_training(steps, batch_size, push_to_hub, repo_id):
69
  run_dir = new_run_dir()
70
- log = train_log_path(run_dir)
71
 
72
  push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
73
  if push_to_hub and repo_id.strip() else
@@ -96,7 +109,7 @@ def resume_training(extra_steps, push_to_hub, repo_id, run_dir_text):
96
  log = train_log_path(run_dir)
97
 
98
  if not has_checkpoint(run_dir):
99
- return f"No checkpoint in {run_dir}/checkpoints/last/ yet β€” let the run save once (>= first 500 steps).", run_dir, tail_file(log)
100
 
101
  push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
102
  if push_to_hub and repo_id.strip() else
@@ -148,17 +161,48 @@ def eval_latest(run_dir_text):
148
  msg = f"Evaluated run at: {run_dir}\nEval exited rc={rc}\n\n=== eval.log tail ===\n{tail}"
149
  return msg, run_dir, tail_file(elog), metrics_txt
150
 
 
151
  def list_runs():
152
  root = pathlib.Path(RUN_ROOT)
153
  if not root.exists():
154
  return "(no runs)"
155
  rows = []
156
  for d in sorted(root.glob("pusht_*")):
157
- size = subprocess.check_output(["bash","-lc", f"du -sh {d} | cut -f1"], text=True).strip()
 
 
 
 
 
158
  ck = "βœ“" if has_checkpoint(str(d)) else "β€”"
159
  rows.append(f"{d.name}\t{size}\tcheckpoint:{ck}")
160
  return "name\tsize\tcheckpoint\n" + "\n".join(rows) if rows else "(no runs)"
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  # ---------- UI ----------
163
  with gr.Blocks(title="LeRobot PushT Trainer (Space)") as demo:
164
  gr.Markdown("# πŸ€– LeRobot PushT Trainer\nTrain / Resume / Evaluate. Files persist under `/home/user/app/runs/` (see App Files).")
@@ -194,10 +238,18 @@ with gr.Blocks(title="LeRobot PushT Trainer (Space)") as demo:
194
  list_btn = gr.Button("πŸ“‚ List runs folder")
195
  list_out = gr.Textbox(label="runs/ listing", lines=12)
196
 
 
 
 
 
 
 
197
  start_btn.click(start_training, inputs=[steps, batch, push_to_hub, repo_id], outputs=[start_out, run_dir_view, train_log])
198
  resume_btn.click(resume_training, inputs=[extra_steps, push_to_hub, repo_id, run_dir_text], outputs=[resume_out, run_dir_view, resume_log])
199
  eval_btn.click(eval_latest, inputs=[run_dir_text], outputs=[eval_out, run_dir_view, eval_log, metrics_box])
200
  list_btn.click(list_runs, outputs=list_out)
 
 
201
 
202
  if __name__ == "__main__":
203
  demo.launch()
 
1
+ import os, subprocess, json, pathlib, time, shutil
2
  import gradio as gr
3
 
4
+ # ---------- CONSTANTS (visible in App Files) ----------
5
+ RUN_ROOT = "/home/user/app/runs" # where all runs live
6
+ LOG_ROOT = "/home/user/app/logs" # global logs (so we don't pre-create run dirs)
7
+ LAST_PTR = pathlib.Path(RUN_ROOT) / "LAST" # remembers most recent run path
8
  os.makedirs(RUN_ROOT, exist_ok=True)
9
+ os.makedirs(LOG_ROOT, exist_ok=True)
10
 
11
+ # ---------- ENV / HUB ----------
12
+ DEFAULT_REPO_ID = os.environ.get("REPO_ID", "") # e.g. "zino36/lerobot-pusht-colab"
13
  PUSH_DEFAULT = os.environ.get("PUSH_TO_HUB", "true").lower() in {"1","true","yes"}
14
  HF_TOKEN = os.environ.get("HF_TOKEN")
15
 
 
16
  if HF_TOKEN:
17
  try:
18
  from huggingface_hub import login
 
44
 
45
  # ---------- RUN DIR HELPERS ----------
46
  def new_run_dir():
47
+ """Return a unique run dir path WITHOUT creating it (so LeRobot can create it)."""
48
+ base = pathlib.Path(RUN_ROOT) / f"pusht_{int(time.time())}"
49
+ d = base
50
+ i = 1
51
+ while d.exists():
52
+ d = pathlib.Path(f"{base}_{i}")
53
+ i += 1
54
  LAST_PTR.write_text(str(d))
55
  return str(d)
56
 
57
  def current_run_dir(user_override: str | None):
58
+ """Prefer user text if given, else use the LAST pointer if present."""
59
  if user_override and user_override.strip():
60
  return user_override.strip()
61
  if LAST_PTR.exists():
62
  return LAST_PTR.read_text().strip()
63
+ return ""
64
 
65
  def has_checkpoint(run_dir: str):
66
+ """We consider a checkpoint present once checkpoints/last/ exists (first save is at step 500)."""
67
  return os.path.isdir(os.path.join(run_dir, "checkpoints", "last"))
68
 
69
+ def train_log_path_for_new(run_dir: str):
70
+ """Write fresh-run logs to global LOG_ROOT so we don't pre-create run_dir."""
71
+ name = pathlib.Path(run_dir).name
72
+ return os.path.join(LOG_ROOT, f"{name}.train.log")
73
+
74
  def train_log_path(run_dir: str):
75
  return os.path.join(run_dir, "logs", "train.log")
76
 
 
80
  # ---------- ACTIONS ----------
81
  def start_training(steps, batch_size, push_to_hub, repo_id):
82
  run_dir = new_run_dir()
83
+ log = train_log_path_for_new(run_dir)
84
 
85
  push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
86
  if push_to_hub and repo_id.strip() else
 
109
  log = train_log_path(run_dir)
110
 
111
  if not has_checkpoint(run_dir):
112
+ return f"No checkpoint in {run_dir}/checkpoints/last/ yet β€” run at least 500 steps once.", run_dir, tail_file(log)
113
 
114
  push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
115
  if push_to_hub and repo_id.strip() else
 
161
  msg = f"Evaluated run at: {run_dir}\nEval exited rc={rc}\n\n=== eval.log tail ===\n{tail}"
162
  return msg, run_dir, tail_file(elog), metrics_txt
163
 
164
+ # ---------- Maintenance (list / delete runs) ----------
165
  def list_runs():
166
  root = pathlib.Path(RUN_ROOT)
167
  if not root.exists():
168
  return "(no runs)"
169
  rows = []
170
  for d in sorted(root.glob("pusht_*")):
171
+ try:
172
+ size = subprocess.check_output(
173
+ ["bash","-lc", f"du -sh {d} | cut -f1"], text=True
174
+ ).strip()
175
+ except Exception:
176
+ size = "?"
177
  ck = "βœ“" if has_checkpoint(str(d)) else "β€”"
178
  rows.append(f"{d.name}\t{size}\tcheckpoint:{ck}")
179
  return "name\tsize\tcheckpoint\n" + "\n".join(rows) if rows else "(no runs)"
180
 
181
+ def delete_run_by_name(name: str):
182
+ name = os.path.basename((name or "").strip())
183
+ if not name:
184
+ return "Type a folder like 'pusht_1234567890'.", list_runs()
185
+ target = os.path.join(RUN_ROOT, name)
186
+ if not target.startswith(RUN_ROOT + "/"):
187
+ return "Refusing to delete outside runs/.", list_runs()
188
+ if not os.path.isdir(target):
189
+ return f"Folder not found: {target}", list_runs()
190
+ shutil.rmtree(target, ignore_errors=True)
191
+ # clear LAST if it pointed here
192
+ if LAST_PTR.exists() and LAST_PTR.read_text().strip() == target:
193
+ LAST_PTR.unlink(missing_ok=True)
194
+ return f"Deleted {target}", list_runs()
195
+
196
+ def delete_all_runs():
197
+ if not os.path.isdir(RUN_ROOT):
198
+ return "(runs/ missing)", list_runs()
199
+ for n in os.listdir(RUN_ROOT):
200
+ p = os.path.join(RUN_ROOT, n)
201
+ if os.path.isdir(p) and n.startswith("pusht_"):
202
+ shutil.rmtree(p, ignore_errors=True)
203
+ LAST_PTR.unlink(missing_ok=True)
204
+ return "Deleted all pusht_* runs.", list_runs()
205
+
206
  # ---------- UI ----------
207
  with gr.Blocks(title="LeRobot PushT Trainer (Space)") as demo:
208
  gr.Markdown("# πŸ€– LeRobot PushT Trainer\nTrain / Resume / Evaluate. Files persist under `/home/user/app/runs/` (see App Files).")
 
238
  list_btn = gr.Button("πŸ“‚ List runs folder")
239
  list_out = gr.Textbox(label="runs/ listing", lines=12)
240
 
241
+ gr.Markdown("### Maintenance")
242
+ del_name = gr.Textbox(label="Run folder name to delete (e.g., pusht_1699999999)")
243
+ del_one_btn = gr.Button("πŸ—‘οΈ Delete this run")
244
+ del_all_btn = gr.Button("🧹 Delete ALL pusht_* runs")
245
+
246
+ # Wiring
247
  start_btn.click(start_training, inputs=[steps, batch, push_to_hub, repo_id], outputs=[start_out, run_dir_view, train_log])
248
  resume_btn.click(resume_training, inputs=[extra_steps, push_to_hub, repo_id, run_dir_text], outputs=[resume_out, run_dir_view, resume_log])
249
  eval_btn.click(eval_latest, inputs=[run_dir_text], outputs=[eval_out, run_dir_view, eval_log, metrics_box])
250
  list_btn.click(list_runs, outputs=list_out)
251
+ del_one_btn.click(delete_run_by_name, inputs=del_name, outputs=[list_out, list_out])
252
+ del_all_btn.click(delete_all_runs, outputs=[list_out, list_out])
253
 
254
  if __name__ == "__main__":
255
  demo.launch()