Spaces:

Suzana
/

labelit-mini-ner

Sleeping

App Files Files Community

Suzana commited on May 28

Commit

efeca40

verified ·

1 Parent(s): 6b99f03

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -52

app.py CHANGED Viewed

@@ -2,74 +2,115 @@ import gradio as gr
 import pandas as pd
 from pathlib import Path
-# Global store
-token_df = pd.DataFrame()
-def make_sample_data(n=100):
-    people = ["Alice","Bob","Charlie","Diane","Eve"]
-    orgs   = ["Acme","Globex","Initech","Umbrella","Stark"]
-    locs   = ["Paris","NYC","London","Tokyo","Sydney"]
-    rows = [{"text": f"{people[i%5]} visited {orgs[i%5]} in {locs[i%5]}."} for i in range(n)]
-    return pd.DataFrame(rows)
-# ────────────────────────── I/O helpers ──────────────────────────
 def load_data(file):
     global token_df
-    df = pd.read_csv(file.name) if file else make_sample_data()
-    if "text" not in df.columns:
-        return None,"❌ Need a `text` column",gr.update(visible=False)
-    records=[]
-    for sid,txt in enumerate(df["text"]):
-        for tok in txt.split():
-            records.append({"sentence_id":sid,"token":tok,"label":"O"})
-    token_df=pd.DataFrame(records)
-    return token_df,"✅ Loaded & tokenized",gr.update(visible=True)
-def save_edits(tbl):          # keep edits in memory
     global token_df
-    token_df=pd.DataFrame(tbl,columns=["sentence_id","token","label"])
-    return "💾 Saved"
-def get_tokens_csv():
-    path="raw_tokens.csv"; token_df.to_csv(path,index=False); return Path(path)
-def get_iob_csv():
-    iob,prev=[],{}
-    for _,r in token_df.iterrows():
-        sid,l=r["sentence_id"],r["label"]
-        if l=="O": iob.append("O"); prev[sid]=None
-        else: iob.append(("I-" if prev.get(sid)==l else "B-")+l); prev[sid]=l
-    out=token_df.copy(); out["iob"]=iob
-    path="ner_iob.csv"; out.to_csv(path,index=False); return Path(path)
-# ────────────────────────── UI ──────────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
-    gr.Markdown("Step 1 – Upload a CSV with a `text` column (or leave blank for sample).")
     with gr.Row():
-        file_in  = gr.File(label="📁 Upload CSV", file_types=[".csv"])
         load_btn = gr.Button("Load")
     status = gr.Textbox(label="Status", interactive=False)
-    table  = gr.Dataframe(headers=["sentence_id","token","label"], interactive=True, visible=False)
-    # action row
-    with gr.Row(visible=False) as actions:
-        save_btn = gr.Button("💾 Save Edits")
-        dl_tok_btn = gr.Button("⬇️ Download Tokens CSV")
-        dl_iob_btn = gr.Button("⬇️ Download IOB CSV")
-    hidden_tok = gr.File(visible=False)
-    hidden_iob = gr.File(visible=False)
-    # Bindings
-    load_btn.click(load_data, inputs=file_in, outputs=[table,status,actions])
-    save_btn.click(save_edits, inputs=table, outputs=status)
-    dl_tok_btn.click(lambda: get_tokens_csv(),  outputs=hidden_tok)
-    dl_iob_btn.click(lambda: get_iob_csv(),     outputs=hidden_iob)
-    gr.Markdown("Step 2 – Edit **label** cells (`PER`,`ORG`,`LOC`, or `O`), then Save/Download.")
 demo.launch()

 import pandas as pd
 from pathlib import Path
+token_df = pd.DataFrame()          # global store
+# ───────────────────────── helpers ──────────────────────────────
+def tokenize_df(df: pd.DataFrame) -> pd.DataFrame:
+    """Explode dataframe into token rows with default 'O' label."""
+    records = []
+    if "text" in df.columns:
+        lines = df["text"].astype(str)
+    else:  # user+assistant dialog
+        lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
+    for sid, line in enumerate(lines):
+        for tok in line.split():
+            records.append({"sentence_id": sid, "token": tok, "label": "O"})
+    return pd.DataFrame(records)
+# ───────────────────────── callbacks ────────────────────────────
 def load_data(file):
     global token_df
+    df = pd.read_csv(file.name)
+    if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
+        return (None, "❌ CSV must have `text` **or** `user`+`assistant` columns.",
+                gr.update(visible=False), gr.update(visible=False))
+    token_df = tokenize_df(df)
+    return (
+        # Show table with correct row_count
+        gr.update(value=token_df.values.tolist(),          # list-of-lists
+                  row_count=len(token_df),
+                  visible=True),
+        f"✅ Loaded {len(df)} rows → {len(token_df)} tokens.",
+        gr.update(visible=True),     # show action row
+        gr.update(visible=False)     # hide downloads until first export
+    )
+def save_edits(table_data):
     global token_df
+    token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"])
+    return "💾 Edits saved."
+def make_tokens_csv():
+    path = "raw_tokens.csv"
+    token_df.to_csv(path, index=False)
+    return Path(path)
+def make_iob_csv():
+    iob_tags, prev = [], {}
+    for _, r in token_df.iterrows():
+        sid, lbl = r["sentence_id"], r["label"]
+        if lbl == "O":
+            iob_tags.append("O")
+            prev[sid] = None
+        else:
+            prefix = "I-" if prev.get(sid) == lbl else "B-"
+            iob_tags.append(prefix + lbl)
+            prev[sid] = lbl
+    out = token_df.copy()
+    out["iob"] = iob_tags
+    path = "ner_iob.csv"
+    out.to_csv(path, index=False)
+    return Path(path)
+# ───────────────────────── UI ───────────────────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
+    gr.Markdown(
+        "**Step 1** – Upload a CSV containing either a `text` column or `user` + `assistant` columns."
+    )
     with gr.Row():
+        csv_file = gr.File(label="📁 Upload CSV", file_types=[".csv"])
         load_btn = gr.Button("Load")
     status = gr.Textbox(label="Status", interactive=False)
+    # Token table (hidden until data loaded)
+    tok_table = gr.Dataframe(
+        headers=["sentence_id", "token", "label"],
+        datatype=["number", "str", "str"],
+        row_count=0, col_count=3,
+        visible=False
+    )
+    # Buttons row (hidden until loaded)
+    with gr.Row(visible=False) as action_row:
+        save_btn  = gr.Button("💾 Save")
+        dl_tok_btn= gr.Button("⬇︎ Download Tokens CSV")
+        dl_iob_btn= gr.Button("⬇︎ Download IOB CSV")
+    # Hidden download files (appear only after first export)
+    dl_tokens_file = gr.File(label="Tokens CSV", visible=False)
+    dl_iob_file    = gr.File(label="IOB CSV",    visible=False)
+    # Bind events
+    load_btn.click(load_data,
+                   inputs=csv_file,
+                   outputs=[tok_table, status, action_row, dl_tokens_file])
+    save_btn.click(save_edits, inputs=tok_table, outputs=status)
+    dl_tok_btn.click(lambda: make_tokens_csv(),
+                     outputs=dl_tokens_file)
+    dl_iob_btn.click(lambda: make_iob_csv(),
+                     outputs=dl_iob_file)
+    gr.Markdown(
+        "**Step 2** – Edit the `label` column (`PER`, `ORG`, `LOC`, or `O`) → click **Save** → export."
+    )
 demo.launch()