Suzana commited on
Commit
efeca40
Β·
verified Β·
1 Parent(s): 6b99f03

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -52
app.py CHANGED
@@ -2,74 +2,115 @@ import gradio as gr
2
  import pandas as pd
3
  from pathlib import Path
4
 
5
- # Global store
6
- token_df = pd.DataFrame()
7
 
8
- def make_sample_data(n=100):
9
- people = ["Alice","Bob","Charlie","Diane","Eve"]
10
- orgs = ["Acme","Globex","Initech","Umbrella","Stark"]
11
- locs = ["Paris","NYC","London","Tokyo","Sydney"]
12
- rows = [{"text": f"{people[i%5]} visited {orgs[i%5]} in {locs[i%5]}."} for i in range(n)]
13
- return pd.DataFrame(rows)
 
 
 
 
 
 
14
 
15
- # ────────────────────────── I/O helpers ──────────────────────────
16
  def load_data(file):
17
  global token_df
18
- df = pd.read_csv(file.name) if file else make_sample_data()
19
- if "text" not in df.columns:
20
- return None,"❌ Need a `text` column",gr.update(visible=False)
21
- records=[]
22
- for sid,txt in enumerate(df["text"]):
23
- for tok in txt.split():
24
- records.append({"sentence_id":sid,"token":tok,"label":"O"})
25
- token_df=pd.DataFrame(records)
26
- return token_df,"βœ… Loaded & tokenized",gr.update(visible=True)
27
-
28
- def save_edits(tbl): # keep edits in memory
 
 
 
 
 
 
 
 
29
  global token_df
30
- token_df=pd.DataFrame(tbl,columns=["sentence_id","token","label"])
31
- return "πŸ’Ύ Saved"
32
-
33
- def get_tokens_csv():
34
- path="raw_tokens.csv"; token_df.to_csv(path,index=False); return Path(path)
35
-
36
- def get_iob_csv():
37
- iob,prev=[],{}
38
- for _,r in token_df.iterrows():
39
- sid,l=r["sentence_id"],r["label"]
40
- if l=="O": iob.append("O"); prev[sid]=None
41
- else: iob.append(("I-" if prev.get(sid)==l else "B-")+l); prev[sid]=l
42
- out=token_df.copy(); out["iob"]=iob
43
- path="ner_iob.csv"; out.to_csv(path,index=False); return Path(path)
44
-
45
- # ────────────────────────── UI ──────────────────────────
 
 
 
 
 
 
 
 
 
 
46
  with gr.Blocks() as demo:
47
  gr.Markdown("# 🏷️ Label It! Mini-NER")
48
- gr.Markdown("Step 1 – Upload a CSV with a `text` column (or leave blank for sample).")
 
 
 
49
 
50
  with gr.Row():
51
- file_in = gr.File(label="πŸ“ Upload CSV", file_types=[".csv"])
52
  load_btn = gr.Button("Load")
53
 
54
  status = gr.Textbox(label="Status", interactive=False)
55
- table = gr.Dataframe(headers=["sentence_id","token","label"], interactive=True, visible=False)
56
 
57
- # action row
58
- with gr.Row(visible=False) as actions:
59
- save_btn = gr.Button("πŸ’Ύ Save Edits")
60
- dl_tok_btn = gr.Button("⬇️ Download Tokens CSV")
61
- dl_iob_btn = gr.Button("⬇️ Download IOB CSV")
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- hidden_tok = gr.File(visible=False)
64
- hidden_iob = gr.File(visible=False)
 
 
65
 
66
- # Bindings
67
- load_btn.click(load_data, inputs=file_in, outputs=[table,status,actions])
68
- save_btn.click(save_edits, inputs=table, outputs=status)
69
 
70
- dl_tok_btn.click(lambda: get_tokens_csv(), outputs=hidden_tok)
71
- dl_iob_btn.click(lambda: get_iob_csv(), outputs=hidden_iob)
 
 
72
 
73
- gr.Markdown("Step 2 – Edit **label** cells (`PER`,`ORG`,`LOC`, or `O`), then Save/Download.")
 
 
74
 
75
  demo.launch()
 
2
  import pandas as pd
3
  from pathlib import Path
4
 
5
+ token_df = pd.DataFrame() # global store
 
6
 
7
+ # ───────────────────────── helpers ──────────────────────────────
8
+ def tokenize_df(df: pd.DataFrame) -> pd.DataFrame:
9
+ """Explode dataframe into token rows with default 'O' label."""
10
+ records = []
11
+ if "text" in df.columns:
12
+ lines = df["text"].astype(str)
13
+ else: # user+assistant dialog
14
+ lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
15
+ for sid, line in enumerate(lines):
16
+ for tok in line.split():
17
+ records.append({"sentence_id": sid, "token": tok, "label": "O"})
18
+ return pd.DataFrame(records)
19
 
20
+ # ───────────────────────── callbacks ────────────────────────────
21
  def load_data(file):
22
  global token_df
23
+ df = pd.read_csv(file.name)
24
+
25
+ if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
26
+ return (None, "❌ CSV must have `text` **or** `user`+`assistant` columns.",
27
+ gr.update(visible=False), gr.update(visible=False))
28
+
29
+ token_df = tokenize_df(df)
30
+
31
+ return (
32
+ # Show table with correct row_count
33
+ gr.update(value=token_df.values.tolist(), # list-of-lists
34
+ row_count=len(token_df),
35
+ visible=True),
36
+ f"βœ… Loaded {len(df)} rows β†’ {len(token_df)} tokens.",
37
+ gr.update(visible=True), # show action row
38
+ gr.update(visible=False) # hide downloads until first export
39
+ )
40
+
41
+ def save_edits(table_data):
42
  global token_df
43
+ token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"])
44
+ return "πŸ’Ύ Edits saved."
45
+
46
+ def make_tokens_csv():
47
+ path = "raw_tokens.csv"
48
+ token_df.to_csv(path, index=False)
49
+ return Path(path)
50
+
51
+ def make_iob_csv():
52
+ iob_tags, prev = [], {}
53
+ for _, r in token_df.iterrows():
54
+ sid, lbl = r["sentence_id"], r["label"]
55
+ if lbl == "O":
56
+ iob_tags.append("O")
57
+ prev[sid] = None
58
+ else:
59
+ prefix = "I-" if prev.get(sid) == lbl else "B-"
60
+ iob_tags.append(prefix + lbl)
61
+ prev[sid] = lbl
62
+ out = token_df.copy()
63
+ out["iob"] = iob_tags
64
+ path = "ner_iob.csv"
65
+ out.to_csv(path, index=False)
66
+ return Path(path)
67
+
68
+ # ───────────────────────── UI ───────────────────────────────────
69
  with gr.Blocks() as demo:
70
  gr.Markdown("# 🏷️ Label It! Mini-NER")
71
+
72
+ gr.Markdown(
73
+ "**Step 1** – Upload a CSV containing either a `text` column or `user` + `assistant` columns."
74
+ )
75
 
76
  with gr.Row():
77
+ csv_file = gr.File(label="πŸ“ Upload CSV", file_types=[".csv"])
78
  load_btn = gr.Button("Load")
79
 
80
  status = gr.Textbox(label="Status", interactive=False)
 
81
 
82
+ # Token table (hidden until data loaded)
83
+ tok_table = gr.Dataframe(
84
+ headers=["sentence_id", "token", "label"],
85
+ datatype=["number", "str", "str"],
86
+ row_count=0, col_count=3,
87
+ visible=False
88
+ )
89
+
90
+ # Buttons row (hidden until loaded)
91
+ with gr.Row(visible=False) as action_row:
92
+ save_btn = gr.Button("πŸ’Ύ Save")
93
+ dl_tok_btn= gr.Button("β¬‡οΈŽ Download Tokens CSV")
94
+ dl_iob_btn= gr.Button("β¬‡οΈŽ Download IOB CSV")
95
+
96
+ # Hidden download files (appear only after first export)
97
+ dl_tokens_file = gr.File(label="Tokens CSV", visible=False)
98
+ dl_iob_file = gr.File(label="IOB CSV", visible=False)
99
 
100
+ # Bind events
101
+ load_btn.click(load_data,
102
+ inputs=csv_file,
103
+ outputs=[tok_table, status, action_row, dl_tokens_file])
104
 
105
+ save_btn.click(save_edits, inputs=tok_table, outputs=status)
 
 
106
 
107
+ dl_tok_btn.click(lambda: make_tokens_csv(),
108
+ outputs=dl_tokens_file)
109
+ dl_iob_btn.click(lambda: make_iob_csv(),
110
+ outputs=dl_iob_file)
111
 
112
+ gr.Markdown(
113
+ "**Step 2** – Edit the `label` column (`PER`, `ORG`, `LOC`, or `O`) β†’ click **Save** β†’ export."
114
+ )
115
 
116
  demo.launch()