Spaces:
Running
Running
File size: 4,949 Bytes
f3b49b2 6b99f03 4455f2c f3b49b2 11b95d7 44cb2f4 4455f2c 11b95d7 359816c 47e0f7e 11b95d7 47e0f7e 4455f2c 4a75b2d 11b95d7 efeca40 359816c 66bd7e3 f3b49b2 11b95d7 d1f4849 f3b49b2 11b95d7 efeca40 11b95d7 4a75b2d 47e0f7e 11b95d7 47e0f7e 11b95d7 47e0f7e 4a75b2d f3b49b2 359816c 11b95d7 4a75b2d 4455f2c 47e0f7e 11b95d7 4a75b2d 4455f2c 47e0f7e efeca40 359816c efeca40 47e0f7e efeca40 11b95d7 4a75b2d 47e0f7e 11b95d7 4a75b2d 4455f2c 47e0f7e 4a75b2d 47e0f7e 4a75b2d 47e0f7e 11b95d7 4a75b2d 4455f2c 47e0f7e 4a75b2d 11b95d7 6b99f03 f3b49b2 4455f2c 359816c 11b95d7 f3b49b2 11b95d7 f3b49b2 47e0f7e 4a75b2d 359816c 11b95d7 47e0f7e 11b95d7 47e0f7e 11b95d7 47e0f7e 11b95d7 4a75b2d 11b95d7 47e0f7e 6b99f03 44cb2f4 f3b49b2 6b99f03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
import pandas as pd
from pathlib import Path
from huggingface_hub import HfApi, Repository
# Allowed tags
LABELS = {"PER", "ORG", "LOC", "O"}
token_df = pd.DataFrame() # global store
# βββββββββββββββββββββββββ token explode βββββββββββββββββββββββ
def explode(df: pd.DataFrame) -> pd.DataFrame:
"""Return DataFrame(example_id, token, label='O')."""
if "text" in df.columns:
lines = df["text"].astype(str)
else: # user / assistant dialogs
lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
rows = []
for sid, line in enumerate(lines, start=0): # ensure unique 0,1,2,...
for tok in line.split():
rows.append({"example_id": sid, "token": tok, "label": "O"})
return pd.DataFrame(rows)
# βββββββββββββββββββββββββ callbacks βββββββββββββββββββββββββββ
def load_csv(file):
global token_df
df = pd.read_csv(file.name)
valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
if not valid:
msg = "β CSV needs a `text` column **or** both `user` and `assistant` columns."
return None, msg, *(gr.update(visible=False),) * 3
token_df = explode(df)
return (gr.update(value=token_df, visible=True, row_count=len(token_df)),
f"β
{len(df)} rows β {len(token_df)} tokens.",
gr.update(visible=True), # show buttons
gr.update(visible=False), # reset download links
gr.update(visible=False))
def save_table(tbl):
global token_df
token_df = pd.DataFrame(tbl, columns=["example_id", "token", "label"])
bad = token_df.loc[~token_df["label"].isin(LABELS), "label"].unique()
return "πΎ Saved." if bad.size == 0 else f"β οΈ Unknown label(s): {', '.join(bad)}"
def export_tokens():
fname = "raw_tokens.csv"
token_df.to_csv(fname, index=False)
return gr.update(value=fname, visible=True)
def export_iob():
iob, prev = [], {}
for _, r in token_df.iterrows():
sid, lbl = r["example_id"], r["label"]
if lbl == "O":
iob.append("O"); prev[sid] = None
else:
iob.append(("I-" if prev.get(sid) == lbl else "B-") + lbl)
prev[sid] = lbl
out = token_df.copy(); out["iob"] = iob
fname = "ner_iob.csv"; out.to_csv(fname, index=False)
return gr.update(value=fname, visible=True)
def push_to_hub(repo_id, token):
try:
HfApi().create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
local = Path(repo_id.replace("/", "_"))
if local.exists():
for f in local.iterdir(): f.unlink()
local.rmdir()
repo = Repository(str(local), clone_from=repo_id,
repo_type="dataset", use_auth_token=token)
token_df.to_csv(local / "data.csv", index=False)
repo.push_to_hub("Add annotated NER data")
return f"π https://huggingface.co/datasets/{repo_id}"
except Exception as e:
return f"β {e}"
# βββββββββββββββββββββββββ UI ββββββββββββββββββββββββββββββββββ
with gr.Blocks() as demo:
gr.Markdown("# π·οΈ Label It! Mini-NER")
gr.Markdown("**Step 1** β upload CSV (columns: `text` **or** `user`+`assistant`).")
with gr.Row():
csv_file = gr.File(file_types=[".csv"])
load_btn = gr.Button("Load")
status = gr.Textbox(interactive=False)
tok_table = gr.Dataframe(headers=["example_id", "token", "label"],
datatype=["number", "str", "str"],
visible=False)
with gr.Row(visible=False) as buttons:
save_btn = gr.Button("πΎ Save")
tok_btn = gr.Button("β¬οΈ Tokens CSV")
iob_btn = gr.Button("β¬οΈ IOB CSV")
file_tok = gr.File(visible=False)
file_iob = gr.File(visible=False)
with gr.Accordion("π¦ Push to Hugging Face Hub", open=False, visible=False) as acc:
repo_in, token_in = gr.Textbox(label="repo"), gr.Textbox(label="token", type="password")
push_btn = gr.Button("Push")
push_out = gr.Textbox(interactive=False)
# wiring
load_btn.click(load_csv, csv_file, [tok_table, status, buttons, file_tok, file_iob])
load_btn.click(lambda: gr.update(visible=True), None, acc)
save_btn.click(save_table, tok_table, status)
tok_btn.click(export_tokens, outputs=file_tok)
iob_btn.click(export_iob, outputs=file_iob)
push_btn.click(push_to_hub, [repo_in, token_in], push_out)
gr.Markdown("**Step 2** β label tokens (`PER`, `ORG`, `LOC`, `O`) β Save β Download / Push.")
demo.launch()
|