File size: 4,949 Bytes
f3b49b2
 
6b99f03
4455f2c
f3b49b2
11b95d7
44cb2f4
4455f2c
11b95d7
 
 
 
359816c
47e0f7e
 
11b95d7
47e0f7e
4455f2c
4a75b2d
11b95d7
efeca40
359816c
66bd7e3
f3b49b2
11b95d7
d1f4849
f3b49b2
11b95d7
efeca40
11b95d7
 
 
 
4a75b2d
47e0f7e
11b95d7
47e0f7e
 
11b95d7
 
47e0f7e
4a75b2d
 
f3b49b2
359816c
11b95d7
 
4a75b2d
4455f2c
47e0f7e
 
11b95d7
4a75b2d
4455f2c
47e0f7e
efeca40
359816c
efeca40
47e0f7e
efeca40
11b95d7
4a75b2d
47e0f7e
 
11b95d7
4a75b2d
4455f2c
 
47e0f7e
4a75b2d
 
47e0f7e
4a75b2d
47e0f7e
 
11b95d7
4a75b2d
 
4455f2c
47e0f7e
4a75b2d
11b95d7
6b99f03
f3b49b2
4455f2c
359816c
11b95d7
f3b49b2
11b95d7
 
f3b49b2
47e0f7e
4a75b2d
359816c
11b95d7
 
 
 
 
 
 
47e0f7e
 
 
 
11b95d7
47e0f7e
11b95d7
 
47e0f7e
 
11b95d7
 
4a75b2d
11b95d7
47e0f7e
 
 
6b99f03
44cb2f4
f3b49b2
6b99f03
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
import pandas as pd
from pathlib import Path
from huggingface_hub import HfApi, Repository

# Allowed tags
LABELS = {"PER", "ORG", "LOC", "O"}

token_df = pd.DataFrame()  # global store

# ───────────────────────── token explode ───────────────────────
def explode(df: pd.DataFrame) -> pd.DataFrame:
    """Return DataFrame(example_id, token, label='O')."""
    if "text" in df.columns:
        lines = df["text"].astype(str)
    else:  # user / assistant dialogs
        lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)

    rows = []
    for sid, line in enumerate(lines, start=0):      # ensure unique 0,1,2,...
        for tok in line.split():
            rows.append({"example_id": sid, "token": tok, "label": "O"})
    return pd.DataFrame(rows)

# ───────────────────────── callbacks ───────────────────────────
def load_csv(file):
    global token_df

    df = pd.read_csv(file.name)
    valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
    if not valid:
        msg = "❌ CSV needs a `text` column **or** both `user` and `assistant` columns."
        return None, msg, *(gr.update(visible=False),) * 3

    token_df = explode(df)

    return (gr.update(value=token_df, visible=True, row_count=len(token_df)),
            f"βœ… {len(df)} rows β†’ {len(token_df)} tokens.",
            gr.update(visible=True),  # show buttons
            gr.update(visible=False),  # reset download links
            gr.update(visible=False))

def save_table(tbl):
    global token_df
    token_df = pd.DataFrame(tbl, columns=["example_id", "token", "label"])
    bad = token_df.loc[~token_df["label"].isin(LABELS), "label"].unique()
    return "πŸ’Ύ Saved." if bad.size == 0 else f"⚠️ Unknown label(s): {', '.join(bad)}"

def export_tokens():
    fname = "raw_tokens.csv"
    token_df.to_csv(fname, index=False)
    return gr.update(value=fname, visible=True)

def export_iob():
    iob, prev = [], {}
    for _, r in token_df.iterrows():
        sid, lbl = r["example_id"], r["label"]
        if lbl == "O":
            iob.append("O"); prev[sid] = None
        else:
            iob.append(("I-" if prev.get(sid) == lbl else "B-") + lbl)
            prev[sid] = lbl
    out = token_df.copy(); out["iob"] = iob
    fname = "ner_iob.csv"; out.to_csv(fname, index=False)
    return gr.update(value=fname, visible=True)

def push_to_hub(repo_id, token):
    try:
        HfApi().create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
        local = Path(repo_id.replace("/", "_"))
        if local.exists():
            for f in local.iterdir(): f.unlink()
            local.rmdir()
        repo = Repository(str(local), clone_from=repo_id,
                          repo_type="dataset", use_auth_token=token)
        token_df.to_csv(local / "data.csv", index=False)
        repo.push_to_hub("Add annotated NER data")
        return f"πŸš€ https://huggingface.co/datasets/{repo_id}"
    except Exception as e:
        return f"❌ {e}"

# ───────────────────────── UI ──────────────────────────────────
with gr.Blocks() as demo:
    gr.Markdown("# 🏷️ Label It! Mini-NER")

    gr.Markdown("**Step 1** – upload CSV (columns: `text` **or** `user`+`assistant`).")

    with gr.Row():
        csv_file = gr.File(file_types=[".csv"])
        load_btn = gr.Button("Load")

    status = gr.Textbox(interactive=False)

    tok_table = gr.Dataframe(headers=["example_id", "token", "label"],
                             datatype=["number", "str", "str"],
                             visible=False)

    with gr.Row(visible=False) as buttons:
        save_btn = gr.Button("πŸ’Ύ Save")
        tok_btn  = gr.Button("β¬‡οΈŽ Tokens CSV")
        iob_btn  = gr.Button("β¬‡οΈŽ IOB CSV")

    file_tok = gr.File(visible=False)
    file_iob = gr.File(visible=False)

    with gr.Accordion("πŸ“¦ Push to Hugging Face Hub", open=False, visible=False) as acc:
        repo_in, token_in = gr.Textbox(label="repo"), gr.Textbox(label="token", type="password")
        push_btn   = gr.Button("Push")
        push_out   = gr.Textbox(interactive=False)

    # wiring
    load_btn.click(load_csv, csv_file, [tok_table, status, buttons, file_tok, file_iob])
    load_btn.click(lambda: gr.update(visible=True), None, acc)

    save_btn.click(save_table, tok_table, status)
    tok_btn.click(export_tokens, outputs=file_tok)
    iob_btn.click(export_iob,  outputs=file_iob)
    push_btn.click(push_to_hub, [repo_in, token_in], push_out)

    gr.Markdown("**Step 2** – label tokens (`PER`, `ORG`, `LOC`, `O`) ➜ Save ➜ Download / Push.")

demo.launch()