File size: 10,689 Bytes
862b801
 
 
 
 
 
 
 
 
2920ba0
 
 
 
862b801
 
 
 
2920ba0
945e8ce
862b801
2920ba0
 
 
945e8ce
2920ba0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45036b5
2920ba0
45036b5
2920ba0
 
 
 
 
945e8ce
 
 
2920ba0
 
 
 
 
 
862b801
945e8ce
2920ba0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
945e8ce
 
 
862b801
 
 
 
 
 
 
 
 
 
 
 
 
945e8ce
862b801
 
 
945e8ce
862b801
 
 
 
945e8ce
 
 
 
862b801
 
 
945e8ce
862b801
 
 
945e8ce
862b801
 
 
 
 
945e8ce
862b801
 
 
945e8ce
 
 
 
862b801
 
945e8ce
862b801
 
 
945e8ce
862b801
 
 
 
 
 
945e8ce
862b801
 
945e8ce
862b801
 
945e8ce
862b801
 
 
 
945e8ce
862b801
 
945e8ce
862b801
 
 
 
 
 
 
 
 
 
945e8ce
862b801
 
 
 
 
 
 
 
 
 
 
 
 
945e8ce
862b801
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
945e8ce
862b801
 
 
945e8ce
862b801
 
 
945e8ce
862b801
 
 
945e8ce
862b801
 
 
 
 
 
 
 
945e8ce
862b801
 
 
 
 
 
 
 
 
 
 
 
945e8ce
862b801
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
945e8ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
862b801
 
 
 
 
945e8ce
862b801
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
import gradio as gr
import csv
import uuid
import threading
from datasets import load_dataset
from collections import OrderedDict
import random
import time
from huggingface_hub import CommitScheduler, HfApi, snapshot_download
import os
import shutil
from pathlib import Path
import pandas as pd

api = HfApi(token=os.environ["HF_TOKEN"])


DATASET_NAME = "taesiri/HumanHandsDataset"
BACKUP_REPO = "taesiri/HumanHandsDatasetFingerCounts"

# Create data directory
os.makedirs("./data", exist_ok=True)


def sync_with_hub():
    """
    Synchronize local data with the hub by downloading latest dataset
    """
    print("Starting sync with hub...")
    data_dir = Path("./data")
    if data_dir.exists():
        # Backup existing data
        backup_dir = Path("./data_backup")
        if backup_dir.exists():
            shutil.rmtree(backup_dir)
        shutil.copytree(data_dir, backup_dir)

    # Download latest data from hub
    repo_path = snapshot_download(
        repo_id=BACKUP_REPO, repo_type="dataset", local_dir="hub_data"
    )

    # Merge hub data with local data
    hub_data_dir = Path(repo_path) / "data"
    if hub_data_dir.exists():
        os.makedirs(data_dir, exist_ok=True)
        for item in hub_data_dir.glob("*"):
            if item.is_dir():
                dest = data_dir / item.name
                if not dest.exists():
                    shutil.copytree(item, dest)
            elif item.name == "finger_count_results.csv":
                hub_csv = pd.read_csv(item) if item.exists() else pd.DataFrame()
                local_csv_path = data_dir / "finger_count_results.csv"
                local_csv = (
                    pd.read_csv(local_csv_path)
                    if local_csv_path.exists()
                    else pd.DataFrame()
                )
                merged_csv = pd.concat(
                    [local_csv, hub_csv], ignore_index=True
                ).drop_duplicates()
                merged_csv.to_csv(local_csv_path, index=False)

    # Clean up downloaded repo
    if Path("hub_data").exists():
        shutil.rmtree("hub_data")
    print("Finished syncing with hub!")


# Set up commit scheduler
scheduler = CommitScheduler(
    repo_id=BACKUP_REPO,
    repo_type="dataset",
    folder_path="./data",
    path_in_repo="data",
    every=1,
)

# Sync with hub before starting
sync_with_hub()

# Update RESULT_CSV path to be in data directory
RESULT_CSV = "./data/finger_count_results.csv"

# Load the dataset
ds = load_dataset(DATASET_NAME, split="train")
# Get UUID lookup dataframe for efficient searching
uuid_df = load_dataset(DATASET_NAME, split="train", columns=["uuid"])
uuid_df = pd.DataFrame(uuid_df)

# A thread lock to avoid concurrent writes
write_lock = threading.Lock()

# Set to store annotated sample indices
annotated_samples = set()

# OrderedDict to act as a TTL cache for in-progress samples
# Format: {index: (timestamp, session_id)}
in_progress_samples = OrderedDict()
IN_PROGRESS_TTL = 300  # 5 minutes in seconds
MAX_IN_PROGRESS = 1000  # Maximum number of in-progress samples to track


# Load previously annotated samples from CSV
def load_annotated_samples():
    try:
        with open(RESULT_CSV, "r", newline="", encoding="utf-8") as f:
            reader = csv.reader(f)
            next(reader)  # Skip header
            for row in reader:
                record_uuid = row[1]
                # Find index for this UUID using efficient dataframe lookup
                idx = uuid_df.index[uuid_df["uuid"] == record_uuid].tolist()
                if idx:
                    annotated_samples.add(idx[0])
    except FileExistsError:
        pass


# Prepare the CSV file and load annotated samples
with write_lock:
    try:
        with open(RESULT_CSV, "x", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["session_id", "uuid", "prompt", "choice"])
    except FileExistsError:
        load_annotated_samples()


def cleanup_in_progress():
    """Remove expired in-progress samples"""
    current_time = time.time()
    while (
        in_progress_samples
        and list(in_progress_samples.items())[0][1][0] < current_time - IN_PROGRESS_TTL
    ):
        in_progress_samples.popitem(last=False)


def get_random_sample(session_id):
    """Get a random sample that's neither annotated nor in progress"""
    cleanup_in_progress()

    # Get all possible indices
    all_indices = set(range(len(ds)))
    # Get unavailable indices (annotated + in-progress)
    unavailable = annotated_samples.union(in_progress_samples.keys())
    # Get available indices
    available = list(all_indices - unavailable)

    if not available:
        return None

    # Select random index from available ones
    index = random.choice(available)

    # Add to in-progress samples
    if len(in_progress_samples) >= MAX_IN_PROGRESS:
        in_progress_samples.popitem(last=False)  # Remove oldest item
    in_progress_samples[index] = (time.time(), session_id)

    return index


def get_record(index):
    """
    Given an index, return:
      - PIL image
      - prompt text
      - the UUID for the dataset row
    """
    record = ds[index]
    return record["image"], record["prompt"], record["uuid"]


def update_session(choice, session_id, index):
    """
    This function is called whenever a user presses a button.
    - Writes the user's choice to the CSV file.
    - Increments the index to show the next image.
    - Returns the new image, prompt, updated index, and UUID to the UI.
    - If out of images, returns a "Done" placeholder.
    """
    # Get the current record
    image, prompt, record_uuid = get_record(index)

    # Write to CSV
    with write_lock:
        with open(RESULT_CSV, "a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow([session_id, record_uuid, prompt, choice])

    # Add to annotated samples and remove from in-progress
    annotated_samples.add(index)
    in_progress_samples.pop(index, None)

    # Get next random sample
    new_index = get_random_sample(session_id)
    if new_index is None:
        return (None, "No more images to label. Thank you!", new_index, "")

    # Get the next record
    next_image, next_prompt, next_uuid = get_record(new_index)

    return (next_image, next_prompt, new_index, f"UUID: {next_uuid}")


# Create a Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Human Hands Finger Counting App")

    # State: each user has a unique session ID and current index
    session_id = gr.State(str(uuid.uuid4()))
    current_index = gr.State(0)

    image_display = gr.Image(type="pil", label="Image to Review")
    prompt_display = gr.Markdown()
    uuid_display = gr.Markdown()  # Add UUID display

    # Initialize with the first record
    def start_app(session_id, index):
        if index == 0:  # Only get random sample for new sessions
            index = get_random_sample(session_id)
            if index is None:
                return None, "No more images to label. Thank you!", ""
        img, prompt, uuid_str = get_record(index)
        return img, prompt, f"UUID: {uuid_str}"

    with gr.Row():
        # Buttons for finger count
        btn_three = gr.Button("Three")
        btn_four = gr.Button("Four")
        btn_five = gr.Button("Five")
        btn_six = gr.Button("Six")
        btn_seven = gr.Button("Seven")
        btn_eight = gr.Button("Eight")
        btn_nine = gr.Button("Nine")
        btn_ten = gr.Button("Ten")
        btn_more = gr.Button("More than 11")
        btn_cannot = gr.Button("Cannot identify", variant="stop")  # Red background

    # Define partial functions to specify each choice
    def choose_three(session_id, index):
        return update_session("three", session_id, index)

    def choose_four(session_id, index):
        return update_session("four", session_id, index)

    def choose_five(session_id, index):
        return update_session("five", session_id, index)

    def choose_six(session_id, index):
        return update_session("six", session_id, index)

    def choose_seven(session_id, index):
        return update_session("seven", session_id, index)

    def choose_eight(session_id, index):
        return update_session("eight", session_id, index)

    def choose_nine(session_id, index):
        return update_session("nine", session_id, index)

    def choose_ten(session_id, index):
        return update_session("ten", session_id, index)

    def choose_more(session_id, index):
        return update_session("more_than_11", session_id, index)

    def choose_cannot(session_id, index):
        return update_session("cannot_identify", session_id, index)

    # Link button clicks to functions
    btn_three.click(
        fn=choose_three,
        inputs=[session_id, current_index],
        outputs=[image_display, prompt_display, current_index, uuid_display],
    )

    btn_four.click(
        fn=choose_four,
        inputs=[session_id, current_index],
        outputs=[image_display, prompt_display, current_index, uuid_display],
    )

    btn_five.click(
        fn=choose_five,
        inputs=[session_id, current_index],
        outputs=[image_display, prompt_display, current_index, uuid_display],
    )

    btn_six.click(
        fn=choose_six,
        inputs=[session_id, current_index],
        outputs=[image_display, prompt_display, current_index, uuid_display],
    )

    btn_seven.click(
        fn=choose_seven,
        inputs=[session_id, current_index],
        outputs=[image_display, prompt_display, current_index, uuid_display],
    )

    btn_eight.click(
        fn=choose_eight,
        inputs=[session_id, current_index],
        outputs=[image_display, prompt_display, current_index, uuid_display],
    )

    btn_nine.click(
        fn=choose_nine,
        inputs=[session_id, current_index],
        outputs=[image_display, prompt_display, current_index, uuid_display],
    )

    btn_ten.click(
        fn=choose_ten,
        inputs=[session_id, current_index],
        outputs=[image_display, prompt_display, current_index, uuid_display],
    )

    btn_more.click(
        fn=choose_more,
        inputs=[session_id, current_index],
        outputs=[image_display, prompt_display, current_index, uuid_display],
    )

    btn_cannot.click(
        fn=choose_cannot,
        inputs=[session_id, current_index],
        outputs=[image_display, prompt_display, current_index, uuid_display],
    )

    # Load the first image/prompt on launch
    demo.load(
        fn=start_app,
        inputs=[session_id, current_index],
        outputs=[image_display, prompt_display, uuid_display],
    )

demo.launch()