save-user-preferences

Runtime error

App Files Files Community

hysts HF Staff

Wauplin HF Staff commited on Jun 18, 2023

Commit

e51b1cb

1 Parent(s): 4578dbd

Use parquet instead of zip

Browse files

Co-authored-by: Lucain Pouget <[email protected]>

Files changed (3) hide show

app.py +30 -56
requirements.txt +1 -0
scheduler.py +66 -14

app.py CHANGED Viewed

@@ -12,11 +12,11 @@ from typing import Any
 import gradio as gr
 from gradio_client import Client
-from scheduler import ZipScheduler
 HF_TOKEN = os.getenv('HF_TOKEN')
 UPLOAD_REPO_ID = os.getenv('UPLOAD_REPO_ID')
-UPLOAD_FREQUENCY = int(os.getenv('UPLOAD_FREQUENCY', '5'))
 USE_PUBLIC_REPO = os.getenv('USE_PUBLIC_REPO') == '1'
 LOCAL_SAVE_DIR = pathlib.Path(os.getenv('LOCAL_SAVE_DIR', 'results'))
 LOCAL_SAVE_DIR.mkdir(parents=True, exist_ok=True)
@@ -25,45 +25,18 @@ ABOUT_THIS_SPACE = '''
 This Space is a sample Space that collects user preferences for the results generated by a diffusion model.
 This demo calls the [stable diffusion Space](https://huggingface.co/spaces/stabilityai/stable-diffusion) with the [`gradio_client`](https://pypi.org/project/gradio-client/) library.
-The user preference data is periodically zipped and uploaded to [this dataset repo](https://huggingface.co/datasets/hysts-samples/sample-user-preferences).
-The directory structure of the zipped data is as follows:
-```
-results
-├── 11e11b01-3388-48b3-a2ab-1b58d156e466
-│   ├── 000.jpg
-│   ├── 001.jpg
-│   ├── 002.jpg
-│   ├── 003.jpg
-│   └── preferences.json
-├── 1470ec1d-67a1-47ae-ab9c-4e0e0594dadf
-│   ├── 000.jpg
-│   ├── 001.jpg
-│   ├── 002.jpg
-│   ├── 003.jpg
-│   └── preferences.json
-...
-```
-Also, each `preferences.json` looks like this:
-```
-{
-  "prompt": "an astronaut riding a horse",
-  "negative_prompt": "",
-  "guidance_scale": 9,
-  "selected_index": 2,
-  "timestamp": "2023-06-15T07:57:00.097883"
-}
-```
 The periodic upload is done using [`huggingface_hub.CommitScheduler`](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/hf_api#huggingface_hub.CommitScheduler).
 See [this Space](https://huggingface.co/spaces/Wauplin/space_to_dataset_saver) for more general usage.
 '''
-scheduler = ZipScheduler(repo_id=UPLOAD_REPO_ID,
-                         repo_type='dataset',
-                         every=UPLOAD_FREQUENCY,
-                         private=not USE_PUBLIC_REPO,
-                         token=HF_TOKEN,
-                         folder_path=LOCAL_SAVE_DIR)
 client = Client('stabilityai/stable-diffusion')
@@ -81,12 +54,11 @@ def generate(prompt: str) -> tuple[str, list[str]]:
         'negative_prompt': negative_prompt,
         'guidance_scale': guidance_scale,
     }
-    config_file = tempfile.NamedTemporaryFile(mode='w',
-                                              suffix='.json',
-                                              delete=False)
-    json.dump(config, config_file)
-    with open(pathlib.Path(out_dir) / 'captions.json') as f:
         paths = list(json.load(f).keys())
     return config_file.name, paths
@@ -100,21 +72,23 @@ def save_preference(config_path: str, gallery: list[dict[str, Any]],
     save_dir = LOCAL_SAVE_DIR / f'{uuid.uuid4()}'
     save_dir.mkdir(parents=True, exist_ok=True)
-    paths = [x['name'] for x in gallery]
-    with scheduler.lock:
-        for index, path in enumerate(paths):
-            ext = pathlib.Path(path).suffix
-            shutil.move(path, save_dir / f'{index:03d}{ext}')
-        with open(config_path) as f:
-            config = json.load(f)
-        json_path = save_dir / 'preferences.json'
-        with json_path.open('w') as f:
-            preferences = config | {
-                'selected_index': selected_index,
-                'timestamp': datetime.datetime.utcnow().isoformat(),
-            }
-            json.dump(preferences, f)
 def clear() -> tuple[dict, dict, dict]:

 import gradio as gr
 from gradio_client import Client
+from scheduler import ParquetScheduler
 HF_TOKEN = os.getenv('HF_TOKEN')
 UPLOAD_REPO_ID = os.getenv('UPLOAD_REPO_ID')
+UPLOAD_FREQUENCY = int(os.getenv('UPLOAD_FREQUENCY', '15'))
 USE_PUBLIC_REPO = os.getenv('USE_PUBLIC_REPO') == '1'
 LOCAL_SAVE_DIR = pathlib.Path(os.getenv('LOCAL_SAVE_DIR', 'results'))
 LOCAL_SAVE_DIR.mkdir(parents=True, exist_ok=True)
 This Space is a sample Space that collects user preferences for the results generated by a diffusion model.
 This demo calls the [stable diffusion Space](https://huggingface.co/spaces/stabilityai/stable-diffusion) with the [`gradio_client`](https://pypi.org/project/gradio-client/) library.
+The user preference data is periodically archived in parquet format and uploaded to [this dataset repo](https://huggingface.co/datasets/hysts-samples/sample-user-preferences).
 The periodic upload is done using [`huggingface_hub.CommitScheduler`](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/hf_api#huggingface_hub.CommitScheduler).
 See [this Space](https://huggingface.co/spaces/Wauplin/space_to_dataset_saver) for more general usage.
 '''
+scheduler = ParquetScheduler(repo_id=UPLOAD_REPO_ID,
+                             repo_type='dataset',
+                             every=UPLOAD_FREQUENCY,
+                             private=not USE_PUBLIC_REPO,
+                             token=HF_TOKEN,
+                             folder_path=LOCAL_SAVE_DIR)
 client = Client('stabilityai/stable-diffusion')
         'negative_prompt': negative_prompt,
         'guidance_scale': guidance_scale,
     }
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.json',
+                                     delete=False) as config_file:
+        json.dump(config, config_file)
+    with (pathlib.Path(out_dir) / 'captions.json').open() as f:
         paths = list(json.load(f).keys())
     return config_file.name, paths
     save_dir = LOCAL_SAVE_DIR / f'{uuid.uuid4()}'
     save_dir.mkdir(parents=True, exist_ok=True)
+    # Load config
+    with open(config_path) as f:
+        data = json.load(f)
+    # Add selected item + timestamp
+    data['selected_index'] = selected_index
+    data['timestamp'] = datetime.datetime.utcnow().isoformat()
+    # Copy and add images
+    for index, path in enumerate(x['name'] for x in gallery):
+        name = f'{index:03d}'
+        dst_path = save_dir / f'{name}{pathlib.Path(path).suffix}'
+        shutil.move(path, dst_path)
+        data[f'image_{name}'] = dst_path
+    # Send to scheduler
+    scheduler.append(data)
 def clear() -> tuple[dict, dict, dict]:

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 git+https://github.com/huggingface/huggingface_hub@928e138
 gradio_client==0.2.7

 git+https://github.com/huggingface/huggingface_hub@928e138
 gradio_client==0.2.7
+pyarrow==12.0.1

scheduler.py CHANGED Viewed

@@ -1,31 +1,83 @@
-import shutil
 import tempfile
 import uuid
 from huggingface_hub import CommitScheduler
-class ZipScheduler(CommitScheduler):
     def push_to_hub(self):
         with self.lock:
-            if not any(self.folder_path.iterdir()):
-                return
-            archive_file = tempfile.NamedTemporaryFile(suffix='.zip')
-            archive_name = archive_file.name.split('.')[
-                0]  # `make_archive` automatically append `.zip`
-            shutil.make_archive(base_name=archive_name,
-                                format='zip',
-                                root_dir=self.folder_path.parent,
-                                base_dir=self.folder_path.name)
-            shutil.rmtree(self.folder_path, ignore_errors=True)
-            self.folder_path.mkdir(parents=True, exist_ok=True)
         self.api.upload_file(
             repo_id=self.repo_id,
             repo_type=self.repo_type,
             revision=self.revision,
-            path_in_repo=f'{uuid.uuid4()}.zip',
             path_or_fileobj=archive_file.name,
         )
         archive_file.close()

+import json
 import tempfile
 import uuid
+from pathlib import Path
+from typing import Any, Dict, List
+import pyarrow as pa
+import pyarrow.parquet as pq
 from huggingface_hub import CommitScheduler
+class ParquetScheduler(CommitScheduler):
+    def append(self, row: Dict[str, Any]) -> None:
+        with self.lock:
+            if not hasattr(self, 'rows') or self.rows is None:  # type: ignore
+                self.rows = []
+            self.rows.append(row)
     def push_to_hub(self):
+        # Check for new rows to push
         with self.lock:
+            rows = self.rows
+            self.rows = None
+        if not rows:
+            return
+        # Load images + create 'features' config for datasets library
+        hf_features: dict[str, Dict] = {}
+        path_to_cleanup: List[Path] = []
+        for row in rows:
+            for key, value in row.items():
+                if 'image' in key:
+                    # It's an image: we load the bytes, define a special schema and remember to cleanup the file
+                    # Note: could do the same with "Audio"
+                    image_path = Path(value)
+                    if image_path.is_file():
+                        row[key] = {
+                            'path': image_path.name,
+                            'bytes': image_path.read_bytes()
+                        }
+                        path_to_cleanup.append(image_path)
+                        if key not in hf_features:
+                            hf_features[key] = {'_type': 'Image'}
+                else:
+                    # Otherwise, do nothing special
+                    if key not in hf_features:
+                        hf_features[key] = {
+                            '_type': 'Value',
+                            'dtype': 'string'
+                        }
+        # Complete rows if needed
+        for row in rows:
+            for feature in hf_features:
+                if feature not in row:
+                    row[feature] = None
+        # Export items to Arrow format
+        table = pa.Table.from_pylist(rows)
+        # Add metadata (used by datasets library)
+        table = table.replace_schema_metadata(
+            {'huggingface': json.dumps({'info': {
+                'features': hf_features
+            }})})
+        # Write to parquet file
+        archive_file = tempfile.NamedTemporaryFile()
+        pq.write_table(table, archive_file.name)
+        # Upload
         self.api.upload_file(
             repo_id=self.repo_id,
             repo_type=self.repo_type,
             revision=self.revision,
+            path_in_repo=f'{uuid.uuid4()}.parquet',
             path_or_fileobj=archive_file.name,
         )
+        # Cleanup
         archive_file.close()
+        for path in path_to_cleanup:
+            path.unlink(missing_ok=True)