save-user-preferences

Runtime error

App Files Files Community

hysts HF Staff commited on Jul 3, 2023

Commit

1bb0264

1 Parent(s): e820b78

Update

Browse files

Files changed (2) hide show

app.py +4 -15
scheduler.py +78 -34

app.py CHANGED Viewed

@@ -4,9 +4,7 @@ import datetime
 import json
 import os
 import pathlib
-import shutil
 import tempfile
-import uuid
 from typing import Any
 import gradio as gr
@@ -15,11 +13,9 @@ from gradio_client import Client
 from scheduler import ParquetScheduler
 HF_TOKEN = os.getenv('HF_TOKEN')
-UPLOAD_REPO_ID = os.getenv('UPLOAD_REPO_ID')
 UPLOAD_FREQUENCY = int(os.getenv('UPLOAD_FREQUENCY', '15'))
 USE_PUBLIC_REPO = os.getenv('USE_PUBLIC_REPO') == '1'
-LOCAL_SAVE_DIR = pathlib.Path(os.getenv('LOCAL_SAVE_DIR', 'results'))
-LOCAL_SAVE_DIR.mkdir(parents=True, exist_ok=True)
 ABOUT_THIS_SPACE = '''
 This Space is a sample Space that collects user preferences for the results generated by a diffusion model.
@@ -35,8 +31,7 @@ scheduler = ParquetScheduler(repo_id=UPLOAD_REPO_ID,
                              repo_type='dataset',
                              every=UPLOAD_FREQUENCY,
                              private=not USE_PUBLIC_REPO,
-                             token=HF_TOKEN,
-                             folder_path=LOCAL_SAVE_DIR)
 client = Client('stabilityai/stable-diffusion')
@@ -69,9 +64,6 @@ def get_selected_index(evt: gr.SelectData) -> int:
 def save_preference(config_path: str, gallery: list[dict[str, Any]],
                     selected_index: int) -> None:
-    save_dir = LOCAL_SAVE_DIR / f'{uuid.uuid4()}'
-    save_dir.mkdir(parents=True, exist_ok=True)
     # Load config
     with open(config_path) as f:
         data = json.load(f)
@@ -80,12 +72,9 @@ def save_preference(config_path: str, gallery: list[dict[str, Any]],
     data['selected_index'] = selected_index
     data['timestamp'] = datetime.datetime.utcnow().isoformat()
-    # Copy and add images
     for index, path in enumerate(x['name'] for x in gallery):
-        name = f'{index:03d}'
-        dst_path = save_dir / f'{name}{pathlib.Path(path).suffix}'
-        shutil.move(path, dst_path)
-        data[f'image_{name}'] = dst_path
     # Send to scheduler
     scheduler.append(data)

 import json
 import os
 import pathlib
 import tempfile
 from typing import Any
 import gradio as gr
 from scheduler import ParquetScheduler
 HF_TOKEN = os.getenv('HF_TOKEN')
+UPLOAD_REPO_ID = os.environ['UPLOAD_REPO_ID']
 UPLOAD_FREQUENCY = int(os.getenv('UPLOAD_FREQUENCY', '15'))
 USE_PUBLIC_REPO = os.getenv('USE_PUBLIC_REPO') == '1'
 ABOUT_THIS_SPACE = '''
 This Space is a sample Space that collects user preferences for the results generated by a diffusion model.
                              repo_type='dataset',
                              every=UPLOAD_FREQUENCY,
                              private=not USE_PUBLIC_REPO,
+                             token=HF_TOKEN)
 client = Client('stabilityai/stable-diffusion')
 def save_preference(config_path: str, gallery: list[dict[str, Any]],
                     selected_index: int) -> None:
     # Load config
     with open(config_path) as f:
         data = json.load(f)
     data['selected_index'] = selected_index
     data['timestamp'] = datetime.datetime.utcnow().isoformat()
+    # Add images
     for index, path in enumerate(x['name'] for x in gallery):
+        data[f'image_{index:03d}'] = path
     # Send to scheduler
     scheduler.append(data)

scheduler.py CHANGED Viewed

@@ -1,71 +1,114 @@
 import json
-import tempfile
 import uuid
 from pathlib import Path
-from typing import Any, Dict, List
 import pyarrow as pa
 import pyarrow.parquet as pq
-from huggingface_hub import CommitScheduler
 class ParquetScheduler(CommitScheduler):
     def append(self, row: Dict[str, Any]) -> None:
         with self.lock:
-            if not hasattr(self, 'rows') or self.rows is None:  # type: ignore
-                self.rows = []
-            self.rows.append(row)
-    def set_schema(self, schema: Dict[str, Dict[str, str]]) -> None:
-        """
-        Define a schema to help `datasets` load the generated library.
-        This method is optional and can be called once just after the scheduler had been created. If it is not called,
-        the schema is automatically inferred before pushing the data to the Hub.
-        See https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value for the list of
-        possible values.
-        Example:
-        ```py
-        scheduler.set_schema({
-            "prompt": {"_type": "Value", "dtype": "string"},
-            "negative_prompt": {"_type": "Value", "dtype": "string"},
-            "guidance_scale": {"_type": "Value", "dtype": "int64"},
-            "image": {"_type": "Image"},
-        })
-        ```
-        """
-        self._schema = schema
     def push_to_hub(self):
         # Check for new rows to push
         with self.lock:
-            rows = self.rows
-            self.rows = None
         if not rows:
             return
         # Load images + create 'features' config for datasets library
-        hf_features: Dict[str, Dict] = getattr(self, '_schema', None) or {}
         path_to_cleanup: List[Path] = []
         for row in rows:
             for key, value in row.items():
                 # Infer schema (for `datasets` library)
-                if key not in hf_features:
-                    hf_features[key] = _infer_schema(key, value)
                 # Load binary files if necessary
-                if hf_features[key]['_type'] in ('Image', 'Audio'):
                     # It's an image or audio: we load the bytes and remember to cleanup the file
                     file_path = Path(value)
                     if file_path.is_file():
                         row[key] = {
                             'path': file_path.name,
-                            'bytes': file_path.read_bytes()
                         }
                         path_to_cleanup.append(file_path)
         # Complete rows if needed
         for row in rows:
-            for feature in hf_features:
                 if feature not in row:
                     row[feature] = None
@@ -75,7 +118,7 @@ class ParquetScheduler(CommitScheduler):
         # Add metadata (used by datasets library)
         table = table.replace_schema_metadata(
             {'huggingface': json.dumps({'info': {
-                'features': hf_features
             }})})
         # Write to parquet file
@@ -90,6 +133,7 @@ class ParquetScheduler(CommitScheduler):
             path_in_repo=f'{uuid.uuid4()}.parquet',
             path_or_fileobj=archive_file.name,
         )
         # Cleanup
         archive_file.close()

 import json
 import uuid
 from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
 import pyarrow as pa
 import pyarrow.parquet as pq
+from huggingface_hub import CommitScheduler, HfApi
 class ParquetScheduler(CommitScheduler):
+    """
+    Usage: configure the scheduler with a repo id. Once started, you can add data to be uploaded to the Hub. 1 `.append`
+    call will result in 1 row in your final dataset.
+    ```py
+    # Start scheduler
+    >>> scheduler = ParquetScheduler(repo_id="my-parquet-dataset")
+    # Append some data to be uploaded
+    >>> scheduler.append({...})
+    >>> scheduler.append({...})
+    >>> scheduler.append({...})
+    ```
+    The scheduler will automatically infer the schema from the data it pushes.
+    Optionally, you can manually set the schema yourself:
+    ```py
+    >>> scheduler = ParquetScheduler(
+    ...     repo_id="my-parquet-dataset",
+    ...     schema={
+    ...         "prompt": {"_type": "Value", "dtype": "string"},
+    ...         "negative_prompt": {"_type": "Value", "dtype": "string"},
+    ...         "guidance_scale": {"_type": "Value", "dtype": "int64"},
+    ...         "image": {"_type": "Image"},
+    ...     },
+    ... )
+    See https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value for the list of
+    possible values.
+    """
+    def __init__(
+        self,
+        *,
+        repo_id: str,
+        schema: Optional[Dict[str, Dict[str, str]]] = None,
+        every: Union[int, float] = 5,
+        path_in_repo: Optional[str] = 'data',
+        repo_type: Optional[str] = 'dataset',
+        revision: Optional[str] = None,
+        private: bool = False,
+        token: Optional[str] = None,
+        allow_patterns: Union[List[str], str, None] = None,
+        ignore_patterns: Union[List[str], str, None] = None,
+        hf_api: Optional[HfApi] = None,
+    ) -> None:
+        super().__init__(
+            repo_id=repo_id,
+            folder_path='dummy',  # not used by the scheduler
+            every=every,
+            path_in_repo=path_in_repo,
+            repo_type=repo_type,
+            revision=revision,
+            private=private,
+            token=token,
+            allow_patterns=allow_patterns,
+            ignore_patterns=ignore_patterns,
+            hf_api=hf_api,
+        )
+        self._rows: List[Dict[str, Any]] = []
+        self._schema = schema
     def append(self, row: Dict[str, Any]) -> None:
+        """Add a new item to be uploaded."""
         with self.lock:
+            self._rows.append(row)
     def push_to_hub(self):
         # Check for new rows to push
         with self.lock:
+            rows = self._rows
+            self._rows = []
         if not rows:
             return
+        print(f'Got {len(rows)} item(s) to commit.')
         # Load images + create 'features' config for datasets library
+        schema: Dict[str, Dict] = self._schema or {}
         path_to_cleanup: List[Path] = []
         for row in rows:
             for key, value in row.items():
                 # Infer schema (for `datasets` library)
+                if key not in schema:
+                    schema[key] = _infer_schema(key, value)
                 # Load binary files if necessary
+                if schema[key]['_type'] in ('Image', 'Audio'):
                     # It's an image or audio: we load the bytes and remember to cleanup the file
                     file_path = Path(value)
                     if file_path.is_file():
                         row[key] = {
                             'path': file_path.name,
+                            'bytes': file_path.read_bytes(),
                         }
                         path_to_cleanup.append(file_path)
         # Complete rows if needed
         for row in rows:
+            for feature in schema:
                 if feature not in row:
                     row[feature] = None
         # Add metadata (used by datasets library)
         table = table.replace_schema_metadata(
             {'huggingface': json.dumps({'info': {
+                'features': schema
             }})})
         # Write to parquet file
             path_in_repo=f'{uuid.uuid4()}.parquet',
             path_or_fileobj=archive_file.name,
         )
+        print(f'Commit completed.')
         # Cleanup
         archive_file.close()