Spaces:

arcleife
/

sentiment_analysis_id

Sleeping

App Files Files Community

arcleife commited on Apr 5

Commit

beb81ec

verified ·

1 Parent(s): a8268e2

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -279

app.py CHANGED Viewed

@@ -7,9 +7,9 @@ import polars as pl
 import re
 import json
 from datetime import datetime, timezone, timedelta
-from transformers import pipeline
-from transformers import AutoModelForSequenceClassification
-from transformers import AutoTokenizer, DistilBertTokenizerFast
 # version: 0.2.1
@@ -24,279 +24,7 @@ import uuid
 import filelock
 import csv
-# TODO move to separate file for cleaner code
-class HuggingFaceDatasetSaver(FlaggingCallback):
-    """
-    A callback that saves each flagged sample (both the input and output data) to a HuggingFace dataset.
-    Example:
-        import gradio as gr
-        hf_writer = gr.HuggingFaceDatasetSaver(HF_API_TOKEN, "image-classification-mistakes")
-        def image_classifier(inp):
-            return {'cat': 0.3, 'dog': 0.7}
-        demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
-                            allow_flagging="manual", flagging_callback=hf_writer)
-    Guides: using-flagging
-    """
-    def __init__(
-        self,
-        hf_token: str,
-        dataset_name: str,
-        private: bool = False,
-        info_filename: str = "dataset_info.json",
-        separate_dirs: bool = False,
-    ):
-        """
-        Parameters:
-            hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset (defaults to the registered one).
-            dataset_name: The repo_id of the dataset to save the data to, e.g. "image-classifier-1" or "username/image-classifier-1".
-            private: Whether the dataset should be private (defaults to False).
-            info_filename: The name of the file to save the dataset info (defaults to "dataset_infos.json").
-            separate_dirs: If True, each flagged item will be saved in a separate directory. This makes the flagging more robust to concurrent editing, but may be less convenient to use.
-        """
-        self.hf_token = hf_token
-        self.dataset_id = dataset_name  # TODO: rename parameter (but ensure backward compatibility somehow)
-        self.dataset_private = private
-        self.info_filename = info_filename
-        self.separate_dirs = separate_dirs
-    def setup(self, components: Sequence[Component], flagging_dir: str):
-        """
-        Params:
-        flagging_dir (str): local directory where the dataset is cloned,
-        updated, and pushed from.
-        """
-        # Setup dataset on the Hub
-        self.dataset_id = huggingface_hub.create_repo(
-            repo_id=self.dataset_id,
-            token=self.hf_token,
-            private=self.dataset_private,
-            repo_type="dataset",
-            exist_ok=True,
-        ).repo_id
-        path_glob = "**/*.jsonl" if self.separate_dirs else "data.csv"
-        huggingface_hub.metadata_update(
-            repo_id=self.dataset_id,
-            repo_type="dataset",
-            metadata={
-                "configs": [
-                    {
-                        "config_name": "default",
-                        "data_files": [{"split": "train", "path": path_glob}],
-                    }
-                ]
-            },
-            overwrite=True,
-            token=self.hf_token,
-        )
-        # Setup flagging dir
-        self.components = components
-        self.dataset_dir = (
-            Path(flagging_dir).absolute() / self.dataset_id.split("/")[-1]
-        )
-        self.dataset_dir.mkdir(parents=True, exist_ok=True)
-        self.infos_file = self.dataset_dir / self.info_filename
-        # Download remote files to local
-        remote_files = [self.info_filename]
-        if not self.separate_dirs:
-            # No separate dirs => means all data is in the same CSV file => download it to get its current content
-            remote_files.append("data.csv")
-        for filename in remote_files:
-            try:
-                huggingface_hub.hf_hub_download(
-                    repo_id=self.dataset_id,
-                    repo_type="dataset",
-                    filename=filename,
-                    local_dir=self.dataset_dir,
-                    token=self.hf_token,
-                )
-            except huggingface_hub.utils.EntryNotFoundError:
-                pass
-    def flag(
-        self,
-        flag_data: list[Any],
-        flag_option: str = "",
-        username: str | None = None,
-    ) -> int:
-        if self.separate_dirs:
-            # JSONL files to support dataset preview on the Hub
-            unique_id = str(uuid.uuid4())
-            components_dir = self.dataset_dir / unique_id
-            data_file = components_dir / "metadata.jsonl"
-            path_in_repo = unique_id  # upload in sub folder (safer for concurrency)
-        else:
-            # Unique CSV file
-            components_dir = self.dataset_dir
-            data_file = components_dir / "data.csv"
-            path_in_repo = None  # upload at root level
-        return self._flag_in_dir(
-            data_file=data_file,
-            components_dir=components_dir,
-            path_in_repo=path_in_repo,
-            flag_data=flag_data,
-            flag_option=flag_option,
-            username=username or "",
-        )
-    def _flag_in_dir(
-        self,
-        data_file: Path,
-        components_dir: Path,
-        path_in_repo: str | None,
-        flag_data: list[Any],
-        flag_option: str = "",
-        username: str = "",
-    ) -> int:
-        # Deserialize components (write images/audio to files)
-        features, row = self._deserialize_components(
-            components_dir, flag_data, flag_option, username
-        )
-        # Write generic info to dataset_infos.json + upload
-        with filelock.FileLock(str(self.infos_file) + ".lock"):
-            if not self.infos_file.exists():
-                self.infos_file.write_text(
-                    json.dumps({"flagged": {"features": features}})
-                )
-                huggingface_hub.upload_file(
-                    repo_id=self.dataset_id,
-                    repo_type="dataset",
-                    token=self.hf_token,
-                    path_in_repo=self.infos_file.name,
-                    path_or_fileobj=self.infos_file,
-                )
-        headers = list(features.keys())
-        if not self.separate_dirs:
-            with filelock.FileLock(components_dir / ".lock"):
-                sample_nb = self._save_as_csv(data_file, headers=headers, row=row)
-                sample_name = str(sample_nb)
-                huggingface_hub.upload_folder(
-                    repo_id=self.dataset_id,
-                    repo_type="dataset",
-                    commit_message=f"Flagged sample #{sample_name}",
-                    path_in_repo=path_in_repo,
-                    ignore_patterns="*.lock",
-                    folder_path=components_dir,
-                    token=self.hf_token,
-                )
-        else:
-            sample_name = self._save_as_jsonl(data_file, headers=headers, row=row)
-            sample_nb = len(
-                [path for path in self.dataset_dir.iterdir() if path.is_dir()]
-            )
-            huggingface_hub.upload_folder(
-                repo_id=self.dataset_id,
-                repo_type="dataset",
-                commit_message=f"Flagged sample #{sample_name}",
-                path_in_repo=path_in_repo,
-                ignore_patterns="*.lock",
-                folder_path=components_dir,
-                token=self.hf_token,
-            )
-        return sample_nb
-    @staticmethod
-    def _save_as_csv(data_file: Path, headers: list[str], row: list[Any]) -> int:
-        """Save data as CSV and return the sample name (row number)."""
-        is_new = not data_file.exists()
-        with data_file.open("a", newline="", encoding="utf-8") as csvfile:
-            writer = csv.writer(csvfile)
-            # Write CSV headers if new file
-            if is_new:
-                writer.writerow(utils.sanitize_list_for_csv(headers))
-            # Write CSV row for flagged sample
-            writer.writerow(utils.sanitize_list_for_csv(row))
-        with data_file.open(encoding="utf-8") as csvfile:
-            return sum(1 for _ in csv.reader(csvfile)) - 1
-    @staticmethod
-    def _save_as_jsonl(data_file: Path, headers: list[str], row: list[Any]) -> str:
-        """Save data as JSONL and return the sample name (uuid)."""
-        Path.mkdir(data_file.parent, parents=True, exist_ok=True)
-        with open(data_file, "w", encoding="utf-8") as f:
-            json.dump(dict(zip(headers, row)), f)
-        return data_file.parent.name
-    def _deserialize_components(
-        self,
-        data_dir: Path,
-        flag_data: list[Any],
-        flag_option: str = "",
-        username: str = "",
-    ) -> tuple[dict[Any, Any], list[Any]]:
-        """Deserialize components and return the corresponding row for the flagged sample.
-        Images/audio are saved to disk as individual files.
-        """
-        # Components that can have a preview on dataset repos
-        file_preview_types = {gr.Audio: "Audio", gr.Image: "Image"}
-        # Generate the row corresponding to the flagged sample
-        features = OrderedDict()
-        row = []
-        for component, sample in zip(self.components, flag_data):
-            # Get deserialized object (will save sample to disk if applicable -file, audio, image,...-)
-            label = component.label or ""
-            save_dir = data_dir / client_utils.strip_invalid_filename_characters(label)
-            save_dir.mkdir(exist_ok=True, parents=True)
-            deserialized = utils.simplify_file_data_in_str(
-                component.flag(sample, save_dir)
-            )
-            # Add deserialized object to row
-            features[label] = {"dtype": "string", "_type": "Value"}
-            try:
-                deserialized_path = Path(deserialized)
-                if not deserialized_path.exists():
-                    raise FileNotFoundError(f"File {deserialized} not found")
-                row.append(str(deserialized_path.relative_to(self.dataset_dir)))
-            except (FileNotFoundError, TypeError, ValueError, OSError):
-                deserialized = "" if deserialized is None else str(deserialized)
-                row.append(deserialized)
-            # If component is eligible for a preview, add the URL of the file
-            # Be mindful that images and audio can be None
-            if isinstance(component, tuple(file_preview_types)):  # type: ignore
-                for _component, _type in file_preview_types.items():
-                    if isinstance(component, _component):
-                        features[label + " file"] = {"_type": _type}
-                        break
-                if deserialized:
-                    path_in_repo = str(  # returned filepath is absolute, we want it relative to compute URL
-                        Path(deserialized).relative_to(self.dataset_dir)
-                    ).replace("\\", "/")
-                    row.append(
-                        huggingface_hub.hf_hub_url(
-                            repo_id=self.dataset_id,
-                            filename=path_in_repo,
-                            repo_type="dataset",
-                        )
-                    )
-                else:
-                    row.append("")
-        timestamp = datetime.now(timezone(timedelta(hours=9))).isoformat()
-        features["flag"] = {"dtype": "string", "_type": "Value"}
-        features["username"] = {"dtype": "string", "_type": "Value"}
-        features["timestamp"] = {"dtype": "string", "_type": "Value"}
-        row.append(flag_option)
-        row.append(username)
-        row.append(timestamp)
-        return features, row
 # Get environment variable
 hf_token = os.getenv('HF_TOKEN')
@@ -312,11 +40,10 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 hf_writer = HuggingFaceDatasetSaver(hf_token, "crowdsourced-sentiment_analysis")
 # Prepare model
-# TODO convert the model to ONNX
 tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base", token=hf_token)
-model = AutoModelForSequenceClassification.from_pretrained("arcleife/roberta-sentiment-id", num_labels=3, token=hf_token).to(device)
-pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device, return_token_type_ids=False)
 def get_label(result):
     if result[0]['label'] == "LABEL_0":

 import re
 import json
 from datetime import datetime, timezone, timedelta
+from optimum.pipelines import pipeline
+from optimum.onnxruntime import ORTModelForSequenceClassification
+from transformers import AutoTokenizer
 # version: 0.2.1
 import filelock
 import csv
+from .hf_dataset_saver import HuggingFaceDatasetSaver
 # Get environment variable
 hf_token = os.getenv('HF_TOKEN')
 hf_writer = HuggingFaceDatasetSaver(hf_token, "crowdsourced-sentiment_analysis")
 # Prepare model
 tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base", token=hf_token)
+model = ORTModelForSequenceClassification.from_pretrained("arcleife/roberta-sentiment-id-onnx", num_labels=3, token=hf_token).to(device)
+pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device, return_token_type_ids=False, accelerator="ort")
 def get_label(result):
     if result[0]['label'] == "LABEL_0":