giskard-evaluator

Running

App Files Files Community

200

weixuan-giskard commited on Dec 1, 2023

Commit

77961b6

1 Parent(s): 85095eb

Move text classification column mapping

Browse files

Files changed (2) hide show

app.py +2 -110
text_classification.py +112 -0

app.py CHANGED Viewed

@@ -7,12 +7,11 @@ import time
 from pathlib import Path
 import json
-import logging
-import pandas as pd
 from transformers.pipelines import TextClassificationPipeline
 HF_REPO_ID = 'HF_REPO_ID'
 HF_SPACE_ID = 'SPACE_ID'
@@ -61,113 +60,6 @@ def check_dataset(dataset_id, dataset_config="default", dataset_split="test"):
     return dataset_id, dataset_config, dataset_split
-def text_classificaiton_match_label_case_unsensative(id2label_mapping, label):
-    for model_label in id2label_mapping.keys():
-        if model_label.upper() == label.upper():
-            return model_label, label
-    return None, label
-def text_classification_map_model_and_dataset_labels(id2label, dataset_features):
-    id2label_mapping = {id2label[k]: None for k in id2label.keys()}
-    dataset_labels = None
-    for feature in dataset_features.values():
-        if not isinstance(feature, datasets.ClassLabel):
-            continue
-        if len(feature.names) != len(id2label_mapping.keys()):
-            continue
-        dataset_labels = feature.names
-        # Try to match labels
-        for label in feature.names:
-            if label in id2label_mapping.keys():
-                model_label = label
-            else:
-                # Try to find case unsensative
-                model_label, label = text_classificaiton_match_label_case_unsensative(id2label_mapping, label)
-            if model_label is not None:
-                id2label_mapping[model_label] = label
-    return id2label_mapping, dataset_labels
-def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, split):
-    # We assume dataset is ok here
-    ds = datasets.load_dataset(d_id, config)[split]
-    try:
-        dataset_features = ds.features
-    except AttributeError:
-        # Dataset does not have features, need to provide everything
-        return None, None, None
-    # Check whether we need to infer the text input column
-    infer_text_input_column = True
-    if "text" in column_mapping.keys():
-        dataset_text_column = column_mapping["text"]
-        if dataset_text_column in dataset_features.keys():
-            infer_text_input_column = False
-        else:
-            logging.warning(f"Provided {dataset_text_column} is not in Dataset columns")
-    if infer_text_input_column:
-        # Try to retrieve one
-        candidates = [f for f in dataset_features if dataset_features[f].dtype == "string"]
-        if len(candidates) > 0:
-            logging.debug(f"Candidates are {candidates}")
-            column_mapping["text"] = candidates[0]
-        else:
-            # Not found a text feature
-            return column_mapping, None, None
-    # Load dataset as DataFrame
-    df = ds.to_pandas()
-    # Retrieve all labels
-    id2label_mapping = {}
-    id2label = ppl.model.config.id2label
-    label2id = {v: k for k, v in id2label.items()}
-    prediction_result = None
-    try:
-        # Use the first item to test prediction
-        results = ppl({"text": df.head(1).at[0, column_mapping["text"]]}, top_k=None)
-        prediction_result = {
-            f'{result["label"]}({label2id[result["label"]]})': result["score"] for result in results
-        }
-    except Exception:
-        # Pipeline prediction failed, need to provide labels
-        return column_mapping, None, None
-    # Infer labels
-    id2label_mapping, dataset_labels = text_classification_map_model_and_dataset_labels(id2label, dataset_features)
-    if "label" in column_mapping.keys():
-        if not isinstance(column_mapping["label"], dict) or set(column_mapping["label"].values()) != set(dataset_labels):
-            logging.warning(f'Provided {column_mapping["label"]} does not match labels in Dataset')
-            return column_mapping, prediction_result, None
-        if isinstance(column_mapping["label"], dict):
-            for model_label in id2label_mapping.keys():
-                id2label_mapping[model_label] = column_mapping["label"][str(label2id[model_label])]
-    elif None in id2label_mapping.values():
-        column_mapping["label"] = {
-            i: None for i in id2label.keys()
-        }
-        return column_mapping, prediction_result, None
-    id2label_df = pd.DataFrame({
-        "ID": [i for i in id2label.keys()],
-        "Model labels": [id2label[label] for label in id2label.keys()],
-        "Dataset labels": [id2label_mapping[id2label[label]] for label in id2label.keys()],
-    })
-    if "label" not in column_mapping.keys():
-        column_mapping["label"] = {
-            i: id2label_mapping[id2label[i]] for i in id2label.keys()
-        }
-    return column_mapping, prediction_result, id2label_df
 def try_validate(model_id, dataset_id, dataset_config, dataset_split, column_mapping):
     # Validate model
     m_id, ppl = check_model(model_id=model_id)

 from pathlib import Path
 import json
 from transformers.pipelines import TextClassificationPipeline
+from text_classification import text_classification_fix_column_mapping
 HF_REPO_ID = 'HF_REPO_ID'
 HF_SPACE_ID = 'SPACE_ID'
     return dataset_id, dataset_config, dataset_split
 def try_validate(model_id, dataset_id, dataset_config, dataset_split, column_mapping):
     # Validate model
     m_id, ppl = check_model(model_id=model_id)

text_classification.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import datasets
+import logging
+import pandas as pd
+def text_classificaiton_match_label_case_unsensative(id2label_mapping, label):
+    for model_label in id2label_mapping.keys():
+        if model_label.upper() == label.upper():
+            return model_label, label
+    return None, label
+def text_classification_map_model_and_dataset_labels(id2label, dataset_features):
+    id2label_mapping = {id2label[k]: None for k in id2label.keys()}
+    dataset_labels = None
+    for feature in dataset_features.values():
+        if not isinstance(feature, datasets.ClassLabel):
+            continue
+        if len(feature.names) != len(id2label_mapping.keys()):
+            continue
+        dataset_labels = feature.names
+        # Try to match labels
+        for label in feature.names:
+            if label in id2label_mapping.keys():
+                model_label = label
+            else:
+                # Try to find case unsensative
+                model_label, label = text_classificaiton_match_label_case_unsensative(id2label_mapping, label)
+            if model_label is not None:
+                id2label_mapping[model_label] = label
+    return id2label_mapping, dataset_labels
+def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, split):
+    # We assume dataset is ok here
+    ds = datasets.load_dataset(d_id, config)[split]
+    try:
+        dataset_features = ds.features
+    except AttributeError:
+        # Dataset does not have features, need to provide everything
+        return None, None, None
+    # Check whether we need to infer the text input column
+    infer_text_input_column = True
+    if "text" in column_mapping.keys():
+        dataset_text_column = column_mapping["text"]
+        if dataset_text_column in dataset_features.keys():
+            infer_text_input_column = False
+        else:
+            logging.warning(f"Provided {dataset_text_column} is not in Dataset columns")
+    if infer_text_input_column:
+        # Try to retrieve one
+        candidates = [f for f in dataset_features if dataset_features[f].dtype == "string"]
+        if len(candidates) > 0:
+            logging.debug(f"Candidates are {candidates}")
+            column_mapping["text"] = candidates[0]
+        else:
+            # Not found a text feature
+            return column_mapping, None, None
+    # Load dataset as DataFrame
+    df = ds.to_pandas()
+    # Retrieve all labels
+    id2label_mapping = {}
+    id2label = ppl.model.config.id2label
+    label2id = {v: k for k, v in id2label.items()}
+    prediction_result = None
+    try:
+        # Use the first item to test prediction
+        results = ppl({"text": df.head(1).at[0, column_mapping["text"]]}, top_k=None)
+        prediction_result = {
+            f'{result["label"]}({label2id[result["label"]]})': result["score"] for result in results
+        }
+    except Exception:
+        # Pipeline prediction failed, need to provide labels
+        return column_mapping, None, None
+    # Infer labels
+    id2label_mapping, dataset_labels = text_classification_map_model_and_dataset_labels(id2label, dataset_features)
+    if "label" in column_mapping.keys():
+        if not isinstance(column_mapping["label"], dict) or set(column_mapping["label"].values()) != set(dataset_labels):
+            logging.warning(f'Provided {column_mapping["label"]} does not match labels in Dataset')
+            return column_mapping, prediction_result, None
+        if isinstance(column_mapping["label"], dict):
+            for model_label in id2label_mapping.keys():
+                id2label_mapping[model_label] = column_mapping["label"][str(label2id[model_label])]
+    elif None in id2label_mapping.values():
+        column_mapping["label"] = {
+            i: None for i in id2label.keys()
+        }
+        return column_mapping, prediction_result, None
+    id2label_df = pd.DataFrame({
+        "ID": [i for i in id2label.keys()],
+        "Model labels": [id2label[label] for label in id2label.keys()],
+        "Dataset labels": [id2label_mapping[id2label[label]] for label in id2label.keys()],
+    })
+    if "label" not in column_mapping.keys():
+        column_mapping["label"] = {
+            i: id2label_mapping[id2label[i]] for i in id2label.keys()
+        }
+    return column_mapping, prediction_result, id2label_df