Spaces:

0408happyfeet
/

cheese-texture-tabular-gradio

Sleeping

App Files Files Community

cheese-texture-tabular-gradio / app.py

0408happyfeet

Update app.py

5ee4fa6 verified about 1 month ago

raw

history blame

18.8 kB

	import os, shutil, zipfile, pickle
	from typing import List, Tuple
	import pandas as pd
	import gradio as gr

	from datasets import load_dataset
	from huggingface_hub import hf_hub_download, snapshot_download
	from autogluon.tabular import TabularPredictor

	MODEL_REPO_ID = "rlogh/cheese-texture-autogluon-classifier"
	DATASET_ID = "aslan-ng/cheese-tabular"

	import os, json
	import urllib.request

	USE_HF_LOCAL = os.getenv("USE_HF", "0").lower() in {"1", "true", "yes"}
	USE_HF_API = os.getenv("USE_HF_API", "0").lower() in {"1", "true", "yes"}
	HF_MODEL = os.getenv("HF_MODEL", "google/flan-t5-small")
	HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

	def _hf_inference_api(prompt: str) -> str:
	"""Call HF Inference API for text2text; avoids local downloads."""
	url = f"https://api-inference.huggingface.co/models/{HF_MODEL}"
	req = urllib.request.Request(
	url,
	data=json.dumps({"inputs": prompt}).encode("utf-8"),
	headers={"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"},
	method="POST",
	)
	with urllib.request.urlopen(req, timeout=60) as resp:
	data = json.loads(resp.read().decode("utf-8"))
	# HF returns either a list of dicts with 'generated_text' or a dict with 'generated_text'
	if isinstance(data, list) and data and "generated_text" in data[0]:
	return data[0]["generated_text"].strip()
	if isinstance(data, dict) and "generated_text" in data:
	return data["generated_text"].strip()
	# Fallback if provider returns a different schema
	return str(data).strip()

	def explain(structured: dict, engine: str = "auto") -> str:
	prompt = build_prompt(structured)
	# 1) Remote API (preferred for Spaces)
	if USE_HF_API and HF_TOKEN:
	try:
	return _hf_inference_api(prompt)
	except Exception as e:
	return explain_template(structured) + f"\n\n_Explanation engine fell back to template: {e}_"
	# 2) Local transformers (optional – see Fix B)
	if USE_HF_LOCAL:
	try:
	from transformers import pipeline
	pipe = pipeline("text2text-generation", model=HF_MODEL)
	out = pipe(prompt, max_new_tokens=180, do_sample=False)
	return out[0]["generated_text"].strip()
	except Exception as e:
	return explain_template(structured) + f"\n\n_Explanation engine fell back to template: {e}_"
	# 3) Deterministic template
	return explain_template(structured)

	def _safe_concat_splits(ds):
	frames = []
	for split in ds.keys():
	try:
	frames.append(ds[split].to_pandas())
	except Exception:
	pass
	if not frames:
	raise ValueError("Could not load any splits from the dataset.")
	return pd.concat(frames, ignore_index=True)

	def load_cheese_dataset(dataset_id: str) -> pd.DataFrame:
	# robust: try 'default' config then fallback.
	try:
	ds = load_dataset(dataset_id, "default")
	return _safe_concat_splits(ds)
	except Exception:
	ds = load_dataset(dataset_id)
	return _safe_concat_splits(ds)

	def _find_dir_with_any_predictor_marker(start_dir: str) -> str:
	'''Return the first directory containing either 'learner.pkl' (preferred) or 'predictor.pkl'.'''
	for root, dirs, files in os.walk(start_dir):
	if "learner.pkl" in files or "predictor.pkl" in files:
	return root
	return ""

	def _symlink_or_copytree(src: str, dst: str):
	if os.path.exists(dst):
	return
	try:
	os.symlink(src, dst)
	except Exception:
	shutil.copytree(src, dst)

	def _materialize_flat_model_layout(predictor_dir: str, extract_root: str):
	'''Ensure model subdirs exist in both forms:
	- predictor_dir/models/<name>/...
	- extract_root/<name>/... (flat layout some predictors still reference)
	- predictor_dir/<name>/... (defensive)
	'''
	models_dir = os.path.join(predictor_dir, "models")
	if not os.path.isdir(models_dir):
	return
	for name in os.listdir(models_dir):
	src = os.path.join(models_dir, name)
	if not os.path.isdir(src):
	continue
	for base in (extract_root, predictor_dir):
	dst = os.path.join(base, name)
	_symlink_or_copytree(src, dst)

	def load_predictor_from_hub(repo_id: str) -> Tuple[TabularPredictor, str, str]:
	# Extract zip into a known root, then load from the inner predictor dir.
	extract_root = os.path.join(os.getcwd(), "ag_predictor_unpack")
	if os.path.exists(extract_root):
	shutil.rmtree(extract_root)
	os.makedirs(extract_root, exist_ok=True)

	zip_candidates = ["cheese_texture_predictor_dir.zip", "predictor_dir.zip", "agModels-predictor.zip"]
	predictor_dir = ""
	for fname in zip_candidates:
	try:
	zpath = hf_hub_download(repo_id=repo_id, filename=fname)
	with zipfile.ZipFile(zpath, "r") as zf:
	zf.extractall(extract_root)
	predictor_dir = _find_dir_with_any_predictor_marker(extract_root)
	if predictor_dir:
	break
	except Exception as e:
	print(f"[loader] Zip candidate '{fname}' not usable: {e}")

	if not predictor_dir:
	# Snapshot and search
	repo_path = snapshot_download(repo_id=repo_id)
	predictor_dir = _find_dir_with_any_predictor_marker(repo_path)
	if not predictor_dir:
	# Try PKL fallback
	for fname in ("cheese_texture_predictor.pkl", "predictor.pkl"):
	try:
	pkl_path = hf_hub_download(repo_id=repo_id, filename=fname)
	with open(pkl_path, "rb") as f:
	obj = pickle.load(f)
	if isinstance(obj, TabularPredictor) or hasattr(obj, "predict"):
	# Mirror structure under a local dir
	predictor_dir = os.path.join(os.getcwd(), "ag_predictor_from_pkl")
	os.makedirs(predictor_dir, exist_ok=True)
	# Can't reconstruct 'models' tree automatically here; best effort only.
	return obj, predictor_dir, extract_root
	except Exception as e:
	print(f"[loader] PKL candidate '{fname}' not usable: {e}")
	raise FileNotFoundError("Could not locate an AutoGluon predictor directory.")

	# Load predictor with relaxed checks
	predictor = TabularPredictor.load(
	predictor_dir,
	require_version_match=False,
	require_py_version_match=False,
	check_packages=False,
	)

	# Create flat model layout to satisfy predictors that look for '/extract_root/<model>/model.pkl'
	_materialize_flat_model_layout(predictor_dir, extract_root)

	return predictor, predictor_dir, extract_root

	# ===== Load dataset & predictor on startup =====
	df_all = load_cheese_dataset(DATASET_ID)
	expected_cols = {"fat", "origin", "holed", "price", "protein", "texture"}
	missing = expected_cols.difference(df_all.columns)
	if missing:
	raise ValueError(f"Dataset missing expected columns: {missing}")

	FAT_MIN, FAT_MAX = float(df_all["fat"].min()), float(df_all["fat"].max())
	PRICE_MIN, PRICE_MAX = float(df_all["price"].min()), float(df_all["price"].max())
	PROTEIN_MIN, PROTEIN_MAX = float(df_all["protein"].min()), float(df_all["protein"].max())
	ORIGINS: List[str] = sorted([o for o in df_all["origin"].dropna().unique().tolist() if isinstance(o, str)])

	PREDICTOR, PREDICTOR_DIR, EXTRACT_ROOT = load_predictor_from_hub(MODEL_REPO_ID)
	CLASSES = list(getattr(PREDICTOR, "class_labels", [])) or sorted(df_all["texture"].dropna().unique().tolist())

	# Build base-model name list robustly (reading leaderboard should not require loading submodels)
	MODEL_NAMES = ["best"]
	try:
	if hasattr(PREDICTOR, "get_model_names"):
	names = PREDICTOR.get_model_names()
	if names:
	MODEL_NAMES += list(names)
	else:
	lb = PREDICTOR.leaderboard(silent=True)
	if hasattr(lb, "columns") and "model" in lb.columns:
	MODEL_NAMES += lb["model"].tolist()
	except Exception as e:
	print("Warning: couldn't fetch base model names; using ['best'] only. Details:", e)

	# --- CSS and helpers ---
	SUMMARY_CSS = '''#summary_table table { table-layout: auto !important; width: 100% !important; }
	#summary_table table th,
	#summary_table table td {
	white-space: normal !important;
	overflow: visible !important;
	text-overflow: clip !important;
	max-width: none !important;
	}
	'''

	def _make_df(**kwargs):
	try:
	return gr.Dataframe(**kwargs)
	except TypeError:
	kwargs.pop("wrap", None)
	kwargs.pop("column_widths", None)
	return gr.Dataframe(**kwargs)

	def _coerce_and_validate(fat, origin, holed, price, protein, top_k):
	def clamp(val, lo, hi, name):
	try:
	v = float(val)
	except Exception:
	gr.Warning(f"{name} must be numeric; falling back to dataset default.")
	return float((lo + hi) / 2.0)
	if v < lo:
	gr.Warning(f"{name} below dataset min ({lo:.2f}); clamping to min.")
	return lo
	if v > hi:
	gr.Warning(f"{name} above dataset max ({hi:.2f}); clamping to max.")
	return hi
	return v
	fat_s = clamp(fat, FAT_MIN, FAT_MAX, "fat (g/100g)")
	price_s = clamp(price, PRICE_MIN, PRICE_MAX, "price (unit)")
	protein_s = clamp(protein, PROTEIN_MIN, PROTEIN_MAX, "protein (g/100g)")

	origin_s = origin if origin in ORIGINS else (ORIGINS[0] if ORIGINS else origin)
	if origin_s != origin:
	gr.Warning("origin not in dataset categories; resetting to a valid choice.")

	try:
	k = int(top_k)
	except Exception:
	gr.Warning("Top‑k must be an integer; using 3.")
	k = 3
	k = max(1, min(k, len(CLASSES)))
	if k != top_k:
	gr.Info(f"Top‑k adjusted to {k}.")
	return fat_s, origin_s, bool(holed), price_s, protein_s, k

	def _predict_with_fallback(X, base_model):
	'''Try requested/best, then non-NN models as fallback.'''
	try_order = []
	if base_model in (None, "", "best"):
	try_order.append(None) # best/ensemble
	else:
	try_order.append(base_model)
	# Build a non-NN list (skip NN/WeightedEnsemble)
	non_nn = [m for m in MODEL_NAMES
	if m not in (None, "", "best")
	and not (m.lower().startswith("nn") or "neuralnet" in m.lower() or "weightedensemble" in m.lower())]
	try_order.extend([m for m in non_nn if m not in try_order])
	errors = []
	for m in try_order:
	try:
	if m is None:
	label_pred = PREDICTOR.predict(X).iloc[0]
	proba_df = PREDICTOR.predict_proba(X)
	else:
	label_pred = PREDICTOR.predict(X, model=m).iloc[0]
	proba_df = PREDICTOR.predict_proba(X, model=m)
	if m is not None:
	gr.Info(f"Using base model: {m}")
	return label_pred, proba_df
	except Exception as e:
	errors.append(str(e))
	gr.Warning(f"Model '{m or 'best'}' failed; trying a fallback...")
	raise RuntimeError("All model attempts failed: " + " \| ".join(errors))

	def do_predict(fat, origin, holed, price, protein, base_model, output_mode, top_k):
	try:
	fat_s, origin_s, holed_s, price_s, protein_s, k = _coerce_and_validate(fat, origin, holed, price, protein, top_k)
	X = pd.DataFrame([{
	"fat": float(fat_s),
	"origin": origin_s,
	"holed": int(1 if holed_s else 0),
	"price": float(price_s),
	"protein": float(protein_s),
	}])

	label_pred, proba_df = _predict_with_fallback(X, base_model)
	row = proba_df.iloc[0].sort_values(ascending=False)
	row_top = row.head(k)
	proba_table_df = pd.DataFrame({
	"texture": row_top.index, "probability (%)": (row_top.values * 100).round(2)
	}).reset_index(drop=True)

	topk_text = ", ".join([f"{cls}: {(prob*100):.2f}%" for cls, prob in row_top.items()])
	summary_df = pd.DataFrame([{
	"fat (g/100g)": float(fat_s),
	"price (unit)": float(price_s),
	"protein (g/100g)": float(protein_s),
	"origin": origin_s,
	"holed (Swiss-style holes)": bool(holed_s),
	"Base model to use": (base_model or "best"),
	"Output": output_mode,
	"Top‑k": int(k),
	"Predicted texture": str(label_pred),
	"Top‑k probabilities": topk_text
	}])

	if output_mode == "Label only":
	return str(label_pred), pd.DataFrame(columns=["texture","probability (%)"]), summary_df
	elif output_mode == "Probabilities only":
	return "", proba_table_df, summary_df
	else:
	return str(label_pred), proba_table_df, summary_df
	except Exception as e:
	gr.Warning(f"Something went wrong while predicting: {e}")
	empty_probs = pd.DataFrame(columns=["texture","probability (%)"])
	empty_summary = pd.DataFrame(columns=[
	"fat (g/100g)","price (unit)","protein (g/100g)","origin",
	"holed (Swiss-style holes)","Base model to use","Output","Top‑k",
	"Predicted texture","Top‑k probabilities"
	])
	return "", empty_probs, empty_summary

	def toggle_visibility(output_mode):
	show_probs = (output_mode != "Label only")
	show_label = (output_mode != "Probabilities only")
	return (
	gr.update(visible=show_label),
	gr.update(visible=show_probs),
	gr.update(visible=show_probs),
	)

	with gr.Blocks(title="Cheese Texture Classifier", css=SUMMARY_CSS) as demo:
	gr.Markdown('''## Cheese Texture (Tabular) — AutoGluon model
	Predicts texture from nutritional/origin features.
	Model: [`rlogh/cheese-texture-autogluon-classifier`](https://huggingface.co/rlogh/cheese-texture-autogluon-classifier).
	Set inputs on the left; results on the right.''')

	with gr.Row():
	with gr.Column():
	fat_slider = gr.Slider(
	minimum=float(max(0.0, FAT_MIN)),
	maximum=float(FAT_MAX),
	step=0.1,
	value=float(min(30.0, FAT_MAX)),
	label="fat (g/100g)",
	info="Fat content per 100g of cheese (from dataset range)"
	)
	price_slider = gr.Slider(
	minimum=float(max(0.0, PRICE_MIN)),
	maximum=float(PRICE_MAX),
	step=0.01,
	value=float(min(3.0, PRICE_MAX)),
	label="price (unit)",
	info="Price per unit (dataset units)"
	)
	protein_slider = gr.Slider(
	minimum=float(max(0.0, PROTEIN_MIN)),
	maximum=float(PROTEIN_MAX),
	step=0.1,
	value=float(min(22.0, PROTEIN_MAX)),
	label="protein (g/100g)",
	info="Protein content per 100g of cheese (from dataset range)"
	)
	origin_dd = gr.Dropdown(
	choices=ORIGINS,
	value=("Italy" if "Italy" in ORIGINS else ORIGINS[0]),
	label="origin",
	info="Country or region of origin"
	)
	holed_cb = gr.Checkbox(value=False, label="holed (Swiss-style holes)", info="Typical Emmental-style holes?")

	gr.Markdown("### Inference parameters")
	base_model_dd = gr.Dropdown(
	choices=MODEL_NAMES,
	value=MODEL_NAMES[0] if MODEL_NAMES else "best",
	label="Base model to use",
	info="Pick the best AutoGluon-ensembled model or a specific base model (if available)"
	)
	output_mode_radio = gr.Radio(
	choices=["Label only", "Probabilities only", "Label + Probabilities"],
	value="Label + Probabilities",
	label="Output",
	info="Choose which outputs to display"
	)
	topk_slider = gr.Slider(
	minimum=1,
	maximum=max(1, len(CLASSES)),
	step=1,
	value=min(3, max(1, len(CLASSES))),
	label="Top‑k probabilities to show",
	info="How many top classes to show in the probability table"
	)

	with gr.Column():
	pred_label = gr.Textbox(label="Predicted texture", interactive=False, visible=True)
	proba_table = _make_df(
	headers=["texture", "probability (%)"],
	datatype=["str", "number"],
	row_count=(1, "dynamic"),
	type="pandas",
	interactive=False,
	label="Predicted probabilities (sorted)",
	visible=True,
	wrap=True,
	column_widths=[220, 160]
	)

	gr.Markdown("### Summary (inputs and results)")
	summary_table = _make_df(
	headers=[
	"fat (g/100g)","price (unit)","protein (g/100g)","origin",
	"holed (Swiss-style holes)","Base model to use","Output","Top‑k",
	"Predicted texture","Top‑k probabilities"
	],
	datatype=[
	"number","number","number","str","bool","str","str","number","str","str"
	],
	row_count=(1, "dynamic"),
	type="pandas",
	interactive=False,
	visible=True,
	elem_id="summary_table",
	wrap=True,
	column_widths=[140, 120, 160, 160, 200, 170, 130, 90, 170, 420]
	)

	inputs = [fat_slider, origin_dd, holed_cb, price_slider, protein_slider, base_model_dd, output_mode_radio, topk_slider]

	for s in [fat_slider, price_slider, protein_slider, topk_slider]:
	s.release(fn=do_predict, inputs=inputs, outputs=[pred_label, proba_table, summary_table])
	for c in [origin_dd, holed_cb, base_model_dd, output_mode_radio]:
	c.change(fn=do_predict, inputs=inputs, outputs=[pred_label, proba_table, summary_table])

	output_mode_radio.change(fn=toggle_visibility, inputs=[output_mode_radio], outputs=[pred_label, proba_table, topk_slider], queue=False)

	demo.load(fn=do_predict, inputs=inputs, outputs=[pred_label, proba_table, summary_table])

	gr.Examples(
	label="Examples",
	examples=[
	[27.8, "Switzerland", True, 2.20, 26.9, "best", "Label + Probabilities", 3],
	[4.3, "USA", False, 1.31, 11.1, "best", "Label + Probabilities", 3],
	[29.0, "Italy", False, 5.73, 28.4, "best", "Label + Probabilities", 3],
	[27.7, "France", False, 4.63, 20.8, "best", "Label + Probabilities", 3],
	],
	inputs=inputs
	)

	demo.launch()