Spaces:

Schrieffer
/

SARM-Demo

Running on Zero

App Files Files Community

Schrieffer2sy commited on 11 days ago

Commit

1748050

1 Parent(s): 05a9ebf

init

Browse files

Files changed (3) hide show

.gitattributes +1 -0
app.py +43 -19
assets/framework-v4.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -3,16 +3,16 @@ import torch
 from transformers import AutoTokenizer
 from sarm_llama import LlamaSARM
-# --- 1. 加载模型和Tokenizer ---
-# 这一步会自动从Hugging Face Hub下载你的模型文件
-# 确保你的模型仓库是公开的
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_ID = "schrieffer/SARM-4B"
 print(f"Loading model: {MODEL_ID} on {DEVICE}...")
-# 加载模型时必须信任远程代码，因为SARM有自定义架构
 model = LlamaSARM.from_pretrained(
     MODEL_ID,
     sae_hidden_state_source_layer=16,
@@ -26,18 +26,18 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
 print("Model loaded successfully!")
-# --- 2. 定义推理函数 ---
-# 这个函数会被Gradio调用
 def get_reward_score(prompt: str, response: str) -> float:
     """
-    接收prompt和response，返回SARM模型计算出的奖励分数。
     """
     if not prompt or not response:
         return 0.0
     try:
-        # 使用与模型训练时相同的聊天模板
         messages = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}]
         input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(DEVICE)
@@ -47,20 +47,43 @@ def get_reward_score(prompt: str, response: str) -> float:
         return round(score, 4)
     except Exception as e:
         print(f"Error: {e}")
-        # 在界面上返回一个错误提示可能更好，但这里我们简单返回0
         return 0.0
-# --- 3. 创建并启动Gradio界面 ---
-# 使用gr.Blocks()可以获得更灵活的布局
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # SARM-4B: Interpretable Reward Model Demo
-        This is an interactive demo for the SARM-4B model, an interpretable reward model enhanced by a Sparse Autoencoder.
-        Enter a prompt (question) and a corresponding response below to get a reward score. A higher score indicates a better quality response according to the model.
-        For more details, check out our [Tech Report](https://arxiv.org/abs/submit/6699218) and [Model Card](https://huggingface.co/schrieffer/SARM-4B).
         """
     )
@@ -71,7 +94,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     calculate_btn = gr.Button("Calculate Reward Score", variant="primary")
     score_output = gr.Number(label="Reward Score", info="A higher score is better.")
-    # 定义按钮点击时的行为
     calculate_btn.click(
         fn=get_reward_score,
         inputs=[prompt_input, response_input],
@@ -88,8 +111,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         inputs=[prompt_input, response_input],
         outputs=score_output,
         fn=get_reward_score,
-        cache_examples=True # 缓存示例结果，加快加载速度
     )
-# 启动应用
-demo.launch()

 from transformers import AutoTokenizer
 from sarm_llama import LlamaSARM
+# --- 1. Load Model and Tokenizer ---
+# This step automatically downloads your model files from the Hugging Face Hub.
+# Ensure your model repository is public.
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_ID = "schrieffer/SARM-4B"
 print(f"Loading model: {MODEL_ID} on {DEVICE}...")
+# trust_remote_code=True is required because SARM has a custom architecture.
 model = LlamaSARM.from_pretrained(
     MODEL_ID,
     sae_hidden_state_source_layer=16,
 print("Model loaded successfully!")
+# --- 2. Define the Inference Function ---
+# This function will be called by Gradio.
 def get_reward_score(prompt: str, response: str) -> float:
     """
+    Receives a prompt and a response, and returns the reward score calculated by the SARM model.
     """
     if not prompt or not response:
         return 0.0
     try:
+        # Use the same chat template as used during model training.
         messages = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}]
         input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(DEVICE)
         return round(score, 4)
     except Exception as e:
         print(f"Error: {e}")
+        # It might be better to return an error message on the UI, but here we simply return 0.
         return 0.0
+# --- 3. Create and Launch the Gradio Interface ---
+# Use gr.Blocks() for a more flexible layout.
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # SARM: Interpretable Reward Model Demo
+        This is an interactive demo for the **SARM-4B** model (Sparse Autoencoder-enhanced Reward Model).
+        SARM is a novel reward model architecture that enhances interpretability by integrating a pretrained Sparse Autoencoder (SAE). It maps the internal hidden states of a large language model into a sparse and human-understandable feature space, making the resulting reward scores transparent and conceptually meaningful.
+        **How to use this Demo:**
+        1.  Enter a **Prompt** (e.g., a question) in the left textbox below.
+        2.  Enter a corresponding **Response** in the right textbox.
+        3.  Click the "Calculate Reward Score" button.
+        The model will output a scalar score that evaluates the quality of the response. **A higher score indicates that the SARM model considers the response to be of better quality.**
+        ---
+        *SARM Architecture*
+        ![](https://huggingface.co/schrieffer/SARM-4B/resolve/main/sarm-framework.png?raw=true)
+        + **Authors** (* indicates equal contribution)
+            Shuyi Zhang\*, Wei Shi\*, Sihang Li\*, Jiayi Liao, Tao Liang, Hengxing Cai, Xiang Wang
+        + **Paper**: [Interpretable Reward Model via Sparse Autoencoder](https://arxiv.org/abs/2508.08746)
+        + **Model**: [schrieffer/SARM-4B](https://huggingface.co/schrieffer/SARM-4B)
+            + Finetuned from model: [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
+        + **Code Repository:** [https://github.com/schrieffer-z/sarm](https://github.com/schrieffer-z/sarm)
         """
     )
     calculate_btn = gr.Button("Calculate Reward Score", variant="primary")
     score_output = gr.Number(label="Reward Score", info="A higher score is better.")
+    # Define the button's click behavior.
     calculate_btn.click(
         fn=get_reward_score,
         inputs=[prompt_input, response_input],
         inputs=[prompt_input, response_input],
         outputs=score_output,
         fn=get_reward_score,
+        cache_examples=True # Cache the results of the examples to speed up loading.
     )
+# Launch the application.
+demo.launch()

assets/framework-v4.png ADDED Viewed

Git LFS Details

SHA256: 60a2e71aff1390a841c34a3f0c17290388251a61ff5395bed240047105cbed40
Pointer size: 131 Bytes
Size of remote file: 712 kB