File size: 3,248 Bytes
16e2350
 
68c780e
7efecbe
 
 
 
7322a9c
0224e28
7322a9c
7efecbe
0224e28
16e2350
 
 
 
 
 
 
 
 
8d7f6ed
7efecbe
8d7f6ed
 
 
 
 
 
 
 
 
 
16e2350
 
 
 
 
 
 
7efecbe
16e2350
8d7f6ed
16e2350
 
 
 
 
 
 
 
 
 
 
e0dfe14
 
 
 
 
 
 
 
 
 
 
 
8d7f6ed
16e2350
 
 
8d7f6ed
7efecbe
8d7f6ed
 
 
 
 
7efecbe
8d7f6ed
 
 
 
 
7efecbe
 
8d7f6ed
 
 
16e2350
 
 
7efecbe
16e2350
8d7f6ed
16e2350
 
7efecbe
 
e0dfe14
16e2350
 
 
7efecbe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gradio as gr
from huggingface_hub import InferenceClient
import os

# Get your Hugging Face token from environment variables
HF_Token = os.getenv("HF_TOKEN")

# Initialize the inference client with a coding specialized model HuggingFaceH4/zephyr-7b-beta
client = InferenceClient(
    model="Qwen/Qwen2.5-Coder-1.5B-Instruct",  # Using StarCoder2 which excels at code generation 
    token=HF_Token
)

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    """
    Generate coding-focused responses using the selected model.
    
    Args:
        message (str): The current user input message
        history (list): List of previous conversation turns
        system_message (str): System prompt to guide the model's behavior
        max_tokens (int): Maximum number of tokens to generate
        temperature (float): Controls randomness in generation
        top_p (float): Controls nucleus sampling
    """
    # Format the conversation history into messages
    messages = [{"role": "system", "content": system_message}]
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    messages.append({"role": "user", "content": message})
    
    response = ""
    # Stream the response tokens
    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content
        response += token
        yield response

# Create example inputs - needs to be formatted correctly for ChatInterface
example_prompts = [
    "Write a Python function to find the longest palindromic substring",
    "Create a React component that displays a color picker",
    "How do I implement quicksort in JavaScript?",
    "Explain the difference between Promise.all and Promise.allSettled in JavaScript",
    "Generate a Python script to download and process CSV data from an API"
]

# Format examples properly for ChatInterface
examples = [[prompt] for prompt in example_prompts]

# Create the Gradio interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(
            value="You are an expert coding assistant. Provide detailed, correct, and efficient code solutions with explanations.",
            label="System message"
        ),
        gr.Slider(
            minimum=1,
            maximum=2048,
            value=1024,
            step=1,
            label="Max new tokens"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.5,
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.9,
            step=0.05,
            label="Top-p (nucleus sampling)"
        ),
    ],
    title="Coding Expert Assistant",
    description="A specialized coding assistant powered by StarCoder2, a model trained on code repositories",
    examples=examples
)

if __name__ == "__main__":
    demo.launch(share=True)