Spaces:

akhaliq
/

granite-4.0-h-1b

Running on Zero

App Files Files Community

granite-4.0-h-1b / app.py

akhaliq HF Staff

Update Gradio app with multiple files

4b82591 verified about 1 month ago

raw

history blame contribute delete

6.01 kB

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import gradio as gr
	import spaces
	import os

	# Model configuration
	MODEL_PATH = "ibm-granite/granite-4.0-h-1b"
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# Global variables to store model and tokenizer
	tokenizer = None
	model = None

	def load_model():
	"""Load the model and tokenizer"""
	global tokenizer, model

	if tokenizer is None or model is None:
	print("Loading model and tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
	model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map=DEVICE)
	model.eval()
	print("Model loaded successfully!")

	@spaces.GPU # Use GPU for inference
	def chat_with_model(message, history):
	"""
	Chat function that processes user input and generates responses

	Args:
	message (str): Current user message
	history (list): Previous conversation history

	Returns:
	str: Model response
	"""
	try:
	# Load model if not already loaded
	load_model()

	# Prepare chat format
	messages = []

	# Add system message for better performance
	messages.append({
	"role": "system",
	"content": "You are a helpful AI assistant. Provide clear, accurate, and helpful responses."
	})

	# Add conversation history
	for user_msg, assistant_msg in history:
	messages.append({"role": "user", "content": user_msg})
	messages.append({"role": "assistant", "content": assistant_msg})

	# Add current message
	messages.append({"role": "user", "content": message})

	# Apply chat template
	chat = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Tokenize input
	input_tokens = tokenizer(chat, return_tensors="pt").to(DEVICE)

	# Generate response
	with torch.no_grad():
	output = model.generate(
	**input_tokens,
	max_new_tokens=200,
	temperature=0.7,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id
	)

	# Decode response
	full_response = tokenizer.batch_decode(output)[0]

	# Extract only the assistant's response
	# Find the start of assistant role
	assistant_start = full_response.find('<\|start_of_role\|>assistant<\|end_of_role\|>')
	if assistant_start != -1:
	assistant_start += len('<\|start_of_role\|>assistant<\|end_of_role\|>')
	assistant_response = full_response[assistant_start:].strip()
	else:
	# Fallback to original method if pattern not found
	response_start = full_response.find('<\|assistant\|>')
	if response_start != -1:
	response_start += len('<\|assistant\|>')
	assistant_response = full_response[response_start:].strip()
	else:
	assistant_response = full_response.strip()

	# Clean up the response - remove end markers
	assistant_response = assistant_response.replace('<\|endoftext\|>', '').replace('<\|end_of_text\|>', '').strip()

	return assistant_response

	except Exception as e:
	print(f"Error generating response: {e}")
	return f"I apologize, but I encountered an error: {str(e)}. Please try again."

	def clear_chat():
	"""Clear the chat history"""
	return []

	# Create the Gradio chat interface
	def create_chat_app():
	with gr.Blocks(title="IBM Granite Chat", css="""
	.header {
	text-align: center;
	padding: 10px;
	background: linear-gradient(90deg, #0066cc, #004499);
	color: white;
	margin-bottom: 20px;
	border-radius: 10px;
	}
	.header a {
	color: #ffffff;
	text-decoration: none;
	font-weight: bold;
	}
	.header a:hover {
	text-decoration: underline;
	}
	""") as demo:

	# Header with attribution
	gr.HTML("""
	<div class="header">
	<h1>IBM Granite 4.0 Chat</h1>
	<p>Powered by <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Built with anycoder</a></p>
	</div>
	""")

	# Chat interface
	chatbot = gr.ChatInterface(
	fn=chat_with_model,
	title="Chat with IBM Granite 4.0",
	description="Chat with the IBM Granite 4.0 1B parameter language model. Ask questions, get help, or have a conversation!",
	examples=[
	"What is machine learning?",
	"Explain quantum computing in simple terms",
	"How can I improve my programming skills?",
	"What are the latest developments in AI?",
	"Tell me about IBM Research"
	],
	)

	# Additional info
	with gr.Accordion("Model Information", open=False):
	gr.Markdown(f"""
	## Model Details
	- Model: {MODEL_PATH}
	- Parameters: 1B
	- Device: {DEVICE.upper()}
	- Max Tokens: 200 per response
	- Temperature: 0.7 (for balanced creativity and accuracy)

	## Tips
	- Ask specific questions for better results
	- The model works best with clear, concise prompts
	- Try asking follow-up questions to dive deeper into topics
	- The model can help with programming, explanations, and general knowledge
	""")

	return demo

	if __name__ == "__main__":
	# Create and launch the app
	app = create_chat_app()

	# Launch configuration
	app.launch()