Spaces:

khurrameycon
/

api-smollm135m

Sleeping

App Files Files Community

api-smollm135m / app.py

khurrameycon

Update app.py

d189069 verified 11 months ago

raw

history blame

5.54 kB

	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	from huggingface_hub import snapshot_download
	from safetensors.torch import load_file

	class ModelInput(BaseModel):
	prompt: str
	max_new_tokens: int = 50

	app = FastAPI()

	# Define model paths
	base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
	adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"

	try:
	# First load the base model
	print("Loading base model...")
	model = AutoModelForCausalLM.from_pretrained(
	base_model_path,
	torch_dtype=torch.float16,
	trust_remote_code=True,
	device_map="auto"
	)

	# Load tokenizer from base model
	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(base_model_path)

	# Download adapter weights
	print("Downloading adapter weights...")
	adapter_path_local = snapshot_download(adapter_path)

	# Load the safetensors file
	print("Loading adapter weights...")
	state_dict = load_file(f"{adapter_path_local}/adapter_model.safetensors")

	# Load state dict into model
	model.load_state_dict(state_dict, strict=False)

	print("Model and adapter loaded successfully!")

	except Exception as e:
	print(f"Error during model loading: {e}")
	raise

	def generate_response(model, tokenizer, instruction, max_new_tokens=128):
	"""Generate a response from the model based on an instruction."""
	try:
	messages = [{"role": "user", "content": instruction}]
	input_text = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	inputs = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
	outputs = model.generate(
	inputs,
	max_new_tokens=max_new_tokens,
	temperature=0.2,
	top_p=0.9,
	do_sample=True,
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return response

	except Exception as e:
	raise ValueError(f"Error generating response: {e}")

	@app.post("/generate")
	async def generate_text(input: ModelInput):
	try:
	response = generate_response(
	model=model,
	tokenizer=tokenizer,
	instruction=input.prompt,
	max_new_tokens=input.max_new_tokens
	)
	return {"generated_text": response}

	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/")
	async def root():
	return {"message": "Welcome to the Model API!"}






	# //////////////////////////////////////////

	# from fastapi import FastAPI, HTTPException
	# from pydantic import BaseModel
	# from transformers import AutoModelForCausalLM, AutoTokenizer, AutoAdapterModel
	# import torch
	# from huggingface_hub import snapshot_download

	# class ModelInput(BaseModel):
	# prompt: str
	# max_new_tokens: int = 50

	# app = FastAPI()

	# # Define model paths
	# base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
	# adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"

	# try:
	# # First load the base model
	# print("Loading base model...")
	# model = AutoModelForCausalLM.from_pretrained(
	# base_model_path,
	# torch_dtype=torch.float16,
	# trust_remote_code=True,
	# device_map="auto"
	# )

	# # Load tokenizer from base model
	# print("Loading tokenizer...")
	# tokenizer = AutoTokenizer.from_pretrained(base_model_path)

	# # Download adapter weights
	# print("Downloading adapter weights...")
	# adapter_path_local = snapshot_download(adapter_path)

	# # Load the adapter model
	# print("Loading adapter model...")
	# adapter_model = AutoAdapterModel.from_pretrained(adapter_path_local, from_pt=True)

	# # Combine the base model and adapter
	# model = model.with_adapter(adapter_model)

	# print("Model and adapter loaded successfully!")

	# except Exception as e:
	# print(f"Error during model loading: {e}")
	# raise

	# def generate_response(model, tokenizer, instruction, max_new_tokens=128):
	# """Generate a response from the model based on an instruction."""
	# try:
	# messages = [{"role": "user", "content": instruction}]
	# input_text = tokenizer.apply_chat_template(
	# messages, tokenize=False, add_generation_prompt=True
	# )

	# inputs = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
	# outputs = model.generate(
	# inputs,
	# max_new_tokens=max_new_tokens,
	# temperature=0.2,
	# top_p=0.9,
	# do_sample=True,
	# )

	# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	# return response

	# except Exception as e:
	# raise ValueError(f"Error generating response: {e}")

	# @app.post("/generate")
	# async def generate_text(input: ModelInput):
	# try:
	# response = generate_response(
	# model=model,
	# tokenizer=tokenizer,
	# instruction=input.prompt,
	# max_new_tokens=input.max_new_tokens
	# )
	# return {"generated_text": response}

	# except Exception as e:
	# raise HTTPException(status_code=500, detail=str(e))

	# @app.get("/")
	# async def root():
	# return {"message": "Welcome to the Model API!"}