transformers + openai_gpt_oss on modal to run ππ»
#51
by
weege007
- opened
import io
import math
import requests
import os
import asyncio
import subprocess
from threading import Thread
from time import perf_counter, time
import modal
app = modal.App("openai_gpt_oss")
IMAGE_GPU = os.getenv("IMAGE_GPU", None)
img = (
# https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda/tags
modal.Image.from_registry(
"nvcr.io/nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04",
add_python="3.10",
)
.apt_install("git", "git-lfs")
.run_commands(
# Install transformers, accelerate, as well as the Triton kernels for MXFP4 compatibility
"pip install -U transformers accelerate torch kernels",
)
.run_commands(
"pip install -U git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels",
)
.pip_install("openai-harmony")
.env(
{
"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
# "TQDM_DISABLE": "1",
"LLM_MODEL": os.getenv("LLM_MODEL", "openai/gpt-oss-20b"),
}
)
)
if IMAGE_GPU in ["H100", "B100", "H200", "B200"]:
img = img.pip_install("triton==3.4.0")
else:
img = img.pip_install("triton==3.3.1")
HF_MODEL_DIR = "/root/.achatbot/models"
hf_model_vol = modal.Volume.from_name("models", create_if_missing=True)
with img.imports():
import torch
from transformers import (
pipeline,
AutoModelForCausalLM,
AutoTokenizer,
TextIteratorStreamer,
AutoConfig,
)
MODEL_PATH = os.getenv("LLM_MODEL", "openai/gpt-oss-20b")
model_path = os.path.join(HF_MODEL_DIR, MODEL_PATH)
def print_model_params(model: torch.nn.Module, extra_info="", f=None):
# print the number of parameters in the model
model_million_params = sum(p.numel() for p in model.parameters()) / 1e6
print(model, file=f)
print(f"{extra_info} {model_million_params} M parameters", file=f)
@app
.function(
gpu=IMAGE_GPU,
cpu=2.0,
retries=1,
image=img,
volumes={
HF_MODEL_DIR: hf_model_vol,
},
timeout=1200, # default 300s
scaledown_window=1200,
max_containers=1,
)
async def run(func, **kwargs):
subprocess.run("nvidia-smi --version", shell=True)
subprocess.run("nvcc --version", shell=True)
if torch.cuda.is_available():
gpu_prop = torch.cuda.get_device_properties("cuda")
print(gpu_prop)
if asyncio.iscoroutinefunction(func):
await func(**kwargs)
else:
func(**kwargs)
def dump_model(**kwargs):
for model_name in [
"openai/gpt-oss-20b",
"openai/gpt-oss-120b",
]:
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype="auto",
device_map="auto",
)
model = model.eval()
print(f"{model.config=}")
tokenizer = AutoTokenizer.from_pretrained(model_path)
file_path = os.path.join(model_path, "model.txt")
with open(file_path, "w") as f:
print(f"text tokenizer: {tokenizer}", file=f)
print_model_params(model, f"{model_name}", f)
del model
torch.cuda.empty_cache()
def tokenize(**kwargs):
tokenizer = AutoTokenizer.from_pretrained(model_path)
reasoning = kwargs.get("reasoning", "medium")
model_identity = kwargs.get(
"model_identity", "You are ChatGPT, a large language model trained by OpenAI."
)
device = "cuda" if torch.cuda.is_available() else "cpu"
messages = [
{"role": "user", "content": "Explain what MXFP4 quantization is."},
]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
reasoning_effort=reasoning,
model_identity=model_identity,
).to(device)
for key, value in inputs.items():
print(f"{key}: {value.shape=}")
input_ids = inputs["input_ids"]
print(input_ids)
prompt = tokenizer.decode(input_ids[0])
print(f"{prompt=}")
def pipe(**kwargs):
pipe = pipeline(
"text-generation",
model=model_path,
torch_dtype="auto",
device_map="auto",
)
messages = [
{"role": "user", "content": "Explain quantum mechanics clearly and concisely."},
]
outputs = pipe(
messages,
max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])
def generate(**kwargs):
tokenizer = AutoTokenizer.from_pretrained(model_path)
gpu_prop = torch.cuda.get_device_properties("cuda")
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype="auto",
device_map="auto",
attn_implementation="kernels-community/vllm-flash-attn3" if gpu_prop.major > 8 else None,
)
messages = [
{"role": "user", "content": "Explain what MXFP4 quantization is."},
]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True,
).to(model.device)
print(inputs)
outputs = model.generate(**inputs, max_new_tokens=128, temperature=0.7)
print(tokenizer.decode(outputs[0]))
def generate_stream(**kwargs):
gpu_prop = torch.cuda.get_device_properties("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype="auto",
device_map="auto",
attn_implementation="kernels-community/vllm-flash-attn3" if gpu_prop.major > 8 else None,
)
reasoning = kwargs.get("reasoning", "medium")
model_identity = kwargs.get(
"model_identity", "You are ChatGPT, a large language model trained by OpenAI."
)
messages = [
{"role": "user", "content": "Explain what MXFP4 quantization is."},
]
for i in range(2):
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
reasoning_effort=reasoning,
model_identity=model_identity,
).to(model.device)
for key, value in inputs.items():
print(f"{key}: {value.shape=}")
input_ids = inputs["input_ids"]
prompt = tokenizer.decode(input_ids[0])
print(f"{prompt=}")
# kv cache
# cache_position = torch.arange(input_ids.shape[1], dtype=torch.int64, device=model.device)
# past_key_values = DynamicCache()
streamer = TextIteratorStreamer(
tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True
)
generation_kwargs = dict(
**inputs,
# do_sample=False,
# cache_position=cache_position,
# past_key_values=past_key_values,
# https://huggingface.co/docs/transformers/kv_cache
# https://huggingface.co/docs/transformers/cache_explanation
cache_implementation="dynamic",
# cache_implementation="offloaded",
do_sample=True,
temperature=0.6,
top_k=10,
top_p=0.95,
# num_beams=1,
repetition_penalty=1.1,
max_new_tokens=2048,
use_cache=True,
streamer=streamer,
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
generated_text = ""
start = perf_counter()
times = []
with torch.inference_mode():
for new_text in streamer:
times.append(perf_counter() - start)
print(new_text, end="", flush=True)
generated_text += new_text
start = perf_counter()
print(f"\n{i}. {generated_text=} TTFT: {times[0]:.2f}s total time: {sum(times):.2f}s")
def openai_harmony_stream_decode_unicode(**kwargs):
from openai_harmony import (
HarmonyEncodingName,
load_harmony_encoding,
StreamableParser,
Role,
)
encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
# StreamableParser for parsing and decoding as the model is generating new tokens.
# This can be helpful for example to stream output and handle unicode characters during decoding.
stream = StreamableParser(encoding, role=Role.ASSISTANT)
tokens = [
200005,
35644,
200008,
1844,
31064,
25,
392,
4827,
382,
220,
17,
659,
220,
17,
16842,
12295,
81645,
13,
51441,
6052,
13,
200007,
200006,
173781,
200005,
17196,
200008,
17,
659,
220,
17,
314,
220,
19,
13,
200002,
]
for token in tokens:
stream.process(token)
print("--------------------------------")
print("current_role", stream.current_role)
print("current_channel", stream.current_channel)
print("last_content_delta", stream.last_content_delta)
print("current_content_type", stream.current_content_type)
print("current_recipient", stream.current_recipient)
print("current_content", stream.current_content)
def openai_harmony_generate_tool(**kwargs):
import json
from openai_harmony import (
Author,
HarmonyEncodingName,
load_harmony_encoding,
Conversation,
Message,
Role,
SystemContent,
DeveloperContent,
ToolDescription,
)
reasoning: str = kwargs.get("reasoning", "medium")
model_identity = kwargs.get(
"model_identity", "You are ChatGPT, a large language model trained by OpenAI."
)
encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
# Build conversation
# https://cookbook.openai.com/articles/openai-harmony
system_message = (
SystemContent.new()
.with_model_identity(model_identity)
# build-in tools
# .with_browser_tool()
# .with_python_tool()
.with_reasoning_effort(reasoning[0].upper() + reasoning[1:])
.with_conversation_start_date(
"2025-06-28"
) # NOTE: if want use more kv cache, don't change system message
.with_knowledge_cutoff("2024-06")
.with_required_channels(["analysis", "commentary", "final"])
)
# https://cookbook.openai.com/articles/openai-harmony#function-calling
developer_message = (
DeveloperContent.new()
.with_instructions("Always respond in riddles")
.with_function_tools(
[
ToolDescription.new(
"get_location",
"Gets the location of the user.",
),
ToolDescription.new(
"get_current_weather",
"Gets the current weather in the provided location.",
parameters={
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"default": "celsius",
},
},
"required": ["location"],
},
),
]
)
)
convo = Conversation.from_messages(
[
Message.from_role_and_content(Role.SYSTEM, system_message),
Message.from_role_and_content(Role.DEVELOPER, developer_message),
Message.from_role_and_content(Role.USER, "What is the weather in Tokyo?"),
Message.from_role_and_content(
Role.ASSISTANT,
'User asks: "What is the weather in Tokyo?" We need to use get_weather tool.',
).with_channel("analysis"),
Message.from_role_and_content(Role.ASSISTANT, '{"location": "Tokyo"}')
.with_channel("commentary")
.with_recipient("functions.get_weather")
.with_content_type("json"),
Message.from_author_and_content(
Author.new(Role.TOOL, "functions.lookup_weather"),
'{ "temperature": 20, "sunny": true }',
)
.with_recipient("assistant")
.with_channel("commentary"),
]
)
prefill_ids: list = encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
print(f"{prefill_ids=}")
tokenizer = AutoTokenizer.from_pretrained(model_path)
prefill_tokens = tokenizer.decode(prefill_ids)
print(f"{prefill_tokens=}")
stop_token_ids = encoding.stop_tokens_for_assistant_actions()
print(f"{stop_token_ids=}")
# Load model
gpu_prop = torch.cuda.get_device_properties("cuda")
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype="auto",
device_map="auto",
attn_implementation="kernels-community/vllm-flash-attn3" if gpu_prop.major > 8 else None,
)
# Generate
outputs = model.generate(
input_ids=torch.tensor([prefill_ids], dtype=torch.int, device=model.device),
max_new_tokens=128 if reasoning == "low" else 2048,
eos_token_id=stop_token_ids,
)
print(outputs)
tokenizer = AutoTokenizer.from_pretrained(model_path)
output_tokens = tokenizer.decode(outputs[0])
print(f"{output_tokens=}")
"""output_tokens=
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-06-28
Reasoning: low
# Valid channels: analysis, commentary, final. Channel must be included for every message.
Calls to these tools must go to the commentary channel: \'functions\'.<|end|><|start|>developer<|message|># Instructions
Always respond in riddles
# Tools
## functions
namespace functions {
// Gets the location of the user.
type get_location = () => any;
// Gets the current weather in the provided location.
type get_current_weather = (_: {
// The city and state, e.g. San Francisco, CA
location: string,
format?: "celsius" | "fahrenheit", // default: celsius
}) => any;
} // namespace functions<|end|><|start|>user<|message|>What is the weather in Tokyo?<|end|><|start|>assistant<|channel|>analysis<|message|>User asks: "What is the weather in Tokyo?" We need to use get_weather tool.<|end|><|start|>assistant to=functions.get_weather<|channel|>commentary json<|message|>{"location": "Tokyo"}<|call|><|start|>functions.lookup_weather to=assistant<|channel|>commentary<|message|>{ "temperature": 20, "sunny": true }<|end|><|start|>assistant<|channel|>final<|message|>In the land where cranes dance, the sky wears a warm, sunlit smileβabout twenty degrees, with clouds whispering few secrets. π€οΈ<|return|>
"""
# Parse completion tokens
completion_ids = outputs[0][len(prefill_ids) :]
print(f"{completion_ids=}")
# After receiving a token response
# Do not pass in the stop token
parsed_response = encoding.parse_messages_from_completion_tokens(completion_ids, Role.ASSISTANT)
for message in parsed_response:
print(json.dumps(message.to_dict(), indent=2))
def openai_harmony_generate(**kwargs):
import json
from openai_harmony import (
HarmonyEncodingName,
load_harmony_encoding,
Conversation,
Message,
Role,
SystemContent,
DeveloperContent,
ReasoningEffort,
)
reasoning = kwargs.get("reasoning", "medium")
model_identity = kwargs.get(
"model_identity", "You are ChatGPT, a large language model trained by OpenAI."
)
encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
# Build conversation
# https://cookbook.openai.com/articles/openai-harmony
system_message = (
SystemContent.new()
.with_model_identity(model_identity)
.with_reasoning_effort(reasoning[0].upper() + reasoning[1:])
.with_conversation_start_date("2025-06-28")
.with_knowledge_cutoff("2024-06")
.with_required_channels(["analysis", "commentary", "final"])
)
convo = Conversation.from_messages(
[
Message.from_role_and_content(Role.SYSTEM, system_message),
Message.from_role_and_content(
Role.DEVELOPER,
DeveloperContent.new().with_instructions("Always respond in riddles"),
),
Message.from_role_and_content(Role.USER, "What is the weather like in SF?"),
]
)
# Render prompt
prefill_ids = encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
print(f"{prefill_ids=}")
stop_token_ids = encoding.stop_tokens_for_assistant_actions()
print(f"{stop_token_ids=}")
# Load model
gpu_prop = torch.cuda.get_device_properties("cuda")
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype="auto",
device_map="auto",
attn_implementation="kernels-community/vllm-flash-attn3" if gpu_prop.major > 8 else None,
)
# Generate
outputs = model.generate(
input_ids=torch.tensor([prefill_ids], dtype=torch.int, device=model.device),
max_new_tokens=128,
eos_token_id=stop_token_ids,
)
print(outputs)
tokenizer = AutoTokenizer.from_pretrained(model_path)
output_tokens = tokenizer.decode(outputs[0])
print(f"{output_tokens=}")
# Parse completion tokens
completion_ids = outputs[0][len(prefill_ids) :]
print(f"{completion_ids=}")
entries = encoding.parse_messages_from_completion_tokens(completion_ids, Role.ASSISTANT)
print(entries)
for message in entries:
print(json.dumps(message.to_dict(), indent=2))
def split_model(**kwargs):
device_map = {}
world_size = torch.cuda.device_count()
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
print(config)
num_layers = config.num_hidden_layers
num_layers_per_gpu = math.ceil(num_layers / world_size)
num_layers_per_gpus = [num_layers_per_gpu] * world_size
num_layers_per_gpus[0] = int(num_layers_per_gpu * 0.95)
for i in range(world_size - 1):
num_layers_per_gpus[i + 1] = num_layers_per_gpu + int(num_layers_per_gpu * (1 - 0.95))
print(num_layers_per_gpus)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpus):
for j in range(num_layer):
if layer_cnt >= num_layers:
break
device_map[f"model.layers.{layer_cnt}"] = i
layer_cnt += 1
device_map["model.embed_tokens"] = 0
device_map["model.norm"] = 0
device_map["model.rotary_emb"] = 0
device_map["lm_head"] = 0
device_map[f"model.layers.{num_layers - 1}"] = 0
print(device_map)
return device_map
def multi_gpu_generate(**kwargs):
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left")
device_map = split_model()
gpu_prop = torch.cuda.get_device_properties("cuda")
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype="auto",
attn_implementation="kernels-community/vllm-flash-attn3" if gpu_prop.major > 8 else None,
device_map=device_map,
)
reasoning = kwargs.get("reasoning", "medium")
model_identity = kwargs.get(
"model_identity", "You are ChatGPT, a large language model trained by OpenAI."
)
messages = [
{
"role": "user",
"content": "Explain how expert parallelism works in large language models.",
}
]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True,
reasoning_effort=reasoning,
model_identity=model_identity,
).to(model.device)
for key, value in inputs.items():
print(f"{key}: {value.shape=}")
input_ids = inputs["input_ids"]
prompt = tokenizer.decode(input_ids[0])
print(f"{prompt=}")
start = perf_counter()
outputs = model.generate(**inputs, max_new_tokens=1000)
print(f"generate cost:{(perf_counter() - start):.3f} s")
# Decode and print
response = tokenizer.decode(outputs[0])
print(f"{response=}")
print("Model response:", response.split("<|channel|>final<|message|>")[-1].strip())
"""
- https://cookbook.openai.com/articles/gpt-oss/run-transformers
- https://cookbook.openai.com/articles/openai-harmony
- https://github.com/huggingface/transformers/releases/tag/v4.55.0
- https://huggingface.co/blog/welcome-openai-gpt-oss
- https://huggingface.co/openai/gpt-oss-20b/blob/main/chat_template.jinja
- https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py
- https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/integrations/mxfp4.py
# NOTE: u can use text tokenizer lib (tiktoken use openai-harmony or HF tokenizers lib) + gpt oss LLM generator
modal run src/download_models.py --repo-ids "openai/gpt-oss-20b"
modal run src/download_models.py --repo-ids "openai/gpt-oss-120b"
modal run src/llm/transformers/openai_gpt_oss.py --task tokenize --reasoning low
modal run src/llm/transformers/openai_gpt_oss.py --task tokenize --reasoning medium
modal run src/llm/transformers/openai_gpt_oss.py --task tokenize --reasoning high
modal run src/llm/transformers/openai_gpt_oss.py --task tokenize --reasoning low --model-identity "you are a helpful assistant."
IMAGE_GPU=L40s modal run src/llm/transformers/openai_gpt_oss.py --task dump_model
IMAGE_GPU=H100 LLM_MODEL="openai/gpt-oss-120b" modal run src/llm/transformers/openai_gpt_oss.py --task dump_model
IMAGE_GPU=L40s modal run src/llm/transformers/openai_gpt_oss.py --task pipe
IMAGE_GPU=H100 modal run src/llm/transformers/openai_gpt_oss.py --task pipe
IMAGE_GPU=L40s modal run src/llm/transformers/openai_gpt_oss.py --task generate
IMAGE_GPU=H100 modal run src/llm/transformers/openai_gpt_oss.py --task generate
IMAGE_GPU=L40s modal run src/llm/transformers/openai_gpt_oss.py --task generate_stream --reasoning low
IMAGE_GPU=L40s modal run src/llm/transformers/openai_gpt_oss.py --task generate_stream --reasoning medium
IMAGE_GPU=L40s modal run src/llm/transformers/openai_gpt_oss.py --task generate_stream --reasoning high
IMAGE_GPU=H100 modal run src/llm/transformers/openai_gpt_oss.py --task generate_stream --reasoning low
IMAGE_GPU=H100 modal run src/llm/transformers/openai_gpt_oss.py --task generate_stream --reasoning medium
IMAGE_GPU=H100 modal run src/llm/transformers/openai_gpt_oss.py --task generate_stream --reasoning high
modal run src/llm/transformers/openai_gpt_oss.py --task openai_harmony_stream_decode_unicode
IMAGE_GPU=L40s modal run src/llm/transformers/openai_gpt_oss.py --task openai_harmony_generate --reasoning low
IMAGE_GPU=L40s modal run src/llm/transformers/openai_gpt_oss.py --task openai_harmony_generate --reasoning medium
IMAGE_GPU=L40s modal run src/llm/transformers/openai_gpt_oss.py --task openai_harmony_generate --reasoning high
IMAGE_GPU=L40s modal run src/llm/transformers/openai_gpt_oss.py --task openai_harmony_generate_tool --reasoning low
IMAGE_GPU=L40s modal run src/llm/transformers/openai_gpt_oss.py --task openai_harmony_generate_tool --reasoning medium
IMAGE_GPU=L40s modal run src/llm/transformers/openai_gpt_oss.py --task openai_harmony_generate_tool --reasoning high
IMAGE_GPU=L4:3 modal run src/llm/transformers/openai_gpt_oss.py --task split_model
IMAGE_GPU=L4:3 modal run src/llm/transformers/openai_gpt_oss.py --task multi_gpu_generate --reasoning low
IMAGE_GPU=L4:3 modal run src/llm/transformers/openai_gpt_oss.py --task multi_gpu_generate --reasoning medium
IMAGE_GPU=L4:3 modal run src/llm/transformers/openai_gpt_oss.py --task multi_gpu_generate --reasoning high
IMAGE_GPU=L40:1 modal run src/llm/transformers/openai_gpt_oss.py --task multi_gpu_generate
IMAGE_GPU=L40:2 modal run src/llm/transformers/openai_gpt_oss.py --task multi_gpu_generate --reasoning high
IMAGE_GPU=H100:1 modal run src/llm/transformers/openai_gpt_oss.py --task multi_gpu_generate
IMAGE_GPU=H100:2 modal run src/llm/transformers/openai_gpt_oss.py --task multi_gpu_generate --reasoning high
"""
@app
.local_entrypoint()
def main(
task: str = "dump_model",
reasoning="medium", # low medium high
model_identity="You are ChatGPT, a large language model trained by OpenAI.",
):
print(task)
tasks = {
"dump_model": dump_model,
"tokenize": tokenize,
"pipe": pipe,
"generate": generate,
"generate_stream": generate_stream,
"openai_harmony_stream_decode_unicode": openai_harmony_stream_decode_unicode,
"openai_harmony_generate": openai_harmony_generate,
"openai_harmony_generate_tool": openai_harmony_generate_tool,
"split_model": split_model,
"multi_gpu_generate": multi_gpu_generate,
}
if task not in tasks:
raise ValueError(f"task {task} not found")
print(f"running task {task}")
run.remote(
tasks[task],
reasoning=reasoning.lower(),
model_identity=model_identity,
)
weege007
changed discussion title from
transformers + openai_gpt_oss on modal to run
to transformers + openai_gpt_oss on modal to run ππ»