#!/usr/bin/env python3 """ Performance monitor for GPT-OSS-120B """ import time from mlx_lm import load, generate import psutil import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def monitor_performance(): """Monitor model performance and resource usage""" logger.info("๐Ÿ“Š GPT-OSS-120B Performance Monitor") logger.info("=" * 50) # Check system resources ram = psutil.virtual_memory() logger.info(f"๐Ÿ’พ System RAM: {ram.available / (1024**3):.1f}GB available") # Load model and time it load_start = time.time() model, tokenizer = load("mlx-community/gpt-oss-120b-MXFP4-Q4") load_time = time.time() - load_start logger.info(f"โฑ๏ธ Model load time: {load_time:.2f}s") # Test generation performance test_prompts = [ "Hello, how are you?", "Explain machine learning", "What is the meaning of life?", "Write a haiku about technology", "Describe quantum physics" ] total_tokens = 0 total_time = 0 for i, prompt in enumerate(test_prompts): logger.info(f"\n๐Ÿงช Test {i+1}: {prompt}") # Time generation gen_start = time.time() response = generate( model, tokenizer, prompt=prompt, max_tokens=50, verbose=False ) gen_time = time.time() - gen_start # Estimate tokens (roughly) tokens = len(response.split()) * 1.3 # Approximate total_tokens += tokens total_time += gen_time tokens_per_sec = tokens / gen_time if gen_time > 0 else 0 logger.info(f" โฑ๏ธ Time: {gen_time:.2f}s") logger.info(f" ๐Ÿ“ˆ Speed: {tokens_per_sec:.1f} tokens/sec") logger.info(f" ๐Ÿ“ Response: {response[:100]}...") # Summary avg_speed = total_tokens / total_time if total_time > 0 else 0 logger.info(f"\n๐Ÿ“Š Summary:") logger.info(f" Total tokens generated: {total_tokens:.0f}") logger.info(f" Total time: {total_time:.2f}s") logger.info(f" Average speed: {avg_speed:.1f} tokens/sec") logger.info(f" Peak RAM usage: ~62GB (estimated)") if __name__ == "__main__": monitor_performance()