Spaces:

kfoughali
/

serpent

Sleeping

App Files Files Community

kfoughali commited on Sep 6

Commit

9c6e956

verified ·

1 Parent(s): b3bb89e

Update benchmark.py

Browse files

Files changed (1) hide show

benchmark.py +424 -191

benchmark.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Benchmarking, metrics, and proof generation for Enhanced SPG.
 Supports LongBench, NIAH, RULER, SCBench benchmarks.
 MEASURED VALUES ONLY - no estimations. FAIL FAST on errors.
 """
 import torch
@@ -234,6 +235,113 @@ class BenchmarkMetrics:
         return (0.0, 0.0)
 def create_niah_haystack(context_length: int, needle: str, depth_percent: float) -> str:
     """Create Needle-in-a-Haystack test context - NO HARDCODING."""
     # Generate haystack text
@@ -255,8 +363,9 @@ def create_niah_haystack(context_length: int, needle: str, depth_percent: float)
     return haystack_with_needle
-def evaluate_niah(model, tokenizer, config: CompressionConfig, cache_manager: Optional[QuantizedKVCache] = None) -> float:
-    """Evaluate Needle-in-a-Haystack performance - MEASURED ONLY."""
     context = create_niah_haystack(
         config.prefill_length,
         config.niah_needle,
@@ -267,46 +376,32 @@ def evaluate_niah(model, tokenizer, config: CompressionConfig, cache_manager: Op
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=config.prefill_length)
     input_ids = inputs.input_ids.to(model.device)
     with torch.inference_mode():
-        if cache_manager:
-            # Compress KV cache
-            outputs = model(input_ids, use_cache=True, return_dict=True)
-            past_key_values = outputs.past_key_values
-            # Store compressed
-            kv_tuple = past_key_values.to_legacy_cache() if hasattr(past_key_values, 'to_legacy_cache') else past_key_values
-            for layer_idx, (keys, values) in enumerate(kv_tuple):
-                cache_manager.compress_and_store(layer_idx, keys, values)
-            # Reconstruct for generation
-            reconstructed_kv = []
-            for layer_idx in range(len(kv_tuple)):
-                dec_keys, dec_values = cache_manager.get_decompressed(layer_idx)
-                if dec_keys is not None and dec_values is not None:
-                    reconstructed_kv.append((dec_keys, dec_values))
-            if hasattr(DynamicCache, 'from_legacy_cache'):
-                past_key_values = DynamicCache.from_legacy_cache(tuple(reconstructed_kv))
-            else:
-                past_key_values = tuple(reconstructed_kv)
-            # Generate with compressed cache
-            output = model.generate(
-                input_ids,
-                past_key_values=past_key_values,
-                max_new_tokens=20,
-                temperature=0.0,
-                do_sample=False
-            )
-        else:
-            # Generate without compression
-            output = model.generate(
-                input_ids,
-                max_new_tokens=20,
-                temperature=0.0,
-                do_sample=False
-            )
     generated_text = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
@@ -314,59 +409,20 @@ def evaluate_niah(model, tokenizer, config: CompressionConfig, cache_manager: Op
     accuracy = 1.0 if config.niah_needle.split()[-1] in generated_text else 0.0
     logger.info(f"NIAH accuracy: {accuracy}, Generated: {generated_text[:50]}")
-    return accuracy
-def evaluate_longbench_task(model, tokenizer, config: CompressionConfig,
-                            task: str, cache_manager: Optional[QuantizedKVCache] = None) -> Dict[str, float]:
-    """Evaluate LongBench task - MEASURED METRICS ONLY."""
-    try:
-        dataset = load_dataset("THUDM/LongBench", task, split="test")
-        # Sample evaluation examples
-        n_samples = min(config.eval_samples, len(dataset))
-        samples = dataset.select(range(n_samples))
-        scores = []
-        for sample in samples:
-            context = sample.get("context", "")
-            question = sample.get("input", sample.get("question", ""))
-            answer = sample.get("answers", [sample.get("answer", "")])
-            if isinstance(answer, list) and answer:
-                answer = answer[0]
-            prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
-            inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
-                             max_length=config.prefill_length)
-            input_ids = inputs.input_ids.to(model.device)
-            with torch.inference_mode():
-                output = model.generate(
-                    input_ids,
-                    max_new_tokens=50,
-                    temperature=0.0,
-                    do_sample=False
-                )
-            generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
-            # Simple accuracy metric - check if answer appears in generation
-            score = 1.0 if str(answer).lower() in generated.lower() else 0.0
-            scores.append(score)
-        return {
-            "accuracy": float(np.mean(scores)),
-            "n_samples": n_samples
-        }
-    except Exception as e:
-        logger.error(f"Error evaluating LongBench task {task}: {e}")
-        return {"accuracy": 0.0, "n_samples": 0}
-def evaluate_ruler(model, tokenizer, config: CompressionConfig,
-                  cache_manager: Optional[QuantizedKVCache] = None) -> float:
-    """Evaluate RULER benchmark - MEASURED ONLY."""
     # Create synthetic RULER-like task
     seq_len = min(config.ruler_max_seq_length, config.prefill_length)
@@ -383,14 +439,31 @@ def evaluate_ruler(model, tokenizer, config: CompressionConfig,
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=seq_len)
     input_ids = inputs.input_ids.to(model.device)
     with torch.inference_mode():
         output = model.generate(
             input_ids,
             max_new_tokens=10,
             temperature=0.0,
-            do_sample=False
         )
     generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
@@ -399,11 +472,20 @@ def evaluate_ruler(model, tokenizer, config: CompressionConfig,
     exact_match = 1.0 if expected in generated else 0.0
     logger.info(f"RULER exact match: {exact_match}, Generated: {generated[:50]}")
-    return exact_match
-def evaluate_scbench(model, tokenizer, config: CompressionConfig,
-                    cache_manager: Optional[QuantizedKVCache] = None) -> float:
-    """Evaluate SCBench multi-turn conversation - MEASURED ONLY."""
     # Create multi-turn conversation
     conversation = []
     facts = {}
@@ -428,14 +510,31 @@ def evaluate_scbench(model, tokenizer, config: CompressionConfig,
     inputs = tokenizer(full_conversation, return_tensors="pt", truncation=True,
                       max_length=config.prefill_length)
     input_ids = inputs.input_ids.to(model.device)
     with torch.inference_mode():
         output = model.generate(
             input_ids,
             max_new_tokens=20,
             temperature=0.0,
-            do_sample=False
         )
     generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
@@ -444,7 +543,107 @@ def evaluate_scbench(model, tokenizer, config: CompressionConfig,
     accuracy = 1.0 if expected_value in generated else 0.0
     logger.info(f"SCBench accuracy: {accuracy}, Generated: {generated[:50]}")
-    return accuracy
 def load_model_and_tokenizer(model_name: str, config: CompressionConfig):
     """Load model and tokenizer with proper configuration - NO HARDCODING."""
@@ -496,11 +695,12 @@ def load_model_and_tokenizer(model_name: str, config: CompressionConfig):
     return model, tokenizer
 def load_real_dataset_samples(config: CompressionConfig, tokenizer) -> List[str]:
     """Load dataset samples based on benchmark type - NO HARDCODING."""
     logger.info(f"Loading samples for benchmark: {config.benchmark_type}")
-    if config.benchmark_type == "perplexity":
         # Original WikiText loading
         texts = []
         min_tokens = config.prefill_length + config.generation_length
@@ -568,8 +768,9 @@ def load_real_dataset_samples(config: CompressionConfig, tokenizer) -> List[str]
     logger.info(f"Loaded {len(texts)} text samples")
     return texts
 def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_texts: Optional[List[str]] = None) -> Tuple[BenchmarkMetrics, Dict, List[Dict], List[Dict]]:
-    """Research-grade benchmark with support for multiple benchmarks."""
     logger.info(f"Starting benchmark: {model_name} with {config.compression_type.value}")
     logger.info(f"Benchmark type: {config.benchmark_type}")
     logger.info(f"Config hash: {config.get_hash()}")
@@ -611,57 +812,117 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
         metrics = BenchmarkMetrics()
-        # Run benchmark-specific evaluation
         if config.benchmark_type == "niah":
-            # NIAH evaluation
             for depth in BENCHMARK_CONFIGS["niah"]["depths"]:
                 config.niah_depth_percent = depth
                 for idx in range(min(config.eval_samples, 10)):
                     cache_manager = QuantizedKVCache(config)
                     cache_manager.n_layers = n_layers
-                    accuracy = evaluate_niah(model, tokenizer, config, cache_manager)
-                    metrics.niah_retrieval_accuracy.append(accuracy)
-                    compressed_size = cache_manager.get_memory_footprint()
-                    metrics.kv_cache_memory_samples_mb.append(compressed_size / (1024 * 1024))
         elif config.benchmark_type == "ruler":
-            # RULER evaluation
             for idx in range(config.eval_samples):
                 cache_manager = QuantizedKVCache(config)
                 cache_manager.n_layers = n_layers
-                exact_match = evaluate_ruler(model, tokenizer, config, cache_manager)
-                metrics.ruler_exact_match.append(exact_match)
-                compressed_size = cache_manager.get_memory_footprint()
-                metrics.kv_cache_memory_samples_mb.append(compressed_size / (1024 * 1024))
         elif config.benchmark_type == "scbench":
-            # SCBench evaluation
             for idx in range(config.eval_samples):
                 cache_manager = QuantizedKVCache(config)
                 cache_manager.n_layers = n_layers
-                accuracy = evaluate_scbench(model, tokenizer, config, cache_manager)
-                metrics.scbench_turn_accuracy.append(accuracy)
-                compressed_size = cache_manager.get_memory_footprint()
-                metrics.kv_cache_memory_samples_mb.append(compressed_size / (1024 * 1024))
         elif config.benchmark_type == "longbench":
-            # LongBench evaluation
             if config.benchmark_subset:
                 cache_manager = QuantizedKVCache(config)
                 cache_manager.n_layers = n_layers
-                scores = evaluate_longbench_task(model, tokenizer, config,
                                                 config.benchmark_subset, cache_manager)
-                metrics.longbench_scores.append(scores)
         else:
-            # Standard perplexity evaluation
             for idx in range(config.eval_samples):
                 logger.info(f"Sample {idx+1}/{config.eval_samples}")
@@ -682,68 +943,27 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                 input_ids = inputs.input_ids.to(device)
                 attention_mask = inputs.attention_mask.to(device)
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-                    torch.cuda.reset_peak_memory_stats()
-                    torch.cuda.synchronize()
-                # Prefill
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                start_time_sample = time.perf_counter()
-                with torch.inference_mode():
-                    outputs = model(
-                        input_ids,
-                        attention_mask=attention_mask,
-                        use_cache=True,
-                        return_dict=True
-                    )
-                    past_key_values = outputs.past_key_values
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                prefill_time = time.perf_counter() - start_time_sample
-                if torch.cuda.is_available():
-                    prefill_peak_mem = _peak_mem_bytes_all_gpus()
-                    metrics.prefill_peak_memories.append(prefill_peak_mem)
-                metrics.prefill_times.append(prefill_time)
-                # Compression
-                original_cache_size = 0
-                if past_key_values:
-                    kv_tuple = past_key_values.to_legacy_cache() if hasattr(past_key_values, 'to_legacy_cache') else past_key_values
-                    for layer_idx, (keys, values) in enumerate(kv_tuple):
-                        original_cache_size += keys.nelement() * keys.element_size()
-                        original_cache_size += values.nelement() * values.element_size()
-                        if config.compression_type != CompressionType.NONE:
-                            cache_manager.compress_and_store(layer_idx, keys, values)
-                    if config.compression_type != CompressionType.NONE:
-                        reconstructed_kv = []
-                        for layer_idx in range(len(kv_tuple)):
-                            dec_keys, dec_values = cache_manager.get_decompressed(layer_idx)
-                            if dec_keys is not None and dec_values is not None:
-                                reconstructed_kv.append((dec_keys, dec_values))
-                        if hasattr(DynamicCache, 'from_legacy_cache'):
-                            past_key_values = DynamicCache.from_legacy_cache(tuple(reconstructed_kv))
-                        else:
-                            past_key_values = tuple(reconstructed_kv)
-                compressed_size = original_cache_size if config.compression_type == CompressionType.NONE else cache_manager.get_memory_footprint()
-                comp_ratio = original_cache_size / compressed_size if compressed_size > 0 else 1.0
-                metrics.compression_ratios.append(comp_ratio)
-                metrics.kv_cache_memory_samples_mb.append(compressed_size / (1024 * 1024))
-                # Generation
                 generated_ids = input_ids.clone()
                 decode_times = []
                 generation_losses = []
                 for gen_step in range(config.generation_length):
                     if torch.cuda.is_available():
@@ -778,11 +998,21 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                 if generation_losses:
                     generation_perplexity = np.exp(np.mean(generation_losses))
                     metrics.generation_perplexities.append(min(generation_perplexity, 1000))
         metrics.calculate_statistics(config)
         all_metrics.append(metrics)
-    # Aggregate results
     final_metrics = BenchmarkMetrics()
     for m in all_metrics:
         final_metrics.prefill_times.extend(m.prefill_times)
@@ -826,15 +1056,18 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
     else:
         summary['prefill_perplexity'] = final_metrics.prefill_perplexity_mean
         summary['generation_perplexity'] = final_metrics.generation_perplexity_mean
-        summary['prefill_time_ms'] = final_metrics.prefill_time_mean * 1000
-        summary['decode_time_ms'] = final_metrics.decode_time_per_token_mean_ms
-        summary['throughput_tokens_sec'] = final_metrics.decode_tokens_per_sec
-        summary['end_to_end_throughput'] = final_metrics.end_to_end_throughput
-        summary['end_to_end_latency_ms'] = final_metrics.end_to_end_latency_ms
-        summary['peak_memory_mb'] = final_metrics.prefill_peak_memory_mean_mb
     return final_metrics, summary, per_sample_records, per_layer_fingerprints
 def export_proof_bundle(bundle_dir: str, config: CompressionConfig,
                        metrics: BenchmarkMetrics, summary: Dict[str, Any],
                        per_sample_records: List[Dict[str, Any]],
@@ -889,6 +1122,7 @@ def export_proof_bundle(bundle_dir: str, config: CompressionConfig,
     logger.info(f"Proof bundle exported: {zip_path}")
     return zip_path
 def verify_proof_bundle(bundle_root: str, config: CompressionConfig, proving: ProvingConfig) -> Dict[str, Any]:
     """Verify proof bundle - recompute metrics and check tolerances."""
     try:
@@ -924,27 +1158,26 @@ def verify_proof_bundle(bundle_root: str, config: CompressionConfig, proving: Pr
     # Verify based on benchmark type
     if config.benchmark_type == "niah":
         if "niah_accuracy" in summary:
-            recomputed["niah_accuracy"] = mean_of("niah_accuracy")
     elif config.benchmark_type == "ruler":
         if "ruler_exact_match" in summary:
-            recomputed["ruler_exact_match"] = mean_of("ruler_exact_match")
     elif config.benchmark_type == "scbench":
         if "scbench_accuracy" in summary:
-            recomputed["scbench_accuracy"] = mean_of("scbench_accuracy")
     elif config.benchmark_type == "longbench":
         if "longbench_accuracy" in summary:
-            recomputed["longbench_accuracy"] = mean_of("longbench_accuracy")
     elif config.benchmark_type == "wikitext":
         # WikiText benchmark metrics
-        recomputed["compression_ratio"] = mean_of("compression_ratio")
-        recomputed["kv_cache_memory_mb"] = mean_of("kv_cache_memory_mb")
         if "prefill_perplexity" in summary:
             recomputed["prefill_perplexity"] = mean_of("prefill_perplexity")
         if "generation_perplexity" in summary:
             recomputed["generation_perplexity"] = mean_of("generation_perplexity")
-    else:
-        recomputed["compression_ratio"] = mean_of("compression_ratio")
-        recomputed["kv_cache_memory_mb"] = mean_of("kv_cache_memory_mb")
     for k, v in recomputed.items():
         s = summary.get(k)

 Benchmarking, metrics, and proof generation for Enhanced SPG.
 Supports LongBench, NIAH, RULER, SCBench benchmarks.
 MEASURED VALUES ONLY - no estimations. FAIL FAST on errors.
+ALL BENCHMARKS USE SAME COMPRESSION PIPELINE AS WIKITEXT.
 """
 import torch
         return (0.0, 0.0)
+def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
+                              cache_manager: QuantizedKVCache, config: CompressionConfig,
+                              measure_memory: bool = True) -> Dict[str, Any]:
+    """
+    Unified compression pipeline for ALL benchmarks.
+    Returns compressed cache, metrics, and reconstructed KV pairs.
+    """
+    device = input_ids.device
+    # Clear GPU cache if requested
+    if torch.cuda.is_available() and measure_memory:
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.synchronize()
+    # Measure prefill time
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    start_time = time.perf_counter()
+    # Prefill phase
+    with torch.inference_mode():
+        outputs = model(
+            input_ids,
+            attention_mask=attention_mask,
+            use_cache=True,
+            return_dict=True
+        )
+        past_key_values = outputs.past_key_values
+        logits = outputs.logits
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    prefill_time = time.perf_counter() - start_time
+    # Measure peak memory
+    prefill_peak_mem = 0
+    if torch.cuda.is_available() and measure_memory:
+        prefill_peak_mem = _peak_mem_bytes_all_gpus()
+    # Calculate prefill perplexity if we have logits
+    prefill_loss = None
+    if logits is not None and input_ids.shape[1] > 1:
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = input_ids[..., 1:].contiguous()
+        loss = F.cross_entropy(
+            shift_logits.view(-1, shift_logits.size(-1)),
+            shift_labels.view(-1),
+            reduction='mean',
+            ignore_index=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -100
+        )
+        prefill_loss = loss.item()
+    # Compression phase - same as WikiText
+    original_cache_size = 0
+    compressed_cache_size = 0
+    compression_ratio = 1.0
+    if past_key_values:
+        # Convert to legacy format for processing
+        kv_tuple = past_key_values.to_legacy_cache() if hasattr(past_key_values, 'to_legacy_cache') else past_key_values
+        # Calculate original size
+        for layer_idx, (keys, values) in enumerate(kv_tuple):
+            original_cache_size += keys.nelement() * keys.element_size()
+            original_cache_size += values.nelement() * values.element_size()
+            # Apply compression if enabled
+            if config.compression_type != CompressionType.NONE:
+                cache_manager.compress_and_store(layer_idx, keys, values)
+        # Reconstruct compressed cache
+        if config.compression_type != CompressionType.NONE:
+            reconstructed_kv = []
+            for layer_idx in range(len(kv_tuple)):
+                dec_keys, dec_values = cache_manager.get_decompressed(layer_idx)
+                if dec_keys is not None and dec_values is not None:
+                    reconstructed_kv.append((dec_keys, dec_values))
+            # Convert back to DynamicCache format
+            if hasattr(DynamicCache, 'from_legacy_cache'):
+                past_key_values = DynamicCache.from_legacy_cache(tuple(reconstructed_kv))
+            else:
+                past_key_values = tuple(reconstructed_kv)
+            # Measure compressed size
+            compressed_cache_size = cache_manager.get_memory_footprint()
+        else:
+            compressed_cache_size = original_cache_size
+        # Calculate compression ratio
+        compression_ratio = original_cache_size / compressed_cache_size if compressed_cache_size > 0 else 1.0
+    return {
+        'past_key_values': past_key_values,
+        'prefill_time': prefill_time,
+        'prefill_peak_mem': prefill_peak_mem,
+        'prefill_loss': prefill_loss,
+        'original_cache_size': original_cache_size,
+        'compressed_cache_size': compressed_cache_size,
+        'compression_ratio': compression_ratio,
+        'logits': logits
+    }
 def create_niah_haystack(context_length: int, needle: str, depth_percent: float) -> str:
     """Create Needle-in-a-Haystack test context - NO HARDCODING."""
     # Generate haystack text
     return haystack_with_needle
+def evaluate_niah(model, tokenizer, config: CompressionConfig, cache_manager: Optional[QuantizedKVCache] = None) -> Dict[str, Any]:
+    """Evaluate NIAH with SAME compression pipeline as WikiText."""
     context = create_niah_haystack(
         config.prefill_length,
         config.niah_needle,
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=config.prefill_length)
     input_ids = inputs.input_ids.to(model.device)
+    attention_mask = inputs.attention_mask.to(model.device)
+    # Apply SAME compression pipeline as WikiText
+    compression_result = apply_compression_pipeline(
+        model, tokenizer, input_ids, attention_mask, cache_manager, config
+    )
+    # Generate with compressed cache
     with torch.inference_mode():
+        # Measure generation time
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        gen_start = time.perf_counter()
+        output = model.generate(
+            input_ids,
+            past_key_values=compression_result['past_key_values'],
+            max_new_tokens=20,
+            temperature=0.0,
+            do_sample=False,
+            attention_mask=attention_mask
+        )
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        gen_time = time.perf_counter() - gen_start
     generated_text = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
     accuracy = 1.0 if config.niah_needle.split()[-1] in generated_text else 0.0
     logger.info(f"NIAH accuracy: {accuracy}, Generated: {generated_text[:50]}")
+    logger.info(f"NIAH compression ratio: {compression_result['compression_ratio']:.1f}x")
+    return {
+        'accuracy': accuracy,
+        'compression_ratio': compression_result['compression_ratio'],
+        'kv_cache_memory_mb': compression_result['compressed_cache_size'] / (1024 * 1024),
+        'prefill_time': compression_result['prefill_time'],
+        'generation_time': gen_time,
+        'prefill_peak_mem': compression_result['prefill_peak_mem']
+    }
+def evaluate_ruler(model, tokenizer, config: CompressionConfig, cache_manager: Optional[QuantizedKVCache] = None) -> Dict[str, Any]:
+    """Evaluate RULER with SAME compression pipeline as WikiText."""
     # Create synthetic RULER-like task
     seq_len = min(config.ruler_max_seq_length, config.prefill_length)
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=seq_len)
     input_ids = inputs.input_ids.to(model.device)
+    attention_mask = inputs.attention_mask.to(model.device)
+    # Apply SAME compression pipeline as WikiText
+    compression_result = apply_compression_pipeline(
+        model, tokenizer, input_ids, attention_mask, cache_manager, config
+    )
+    # Generate with compressed cache
     with torch.inference_mode():
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        gen_start = time.perf_counter()
         output = model.generate(
             input_ids,
+            past_key_values=compression_result['past_key_values'],
             max_new_tokens=10,
             temperature=0.0,
+            do_sample=False,
+            attention_mask=attention_mask
         )
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        gen_time = time.perf_counter() - gen_start
     generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
     exact_match = 1.0 if expected in generated else 0.0
     logger.info(f"RULER exact match: {exact_match}, Generated: {generated[:50]}")
+    logger.info(f"RULER compression ratio: {compression_result['compression_ratio']:.1f}x")
+    return {
+        'exact_match': exact_match,
+        'compression_ratio': compression_result['compression_ratio'],
+        'kv_cache_memory_mb': compression_result['compressed_cache_size'] / (1024 * 1024),
+        'prefill_time': compression_result['prefill_time'],
+        'generation_time': gen_time,
+        'prefill_peak_mem': compression_result['prefill_peak_mem']
+    }
+def evaluate_scbench(model, tokenizer, config: CompressionConfig, cache_manager: Optional[QuantizedKVCache] = None) -> Dict[str, Any]:
+    """Evaluate SCBench with SAME compression pipeline as WikiText."""
     # Create multi-turn conversation
     conversation = []
     facts = {}
     inputs = tokenizer(full_conversation, return_tensors="pt", truncation=True,
                       max_length=config.prefill_length)
     input_ids = inputs.input_ids.to(model.device)
+    attention_mask = inputs.attention_mask.to(model.device)
+    # Apply SAME compression pipeline as WikiText
+    compression_result = apply_compression_pipeline(
+        model, tokenizer, input_ids, attention_mask, cache_manager, config
+    )
+    # Generate with compressed cache
     with torch.inference_mode():
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        gen_start = time.perf_counter()
         output = model.generate(
             input_ids,
+            past_key_values=compression_result['past_key_values'],
             max_new_tokens=20,
             temperature=0.0,
+            do_sample=False,
+            attention_mask=attention_mask
         )
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        gen_time = time.perf_counter() - gen_start
     generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
     accuracy = 1.0 if expected_value in generated else 0.0
     logger.info(f"SCBench accuracy: {accuracy}, Generated: {generated[:50]}")
+    logger.info(f"SCBench compression ratio: {compression_result['compression_ratio']:.1f}x")
+    return {
+        'accuracy': accuracy,
+        'compression_ratio': compression_result['compression_ratio'],
+        'kv_cache_memory_mb': compression_result['compressed_cache_size'] / (1024 * 1024),
+        'prefill_time': compression_result['prefill_time'],
+        'generation_time': gen_time,
+        'prefill_peak_mem': compression_result['prefill_peak_mem']
+    }
+def evaluate_longbench_task(model, tokenizer, config: CompressionConfig,
+                            task: str, cache_manager: Optional[QuantizedKVCache] = None) -> Dict[str, Any]:
+    """Evaluate LongBench with SAME compression pipeline as WikiText."""
+    try:
+        dataset = load_dataset("THUDM/LongBench", task, split="test")
+        # Sample evaluation examples
+        n_samples = min(config.eval_samples, len(dataset))
+        samples = dataset.select(range(n_samples))
+        scores = []
+        compression_ratios = []
+        kv_memories = []
+        prefill_times = []
+        gen_times = []
+        for sample in samples:
+            context = sample.get("context", "")
+            question = sample.get("input", sample.get("question", ""))
+            answer = sample.get("answers", [sample.get("answer", "")])
+            if isinstance(answer, list) and answer:
+                answer = answer[0]
+            prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
+            inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
+                             max_length=config.prefill_length)
+            input_ids = inputs.input_ids.to(model.device)
+            attention_mask = inputs.attention_mask.to(model.device)
+            # Apply SAME compression pipeline as WikiText
+            compression_result = apply_compression_pipeline(
+                model, tokenizer, input_ids, attention_mask, cache_manager, config,
+                measure_memory=False  # Don't measure memory for each sample
+            )
+            # Generate with compressed cache
+            with torch.inference_mode():
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                gen_start = time.perf_counter()
+                output = model.generate(
+                    input_ids,
+                    past_key_values=compression_result['past_key_values'],
+                    max_new_tokens=50,
+                    temperature=0.0,
+                    do_sample=False,
+                    attention_mask=attention_mask
+                )
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                gen_time = time.perf_counter() - gen_start
+            generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
+            # Simple accuracy metric
+            score = 1.0 if str(answer).lower() in generated.lower() else 0.0
+            scores.append(score)
+            compression_ratios.append(compression_result['compression_ratio'])
+            kv_memories.append(compression_result['compressed_cache_size'] / (1024 * 1024))
+            prefill_times.append(compression_result['prefill_time'])
+            gen_times.append(gen_time)
+        avg_compression = float(np.mean(compression_ratios)) if compression_ratios else 1.0
+        logger.info(f"LongBench {task} avg compression: {avg_compression:.1f}x")
+        return {
+            'accuracy': float(np.mean(scores)),
+            'n_samples': n_samples,
+            'compression_ratio': avg_compression,
+            'kv_cache_memory_mb': float(np.mean(kv_memories)) if kv_memories else 0.0,
+            'prefill_time': float(np.mean(prefill_times)) if prefill_times else 0.0,
+            'generation_time': float(np.mean(gen_times)) if gen_times else 0.0
+        }
+    except Exception as e:
+        logger.error(f"Error evaluating LongBench task {task}: {e}")
+        return {
+            'accuracy': 0.0,
+            'n_samples': 0,
+            'compression_ratio': 1.0,
+            'kv_cache_memory_mb': 0.0,
+            'prefill_time': 0.0,
+            'generation_time': 0.0
+        }
 def load_model_and_tokenizer(model_name: str, config: CompressionConfig):
     """Load model and tokenizer with proper configuration - NO HARDCODING."""
     return model, tokenizer
 def load_real_dataset_samples(config: CompressionConfig, tokenizer) -> List[str]:
     """Load dataset samples based on benchmark type - NO HARDCODING."""
     logger.info(f"Loading samples for benchmark: {config.benchmark_type}")
+    if config.benchmark_type == "wikitext":
         # Original WikiText loading
         texts = []
         min_tokens = config.prefill_length + config.generation_length
     logger.info(f"Loaded {len(texts)} text samples")
     return texts
 def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_texts: Optional[List[str]] = None) -> Tuple[BenchmarkMetrics, Dict, List[Dict], List[Dict]]:
+    """Research-grade benchmark with UNIFIED compression for ALL benchmarks."""
     logger.info(f"Starting benchmark: {model_name} with {config.compression_type.value}")
     logger.info(f"Benchmark type: {config.benchmark_type}")
     logger.info(f"Config hash: {config.get_hash()}")
         metrics = BenchmarkMetrics()
+        # Run benchmark-specific evaluation with UNIFIED compression
         if config.benchmark_type == "niah":
+            # NIAH evaluation with unified compression
             for depth in BENCHMARK_CONFIGS["niah"]["depths"]:
                 config.niah_depth_percent = depth
                 for idx in range(min(config.eval_samples, 10)):
                     cache_manager = QuantizedKVCache(config)
                     cache_manager.n_layers = n_layers
+                    result = evaluate_niah(model, tokenizer, config, cache_manager)
+                    metrics.niah_retrieval_accuracy.append(result['accuracy'])
+                    metrics.compression_ratios.append(result['compression_ratio'])
+                    metrics.kv_cache_memory_samples_mb.append(result['kv_cache_memory_mb'])
+                    metrics.prefill_times.append(result['prefill_time'])
+                    metrics.decode_times.append(result['generation_time'] / 20)  # Per token
+                    if result['prefill_peak_mem'] > 0:
+                        metrics.prefill_peak_memories.append(result['prefill_peak_mem'])
+                    # Record per-sample data
+                    per_sample_records.append({
+                        'benchmark': 'niah',
+                        'depth_percent': depth,
+                        'sample_idx': idx,
+                        'accuracy': result['accuracy'],
+                        'compression_ratio': result['compression_ratio'],
+                        'kv_cache_memory_mb': result['kv_cache_memory_mb'],
+                        'compression_type': config.compression_type.value
+                    })
         elif config.benchmark_type == "ruler":
+            # RULER evaluation with unified compression
             for idx in range(config.eval_samples):
                 cache_manager = QuantizedKVCache(config)
                 cache_manager.n_layers = n_layers
+                result = evaluate_ruler(model, tokenizer, config, cache_manager)
+                metrics.ruler_exact_match.append(result['exact_match'])
+                metrics.compression_ratios.append(result['compression_ratio'])
+                metrics.kv_cache_memory_samples_mb.append(result['kv_cache_memory_mb'])
+                metrics.prefill_times.append(result['prefill_time'])
+                metrics.decode_times.append(result['generation_time'] / 10)  # Per token
+                if result['prefill_peak_mem'] > 0:
+                    metrics.prefill_peak_memories.append(result['prefill_peak_mem'])
+                per_sample_records.append({
+                    'benchmark': 'ruler',
+                    'sample_idx': idx,
+                    'exact_match': result['exact_match'],
+                    'compression_ratio': result['compression_ratio'],
+                    'kv_cache_memory_mb': result['kv_cache_memory_mb'],
+                    'compression_type': config.compression_type.value
+                })
         elif config.benchmark_type == "scbench":
+            # SCBench evaluation with unified compression
             for idx in range(config.eval_samples):
                 cache_manager = QuantizedKVCache(config)
                 cache_manager.n_layers = n_layers
+                result = evaluate_scbench(model, tokenizer, config, cache_manager)
+                metrics.scbench_turn_accuracy.append(result['accuracy'])
+                metrics.compression_ratios.append(result['compression_ratio'])
+                metrics.kv_cache_memory_samples_mb.append(result['kv_cache_memory_mb'])
+                metrics.prefill_times.append(result['prefill_time'])
+                metrics.decode_times.append(result['generation_time'] / 20)  # Per token
+                if result['prefill_peak_mem'] > 0:
+                    metrics.prefill_peak_memories.append(result['prefill_peak_mem'])
+                per_sample_records.append({
+                    'benchmark': 'scbench',
+                    'sample_idx': idx,
+                    'accuracy': result['accuracy'],
+                    'compression_ratio': result['compression_ratio'],
+                    'kv_cache_memory_mb': result['kv_cache_memory_mb'],
+                    'compression_type': config.compression_type.value
+                })
         elif config.benchmark_type == "longbench":
+            # LongBench evaluation with unified compression
             if config.benchmark_subset:
                 cache_manager = QuantizedKVCache(config)
                 cache_manager.n_layers = n_layers
+                result = evaluate_longbench_task(model, tokenizer, config,
                                                 config.benchmark_subset, cache_manager)
+                metrics.longbench_scores.append(result)
+                metrics.compression_ratios.append(result['compression_ratio'])
+                metrics.kv_cache_memory_samples_mb.append(result['kv_cache_memory_mb'])
+                metrics.prefill_times.append(result['prefill_time'])
+                if result['generation_time'] > 0:
+                    metrics.decode_times.append(result['generation_time'] / 50)  # Per token
+                per_sample_records.append({
+                    'benchmark': 'longbench',
+                    'subset': config.benchmark_subset,
+                    'accuracy': result['accuracy'],
+                    'compression_ratio': result['compression_ratio'],
+                    'kv_cache_memory_mb': result['kv_cache_memory_mb'],
+                    'compression_type': config.compression_type.value
+                })
         else:
+            # Standard WikiText perplexity evaluation with existing compression
             for idx in range(config.eval_samples):
                 logger.info(f"Sample {idx+1}/{config.eval_samples}")
                 input_ids = inputs.input_ids.to(device)
                 attention_mask = inputs.attention_mask.to(device)
+                # Apply unified compression pipeline
+                compression_result = apply_compression_pipeline(
+                    model, tokenizer, input_ids, attention_mask, cache_manager, config
+                )
+                metrics.prefill_times.append(compression_result['prefill_time'])
+                metrics.compression_ratios.append(compression_result['compression_ratio'])
+                metrics.kv_cache_memory_samples_mb.append(compression_result['compressed_cache_size'] / (1024 * 1024))
+                if compression_result['prefill_peak_mem'] > 0:
+                    metrics.prefill_peak_memories.append(compression_result['prefill_peak_mem'])
+                if compression_result['prefill_loss'] is not None:
+                    prefill_perplexity = np.exp(compression_result['prefill_loss'])
+                    metrics.prefill_perplexities.append(min(prefill_perplexity, 1000))
+                # Generation phase with timing
                 generated_ids = input_ids.clone()
                 decode_times = []
                 generation_losses = []
+                past_key_values = compression_result['past_key_values']
                 for gen_step in range(config.generation_length):
                     if torch.cuda.is_available():
                 if generation_losses:
                     generation_perplexity = np.exp(np.mean(generation_losses))
                     metrics.generation_perplexities.append(min(generation_perplexity, 1000))
+                per_sample_records.append({
+                    'benchmark': 'wikitext',
+                    'sample_idx': idx,
+                    'prefill_perplexity': metrics.prefill_perplexities[-1] if metrics.prefill_perplexities else None,
+                    'generation_perplexity': metrics.generation_perplexities[-1] if metrics.generation_perplexities else None,
+                    'compression_ratio': compression_result['compression_ratio'],
+                    'kv_cache_memory_mb': compression_result['compressed_cache_size'] / (1024 * 1024),
+                    'compression_type': config.compression_type.value
+                })
         metrics.calculate_statistics(config)
         all_metrics.append(metrics)
+    # Aggregate results across seeds
     final_metrics = BenchmarkMetrics()
     for m in all_metrics:
         final_metrics.prefill_times.extend(m.prefill_times)
     else:
         summary['prefill_perplexity'] = final_metrics.prefill_perplexity_mean
         summary['generation_perplexity'] = final_metrics.generation_perplexity_mean
+    # Always add timing and memory metrics
+    summary['prefill_time_ms'] = final_metrics.prefill_time_mean * 1000
+    summary['decode_time_ms'] = final_metrics.decode_time_per_token_mean_ms
+    summary['throughput_tokens_sec'] = final_metrics.decode_tokens_per_sec
+    summary['end_to_end_throughput'] = final_metrics.end_to_end_throughput
+    summary['end_to_end_latency_ms'] = final_metrics.end_to_end_latency_ms
+    summary['peak_memory_mb'] = final_metrics.prefill_peak_memory_mean_mb
     return final_metrics, summary, per_sample_records, per_layer_fingerprints
 def export_proof_bundle(bundle_dir: str, config: CompressionConfig,
                        metrics: BenchmarkMetrics, summary: Dict[str, Any],
                        per_sample_records: List[Dict[str, Any]],
     logger.info(f"Proof bundle exported: {zip_path}")
     return zip_path
 def verify_proof_bundle(bundle_root: str, config: CompressionConfig, proving: ProvingConfig) -> Dict[str, Any]:
     """Verify proof bundle - recompute metrics and check tolerances."""
     try:
     # Verify based on benchmark type
     if config.benchmark_type == "niah":
         if "niah_accuracy" in summary:
+            recomputed["niah_accuracy"] = mean_of("accuracy")
     elif config.benchmark_type == "ruler":
         if "ruler_exact_match" in summary:
+            recomputed["ruler_exact_match"] = mean_of("exact_match")
     elif config.benchmark_type == "scbench":
         if "scbench_accuracy" in summary:
+            recomputed["scbench_accuracy"] = mean_of("accuracy")
     elif config.benchmark_type == "longbench":
         if "longbench_accuracy" in summary:
+            recomputed["longbench_accuracy"] = mean_of("accuracy")
     elif config.benchmark_type == "wikitext":
         # WikiText benchmark metrics
         if "prefill_perplexity" in summary:
             recomputed["prefill_perplexity"] = mean_of("prefill_perplexity")
         if "generation_perplexity" in summary:
             recomputed["generation_perplexity"] = mean_of("generation_perplexity")
+    # Always verify compression metrics
+    recomputed["compression_ratio"] = mean_of("compression_ratio")
+    recomputed["kv_cache_memory_mb"] = mean_of("kv_cache_memory_mb")
     for k, v in recomputed.items():
         s = summary.get(k)