Spaces:

kfoughali
/

serpent

Sleeping

App Files Files

xet

Community

kfoughali commited on Sep 6

Commit

52797d8

verified ·

1 Parent(s): 0129ff5

Update compression.py

Browse files

Files changed (1) hide show

compression.py +248 -96

compression.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """
 Enhanced SPG compression algorithms with RocketKV-style 450x compression.
 NO ESTIMATIONS - only measured values. FAIL FAST on errors.
 """
 import torch
@@ -17,10 +19,50 @@ from config import (
 logger = logging.getLogger(__name__)
 class EnhancedSlidingPrecisionGradient:
     """
     Research-grade Enhanced SPG with RocketKV-style 450x compression capability.
     NO ESTIMATIONS OR HARDCODED VALUES - all parameters from validated config.
     """
     def __init__(self, config: EnhancedSPGConfig):
@@ -160,34 +202,50 @@ class EnhancedSlidingPrecisionGradient:
     def snapkv_plus_plus(self, keys: torch.Tensor, values: torch.Tensor,
                         compression_ratio: float) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
-        """SnapKV++ with GQA support and adaptive pooling - no hardcoded values."""
         batch_size, n_heads, seq_len, head_dim = keys.shape
         # Adaptive kernel size based on sequence length (from config)
         kernel_size = self.config.get_adaptive_kernel_size(seq_len)
         # Compute importance scores with adaptive pooling
-        key_norms = keys.norm(dim=-1)  # [batch, heads, seq]
-        value_norms = values.norm(dim=-1)
-        combined_importance = (key_norms + value_norms) / 2.0
-        # Multi-head aggregation with adaptive pooling
-        if kernel_size > 1:
-            # Apply 1D pooling along sequence dimension
-            pooled_importance = F.avg_pool1d(
-                combined_importance.mean(dim=1).unsqueeze(1),  # [batch, 1, seq]
-                kernel_size=kernel_size,
-                stride=1,
-                padding=kernel_size // 2
-            ).squeeze(1)  # [batch, seq]
-            # Ensure pooled output matches original sequence length
-            if pooled_importance.shape[-1] != seq_len:
-                pooled_importance = pooled_importance[:, :seq_len]
-        else:
-            pooled_importance = combined_importance.mean(dim=1)
-        # Aggregate across batch
-        final_importance = pooled_importance.mean(dim=0)  # [seq]
         # Ensure importance tensor matches sequence length
         if final_importance.shape[0] != seq_len:
@@ -195,14 +253,18 @@ class EnhancedSlidingPrecisionGradient:
         # Preserve sink and recent tokens
         preserve_mask = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
-        preserve_mask[:min(self.config.sink_tokens, seq_len)] = True
-        preserve_mask[-min(self.config.recent_window, seq_len):] = True
-        # Top-k selection for remaining tokens
-        n_keep = max(self.config.sink_tokens + self.config.recent_window,
-                    int(seq_len / compression_ratio))
-        n_keep = min(n_keep, seq_len)  # Ensure we don't exceed sequence length
-        remaining_slots = n_keep - preserve_mask.sum().item()
         if remaining_slots > 0:
             masked_importance = final_importance.clone()
@@ -212,36 +274,58 @@ class EnhancedSlidingPrecisionGradient:
             if len(available_indices) > 0:
                 k = min(remaining_slots, len(available_indices))
                 if k > 0:
-                    _, relative_top_indices = torch.topk(masked_importance[available_indices], k)
-                    absolute_top_indices = available_indices[relative_top_indices]
-                    preserve_mask[absolute_top_indices] = True
-        # Extract retained tokens with bounds checking
-        retained_indices = torch.where(preserve_mask)[0]
-        retained_indices = retained_indices[retained_indices < seq_len]  # Safety check
-        keys_compressed = keys[:, :, retained_indices, :]
-        values_compressed = values[:, :, retained_indices, :]
-        actual_ratio = seq_len / len(retained_indices) if len(retained_indices) > 0 else float('inf')
-        logger.debug(f"SnapKV++: {seq_len} → {len(retained_indices)} tokens ({actual_ratio:.1f}x)")
         return keys_compressed, values_compressed, retained_indices.tolist()
     def hybrid_sparse_attention(self, keys: torch.Tensor, values: torch.Tensor,
                                head_budget: int, seq_budget: int) -> Dict[str, Any]:
-        """RocketKV-style Hybrid Sparse Attention for Stage 2 - no hardcoded values."""
         batch_size, n_heads, seq_len, head_dim = keys.shape
-        # 1. Head-wise importance scoring
-        head_importance = (
-            keys.float().pow(2).sum(dim=(-1, -2)).sum(dim=0) +  # Sum over batch, seq, hidden
-            values.float().pow(2).sum(dim=(-1, -2)).sum(dim=0)
-        )  # [n_heads]
-        # Select top heads
-        actual_head_budget = min(head_budget, n_heads)
-        _, top_head_indices = torch.topk(head_importance, actual_head_budget)
         compressed_data = {
             'keys': {},
@@ -255,33 +339,49 @@ class EnhancedSlidingPrecisionGradient:
         # 2. Sequence-wise top-k selection per selected head
         for head_idx in top_head_indices:
-            head_keys = keys[:, head_idx:head_idx+1, :, :]  # Keep head dimension
-            head_values = values[:, head_idx:head_idx+1, :, :]
             # Compute sequence importance for this head
-            seq_importance = (
-                head_keys.norm(dim=-1).squeeze(1).mean(dim=0) +  # [seq]
-                head_values.norm(dim=-1).squeeze(1).mean(dim=0)
-            ) / 2.0
             # Apply position-based boost (from research constants)
             position_boost = torch.ones_like(seq_importance)
-            position_boost[:self.config.sink_tokens] *= self.constants.POSITION_BOOST_SINK
-            position_boost[-self.config.recent_window:] *= self.constants.POSITION_BOOST_RECENT
             boosted_importance = seq_importance * position_boost
             # Select top tokens for this head
-            actual_seq_budget = min(seq_budget, seq_len)
-            _, top_token_indices = torch.topk(boosted_importance, actual_seq_budget)
             # Store compressed data
-            head_key = f'head_{head_idx.item()}'
             compressed_data['keys'][head_key] = {
-                'data': head_keys[:, :, top_token_indices, :].clone(),
                 'indices': top_token_indices.tolist()
             }
             compressed_data['values'][head_key] = {
-                'data': head_values[:, :, top_token_indices, :].clone(),
                 'indices': top_token_indices.tolist()
             }
@@ -315,7 +415,7 @@ class EnhancedSlidingPrecisionGradient:
         # Calculate retention based on compression ratio
         retention_ratio = 1.0 / compression_ratio
-        min_retain = self.config.sink_tokens + self.config.recent_window
         n_retain = max(min_retain, int(seq_len * retention_ratio))
         # Apply layer-specific constraints (from research constants)
@@ -325,7 +425,7 @@ class EnhancedSlidingPrecisionGradient:
         else:  # Late layers
             max_retain = int(seq_len * self.constants.LATE_LAYER_MAX_RETENTION)
-        n_retain = min(n_retain, max_retain)
         # Compute magnitude-based importance
         importance_scores = self.compute_magnitude_importance(keys, values)
@@ -333,13 +433,18 @@ class EnhancedSlidingPrecisionGradient:
         # Quality preservation: boost recent tokens (explicit formula from config)
         recent_boost = torch.zeros_like(importance_scores)
         if self.config.recent_window > 0:
-            recent_boost[-self.config.recent_window:] = importance_scores.max() * self.config.recent_boost_factor
         importance_scores = importance_scores + recent_boost
         # Initialize preservation mask
         preserve_mask = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
-        preserve_mask[:self.config.sink_tokens] = True
-        preserve_mask[-self.config.recent_window:] = True
         # Select additional tokens based on importance
         remaining_slots = n_retain - preserve_mask.sum().item()
@@ -359,15 +464,22 @@ class EnhancedSlidingPrecisionGradient:
             available = (masked_importance > -float('inf')).sum().item()
             k = min(remaining_slots, available)
             if k > 0:
-                _, top_indices = torch.topk(masked_importance, k)
-                preserve_mask[top_indices] = True
         # Extract retained tokens
-        retained_indices = torch.where(preserve_mask)[0]
-        keys_stage1 = keys[:, :, retained_indices, :]
-        values_stage1 = values[:, :, retained_indices, :]
-        actual_ratio = seq_len / len(retained_indices) if len(retained_indices) > 0 else float('inf')
         logger.debug(f"Stage 1 Layer {layer_idx}: {seq_len} → {len(retained_indices)} tokens ({actual_ratio:.1f}x)")
         return keys_stage1, values_stage1, retained_indices.tolist()
@@ -382,7 +494,10 @@ class EnhancedSlidingPrecisionGradient:
         if self.use_hybrid_sparse_attention:
             # RocketKV-style compression with adaptive budgets
-            sparsity = self.estimate_attention_sparsity(keys, values)  # May raise if fails
             if self.use_adaptive_decomposition:
                 _, stage2_ratio = self.adaptive_stage_split(
@@ -462,7 +577,11 @@ class EnhancedSlidingPrecisionGradient:
                 values.float().pow(2).sum(dim=(-1, -2)).sum(dim=0)
             )
-            _, important_head_indices = torch.topk(head_importance, n_important_heads)
             other_head_indices = torch.tensor(
                 [h for h in range(n_heads) if h not in important_head_indices.tolist()],
                 device=keys.device, dtype=torch.long
@@ -470,19 +589,19 @@ class EnhancedSlidingPrecisionGradient:
             # Store important heads at full precision
             compressed_data['keys']['heads_fp16'] = {
-                'data': keys[:, important_head_indices, :, :].clone(),
                 'indices': important_head_indices.tolist()
             }
             compressed_data['values']['heads_fp16'] = {
-                'data': values[:, important_head_indices, :, :].clone(),
                 'indices': important_head_indices.tolist()
             }
             if other_head_indices.numel() == 0:
                 return compressed_data
-            seq_keys = keys[:, other_head_indices, :, :]
-            seq_values = values[:, other_head_indices, :, :]
         else:
             seq_keys = keys
             seq_values = values
@@ -492,10 +611,13 @@ class EnhancedSlidingPrecisionGradient:
         # Explicit top-K selection for FP16
         keep_fp16 = max(0, int(seq_len * self.config.sequence_compression_ratio))
-        top_fp16 = torch.topk(combined_importance, k=keep_fp16).indices if keep_fp16 > 0 else torch.empty(0, dtype=torch.long, device=keys.device)
-        is_fp16 = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
         if keep_fp16 > 0:
-            is_fp16[top_fp16] = True
         # Vectorized token binning
         thresh = torch.tensor([pl.threshold for pl in levels], device=keys.device)
@@ -542,8 +664,8 @@ class EnhancedSlidingPrecisionGradient:
                 continue
             idx_tensor = torch.tensor(indices, device=keys.device, dtype=torch.long)
-            k_slice = seq_keys.index_select(2, idx_tensor)
-            v_slice = seq_values.index_select(2, idx_tensor)
             # Store with aggressive precision - only FP16 for ultra-selective tokens
             compressed_data['keys'][precision_key]['data'] = k_slice.clone()
@@ -589,7 +711,8 @@ class EnhancedSlidingPrecisionGradient:
         except Exception as e:
             logger.error(f"Error in enhanced compression for layer {layer_idx}: {e}")
-            raise
     def _fallback_to_original_spg(self, keys: torch.Tensor, values: torch.Tensor,
                                  layer_idx: int, current_position: Optional[int]) -> Dict[str, Any]:
@@ -668,8 +791,8 @@ class EnhancedSlidingPrecisionGradient:
                 continue
             level_indices = torch.tensor(indices, device=device, dtype=torch.long)
-            k_slice = keys.index_select(2, level_indices)
-            v_slice = values.index_select(2, level_indices)
             # Store with FP16 precision (simplified for original SPG)
             compressed_data['keys'][precision_key]['data'] = k_slice.clone()
@@ -750,8 +873,16 @@ class EnhancedSlidingPrecisionGradient:
         if 'heads_fp16' in compressed_data['keys']:
             head_indices = compressed_data['keys']['heads_fp16']['indices']
             head_idx_tensor = torch.tensor(head_indices, device=device, dtype=torch.long)
-            keys_full[:, head_idx_tensor, :, :] = compressed_data['keys']['heads_fp16']['data']
-            values_full[:, head_idx_tensor, :, :] = compressed_data['values']['heads_fp16']['data']
             if self.config.enable_head_compression:
                 n_heads = original_shape[1]
@@ -768,13 +899,22 @@ class EnhancedSlidingPrecisionGradient:
                 continue
             indices = compressed_data['keys'][precision_key]['indices']
             idx_tensor = torch.tensor(indices, device=device, dtype=torch.long)
             # All data stored as FP16 in this simplified version
-            keys_full[:, other_head_indices, :, :].index_copy_(2, idx_tensor,
-                compressed_data['keys'][precision_key]['data'])
-            values_full[:, other_head_indices, :, :].index_copy_(2, idx_tensor,
-                compressed_data['values'][precision_key]['data'])
         return keys_full, values_full
@@ -806,8 +946,11 @@ class EnhancedSlidingPrecisionGradient:
             token_indices = head_data_k['indices']
             # Place data in the correct head and token positions
-            keys_full[:, head_idx:head_idx+1, token_indices, :] = head_data_k['data']
-            values_full[:, head_idx:head_idx+1, token_indices, :] = head_data_v['data']
         return keys_full, values_full
@@ -825,11 +968,20 @@ class EnhancedSlidingPrecisionGradient:
             data_dict = compressed_data['keys'][precision_key]
             if 'data' in data_dict and 'indices' in data_dict:
                 indices = data_dict['indices']
                 idx_tensor = torch.tensor(indices, device=device, dtype=torch.long)
                 # All data stored as original precision
-                keys_full.index_copy_(2, idx_tensor, data_dict['data'])
-                values_full.index_copy_(2, idx_tensor, compressed_data['values'][precision_key]['data'])
         return keys_full, values_full

+# compression.py
 """
 Enhanced SPG compression algorithms with RocketKV-style 450x compression.
 NO ESTIMATIONS - only measured values. FAIL FAST on errors.
+FIXED: CUDA assert errors, safe tensor operations, bounds checking.
 """
 import torch
 logger = logging.getLogger(__name__)
+def safe_topk(tensor, k, dim=-1):
+    """Safe version of topk that handles edge cases."""
+    if tensor.numel() == 0:
+        logger.warning("Empty tensor in topk operation")
+        return torch.empty(0, dtype=torch.long, device=tensor.device), torch.empty(0, device=tensor.device)
+    # Ensure k doesn't exceed tensor size
+    max_k = tensor.shape[dim]
+    actual_k = min(k, max_k)
+    if actual_k <= 0:
+        logger.warning(f"Invalid k={k} for tensor with shape {tensor.shape}")
+        return torch.empty(0, dtype=torch.long, device=tensor.device), torch.empty(0, device=tensor.device)
+    return torch.topk(tensor, actual_k, dim=dim)
+def safe_index_select(tensor, dim, indices):
+    """Safe version of index_select that validates indices."""
+    if indices.numel() == 0:
+        # Return empty tensor with correct shape
+        shape = list(tensor.shape)
+        shape[dim] = 0
+        return torch.empty(shape, dtype=tensor.dtype, device=tensor.device)
+    # Validate indices are within bounds
+    max_idx = tensor.shape[dim] - 1
+    if indices.max() > max_idx:
+        logger.warning(f"Index {indices.max()} exceeds max {max_idx}, clamping")
+        indices = indices.clamp(0, max_idx)
+    if indices.min() < 0:
+        logger.warning(f"Negative index {indices.min()}, clamping to 0")
+        indices = indices.clamp(0, max_idx)
+    return tensor.index_select(dim, indices)
 class EnhancedSlidingPrecisionGradient:
     """
     Research-grade Enhanced SPG with RocketKV-style 450x compression capability.
     NO ESTIMATIONS OR HARDCODED VALUES - all parameters from validated config.
+    FIXED: Safe tensor operations with bounds checking.
     """
     def __init__(self, config: EnhancedSPGConfig):
     def snapkv_plus_plus(self, keys: torch.Tensor, values: torch.Tensor,
                         compression_ratio: float) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
+        """SnapKV++ with GQA support and adaptive pooling - FIXED with safe operations."""
         batch_size, n_heads, seq_len, head_dim = keys.shape
+        # CRITICAL: Ensure minimum tokens retained
+        min_tokens = max(8, self.config.min_tokens_for_stability)  # At least 8 tokens
+        n_keep = max(min_tokens, int(seq_len / compression_ratio))
+        n_keep = min(n_keep, seq_len)  # Can't keep more than we have
+        logger.debug(f"SnapKV++: seq_len={seq_len}, compression_ratio={compression_ratio:.1f}, n_keep={n_keep}")
+        if n_keep >= seq_len:
+            # No compression needed
+            return keys, values, list(range(seq_len))
         # Adaptive kernel size based on sequence length (from config)
         kernel_size = self.config.get_adaptive_kernel_size(seq_len)
         # Compute importance scores with adaptive pooling
+        try:
+            key_norms = keys.norm(dim=-1)  # [batch, heads, seq]
+            value_norms = values.norm(dim=-1)
+            combined_importance = (key_norms + value_norms) / 2.0
+            # Multi-head aggregation with adaptive pooling
+            if kernel_size > 1 and seq_len > kernel_size:
+                # Apply 1D pooling along sequence dimension
+                pooled_importance = F.avg_pool1d(
+                    combined_importance.mean(dim=1).unsqueeze(1),  # [batch, 1, seq]
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=kernel_size // 2
+                ).squeeze(1)  # [batch, seq]
+                # Ensure pooled output matches original sequence length
+                if pooled_importance.shape[-1] != seq_len:
+                    pooled_importance = pooled_importance[:, :seq_len]
+            else:
+                pooled_importance = combined_importance.mean(dim=1)
+            # Aggregate across batch
+            final_importance = pooled_importance.mean(dim=0)  # [seq]
+        except Exception as e:
+            logger.error(f"Error computing importance: {e}")
+            # Fallback to uniform importance
+            final_importance = torch.ones(seq_len, device=keys.device)
         # Ensure importance tensor matches sequence length
         if final_importance.shape[0] != seq_len:
         # Preserve sink and recent tokens
         preserve_mask = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
+        # Recent tokens
+        recent_window = min(self.config.recent_window, seq_len // 2)  # Don't preserve more than half
+        preserve_mask[-recent_window:] = True
+        # Sink tokens
+        if self.config.sink_tokens > 0:
+            sink_count = min(self.config.sink_tokens, seq_len // 4)  # Don't preserve more than quarter
+            preserve_mask[:sink_count] = True
+        preserved_count = preserve_mask.sum().item()
+        remaining_slots = max(0, n_keep - preserved_count)
         if remaining_slots > 0:
             masked_importance = final_importance.clone()
             if len(available_indices) > 0:
                 k = min(remaining_slots, len(available_indices))
                 if k > 0:
+                    available_importance = masked_importance[available_indices]
+                    _, relative_top_indices = safe_topk(available_importance, k)
+                    if relative_top_indices.numel() > 0:
+                        absolute_indices = available_indices[relative_top_indices]
+                        preserve_mask[absolute_indices] = True
+        # Get final retained indices
+        retained_indices = preserve_mask.nonzero(as_tuple=True)[0]
+        if retained_indices.numel() == 0:
+            logger.error("No indices retained! Keeping at least recent tokens")
+            # Emergency fallback - keep last few tokens
+            retained_indices = torch.arange(max(0, seq_len - min_tokens), seq_len,
+                                          device=keys.device, dtype=torch.long)
+        # Safe indexing
+        keys_compressed = safe_index_select(keys, 2, retained_indices)
+        values_compressed = safe_index_select(values, 2, retained_indices)
+        actual_ratio = seq_len / len(retained_indices) if len(retained_indices) > 0 else 1.0
+        logger.debug(f"SnapKV++ compressed: {seq_len} → {len(retained_indices)} tokens ({actual_ratio:.1f}x)")
         return keys_compressed, values_compressed, retained_indices.tolist()
     def hybrid_sparse_attention(self, keys: torch.Tensor, values: torch.Tensor,
                                head_budget: int, seq_budget: int) -> Dict[str, Any]:
+        """RocketKV-style Hybrid Sparse Attention for Stage 2 - FIXED with safe operations."""
         batch_size, n_heads, seq_len, head_dim = keys.shape
+        # Ensure minimum budgets
+        head_budget = max(1, min(head_budget, n_heads))
+        seq_budget = max(self.config.min_tokens_for_stability, min(seq_budget, seq_len))
+        logger.debug(f"HSA: n_heads={n_heads}, seq_len={seq_len}, head_budget={head_budget}, seq_budget={seq_budget}")
+        # 1. Head-wise importance scoring with safe computation
+        try:
+            head_importance = (
+                keys.float().pow(2).sum(dim=(-1, -2)).mean(dim=0) +  # Average over batch
+                values.float().pow(2).sum(dim=(-1, -2)).mean(dim=0)
+            )  # [n_heads]
+        except Exception as e:
+            logger.error(f"Error computing head importance: {e}")
+            head_importance = torch.ones(n_heads, device=keys.device)
+        # Select top heads safely
+        _, top_head_indices = safe_topk(head_importance, head_budget)
+        if top_head_indices.numel() == 0:
+            # Fallback - keep first head
+            top_head_indices = torch.tensor([0], device=keys.device, dtype=torch.long)
         compressed_data = {
             'keys': {},
         # 2. Sequence-wise top-k selection per selected head
         for head_idx in top_head_indices:
+            head_idx_int = head_idx.item()
+            # Extract head data safely
+            head_keys = keys[:, head_idx_int:head_idx_int+1, :, :]
+            head_values = values[:, head_idx_int:head_idx_int+1, :, :]
             # Compute sequence importance for this head
+            try:
+                seq_importance = (
+                    head_keys.norm(dim=-1).squeeze(1).mean(dim=0) +
+                    head_values.norm(dim=-1).squeeze(1).mean(dim=0)
+                ) / 2.0
+            except Exception as e:
+                logger.error(f"Error computing seq importance for head {head_idx_int}: {e}")
+                seq_importance = torch.ones(seq_len, device=keys.device)
             # Apply position-based boost (from research constants)
             position_boost = torch.ones_like(seq_importance)
+            if self.config.sink_tokens > 0:
+                sink_count = min(self.config.sink_tokens, seq_len // 4)
+                position_boost[:sink_count] *= self.constants.POSITION_BOOST_SINK
+            if self.config.recent_window > 0:
+                recent_count = min(self.config.recent_window, seq_len // 2)
+                position_boost[-recent_count:] *= self.constants.POSITION_BOOST_RECENT
             boosted_importance = seq_importance * position_boost
             # Select top tokens for this head
+            _, top_token_indices = safe_topk(boosted_importance, seq_budget)
+            if top_token_indices.numel() == 0:
+                # Fallback - keep last few tokens
+                top_token_indices = torch.arange(max(0, seq_len - seq_budget), seq_len,
+                                               device=keys.device, dtype=torch.long)
             # Store compressed data
+            head_key = f'head_{head_idx_int}'
             compressed_data['keys'][head_key] = {
+                'data': safe_index_select(head_keys, 2, top_token_indices),
                 'indices': top_token_indices.tolist()
             }
             compressed_data['values'][head_key] = {
+                'data': safe_index_select(head_values, 2, top_token_indices),
                 'indices': top_token_indices.tolist()
             }
         # Calculate retention based on compression ratio
         retention_ratio = 1.0 / compression_ratio
+        min_retain = max(8, self.config.sink_tokens + self.config.recent_window, self.config.min_tokens_for_stability)
         n_retain = max(min_retain, int(seq_len * retention_ratio))
         # Apply layer-specific constraints (from research constants)
         else:  # Late layers
             max_retain = int(seq_len * self.constants.LATE_LAYER_MAX_RETENTION)
+        n_retain = min(n_retain, max_retain, seq_len)
         # Compute magnitude-based importance
         importance_scores = self.compute_magnitude_importance(keys, values)
         # Quality preservation: boost recent tokens (explicit formula from config)
         recent_boost = torch.zeros_like(importance_scores)
         if self.config.recent_window > 0:
+            recent_window = min(self.config.recent_window, seq_len // 2)
+            recent_boost[-recent_window:] = importance_scores.max() * self.config.recent_boost_factor
         importance_scores = importance_scores + recent_boost
         # Initialize preservation mask
         preserve_mask = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
+        if self.config.sink_tokens > 0:
+            sink_count = min(self.config.sink_tokens, seq_len // 4)
+            preserve_mask[:sink_count] = True
+        if self.config.recent_window > 0:
+            recent_count = min(self.config.recent_window, seq_len // 2)
+            preserve_mask[-recent_count:] = True
         # Select additional tokens based on importance
         remaining_slots = n_retain - preserve_mask.sum().item()
             available = (masked_importance > -float('inf')).sum().item()
             k = min(remaining_slots, available)
             if k > 0:
+                _, top_indices = safe_topk(masked_importance, k)
+                if top_indices.numel() > 0:
+                    preserve_mask[top_indices] = True
         # Extract retained tokens
+        retained_indices = preserve_mask.nonzero(as_tuple=True)[0]
+        if retained_indices.numel() == 0:
+            logger.error(f"No tokens retained in stage 1 layer {layer_idx}! Using fallback")
+            min_keep = max(8, self.config.min_tokens_for_stability)
+            retained_indices = torch.arange(seq_len - min_keep, seq_len, device=keys.device, dtype=torch.long)
+        keys_stage1 = safe_index_select(keys, 2, retained_indices)
+        values_stage1 = safe_index_select(values, 2, retained_indices)
+        actual_ratio = seq_len / len(retained_indices) if len(retained_indices) > 0 else 1.0
         logger.debug(f"Stage 1 Layer {layer_idx}: {seq_len} → {len(retained_indices)} tokens ({actual_ratio:.1f}x)")
         return keys_stage1, values_stage1, retained_indices.tolist()
         if self.use_hybrid_sparse_attention:
             # RocketKV-style compression with adaptive budgets
+            try:
+                sparsity = self.estimate_attention_sparsity(keys, values)
+            except:
+                sparsity = 0.5  # Default if estimation fails
             if self.use_adaptive_decomposition:
                 _, stage2_ratio = self.adaptive_stage_split(
                 values.float().pow(2).sum(dim=(-1, -2)).sum(dim=0)
             )
+            _, important_head_indices = safe_topk(head_importance, n_important_heads)
+            if important_head_indices.numel() == 0:
+                important_head_indices = torch.tensor([0], device=keys.device, dtype=torch.long)
             other_head_indices = torch.tensor(
                 [h for h in range(n_heads) if h not in important_head_indices.tolist()],
                 device=keys.device, dtype=torch.long
             # Store important heads at full precision
             compressed_data['keys']['heads_fp16'] = {
+                'data': safe_index_select(keys, 1, important_head_indices).clone(),
                 'indices': important_head_indices.tolist()
             }
             compressed_data['values']['heads_fp16'] = {
+                'data': safe_index_select(values, 1, important_head_indices).clone(),
                 'indices': important_head_indices.tolist()
             }
             if other_head_indices.numel() == 0:
                 return compressed_data
+            seq_keys = safe_index_select(keys, 1, other_head_indices)
+            seq_values = safe_index_select(values, 1, other_head_indices)
         else:
             seq_keys = keys
             seq_values = values
         # Explicit top-K selection for FP16
         keep_fp16 = max(0, int(seq_len * self.config.sequence_compression_ratio))
         if keep_fp16 > 0:
+            top_fp16, _ = safe_topk(combined_importance, k=keep_fp16)
+            is_fp16 = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
+            if top_fp16.numel() > 0:
+                is_fp16[top_fp16] = True
+        else:
+            is_fp16 = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
         # Vectorized token binning
         thresh = torch.tensor([pl.threshold for pl in levels], device=keys.device)
                 continue
             idx_tensor = torch.tensor(indices, device=keys.device, dtype=torch.long)
+            k_slice = safe_index_select(seq_keys, 2, idx_tensor)
+            v_slice = safe_index_select(seq_values, 2, idx_tensor)
             # Store with aggressive precision - only FP16 for ultra-selective tokens
             compressed_data['keys'][precision_key]['data'] = k_slice.clone()
         except Exception as e:
             logger.error(f"Error in enhanced compression for layer {layer_idx}: {e}")
+            # Fallback to original SPG on error
+            return self._fallback_to_original_spg(keys, values, layer_idx, current_position)
     def _fallback_to_original_spg(self, keys: torch.Tensor, values: torch.Tensor,
                                  layer_idx: int, current_position: Optional[int]) -> Dict[str, Any]:
                 continue
             level_indices = torch.tensor(indices, device=device, dtype=torch.long)
+            k_slice = safe_index_select(keys, 2, level_indices)
+            v_slice = safe_index_select(values, 2, level_indices)
             # Store with FP16 precision (simplified for original SPG)
             compressed_data['keys'][precision_key]['data'] = k_slice.clone()
         if 'heads_fp16' in compressed_data['keys']:
             head_indices = compressed_data['keys']['heads_fp16']['indices']
             head_idx_tensor = torch.tensor(head_indices, device=device, dtype=torch.long)
+            # Safe assignment
+            head_data_k = compressed_data['keys']['heads_fp16']['data']
+            head_data_v = compressed_data['values']['heads_fp16']['data']
+            if head_data_k is not None and head_data_v is not None:
+                for i, idx in enumerate(head_indices):
+                    if idx < keys_full.shape[1]:
+                        keys_full[:, idx, :, :] = head_data_k[:, i, :, :]
+                        values_full[:, idx, :, :] = head_data_v[:, i, :, :]
             if self.config.enable_head_compression:
                 n_heads = original_shape[1]
                 continue
             indices = compressed_data['keys'][precision_key]['indices']
+            if not indices:
+                continue
             idx_tensor = torch.tensor(indices, device=device, dtype=torch.long)
             # All data stored as FP16 in this simplified version
+            k_data = compressed_data['keys'][precision_key]['data']
+            v_data = compressed_data['values'][precision_key]['data']
+            if k_data is not None and v_data is not None:
+                for head_idx in other_head_indices:
+                    if head_idx < keys_full.shape[1]:
+                        for i, seq_idx in enumerate(indices):
+                            if seq_idx < keys_full.shape[2]:
+                                keys_full[:, head_idx, seq_idx, :] = k_data[:, :, i, :].squeeze(1)
+                                values_full[:, head_idx, seq_idx, :] = v_data[:, :, i, :].squeeze(1)
         return keys_full, values_full
             token_indices = head_data_k['indices']
             # Place data in the correct head and token positions
+            if head_idx < keys_full.shape[1]:
+                for i, token_idx in enumerate(token_indices):
+                    if token_idx < keys_full.shape[2]:
+                        keys_full[:, head_idx, token_idx, :] = head_data_k['data'][:, 0, i, :]
+                        values_full[:, head_idx, token_idx, :] = head_data_v['data'][:, 0, i, :]
         return keys_full, values_full
             data_dict = compressed_data['keys'][precision_key]
             if 'data' in data_dict and 'indices' in data_dict:
                 indices = data_dict['indices']
+                if not indices:
+                    continue
                 idx_tensor = torch.tensor(indices, device=device, dtype=torch.long)
                 # All data stored as original precision
+                k_data = data_dict['data']
+                v_data = compressed_data['values'][precision_key]['data']
+                if k_data is not None and v_data is not None:
+                    for i, seq_idx in enumerate(indices):
+                        if seq_idx < keys_full.shape[2]:
+                            keys_full[:, :, seq_idx, :] = k_data[:, :, i, :]
+                            values_full[:, :, seq_idx, :] = v_data[:, :, i, :]
         return keys_full, values_full