Spaces:

open-llm-leaderboard
/

open_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

1074

alozowski HF staff commited on 4 days ago

Commit

fb82c68

1 Parent(s): 04e1112

Correct gptq approach [wip]

Browse files

Files changed (1) hide show

backend/app/utils/model_validation.py +108 -19

backend/app/utils/model_validation.py CHANGED Viewed

@@ -17,6 +17,8 @@ class ModelValidator:
         self.token = HF_TOKEN
         self.api = HfApi(token=self.token)
         self.headers = {"Authorization": f"Bearer {self.token}"} if self.token else {}
     async def check_model_card(self, model_id: str) -> Tuple[bool, str, Optional[Dict[str, Any]]]:
         """Check if model has a valid model card"""
@@ -86,48 +88,135 @@ class ModelValidator:
         base_model: str,
         revision: str
     ) -> Tuple[Optional[float], Optional[str]]:
-        """Get model size in billions of parameters"""
         try:
-            logger.info(LogFormatter.info(f"Checking model size for {model_info.modelId}"))
             # Check if model is adapter
-            is_adapter = any(s.rfilename == "adapter_config.json" for s in model_info.siblings if hasattr(s, 'rfilename'))
-            # Try to get size from safetensors first
             model_size = None
             if is_adapter and base_model:
-                # For adapters, we need both adapter and base model sizes
-                adapter_meta = await self.get_safetensors_metadata(model_info.id, is_adapter=True, revision=revision)
-                base_meta = await self.get_safetensors_metadata(base_model, revision="main")
                 if adapter_meta and base_meta:
                     adapter_size = sum(adapter_meta.parameter_count.values())
                     base_size = sum(base_meta.parameter_count.values())
                     model_size = adapter_size + base_size
             else:
-                # For regular models, just get the model size
-                meta = await self.get_safetensors_metadata(model_info.id, revision=revision)
                 if meta:
-                    model_size = sum(meta.parameter_count.values()) # total params
             if model_size is None:
-                # If model size could not be determined, return an error
                 return None, "Model size could not be determined"
-            # Adjust size for GPTQ models
-            size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
-            model_size = model_size / 1e9  # Convert to billions, assuming float16
             model_size = round(size_factor * model_size, 3)
-            logger.info(LogFormatter.success(f"Model size: {model_size}B parameters"))
             return model_size, None
         except Exception as e:
-            logger.error(LogFormatter.error(f"Error while determining model size: {e}"))
             return None, str(e)
     async def check_chat_template(
         self,
         model_id: str,

         self.token = HF_TOKEN
         self.api = HfApi(token=self.token)
         self.headers = {"Authorization": f"Bearer {self.token}"} if self.token else {}
+        self.logger = logger
+        self.config_cache = {}
     async def check_model_card(self, model_id: str) -> Tuple[bool, str, Optional[Dict[str, Any]]]:
         """Check if model has a valid model card"""
         base_model: str,
         revision: str
     ) -> Tuple[Optional[float], Optional[str]]:
         try:
+            self.logger.info(LogFormatter.info(f"Checking model size for {model_info.modelId}"))
             # Check if model is adapter
+            is_adapter = any(
+                s.rfilename == "adapter_config.json"
+                for s in model_info.siblings
+                if hasattr(s, 'rfilename')
+            )
+            # Get model size from safetensors
             model_size = None
             if is_adapter and base_model:
+                # For adapters, combine adapter and base model sizes
+                adapter_meta = await self.get_safetensors_metadata(
+                    model_info.id,
+                    is_adapter=True,
+                    revision=revision
+                )
+                base_meta = await self.get_safetensors_metadata(
+                    base_model,
+                    revision="main"
+                )
                 if adapter_meta and base_meta:
                     adapter_size = sum(adapter_meta.parameter_count.values())
                     base_size = sum(base_meta.parameter_count.values())
                     model_size = adapter_size + base_size
             else:
+                # For regular models
+                meta = await self.get_safetensors_metadata(
+                    model_info.id,
+                    revision=revision
+                )
                 if meta:
+                    model_size = sum(meta.parameter_count.values())
             if model_size is None:
                 return None, "Model size could not be determined"
+            if model_size <= 0:
+                return None, "Invalid model size: must be positive"
+            # Only proceed with GPTQ adjustments if necessary
+            if precision == "GPTQ" or "gptq" in model_info.id.lower():
+                precision_bits = await self._get_precision_bits(
+                    model_info.id,
+                    revision
+                )
+                if precision_bits is None:
+                    return None, "Failed to determine precision bits"
+                # FIXED: We should divide by the size factor since quantization reduces size
+                size_factor = precision_bits / 32  # For 2-bit this is 2/32 = 1/16
+                self.logger.info(LogFormatter.info(
+                    f"Applying quantization factor: {size_factor}x (bits={precision_bits})"
+                ))
+            else:
+                size_factor = 1
+            # Convert to billions and apply quantization factor
+            model_size = model_size / 1e9  # Convert to billions
             model_size = round(size_factor * model_size, 3)
+            self.logger.info(LogFormatter.success(f"Model size: {model_size}B parameters"))
             return model_size, None
         except Exception as e:
+            self.logger.error(LogFormatter.error(f"Error while determining model size: {e}"))
             return None, str(e)
+    async def _get_precision_bits(
+        self,
+        model_id: str,
+        revision: str
+    ) -> Optional[int]:
+        """Get the precision bits from config.json, with caching."""
+        # Check cache first
+        cache_key = f"{model_id}_{revision}"
+        if cache_key in self.config_cache:
+            config_data = self.config_cache[cache_key]
+        else:
+            # Fetch config.json
+            config_url = f"https://huggingface.co/{model_id}/raw/{revision}/config.json"
+            try:
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(config_url, headers=self.headers) as response:
+                        if response.status != 200:
+                            self.logger.warning(LogFormatter.warning(
+                                f"Failed to fetch config.json from {config_url}. Defaulting to 4 bits for GPTQ."
+                            ))
+                            return 4
+                        # Try to parse response as JSON regardless of content type
+                        try:
+                            text = await response.text()
+                            config_data = json.loads(text)
+                            self.config_cache[cache_key] = config_data
+                        except json.JSONDecodeError:
+                            self.logger.warning(LogFormatter.warning(
+                                f"Failed to parse config.json from {config_url}. Defaulting to 4 bits for GPTQ."
+                            ))
+                            return 4
+            except Exception as e:
+                self.logger.error(LogFormatter.error(
+                    f"Error fetching config.json: {e}. Defaulting to 4 bits."
+                ))
+                return 4
+        # Get precision bits from config
+        try:
+            precision_bits = config_data.get("quantization_config", {}).get("bits", 4)
+            # Validate precision bits
+            if precision_bits not in [2, 3, 4, 8]:
+                self.logger.error(LogFormatter.error(
+                    f"Unsupported precision_bits: {precision_bits}"
+                ))
+                return None
+            return precision_bits
+        except Exception as e:
+            self.logger.error(LogFormatter.error(
+                f"Error extracting precision bits from config: {e}. Defaulting to 4 bits."
+            ))
+            return 4
     async def check_chat_template(
         self,
         model_id: str,