alvarobartt
/

Magma-8B

@@ -42,12 +42,6 @@ class EndpointHandler:
                         new_message["content"] += content["text"]
                     elif content["type"] == "image_url":
                         images.append(load_image(content["image_url"]["url"]))
-                        logger.info(
-                            "Loaded image using `transformers.image_utils.load_image`"
-                        )
-                        logger.info(
-                            f"Current {new_message['content']} text if any contains {new_message['content'].count(IMAGE_TOKENS)} image tokens"
-                        )
                         if new_message["content"].count(
                             f"{IMAGE_TOKENS}{SEPARATOR}"
                         ) < len(images):
@@ -72,27 +66,24 @@ class EndpointHandler:
         inputs["pixel_values"] = inputs["pixel_values"].unsqueeze(0)
         inputs["image_sizes"] = inputs["image_sizes"].unsqueeze(0)
         inputs = inputs.to("cuda").to(torch.bfloat16)
-        logger.info(f"Inputs contains {inputs=}")
         generation_args = {
             "max_new_tokens": data.get("max_new_tokens", data.get("max_tokens", 128)),
-            "temperature": data.get("temperature", 0.7),
-            "do_sample": False,
-            "use_cache": True,
             "num_beams": 1,
         }
-        logger.info(f"Running text generation with the following {generation_args=}")
         with torch.inference_mode():
-            logger.info(f"Inputs contains {inputs['input_ids']=}")
             generate_ids = self.model.generate(**inputs, **generation_args)
-            logger.info(f"Generate IDs contains {generate_ids=}")
-        logger.info(f"Generated {generate_ids=}")
         generate_ids = generate_ids[:, inputs["input_ids"].shape[-1] :]
         response = self.processor.decode(
             generate_ids[0], skip_special_tokens=True
         ).strip()
-        logger.info(f"Generated the {response=}")
         return {"generated_text": response}

                         new_message["content"] += content["text"]
                     elif content["type"] == "image_url":
                         images.append(load_image(content["image_url"]["url"]))
                         if new_message["content"].count(
                             f"{IMAGE_TOKENS}{SEPARATOR}"
                         ) < len(images):
         inputs["pixel_values"] = inputs["pixel_values"].unsqueeze(0)
         inputs["image_sizes"] = inputs["image_sizes"].unsqueeze(0)
         inputs = inputs.to("cuda").to(torch.bfloat16)
         generation_args = {
             "max_new_tokens": data.get("max_new_tokens", data.get("max_tokens", 128)),
+            "temperature": data.get("temperature", 0.0),
+            "do_sample": False,  # temperature won't really work unless this is set to True
+            "use_cache": False,  # disabled as otherwise the same prompt with different images won't download the image again
             "num_beams": 1,
         }
+        logger.info(
+            f"Running text generation with the following {generation_args=} (skipped {set(data.keys()) - set(generation_args.keys())})"
+        )
         with torch.inference_mode():
             generate_ids = self.model.generate(**inputs, **generation_args)
         generate_ids = generate_ids[:, inputs["input_ids"].shape[-1] :]
         response = self.processor.decode(
             generate_ids[0], skip_special_tokens=True
         ).strip()
         return {"generated_text": response}