Spaces:

Roblox
/

cube3d-interactive

Running on L40S

App Files Files Community

Akash Garg commited on 3 days ago

Commit

f6a2f50

1 Parent(s): 0c10674

adding variance slider for top_p

Browse files

Files changed (4) hide show

app.py +8 -4
cube/cube3d/generate.py +7 -7
cube/cube3d/inference/engine.py +14 -13
cube/cube3d/inference/logits_postprocesses.py +32 -18

app.py CHANGED Viewed

@@ -39,9 +39,10 @@ def gen_save_folder(max_size=200):
     return new_folder
-def handle_text_prompt(input_prompt):
-    print(f"prompt: {input_prompt}")
-    mesh_v_f = GLOBAL_STATE["engine_fast"].t2s([input_prompt], use_kv_cache=True, resolution_base=8.0)
     # save output
     vertices, faces = mesh_v_f[0][0], mesh_v_f[0][1]
     save_folder = gen_save_folder()
@@ -57,6 +58,7 @@ def build_interface():
         gr.Markdown(
             f"""
             # {title}
             """
         )
@@ -74,11 +76,13 @@ def build_interface():
                 model3d = gr.Model3D(
                     label="Output", height="45em", interactive=False
                 )
         submit_button.click(
             handle_text_prompt,
             inputs=[
-                input_text_box
             ],
             outputs=[
                 model3d

     return new_folder
+def handle_text_prompt(input_prompt, variance):
+    print(f"prompt: {input_prompt}, variance: {variance}")
+    top_p = None if variance == 0 else (100 - variance) / 100.0
+    mesh_v_f = GLOBAL_STATE["engine_fast"].t2s([input_prompt], use_kv_cache=True, resolution_base=8.0, top_p=top_p)
     # save output
     vertices, faces = mesh_v_f[0][0], mesh_v_f[0][1]
     save_folder = gen_save_folder()
         gr.Markdown(
             f"""
             # {title}
+            # Check out our [Github](https://github.com/Roblox/cube) to try it on your own machine!
             """
         )
                 model3d = gr.Model3D(
                     label="Output", height="45em", interactive=False
                 )
+                variance = gr.Slider(minimum=0, maximum=99, step=1, value=0, label="Variance")
         submit_button.click(
             handle_text_prompt,
             inputs=[
+                input_text_box,
+                variance
             ],
             outputs=[
                 model3d

cube/cube3d/generate.py CHANGED Viewed

@@ -20,13 +20,13 @@ def generate_mesh(
     output_name,
     resolution_base=8.0,
     disable_postprocess=False,
-    top_k: int = 1,
 ):
     mesh_v_f = engine.t2s(
         [prompt],
         use_kv_cache=True,
         resolution_base=resolution_base,
-        top_k=top_k,
     )
     vertices, faces = mesh_v_f[0][0], mesh_v_f[0][1]
     obj_path = os.path.join(output_dir, f"{output_name}.obj")
@@ -87,10 +87,10 @@ if __name__ == "__main__":
         help="Text prompt for generating a 3D mesh",
     )
     parser.add_argument(
-        "--top-k",
-        type=int,
-        default=1,
-        help="Top k filtering, 0 means no filtering, by default 1, which is determistic.",
     )
     parser.add_argument(
         "--render-gif",
@@ -136,7 +136,7 @@ if __name__ == "__main__":
         "output",
         args.resolution_base,
         args.disable_postprocessing,
-        args.top_k,
     )
     if args.render_gif:
         gif_path = renderer.render_turntable(obj_path, args.output_dir)

     output_name,
     resolution_base=8.0,
     disable_postprocess=False,
+    top_p=None,
 ):
     mesh_v_f = engine.t2s(
         [prompt],
         use_kv_cache=True,
         resolution_base=resolution_base,
+        top_p=top_p,
     )
     vertices, faces = mesh_v_f[0][0], mesh_v_f[0][1]
     obj_path = os.path.join(output_dir, f"{output_name}.obj")
         help="Text prompt for generating a 3D mesh",
     )
     parser.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Float < 1: Keep smallest set of tokens with cumulative probability ≥ top_p. Default None: deterministic generation.",
     )
     parser.add_argument(
         "--render-gif",
         "output",
         args.resolution_base,
         args.disable_postprocessing,
+        args.top_p,
     )
     if args.render_gif:
         gif_path = renderer.render_turntable(obj_path, args.output_dir)

cube/cube3d/inference/engine.py CHANGED Viewed

@@ -160,7 +160,7 @@ class Engine:
         prompts: list[str],
         use_kv_cache: bool,
         guidance_scale: float = 3.0,
-        top_k: int = 1,
     ):
         """
         Generates text using a GPT model based on the provided prompts.
@@ -168,7 +168,8 @@ class Engine:
             prompts (list[str]): A list of input prompts to generate text from.
             use_kv_cache (bool): Whether to use key-value caching for faster generation.
             guidance_scale (float, optional): The scale for guidance during generation. Default is 3.0.
-            top_k : (int, optional): Top k filtering, 0 means no filtering, by default 1.
         Returns:
             torch.Tensor: A tensor containing the generated token IDs.
         """
@@ -215,11 +216,10 @@ class Engine:
                         guidance_scale * (self.max_new_tokens - i) / self.max_new_tokens
                     )
                     logits = (1 + gamma) * logits - gamma * uncond_logits
-                probs = process_logits(
                     logits,
-                    top_k=top_k,
                 )
-                next_id = torch.multinomial(probs, num_samples=1, replacement=True)
                 output_ids.append(next_id)
                 next_embed = self.gpt_model.encode_token(next_id)
                 if guidance_scale > 0.0:
@@ -266,7 +266,7 @@ class Engine:
         guidance_scale: float = 3.0,
         resolution_base: float = 8.0,
         chunk_size: int = 100_000,
-        top_k: int = 1,
     ):
         """
         Generates a 3D mesh from text prompts using a GPT model and shape decoder.
@@ -276,10 +276,12 @@ class Engine:
             guidance_scale (float, optional): The scale of guidance for the GPT model. Default is 3.0.
             resolution_base (float, optional): The base resolution for the shape decoder. Default is 8.0.
             chunk_size (int, optional): The chunk size for processing the shape decoding. Default is 100,000.
         Returns:
             mesh_v_f: The generated 3D mesh vertices and faces.
         """
-        output_ids = self.run_gpt(prompts, use_kv_cache, guidance_scale, top_k)
         with torch.autocast(self.device.type, dtype=torch.bfloat16):
             mesh_v_f = self.run_shape_decode(output_ids, resolution_base, chunk_size)
         return mesh_v_f
@@ -426,7 +428,7 @@ class EngineFast(Engine):
         prompts: list[str],
         use_kv_cache: bool,
         guidance_scale: float = 3.0,
-        top_k: int = 1,
     ):
         """
         Runs the GPT model to generate text based on the provided prompts.
@@ -434,6 +436,8 @@ class EngineFast(Engine):
             prompts (list[str]): A list of input prompts for the GPT model. Only a single prompt is supported.
             use_kv_cache (bool): Flag indicating whether to use key-value caching. (Currently not used)
             guidance_scale (float, optional): The scale factor for guidance. Default is 3.0.
         Returns:
             torch.Tensor: A tensor containing the generated output token IDs.
         Raises:
@@ -464,9 +468,7 @@ class EngineFast(Engine):
                 logits, uncond_logits = logits.float().chunk(2, dim=0)
                 gamma = guidance_scale
                 logits = (1 + gamma) * logits - gamma * uncond_logits
-            probs = process_logits(logits, top_k=top_k)
-            next_id = torch.multinomial(probs, num_samples=1, replacement=True)
             output_ids[:, 0] = next_id.squeeze()
             next_embed = self.gpt_model.encode_token(next_id)
@@ -488,8 +490,7 @@ class EngineFast(Engine):
                         guidance_scale * (self.max_new_tokens - i) / self.max_new_tokens
                     )
                     logits = (1 + gamma) * logits - gamma * uncond_logits
-                probs = process_logits(logits, top_k=top_k)
-                next_id = torch.multinomial(probs, num_samples=1, replacement=True)
                 output_ids[:, i] = next_id.squeeze()
                 next_embed = self.gpt_model.encode_token(next_id)

         prompts: list[str],
         use_kv_cache: bool,
         guidance_scale: float = 3.0,
+        top_p: float = None,
     ):
         """
         Generates text using a GPT model based on the provided prompts.
             prompts (list[str]): A list of input prompts to generate text from.
             use_kv_cache (bool): Whether to use key-value caching for faster generation.
             guidance_scale (float, optional): The scale for guidance during generation. Default is 3.0.
+            top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+            If None, argmax selection is performed (deterministic generation). Otherwise, smallest set of tokens with cumulative probability ≥ top_p are kept (stochastic generation).
         Returns:
             torch.Tensor: A tensor containing the generated token IDs.
         """
                         guidance_scale * (self.max_new_tokens - i) / self.max_new_tokens
                     )
                     logits = (1 + gamma) * logits - gamma * uncond_logits
+                next_id = process_logits(
                     logits,
+                    top_p=top_p,
                 )
                 output_ids.append(next_id)
                 next_embed = self.gpt_model.encode_token(next_id)
                 if guidance_scale > 0.0:
         guidance_scale: float = 3.0,
         resolution_base: float = 8.0,
         chunk_size: int = 100_000,
+        top_p: float = None,
     ):
         """
         Generates a 3D mesh from text prompts using a GPT model and shape decoder.
             guidance_scale (float, optional): The scale of guidance for the GPT model. Default is 3.0.
             resolution_base (float, optional): The base resolution for the shape decoder. Default is 8.0.
             chunk_size (int, optional): The chunk size for processing the shape decoding. Default is 100,000.
+            top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+                                    If None, argmax selection is performed (deterministic generation). Otherwise, smallest set of tokens with cumulative probability ≥ top_p are kept (stochastic generation).
         Returns:
             mesh_v_f: The generated 3D mesh vertices and faces.
         """
+        output_ids = self.run_gpt(prompts, use_kv_cache, guidance_scale, top_p)
         with torch.autocast(self.device.type, dtype=torch.bfloat16):
             mesh_v_f = self.run_shape_decode(output_ids, resolution_base, chunk_size)
         return mesh_v_f
         prompts: list[str],
         use_kv_cache: bool,
         guidance_scale: float = 3.0,
+        top_p: float = None
     ):
         """
         Runs the GPT model to generate text based on the provided prompts.
             prompts (list[str]): A list of input prompts for the GPT model. Only a single prompt is supported.
             use_kv_cache (bool): Flag indicating whether to use key-value caching. (Currently not used)
             guidance_scale (float, optional): The scale factor for guidance. Default is 3.0.
+            top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+            If None, argmax selection is performed. Otherwise, smallest set of tokens with cumulative probability ≥ top_p are kept.
         Returns:
             torch.Tensor: A tensor containing the generated output token IDs.
         Raises:
                 logits, uncond_logits = logits.float().chunk(2, dim=0)
                 gamma = guidance_scale
                 logits = (1 + gamma) * logits - gamma * uncond_logits
+            next_id = process_logits(logits, top_p=top_p)
             output_ids[:, 0] = next_id.squeeze()
             next_embed = self.gpt_model.encode_token(next_id)
                         guidance_scale * (self.max_new_tokens - i) / self.max_new_tokens
                     )
                     logits = (1 + gamma) * logits - gamma * uncond_logits
+                next_id = process_logits(logits, top_p=top_p)
                 output_ids[:, i] = next_id.squeeze()
                 next_embed = self.gpt_model.encode_token(next_id)

cube/cube3d/inference/logits_postprocesses.py CHANGED Viewed

@@ -2,22 +2,28 @@ import torch
 import torch.nn.functional as F
-def top_k_filtering(logits, top_k: int = 1):
     """
-    Filter a distribution of logits using top-k and/or top-p (nucleus) filtering.
     The input logits tensor is modified in-place.
     Args:
-        logits: A tensor of logits to be filtered. Expected shape is [..., vocab_size].
-        top_k: If > 0, only keep the top k tokens with highest probability.
     Returns:
-        A tensor of logits where values outside the top-k/top-p threshold are set to -∞.
     """
-    if top_k > 0:
-        idx_to_remove = logits < logits.topk(top_k, largest=True, sorted=False, dim=-1)[
-            0
-        ].amin(dim=-1, keepdim=True)
         logits.masked_fill_(idx_to_remove, -torch.inf)
     return logits
@@ -25,19 +31,27 @@ def top_k_filtering(logits, top_k: int = 1):
 def process_logits(
         logits,
-        top_k: int = 1,
     ):
     """
-    Process logits by optionally applying top-k filtering.
-    The final probabilities are returned after applying softmax on the filtered logits.
     Args:
-        logits: A tensor of logits to process. Expected shape is [..., vocab_size].
-        top_k: If > 0, only keep the top k tokens with highest probability.
     Returns:
-        A tensor of probabilities after filtering, with the same shape as the input logits.
     """
-    logits = top_k_filtering(logits, top_k=top_k)
-    probs = F.softmax(logits, dim=-1)
-    return probs

 import torch.nn.functional as F
+def top_p_filtering(logits, top_p: float = 1.0):
     """
+    Filter a distribution of logits using top-p filtering.
     The input logits tensor is modified in-place.
     Args:
+        logits (torch.Tensor): A tensor of logits to be filtered. Expected shape is [..., vocab_size].
+        top_p (float, optional): The cumulative probability threshold for top-p sampling.
+               If < 1.0, only keep the smallest set of tokens whose
+               cumulative probability does not exceed this threshold.
     Returns:
+        torch.Tensor: logits where values outside the top-p threshold are set to -∞.
     """
+    if top_p < 1.0:
+        sorted_logits, sorted_idx = logits.sort(dim=-1, descending=True)
+        sorted_idx_to_remove = sorted_logits.softmax(dim=-1).cumsum(dim=-1) > top_p
+        sorted_idx_to_remove[..., 0] = False
+        idx_to_remove = sorted_idx_to_remove.scatter(
+            -1, sorted_idx, sorted_idx_to_remove
+        )
         logits.masked_fill_(idx_to_remove, -torch.inf)
     return logits
 def process_logits(
         logits,
+        top_p: float = None,
     ):
     """
+    Process logits by optionally applying nucleus (top-p) filtering and token selection.
+    If `top_p` is None, the token with the highest probability (argmax) is selected.
+    If `top_p` is provided, smallest set of tokens with cumulative probability ≥ top_p are kept, then softmax is applied to obtain
+    probabilities. A token is sampled from this filtered distribution using `torch.multinomial`.
     Args:
+        logits (torch.Tensor): A tensor of logits to process.
+        top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+            If None, argmax selection is performed (deterministic generation). Otherwise, smallest set of tokens with cumulative probability ≥ top_p are kept (stochastic generation).
     Returns:
+        torch.Tensor: selected token index.
     """
+    if top_p is None:
+        next_id = torch.argmax(logits, dim=-1, keepdim=True)
+    else:
+        logits = top_p_filtering(logits, top_p=0.9)
+        probs = F.softmax(logits, dim=-1)
+        next_id = torch.multinomial(probs, num_samples=1, replacement=True)
+    return next_id