Spaces:

aswerdlow
/

unidisc

Build error

App Files Files Community

aswerdlow commited on Mar 31

Commit

3a60a49

1 Parent(s): 2f30910

Fixed demo instructions & yaml config

Browse files

Files changed (15) hide show

README.md +11 -16
configs/config.yaml +3 -97
configs/config_empty.yaml +0 -8
configs/slurm_example.yaml +94 -0
demo/assets/boat.jpg +3 -0
demo/assets/building.jpg +3 -0
demo/assets/dog.jpg +3 -0
demo/assets/dog_grass.jpg +3 -0
demo/assets/mountain.jpg +3 -0
demo/assets/pickup.jpg +3 -0
demo/assets/tajmahal.jpg +3 -0
demo/assets/venice.jpg +3 -0
demo/client.py +9 -5
demo/inference.py +10 -1
model_setup.py +8 -2

README.md CHANGED Viewed

@@ -45,19 +45,13 @@ See [TRAIN.md](docs/TRAIN.md) for training commands.
 ## Inference
-<!-- Inference demo for **TODO**.
-```
-TODO
-``` -->
-<!-- <img src="docs/todo.png" width="1000"> -->
 Interactive demo:
 ```
-python demo/server.py
-python demo/client_simple_fasthtml.py
-```
 ## Training
@@ -71,11 +65,12 @@ See [EVAL.md](docs/EVAL.md) for details.
 ### Citation
 To cite our work, please use the following:
 ```
-@article{TODO,
-  title={TODO},
-  author={TODO},
-  journal={arXiv preprint arXiv:TODO},
-  year={TODO}
 }
 ```

 ## Inference
 Interactive demo:
+```bash
+mkdir -p ./ckpts/unidisc_interleaved
+huggingface-cli download aswerdlow/unidisc_interleaved --local-dir ./ckpts/unidisc_interleaved
+uv run demo/server.py experiments='[large_scale_train,large_scale_train_high_res_interleaved,eval_unified,large_scale_high_res_interleaved_inference]' trainer.load_from_state_dict="./ckpts/unidisc_interleaved/unidisc_interleaved.pt"
+uv run demo/client.py
 ```
 ## Training
 ### Citation
 To cite our work, please use the following:
 ```
+@article{swerdlow2025unidisc,
+  title = {Unified Multimodal Discrete Diffusion},
+  author = {Swerdlow, Alexander and Prabhudesai, Mihir and Gandhi, Siddharth and Pathak, Deepak and Fragkiadaki, Katerina},
+  journal = {arXiv preprint arXiv:2503.20853},
+  year = {2025},
+  doi = {10.48550/arXiv.2503.20853},
 }
 ```

configs/config.yaml CHANGED Viewed

@@ -293,103 +293,6 @@ hydra:
     subdir: ${hydra.job.id}
   job:
     chdir: true
-  # launcher:
-  #   name: ${get_slurm_name:}
-  #   # See https://hydra.cc/docs/configure_hydra/workdir/
-  #   submitit_folder: ${hydra.sweep.dir}/%j
-  #   nodes: ${nodes} # Number of nodes. This value is *per* node
-  #   mem_gb: ${eval:'${mem_per_gpu} * ${trainer.devices}'} # 40GB per gpu. This value is *per* node
-  #   gpus_per_node: ${trainer.devices}
-  #   partition: ${partition}
-  #   constraint: ${constraint}
-  #   exclude: ${exclude_nodes:}
-  #   timeout_min: ${timeout_min}
-  #   max_num_timeout: 12 # Num requeue exlcuding pre-emptions
-  #   comment: aswerdlo
-  #   stderr_to_stdout: true
-  #   # Be careful with changing anything below.
-  #   # see: https://github.com/stas00/ml-engineering/tree/master/training/fault-tolerance#approach-b2-choosing-which-process-to-send-the-signal-to
-  #   # see: https://github.com/huggingface/accelerate/issues/1918
-  #   # The accelerate launcher w/1 initial process and then spawn 1 per GPU
-  #   tasks_per_node: 1
-  #   cpus_per_task: ${eval:'${cpus_per_gpu} * ${trainer.devices}'}
-  #   python: |
-  #           bash -c "torchrun --nnodes $SLURM_NNODES --nproc_per_node $SLURM_GPUS_PER_NODE --role \$(hostname -s|tr -dc '0-9'): --node_rank \$SLURM_PROCID --max-restarts=2 --rdzv_id $RANDOM --rdzv_backend c10d --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
-  #   # python: "${getpythoncmd:}"
-  #   # tasks_per_node: ${devices}
-  #   # cpus_per_task: 8
-  #   # python: 'python'
-  #   python_suffix: ' --dummy-arg $SLURM_JOB_ID" &'
-  #   signal: 'B:USR2@360'
-  #   post_srun_commands:
-  #     - ''
-  #     - wait
-  #   srun_args:
-  #     - '--jobid $SLURM_JOB_ID'
-  #   setup:
-  #     - |
-  #       export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
-  #       export MASTER_PORT=$(( ($SLURM_JOB_ID % 20001) + 30000 ))
-  #       export NUM_PROCESSES=$((SLURM_NNODES * SLURM_GPUS_PER_NODE))
-  #       export NCCL_DEBUG=INFO
-  #       export NCCL_NSOCKS_PERTHREAD=4
-  #       export NCCL_SOCKET_NTHREADS=2
-  #       export OMP_NUM_THREADS=2
-  #       export PYTHONUNBUFFERED=1
-  #       export STDOUT_PATH=$(scontrol show job $SLURM_JOB_ID | grep -oP "StdOut=\K[^ ]+")
-  #       export LOCAL_JOB_FOLDER=$(dirname $STDOUT_PATH)
-  #       export NCCL_TOPO_DUMP_FILE="$LOCAL_JOB_FOLDER/nccl_topo.xml"
-  #       if [ -n "$SLURM_RESTART_COUNT" ]; then
-  #         export RESTART_COUNT=$SLURM_RESTART_COUNT
-  #       else
-  #         export RESTART_COUNT=0
-  #       fi
-  #       export MAIN_LOG_PATH="$LOCAL_JOB_FOLDER/log_$RESTART_COUNT.txt"
-  #       mkdir -p $LOCAL_JOB_FOLDER
-  #       printenv > "$LOCAL_JOB_FOLDER"/env_"$SLURM_LOCALID_$RESTART_COUNT.txt"
-  #       echo "ibstatus: $(ibstatus)"
-  #       echo "ibdev2netdev: $(ibdev2netdev)"
-  #       echo "rdma device: $(rdma link)"
-  #       echo "environment: $(env | grep NCCL)"
-  #       echo "NUM_PROCESSES: $NUM_PROCESSES, SLURM_NNODES: $SLURM_NNODES SLURM_GPUS_PER_NODE: $SLURM_GPUS_PER_NODE"
-  #       echo "NODE_ID: $SLURM_NODEID, SLURM_PROCID: $SLURM_PROCID, MASTER_ADDR: $MASTER_ADDR, MASTER_PORT: $MASTER_PORT"
-  #       echo "PWD: $PWD, LOCAL_JOB_FOLDER: $LOCAL_JOB_FOLDER, MAIN_LOG_PATH: $MAIN_LOG_PATH"
-  #       trap 'echo "SIGUSR2 received for $SLURM_JOB_ID"; \
-  #       if [ -n "$SLURM_ARRAY_JOB_ID" ]; then echo "SLURM_ARRAY_JOB_ID: $SLURM_ARRAY_JOB_ID"; fi; \
-  #       if [ -n "$SLURM_ARRAY_TASK_ID" ]; then echo "SLURM_ARRAY_TASK_ID: $SLURM_ARRAY_TASK_ID"; fi; \
-  #       # ps auxww | grep $USER; \
-  #       pid=$(pgrep -u $USER -f "python.*(accelerate|torchrun|deepspeed|distributed\.run).*dummy-arg $SLURM_JOB_ID"); \
-  #       echo "Found parent PIDs: $pid"; \
-  #       for p in $pid; do \
-  #         echo "Parent PID has cmd: $(ps -p $p -o cmd=)"; \
-  #         children=$(pgrep -P $p); \
-  #         echo "Children: $children"; \
-  #         if [ -n "$children" ]; then \
-  #           for child in $children; do \
-  #             ppid=$(ps -o ppid= -p $child | tr -d " ")
-  #             if [ "$ppid" -eq "$p" ]; then
-  #               echo "Killing direct child process: PID $child with cmd: $(ps -p $child -o cmd=)"
-  #               kill -USR2 $child &
-  #             else
-  #               echo "Skipping non-direct child process: PID $child with PPID $ppid"
-  #             fi
-  #           done; \
-  #           echo "Sent kill signals to children of $p"; \
-  #         else \
-  #           echo "No children found for $p"; \
-  #         fi; \
-  #       done; \
-  #       wait;' SIGUSR2
 checkpointing:
   # Use custom `save_dir` if, e.g., saving to S3 bucket, otherwise leave this parameter as is
@@ -447,5 +350,8 @@ data:
   add_image_gen_tokens: false
   use_slow_tokenizer: false
   add_image_token: false
 dummyarg: null

     subdir: ${hydra.job.id}
   job:
     chdir: true
 checkpointing:
   # Use custom `save_dir` if, e.g., saving to S3 bucket, otherwise leave this parameter as is
   add_image_gen_tokens: false
   use_slow_tokenizer: false
   add_image_token: false
+  train: "unset_dataset"
+  val: "unset_dataset"
+  tokenizer_name_or_path: "NousResearch/Llama-2-7b-hf"
 dummyarg: null

configs/config_empty.yaml DELETED Viewed

@@ -1,8 +0,0 @@
-defaults:
-  - _self_
-  - /model: small
-  - /experiments: []
-# from omegaconf import OmegaConf
-# with open("config.yaml", "w") as fp:
-#   OmegaConf.save(config=config, f=fp.name)

configs/slurm_example.yaml ADDED Viewed

	@@ -0,0 +1,94 @@

+# This is an example slurm launcher config that should be added to the main config.yaml file under the hydra section. This cannot be run directly.
+hydra:
+  launcher:
+    name: ${get_slurm_name:}
+    # See https://hydra.cc/docs/configure_hydra/workdir/
+    submitit_folder: ${hydra.sweep.dir}/%j
+    nodes: ${nodes} # Number of nodes. This value is *per* node
+    mem_gb: ${eval:'${mem_per_gpu} * ${trainer.devices}'} # 40GB per gpu. This value is *per* node
+    gpus_per_node: ${trainer.devices}
+    partition: ${partition}
+    constraint: ${constraint}
+    exclude: ${exclude_nodes:}
+    timeout_min: ${timeout_min}
+    max_num_timeout: 12 # Num requeue exlcuding pre-emptions
+    comment: aswerdlo
+    stderr_to_stdout: true
+    # Be careful with changing anything below.
+    # see: https://github.com/stas00/ml-engineering/tree/master/training/fault-tolerance#approach-b2-choosing-which-process-to-send-the-signal-to
+    # see: https://github.com/huggingface/accelerate/issues/1918
+    # The accelerate launcher w/1 initial process and then spawn 1 per GPU
+    tasks_per_node: 1
+    cpus_per_task: ${eval:'${cpus_per_gpu} * ${trainer.devices}'}
+    python: |
+            bash -c "torchrun --nnodes $SLURM_NNODES --nproc_per_node $SLURM_GPUS_PER_NODE --role \$(hostname -s|tr -dc '0-9'): --node_rank \$SLURM_PROCID --max-restarts=2 --rdzv_id $RANDOM --rdzv_backend c10d --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    python_suffix: ' --dummy-arg $SLURM_JOB_ID" &'
+    signal: 'B:USR2@360'
+    post_srun_commands:
+      - ''
+      - wait
+    srun_args:
+      - '--jobid $SLURM_JOB_ID'
+    setup:
+      - |
+        export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+        export MASTER_PORT=$(( ($SLURM_JOB_ID % 20001) + 30000 ))
+        export NUM_PROCESSES=$((SLURM_NNODES * SLURM_GPUS_PER_NODE))
+        export NCCL_DEBUG=INFO
+        export NCCL_NSOCKS_PERTHREAD=4
+        export NCCL_SOCKET_NTHREADS=2
+        export OMP_NUM_THREADS=2
+        export PYTHONUNBUFFERED=1
+        export STDOUT_PATH=$(scontrol show job $SLURM_JOB_ID | grep -oP "StdOut=\K[^ ]+")
+        export LOCAL_JOB_FOLDER=$(dirname $STDOUT_PATH)
+        export NCCL_TOPO_DUMP_FILE="$LOCAL_JOB_FOLDER/nccl_topo.xml"
+        if [ -n "$SLURM_RESTART_COUNT" ]; then
+          export RESTART_COUNT=$SLURM_RESTART_COUNT
+        else
+          export RESTART_COUNT=0
+        fi
+        export MAIN_LOG_PATH="$LOCAL_JOB_FOLDER/log_$RESTART_COUNT.txt"
+        mkdir -p $LOCAL_JOB_FOLDER
+        printenv > "$LOCAL_JOB_FOLDER"/env_"$SLURM_LOCALID_$RESTART_COUNT.txt"
+        echo "ibstatus: $(ibstatus)"
+        echo "ibdev2netdev: $(ibdev2netdev)"
+        echo "rdma device: $(rdma link)"
+        echo "environment: $(env | grep NCCL)"
+        echo "NUM_PROCESSES: $NUM_PROCESSES, SLURM_NNODES: $SLURM_NNODES SLURM_GPUS_PER_NODE: $SLURM_GPUS_PER_NODE"
+        echo "NODE_ID: $SLURM_NODEID, SLURM_PROCID: $SLURM_PROCID, MASTER_ADDR: $MASTER_ADDR, MASTER_PORT: $MASTER_PORT"
+        echo "PWD: $PWD, LOCAL_JOB_FOLDER: $LOCAL_JOB_FOLDER, MAIN_LOG_PATH: $MAIN_LOG_PATH"
+        trap 'echo "SIGUSR2 received for $SLURM_JOB_ID"; \
+        if [ -n "$SLURM_ARRAY_JOB_ID" ]; then echo "SLURM_ARRAY_JOB_ID: $SLURM_ARRAY_JOB_ID"; fi; \
+        if [ -n "$SLURM_ARRAY_TASK_ID" ]; then echo "SLURM_ARRAY_TASK_ID: $SLURM_ARRAY_TASK_ID"; fi; \
+        # ps auxww | grep $USER; \
+        pid=$(pgrep -u $USER -f "python.*(accelerate|torchrun|deepspeed|distributed\.run).*dummy-arg $SLURM_JOB_ID"); \
+        echo "Found parent PIDs: $pid"; \
+        for p in $pid; do \
+          echo "Parent PID has cmd: $(ps -p $p -o cmd=)"; \
+          children=$(pgrep -P $p); \
+          echo "Children: $children"; \
+          if [ -n "$children" ]; then \
+            for child in $children; do \
+              ppid=$(ps -o ppid= -p $child | tr -d " ")
+              if [ "$ppid" -eq "$p" ]; then
+                echo "Killing direct child process: PID $child with cmd: $(ps -p $child -o cmd=)"
+                kill -USR2 $child &
+              else
+                echo "Skipping non-direct child process: PID $child with PPID $ppid"
+              fi
+            done; \
+            echo "Sent kill signals to children of $p"; \
+          else \
+            echo "No children found for $p"; \
+          fi; \
+        done; \
+        wait;' SIGUSR2

demo/assets/boat.jpg ADDED Viewed

Git LFS Details

SHA256: 76b5ab9ce3c9fb282d3eb53f812d0afd4f972fb8b2b6d8ce771022fbda928f39
Pointer size: 131 Bytes
Size of remote file: 274 kB

demo/assets/building.jpg ADDED Viewed

Git LFS Details

SHA256: c3b8fe94b65f17ea90b6b158c7e78ef80155aeaae341de3f76ca00f8fb763eb9
Pointer size: 130 Bytes
Size of remote file: 14.2 kB

demo/assets/dog.jpg ADDED Viewed

Git LFS Details

SHA256: 030ca382b90b831bdaa1b52db905dd1ae98beef57c7d1504e85ca1ffc2b5f23f
Pointer size: 129 Bytes
Size of remote file: 9.89 kB

demo/assets/dog_grass.jpg ADDED Viewed

Git LFS Details

SHA256: 2d9e610f8a7dee65e894ad964cd5a75707053475d076c23b82b5adab5f7adf1e
Pointer size: 131 Bytes
Size of remote file: 108 kB

demo/assets/mountain.jpg ADDED Viewed

Git LFS Details

SHA256: 14cbc3df4f8c9b4b0681fdda773cadb2867379116370eceb988ebf5482d4279b
Pointer size: 130 Bytes
Size of remote file: 10 kB

demo/assets/pickup.jpg ADDED Viewed

Git LFS Details

SHA256: 30daa8b78d8eee141ba9691f36e25d69d54627378e75200b750ac21de20766c7
Pointer size: 131 Bytes
Size of remote file: 582 kB

demo/assets/tajmahal.jpg ADDED Viewed

Git LFS Details

SHA256: c5d233d65f537bff66cd7e523d5ba3f2b1fdfc30c6302c2e05498c52e0b97258
Pointer size: 131 Bytes
Size of remote file: 423 kB

demo/assets/venice.jpg ADDED Viewed

Git LFS Details

SHA256: f929ae5c19233571960d81d4ba36aebbb75acb1a07d4ea61c2d581c4d826eeba
Pointer size: 131 Bytes
Size of remote file: 487 kB

demo/client.py CHANGED Viewed

@@ -571,16 +571,17 @@ def post(
     port: int | None = 8001,
     reward_models: str | None = "False"
 ):
-    messages = []
     if user_input:
-        messages.append({"type": "text", "text": user_input})
     current_image = None
     if uploaded_file is not None and uploaded_file.filename != "No image":
         current_image = process(Image.open(io.BytesIO(uploaded_file.file.read())), int(resolution))
         img_data = encode_image(current_image)["url"]
-        messages.append({
             "type": "image_url",
             "image_url": {"url": img_data},
             "is_mask": False
@@ -589,12 +590,15 @@ def post(
         if mask_data is not None and len(mask_data) > 0:
             mask_array = get_boolean_mask(mask_data)
             mask_data_url = encode_array_image(mask_array)["url"]
-            messages.append({
                 "type": "image_url",
                 "image_url": {"url": mask_data_url},
                 "is_mask": True
             })
     config_payload = {
         "max_tokens": int(max_tokens),
         "resolution": int(resolution),
@@ -608,7 +612,7 @@ def post(
     }
     payload = {
-        "messages": [{"role": "user", "content": messages}],
         "model": "unidisc",
         **config_payload
     }

     port: int | None = 8001,
     reward_models: str | None = "False"
 ):
+    payload_messages = []
     if user_input:
+        payload_messages.append({"role": "user", "content": [{"type": "text", "text": user_input}]})
+    image_message_content = []
     current_image = None
     if uploaded_file is not None and uploaded_file.filename != "No image":
         current_image = process(Image.open(io.BytesIO(uploaded_file.file.read())), int(resolution))
         img_data = encode_image(current_image)["url"]
+        image_message_content.append({
             "type": "image_url",
             "image_url": {"url": img_data},
             "is_mask": False
         if mask_data is not None and len(mask_data) > 0:
             mask_array = get_boolean_mask(mask_data)
             mask_data_url = encode_array_image(mask_array)["url"]
+            image_message_content.append({
                 "type": "image_url",
                 "image_url": {"url": mask_data_url},
                 "is_mask": True
             })
+    if image_message_content:
+        payload_messages.append({"role": "assistant", "content": image_message_content})
     config_payload = {
         "max_tokens": int(max_tokens),
         "resolution": int(resolution),
     }
     payload = {
+        "messages": payload_messages,
         "model": "unidisc",
         **config_payload
     }

demo/inference.py CHANGED Viewed

@@ -386,7 +386,16 @@ def inference(
             disable_mask_after_eos=True
         )
-        img_samples_list = torch.cat(img_samples_list, dim=0)
         reward_config = config.eval.auto_enhance_reward_config
         rewards, raw_rewards = model.get_rewards(reward_config, img_samples_list, text_samples_list, batch=gen_batch, return_raw_rewards=True)

             disable_mask_after_eos=True
         )
+        text_samples_list = [x.replace("You are a highly intelligent multimodal AI with the ability to analyze and generate images.", "").removeprefix(" ") for x in text_samples_list]
+        if isinstance(img_samples_list[0], Image.Image):
+            img_tensors = []
+            for img in img_samples_list:
+                img_tensor = torch.tensor(np.array(img)).permute(2, 0, 1).float() / 255.0
+                img_tensors.append(img_tensor.unsqueeze(0))
+            img_samples_list = torch.cat(img_tensors, dim=0)
+        else:
+            img_samples_list = torch.cat(img_samples_list, dim=0)
         reward_config = config.eval.auto_enhance_reward_config
         rewards, raw_rewards = model.get_rewards(reward_config, img_samples_list, text_samples_list, batch=gen_batch, return_raw_rewards=True)

model_setup.py CHANGED Viewed

@@ -12,7 +12,7 @@ from types import FrameType
 from contextlib import nullcontext
 import transformers
-from constants import HF_TOKEN, HF_CACHE_DIR
 import hydra
 import hydra.utils
 import torch
@@ -599,6 +599,12 @@ def set_accelerator(self, accelerator, ckpt_path=None):
     def _load(obj, path, update_fn=None, key="model"):
         _ckpt_path = Path(path)
         if _ckpt_path.is_dir() and (_ckpt_path / "model.safetensors").exists():
             _ckpt_path = _ckpt_path / "model.safetensors"
             path = str(_ckpt_path)
@@ -635,7 +641,7 @@ def set_accelerator(self, accelerator, ckpt_path=None):
                 gprint(f"Loaded state dict from {path}")
                 # obj.load_state_dict(state_dict[key])
         else:
-            state_dict = torch.load(path)
         if 'model' in state_dict and len(state_dict) < 10:
             state_dict = state_dict['model']

 from contextlib import nullcontext
 import transformers
+from constants import HF_TOKEN, HF_CACHE_DIR, UNIDISC_DIR
 import hydra
 import hydra.utils
 import torch
     def _load(obj, path, update_fn=None, key="model"):
         _ckpt_path = Path(path)
+        if not _ckpt_path.is_absolute() and not _ckpt_path.exists():
+            potential_path = UNIDISC_DIR / _ckpt_path
+            rprint(f"Relative path '{_ckpt_path}' not found. Trying path relative to script directory: '{potential_path}'")
+            _ckpt_path = potential_path
         if _ckpt_path.is_dir() and (_ckpt_path / "model.safetensors").exists():
             _ckpt_path = _ckpt_path / "model.safetensors"
             path = str(_ckpt_path)
                 gprint(f"Loaded state dict from {path}")
                 # obj.load_state_dict(state_dict[key])
         else:
+            state_dict = torch.load(_ckpt_path)
         if 'model' in state_dict and len(state_dict) < 10:
             state_dict = state_dict['model']