Commit
Β·
725fd2e
1
Parent(s):
6d46028
reorganize files
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- cosmos1/models/autoregressive/configs/base/model.py β ar_config_base_model.py +1 -1
- cosmos1/models/autoregressive/configs/base/model_config.py β ar_config_base_model_config.py +6 -6
- cosmos1/models/autoregressive/configs/base/tokenizer.py β ar_config_base_tokenizer.py +4 -4
- cosmos1/models/autoregressive/configs/inference/inference_config.py β ar_config_inference_inference_config.py +1 -1
- cosmos1/models/autoregressive/diffusion_decoder/config/base/conditioner.py β ar_diffusion_decoder_config_base_conditioner.py +4 -4
- cosmos1/models/autoregressive/diffusion_decoder/config/config_latent_diffusion_decoder.py β ar_diffusion_decoder_config_config_latent_diffusion_decoder.py +5 -5
- cosmos1/models/autoregressive/diffusion_decoder/config/inference/cosmos_diffusiondecoder_7b.py β ar_diffusion_decoder_config_inference_cosmos_diffusiondecoder_7b.py +3 -3
- cosmos1/models/autoregressive/diffusion_decoder/config/registry.py β ar_diffusion_decoder_config_registry.py +4 -4
- cosmos1/models/autoregressive/diffusion_decoder/inference.py β ar_diffusion_decoder_inference.py +4 -4
- cosmos1/models/autoregressive/diffusion_decoder/model.py β ar_diffusion_decoder_model.py +5 -5
- cosmos1/models/autoregressive/diffusion_decoder/network.py β ar_diffusion_decoder_network.py +2 -2
- cosmos1/models/autoregressive/diffusion_decoder/utils.py β ar_diffusion_decoder_utils.py +0 -0
- cosmos1/models/autoregressive/model.py β ar_model.py +10 -9
- cosmos1/models/autoregressive/modules/attention.py β ar_module_attention.py +2 -2
- cosmos1/models/autoregressive/modules/embedding.py β ar_module_embedding.py +0 -0
- cosmos1/models/autoregressive/modules/mlp.py β ar_module_mlp.py +0 -0
- cosmos1/models/autoregressive/modules/mm_projector.py β ar_module_mm_projector.py +0 -0
- cosmos1/models/autoregressive/modules/normalization.py β ar_module_normalization.py +0 -0
- cosmos1/models/autoregressive/networks/transformer.py β ar_network_transformer.py +7 -7
- cosmos1/models/autoregressive/networks/vit.py β ar_network_vit.py +3 -3
- cosmos1/models/autoregressive/tokenizer/discrete_video.py β ar_tokenizer_discrete_video.py +1 -1
- cosmos1/models/autoregressive/tokenizer/image_text_tokenizer.py β ar_tokenizer_image_text_tokenizer.py +2 -2
- cosmos1/models/autoregressive/tokenizer/modules.py β ar_tokenizer_modules.py +3 -3
- cosmos1/models/autoregressive/tokenizer/networks.py β ar_tokenizer_networks.py +3 -3
- cosmos1/models/autoregressive/tokenizer/patching.py β ar_tokenizer_patching.py +0 -0
- cosmos1/models/autoregressive/tokenizer/quantizers.py β ar_tokenizer_quantizers.py +1 -1
- cosmos1/models/autoregressive/tokenizer/text_tokenizer.py β ar_tokenizer_text_tokenizer.py +1 -1
- cosmos1/models/autoregressive/tokenizer/tokenizer.py β ar_tokenizer_tokenizer.py +2 -2
- cosmos1/models/autoregressive/tokenizer/utils.py β ar_tokenizer_utils.py +0 -0
- cosmos1/models/autoregressive/utils/checkpoint.py β ar_utils_checkpoint.py +0 -0
- cosmos1/models/autoregressive/utils/inference.py β ar_utils_inference.py +2 -2
- cosmos1/models/autoregressive/utils/misc.py β ar_utils_misc.py +0 -0
- cosmos1/models/autoregressive/utils/sampling.py β ar_utils_sampling.py +1 -1
- cosmos1/models/autoregressive/inference/base.py β base.py +3 -3
- cosmos1/models/common/base_world_generation_pipeline.py β base_world_generation_pipeline.py +2 -2
- config.json +10 -0
- cosmos1/utils/config.py β config.py +2 -2
- cosmos1/utils/config_helper.py β config_helper.py +2 -2
- cosmos1/scripts/convert_pixtral_ckpt.py β convert_pixtral_ckpt.py +0 -0
- cosmos1/models/autoregressive/nemo/cosmos.py +1 -1
- cosmos1/models/autoregressive/nemo/inference/general.py +4 -4
- cosmos1/models/autoregressive/nemo/post_training/prepare_dataset.py +2 -2
- cosmos1/models/autoregressive/nemo/utils.py +6 -6
- cosmos1/models/diffusion/config/config.py +4 -4
- cosmos1/models/diffusion/config/inference/cosmos-1-diffusion-text2world.py +1 -1
- cosmos1/models/diffusion/config/inference/cosmos-1-diffusion-video2world.py +2 -2
- cosmos1/models/diffusion/inference/text2world.py +4 -3
- cosmos1/models/diffusion/inference/video2world.py +4 -3
- cosmos1/models/diffusion/inference/world_generation_pipeline.py +5 -5
- cosmos1/models/diffusion/nemo/inference/general.py +1 -1
cosmos1/models/autoregressive/configs/base/model.py β ar_config_base_model.py
RENAMED
|
@@ -17,7 +17,7 @@ from typing import Optional
|
|
| 17 |
|
| 18 |
import attrs
|
| 19 |
|
| 20 |
-
from
|
| 21 |
|
| 22 |
|
| 23 |
@attrs.define
|
|
|
|
| 17 |
|
| 18 |
import attrs
|
| 19 |
|
| 20 |
+
from AutoregressiveVideo2WorldGeneration.ar_config_base_tokenizer import TokenizerConfig
|
| 21 |
|
| 22 |
|
| 23 |
@attrs.define
|
cosmos1/models/autoregressive/configs/base/model_config.py β ar_config_base_model_config.py
RENAMED
|
@@ -16,17 +16,17 @@
|
|
| 16 |
import copy
|
| 17 |
from typing import Callable, List, Optional
|
| 18 |
|
| 19 |
-
from
|
| 20 |
-
from
|
| 21 |
TextTokenizerConfig,
|
| 22 |
TokenizerConfig,
|
| 23 |
VideoTokenizerConfig,
|
| 24 |
create_discrete_video_fsq_tokenizer_state_dict_config,
|
| 25 |
)
|
| 26 |
-
from
|
| 27 |
-
from
|
| 28 |
-
from
|
| 29 |
-
from
|
| 30 |
|
| 31 |
# Common architecture specifications
|
| 32 |
BASE_CONFIG = {"n_kv_heads": 8, "norm_type": "rmsnorm", "norm_eps": 1e-5, "ffn_hidden_size": 14336}
|
|
|
|
| 16 |
import copy
|
| 17 |
from typing import Callable, List, Optional
|
| 18 |
|
| 19 |
+
from AutoregressiveVideo2WorldGeneration.ar_config_base_model import ModelConfig
|
| 20 |
+
from AutoregressiveVideo2WorldGeneration.ar_config_base_tokenizer import (
|
| 21 |
TextTokenizerConfig,
|
| 22 |
TokenizerConfig,
|
| 23 |
VideoTokenizerConfig,
|
| 24 |
create_discrete_video_fsq_tokenizer_state_dict_config,
|
| 25 |
)
|
| 26 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_image_text_tokenizer import ImageTextTokenizer
|
| 27 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_text_tokenizer import TextTokenizer
|
| 28 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 29 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import LazyCall as L
|
| 30 |
|
| 31 |
# Common architecture specifications
|
| 32 |
BASE_CONFIG = {"n_kv_heads": 8, "norm_type": "rmsnorm", "norm_eps": 1e-5, "ffn_hidden_size": 14336}
|
cosmos1/models/autoregressive/configs/base/tokenizer.py β ar_config_base_tokenizer.py
RENAMED
|
@@ -17,10 +17,10 @@ from typing import Optional
|
|
| 17 |
|
| 18 |
import attrs
|
| 19 |
|
| 20 |
-
from
|
| 21 |
-
from
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
|
| 25 |
|
| 26 |
def create_discrete_video_fsq_tokenizer_state_dict_config(
|
|
|
|
| 17 |
|
| 18 |
import attrs
|
| 19 |
|
| 20 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_discrete_video import DiscreteVideoFSQStateDictTokenizer
|
| 21 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_networks import CausalDiscreteVideoTokenizer
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import LazyCall as L
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import LazyDict
|
| 24 |
|
| 25 |
|
| 26 |
def create_discrete_video_fsq_tokenizer_state_dict_config(
|
cosmos1/models/autoregressive/configs/inference/inference_config.py β ar_config_inference_inference_config.py
RENAMED
|
@@ -17,7 +17,7 @@ from typing import Any, List, Union
|
|
| 17 |
|
| 18 |
import attrs
|
| 19 |
|
| 20 |
-
from
|
| 21 |
|
| 22 |
|
| 23 |
@attrs.define(slots=False)
|
|
|
|
| 17 |
|
| 18 |
import attrs
|
| 19 |
|
| 20 |
+
from AutoregressiveVideo2WorldGeneration.ar_config_base_model import ModelConfig, TokenizerConfig
|
| 21 |
|
| 22 |
|
| 23 |
@attrs.define(slots=False)
|
cosmos1/models/autoregressive/diffusion_decoder/config/base/conditioner.py β ar_diffusion_decoder_config_base_conditioner.py
RENAMED
|
@@ -18,8 +18,8 @@ from typing import Dict, Optional
|
|
| 18 |
|
| 19 |
import torch
|
| 20 |
|
| 21 |
-
from
|
| 22 |
-
from
|
| 23 |
FPSConfig,
|
| 24 |
ImageSizeConfig,
|
| 25 |
LatentConditionConfig,
|
|
@@ -28,8 +28,8 @@ from cosmos1.models.diffusion.config.base.conditioner import (
|
|
| 28 |
PaddingMaskConfig,
|
| 29 |
TextConfig,
|
| 30 |
)
|
| 31 |
-
from
|
| 32 |
-
from
|
| 33 |
|
| 34 |
|
| 35 |
@dataclass
|
|
|
|
| 18 |
|
| 19 |
import torch
|
| 20 |
|
| 21 |
+
from AutoregressiveVideo2WorldGeneration.df_conditioner import BaseVideoCondition, GeneralConditioner
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.df_config_base_conditioner import (
|
| 23 |
FPSConfig,
|
| 24 |
ImageSizeConfig,
|
| 25 |
LatentConditionConfig,
|
|
|
|
| 28 |
PaddingMaskConfig,
|
| 29 |
TextConfig,
|
| 30 |
)
|
| 31 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import LazyCall as L
|
| 32 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import LazyDict
|
| 33 |
|
| 34 |
|
| 35 |
@dataclass
|
cosmos1/models/autoregressive/diffusion_decoder/config/config_latent_diffusion_decoder.py β ar_diffusion_decoder_config_config_latent_diffusion_decoder.py
RENAMED
|
@@ -17,11 +17,11 @@ from typing import Any, List
|
|
| 17 |
|
| 18 |
import attrs
|
| 19 |
|
| 20 |
-
from
|
| 21 |
-
from
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
-
from
|
| 25 |
|
| 26 |
|
| 27 |
@attrs.define(slots=False)
|
|
|
|
| 17 |
|
| 18 |
import attrs
|
| 19 |
|
| 20 |
+
from AutoregressiveVideo2WorldGeneration.ar_diffusion_decoder_config_registry import register_configs as register_dd_configs
|
| 21 |
+
from AutoregressiveVideo2WorldGeneration.df_config_base_model import LatentDiffusionDecoderModelConfig
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.df_config_registry import register_configs
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration import config
|
| 24 |
+
from AutoregressiveVideo2WorldGeneration.config_helper import import_all_modules_from_package
|
| 25 |
|
| 26 |
|
| 27 |
@attrs.define(slots=False)
|
cosmos1/models/autoregressive/diffusion_decoder/config/inference/cosmos_diffusiondecoder_7b.py β ar_diffusion_decoder_config_inference_cosmos_diffusiondecoder_7b.py
RENAMED
|
@@ -15,9 +15,9 @@
|
|
| 15 |
|
| 16 |
from hydra.core.config_store import ConfigStore
|
| 17 |
|
| 18 |
-
from
|
| 19 |
-
from
|
| 20 |
-
from
|
| 21 |
|
| 22 |
num_frames = 57
|
| 23 |
Cosmos_DiffusionDecoder_7B_INFERENCE_ONLY: LazyDict = LazyDict(
|
|
|
|
| 15 |
|
| 16 |
from hydra.core.config_store import ConfigStore
|
| 17 |
|
| 18 |
+
from AutoregressiveVideo2WorldGeneration.ar_diffusion_decoder_network import DiffusionDecoderGeneralDIT
|
| 19 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import LazyCall as L
|
| 20 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import LazyDict
|
| 21 |
|
| 22 |
num_frames = 57
|
| 23 |
Cosmos_DiffusionDecoder_7B_INFERENCE_ONLY: LazyDict = LazyDict(
|
cosmos1/models/autoregressive/diffusion_decoder/config/registry.py β ar_diffusion_decoder_config_registry.py
RENAMED
|
@@ -15,12 +15,12 @@
|
|
| 15 |
|
| 16 |
from hydra.core.config_store import ConfigStore
|
| 17 |
|
| 18 |
-
from
|
| 19 |
VideoLatentDiffusionDecoderConditionerConfig,
|
| 20 |
)
|
| 21 |
-
from
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
|
| 25 |
|
| 26 |
def get_cosmos_video_discrete_tokenizer_comp8x16x16(
|
|
|
|
| 15 |
|
| 16 |
from hydra.core.config_store import ConfigStore
|
| 17 |
|
| 18 |
+
from AutoregressiveVideo2WorldGeneration.ar_diffusion_decoder_config_base_conditioner import (
|
| 19 |
VideoLatentDiffusionDecoderConditionerConfig,
|
| 20 |
)
|
| 21 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_discrete_video import DiscreteVideoFSQJITTokenizer
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.df_module_pretrained_vae import JITVAE, JointImageVideoSharedJITTokenizer, VideoJITTokenizer
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import LazyCall as L
|
| 24 |
|
| 25 |
|
| 26 |
def get_cosmos_video_discrete_tokenizer_comp8x16x16(
|
cosmos1/models/autoregressive/diffusion_decoder/inference.py β ar_diffusion_decoder_inference.py
RENAMED
|
@@ -19,10 +19,10 @@ from typing import List
|
|
| 19 |
|
| 20 |
import torch
|
| 21 |
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
-
from
|
| 25 |
-
from
|
| 26 |
|
| 27 |
|
| 28 |
def diffusion_decoder_process_tokens(
|
|
|
|
| 19 |
|
| 20 |
import torch
|
| 21 |
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.ar_config_inference_inference_config import DiffusionDecoderSamplingConfig
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration.ar_diffusion_decoder_model import LatentDiffusionDecoderModel
|
| 24 |
+
from AutoregressiveVideo2WorldGeneration.ar_diffusion_decoder_utils import linear_blend_video_list, split_with_overlap
|
| 25 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 26 |
|
| 27 |
|
| 28 |
def diffusion_decoder_process_tokens(
|
cosmos1/models/autoregressive/diffusion_decoder/model.py β ar_diffusion_decoder_model.py
RENAMED
|
@@ -19,11 +19,11 @@ from typing import Callable, Dict, Optional, Tuple
|
|
| 19 |
import torch
|
| 20 |
from torch import Tensor
|
| 21 |
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
-
from
|
| 25 |
-
from
|
| 26 |
-
from
|
| 27 |
|
| 28 |
|
| 29 |
@dataclass
|
|
|
|
| 19 |
import torch
|
| 20 |
from torch import Tensor
|
| 21 |
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.df_conditioner import BaseVideoCondition
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration.df_df_functional_batch_ops import batch_mul
|
| 24 |
+
from AutoregressiveVideo2WorldGeneration.df_df_module_res_sampler import COMMON_SOLVER_OPTIONS
|
| 25 |
+
from AutoregressiveVideo2WorldGeneration.df_model_model_t2w import DiffusionT2WModel as VideoDiffusionModel
|
| 26 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import instantiate as lazy_instantiate
|
| 27 |
|
| 28 |
|
| 29 |
@dataclass
|
cosmos1/models/autoregressive/diffusion_decoder/network.py β ar_diffusion_decoder_network.py
RENAMED
|
@@ -20,8 +20,8 @@ from einops import rearrange
|
|
| 20 |
from torch import nn
|
| 21 |
from torchvision import transforms
|
| 22 |
|
| 23 |
-
from
|
| 24 |
-
from
|
| 25 |
|
| 26 |
|
| 27 |
class DiffusionDecoderGeneralDIT(GeneralDIT):
|
|
|
|
| 20 |
from torch import nn
|
| 21 |
from torchvision import transforms
|
| 22 |
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration.df_module_blocks import PatchEmbed
|
| 24 |
+
from AutoregressiveVideo2WorldGeneration.df_network_general_dit import GeneralDIT
|
| 25 |
|
| 26 |
|
| 27 |
class DiffusionDecoderGeneralDIT(GeneralDIT):
|
cosmos1/models/autoregressive/diffusion_decoder/utils.py β ar_diffusion_decoder_utils.py
RENAMED
|
File without changes
|
cosmos1/models/autoregressive/model.py β ar_model.py
RENAMED
|
@@ -19,23 +19,24 @@ import time
|
|
| 19 |
from pathlib import Path
|
| 20 |
from typing import Any, Dict, List, Optional, Set
|
| 21 |
|
|
|
|
| 22 |
import torch
|
| 23 |
from safetensors.torch import load_file
|
| 24 |
from torch.nn.modules.module import _IncompatibleKeys
|
| 25 |
|
| 26 |
-
from
|
| 27 |
-
from
|
| 28 |
-
from
|
| 29 |
-
from
|
| 30 |
-
from
|
| 31 |
-
from
|
| 32 |
-
from
|
| 33 |
get_partial_state_dict,
|
| 34 |
process_state_dict,
|
| 35 |
substrings_to_ignore,
|
| 36 |
)
|
| 37 |
-
from
|
| 38 |
-
from
|
| 39 |
|
| 40 |
|
| 41 |
class AutoRegressiveModel(torch.nn.Module):
|
|
|
|
| 19 |
from pathlib import Path
|
| 20 |
from typing import Any, Dict, List, Optional, Set
|
| 21 |
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration import misc
|
| 23 |
import torch
|
| 24 |
from safetensors.torch import load_file
|
| 25 |
from torch.nn.modules.module import _IncompatibleKeys
|
| 26 |
|
| 27 |
+
from AutoregressiveVideo2WorldGeneration.ar_config_base_model import ModelConfig
|
| 28 |
+
from AutoregressiveVideo2WorldGeneration.ar_config_base_tokenizer import TokenizerConfig
|
| 29 |
+
from AutoregressiveVideo2WorldGeneration.ar_module_mm_projector import MultimodalProjector
|
| 30 |
+
from AutoregressiveVideo2WorldGeneration.ar_network_transformer import Transformer
|
| 31 |
+
from AutoregressiveVideo2WorldGeneration.ar_network_vit import VisionTransformer, get_vit_config
|
| 32 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_tokenizer import DiscreteMultimodalTokenizer, update_vocab_size
|
| 33 |
+
from AutoregressiveVideo2WorldGeneration.ar_utils_checkpoint import (
|
| 34 |
get_partial_state_dict,
|
| 35 |
process_state_dict,
|
| 36 |
substrings_to_ignore,
|
| 37 |
)
|
| 38 |
+
from AutoregressiveVideo2WorldGeneration.ar_utils_sampling import decode_n_tokens, decode_one_token, prefill
|
| 39 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 40 |
|
| 41 |
|
| 42 |
class AutoRegressiveModel(torch.nn.Module):
|
cosmos1/models/autoregressive/modules/attention.py β ar_module_attention.py
RENAMED
|
@@ -19,8 +19,8 @@ from typing import Optional, Union
|
|
| 19 |
import torch
|
| 20 |
from torch import nn
|
| 21 |
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
|
| 25 |
|
| 26 |
class Attention(nn.Module):
|
|
|
|
| 19 |
import torch
|
| 20 |
from torch import nn
|
| 21 |
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.ar_module_embedding import RotaryPositionEmbedding
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration.ar_module_normalization import create_norm
|
| 24 |
|
| 25 |
|
| 26 |
class Attention(nn.Module):
|
cosmos1/models/autoregressive/modules/embedding.py β ar_module_embedding.py
RENAMED
|
File without changes
|
cosmos1/models/autoregressive/modules/mlp.py β ar_module_mlp.py
RENAMED
|
File without changes
|
cosmos1/models/autoregressive/modules/mm_projector.py β ar_module_mm_projector.py
RENAMED
|
File without changes
|
cosmos1/models/autoregressive/modules/normalization.py β ar_module_normalization.py
RENAMED
|
File without changes
|
cosmos1/models/autoregressive/networks/transformer.py β ar_network_transformer.py
RENAMED
|
@@ -19,17 +19,17 @@ import torch
|
|
| 19 |
import torch.nn as nn
|
| 20 |
from torch.nn.modules.module import _IncompatibleKeys
|
| 21 |
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
RotaryPositionEmbeddingPytorchV1,
|
| 25 |
RotaryPositionEmbeddingPytorchV2,
|
| 26 |
SinCosPosEmbAxisTE,
|
| 27 |
)
|
| 28 |
-
from
|
| 29 |
-
from
|
| 30 |
-
from
|
| 31 |
-
from
|
| 32 |
-
from
|
| 33 |
|
| 34 |
|
| 35 |
class TransformerBlock(nn.Module):
|
|
|
|
| 19 |
import torch.nn as nn
|
| 20 |
from torch.nn.modules.module import _IncompatibleKeys
|
| 21 |
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.ar_module_attention import Attention
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration.ar_module_embedding import (
|
| 24 |
RotaryPositionEmbeddingPytorchV1,
|
| 25 |
RotaryPositionEmbeddingPytorchV2,
|
| 26 |
SinCosPosEmbAxisTE,
|
| 27 |
)
|
| 28 |
+
from AutoregressiveVideo2WorldGeneration.ar_module_mlp import MLP
|
| 29 |
+
from AutoregressiveVideo2WorldGeneration.ar_module_normalization import create_norm
|
| 30 |
+
from AutoregressiveVideo2WorldGeneration.ar_utils_checkpoint import process_state_dict, substrings_to_ignore
|
| 31 |
+
from AutoregressiveVideo2WorldGeneration.ar_utils_misc import maybe_convert_to_namespace
|
| 32 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 33 |
|
| 34 |
|
| 35 |
class TransformerBlock(nn.Module):
|
cosmos1/models/autoregressive/networks/vit.py β ar_network_vit.py
RENAMED
|
@@ -26,9 +26,9 @@ from typing import Any, Callable, Mapping, Optional, Tuple
|
|
| 26 |
import torch
|
| 27 |
import torch.nn as nn
|
| 28 |
|
| 29 |
-
from
|
| 30 |
-
from
|
| 31 |
-
from
|
| 32 |
|
| 33 |
|
| 34 |
def get_vit_config(model_name: str) -> Mapping[str, Any]:
|
|
|
|
| 26 |
import torch
|
| 27 |
import torch.nn as nn
|
| 28 |
|
| 29 |
+
from AutoregressiveVideo2WorldGeneration.ar_module_normalization import create_norm
|
| 30 |
+
from AutoregressiveVideo2WorldGeneration.ar_network_transformer import TransformerBlock
|
| 31 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 32 |
|
| 33 |
|
| 34 |
def get_vit_config(model_name: str) -> Mapping[str, Any]:
|
cosmos1/models/autoregressive/tokenizer/discrete_video.py β ar_tokenizer_discrete_video.py
RENAMED
|
@@ -18,7 +18,7 @@ from typing import Optional
|
|
| 18 |
import torch
|
| 19 |
from einops import rearrange
|
| 20 |
|
| 21 |
-
from
|
| 22 |
|
| 23 |
# Make sure jit model output consistenly during consecutive calls
|
| 24 |
# Check here: https://github.com/pytorch/pytorch/issues/74534
|
|
|
|
| 18 |
import torch
|
| 19 |
from einops import rearrange
|
| 20 |
|
| 21 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_quantizers import FSQuantizer
|
| 22 |
|
| 23 |
# Make sure jit model output consistenly during consecutive calls
|
| 24 |
# Check here: https://github.com/pytorch/pytorch/issues/74534
|
cosmos1/models/autoregressive/tokenizer/image_text_tokenizer.py β ar_tokenizer_image_text_tokenizer.py
RENAMED
|
@@ -21,8 +21,8 @@ import transformers
|
|
| 21 |
from transformers import AutoImageProcessor
|
| 22 |
from transformers.image_utils import ImageInput, is_valid_image, load_image
|
| 23 |
|
| 24 |
-
from
|
| 25 |
-
from
|
| 26 |
|
| 27 |
# Configuration for different vision-language models
|
| 28 |
IMAGE_CONFIGS = {
|
|
|
|
| 21 |
from transformers import AutoImageProcessor
|
| 22 |
from transformers.image_utils import ImageInput, is_valid_image, load_image
|
| 23 |
|
| 24 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_text_tokenizer import TextTokenizer
|
| 25 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 26 |
|
| 27 |
# Configuration for different vision-language models
|
| 28 |
IMAGE_CONFIGS = {
|
cosmos1/models/autoregressive/tokenizer/modules.py β ar_tokenizer_modules.py
RENAMED
|
@@ -29,8 +29,8 @@ import torch
|
|
| 29 |
import torch.nn as nn
|
| 30 |
import torch.nn.functional as F
|
| 31 |
|
| 32 |
-
from
|
| 33 |
-
from
|
| 34 |
CausalNormalize,
|
| 35 |
batch2space,
|
| 36 |
batch2time,
|
|
@@ -41,7 +41,7 @@ from cosmos1.models.autoregressive.tokenizer.utils import (
|
|
| 41 |
space2batch,
|
| 42 |
time2batch,
|
| 43 |
)
|
| 44 |
-
from
|
| 45 |
|
| 46 |
|
| 47 |
class CausalConv3d(nn.Module):
|
|
|
|
| 29 |
import torch.nn as nn
|
| 30 |
import torch.nn.functional as F
|
| 31 |
|
| 32 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_patching import Patcher3D, UnPatcher3D
|
| 33 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_utils import (
|
| 34 |
CausalNormalize,
|
| 35 |
batch2space,
|
| 36 |
batch2time,
|
|
|
|
| 41 |
space2batch,
|
| 42 |
time2batch,
|
| 43 |
)
|
| 44 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 45 |
|
| 46 |
|
| 47 |
class CausalConv3d(nn.Module):
|
cosmos1/models/autoregressive/tokenizer/networks.py β ar_tokenizer_networks.py
RENAMED
|
@@ -18,9 +18,9 @@ from collections import namedtuple
|
|
| 18 |
import torch
|
| 19 |
from torch import nn
|
| 20 |
|
| 21 |
-
from
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
|
| 25 |
NetworkEval = namedtuple("NetworkEval", ["reconstructions", "quant_loss", "quant_info"])
|
| 26 |
|
|
|
|
| 18 |
import torch
|
| 19 |
from torch import nn
|
| 20 |
|
| 21 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_modules import CausalConv3d, DecoderFactorized, EncoderFactorized
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_quantizers import FSQuantizer
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 24 |
|
| 25 |
NetworkEval = namedtuple("NetworkEval", ["reconstructions", "quant_loss", "quant_info"])
|
| 26 |
|
cosmos1/models/autoregressive/tokenizer/patching.py β ar_tokenizer_patching.py
RENAMED
|
File without changes
|
cosmos1/models/autoregressive/tokenizer/quantizers.py β ar_tokenizer_quantizers.py
RENAMED
|
@@ -21,7 +21,7 @@ import torch
|
|
| 21 |
import torch.nn as nn
|
| 22 |
from einops import rearrange
|
| 23 |
|
| 24 |
-
from
|
| 25 |
|
| 26 |
|
| 27 |
class FSQuantizer(nn.Module):
|
|
|
|
| 21 |
import torch.nn as nn
|
| 22 |
from einops import rearrange
|
| 23 |
|
| 24 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_utils import default, pack_one, round_ste, unpack_one
|
| 25 |
|
| 26 |
|
| 27 |
class FSQuantizer(nn.Module):
|
cosmos1/models/autoregressive/tokenizer/text_tokenizer.py β ar_tokenizer_text_tokenizer.py
RENAMED
|
@@ -19,7 +19,7 @@ import numpy as np
|
|
| 19 |
import torch
|
| 20 |
from transformers import AutoTokenizer
|
| 21 |
|
| 22 |
-
from
|
| 23 |
|
| 24 |
|
| 25 |
def get_tokenizer_path(model_family: str, is_instruct_model: bool = False):
|
|
|
|
| 19 |
import torch
|
| 20 |
from transformers import AutoTokenizer
|
| 21 |
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 23 |
|
| 24 |
|
| 25 |
def get_tokenizer_path(model_family: str, is_instruct_model: bool = False):
|
cosmos1/models/autoregressive/tokenizer/tokenizer.py β ar_tokenizer_tokenizer.py
RENAMED
|
@@ -19,8 +19,8 @@ from typing import Optional
|
|
| 19 |
import torch
|
| 20 |
from einops import rearrange
|
| 21 |
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
|
| 25 |
|
| 26 |
def update_vocab_size(
|
|
|
|
| 19 |
import torch
|
| 20 |
from einops import rearrange
|
| 21 |
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.ar_config_base_tokenizer import TokenizerConfig
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import instantiate as lazy_instantiate
|
| 24 |
|
| 25 |
|
| 26 |
def update_vocab_size(
|
cosmos1/models/autoregressive/tokenizer/utils.py β ar_tokenizer_utils.py
RENAMED
|
File without changes
|
cosmos1/models/autoregressive/utils/checkpoint.py β ar_utils_checkpoint.py
RENAMED
|
File without changes
|
cosmos1/models/autoregressive/utils/inference.py β ar_utils_inference.py
RENAMED
|
@@ -25,8 +25,8 @@ import torch
|
|
| 25 |
import torchvision
|
| 26 |
from PIL import Image
|
| 27 |
|
| 28 |
-
from
|
| 29 |
-
from
|
| 30 |
|
| 31 |
_IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", "webp"]
|
| 32 |
_VIDEO_EXTENSIONS = [".mp4"]
|
|
|
|
| 25 |
import torchvision
|
| 26 |
from PIL import Image
|
| 27 |
|
| 28 |
+
from AutoregressiveVideo2WorldGeneration.ar_config_inference_inference_config import SamplingConfig
|
| 29 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 30 |
|
| 31 |
_IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", "webp"]
|
| 32 |
_VIDEO_EXTENSIONS = [".mp4"]
|
cosmos1/models/autoregressive/utils/misc.py β ar_utils_misc.py
RENAMED
|
File without changes
|
cosmos1/models/autoregressive/utils/sampling.py β ar_utils_sampling.py
RENAMED
|
@@ -17,7 +17,7 @@ from typing import Optional, Tuple
|
|
| 17 |
|
| 18 |
import torch
|
| 19 |
|
| 20 |
-
from
|
| 21 |
|
| 22 |
|
| 23 |
def sample_top_p(logits, temperature, top_p, return_probs: bool = False):
|
|
|
|
| 17 |
|
| 18 |
import torch
|
| 19 |
|
| 20 |
+
from AutoregressiveVideo2WorldGeneration.ar_network_transformer import Transformer
|
| 21 |
|
| 22 |
|
| 23 |
def sample_top_p(logits, temperature, top_p, return_probs: bool = False):
|
cosmos1/models/autoregressive/inference/base.py β base.py
RENAMED
|
@@ -19,9 +19,9 @@ import os
|
|
| 19 |
import imageio
|
| 20 |
import torch
|
| 21 |
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
-
from
|
| 25 |
|
| 26 |
|
| 27 |
def parse_args():
|
|
|
|
| 19 |
import imageio
|
| 20 |
import torch
|
| 21 |
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.world_generation_pipeline import ARBaseGenerationPipeline
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration.ar_utils_inference import add_common_arguments, load_vision_input, validate_args
|
| 24 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 25 |
|
| 26 |
|
| 27 |
def parse_args():
|
cosmos1/models/common/base_world_generation_pipeline.py β base_world_generation_pipeline.py
RENAMED
|
@@ -21,8 +21,8 @@ from typing import Any
|
|
| 21 |
import numpy as np
|
| 22 |
import torch
|
| 23 |
|
| 24 |
-
from
|
| 25 |
-
from
|
| 26 |
|
| 27 |
|
| 28 |
class BaseWorldGenerationPipeline(ABC):
|
|
|
|
| 21 |
import numpy as np
|
| 22 |
import torch
|
| 23 |
|
| 24 |
+
from AutoregressiveVideo2WorldGeneration.t5_text_encoder import CosmosT5TextEncoder
|
| 25 |
+
from AutoregressiveVideo2WorldGeneration import guardrail_common_presets as guardrail_presets
|
| 26 |
|
| 27 |
|
| 28 |
class BaseWorldGenerationPipeline(ABC):
|
config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"ARVideo2World"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoConfig": "video2world_hf.ARVideo2WorldConfig",
|
| 7 |
+
"AutoModel": "video2world_hf.ARVideo2World"
|
| 8 |
+
},
|
| 9 |
+
"model_type": "AutoModel"
|
| 10 |
+
}
|
cosmos1/utils/config.py β config.py
RENAMED
|
@@ -19,8 +19,8 @@ from typing import Any, TypeVar
|
|
| 19 |
|
| 20 |
import attrs
|
| 21 |
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
|
| 25 |
T = TypeVar("T")
|
| 26 |
|
|
|
|
| 19 |
|
| 20 |
import attrs
|
| 21 |
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import LazyDict
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration.misc import Color
|
| 24 |
|
| 25 |
T = TypeVar("T")
|
| 26 |
|
cosmos1/utils/config_helper.py β config_helper.py
RENAMED
|
@@ -27,8 +27,8 @@ from hydra import compose, initialize
|
|
| 27 |
from hydra.core.config_store import ConfigStore
|
| 28 |
from omegaconf import DictConfig, OmegaConf
|
| 29 |
|
| 30 |
-
from
|
| 31 |
-
from
|
| 32 |
|
| 33 |
|
| 34 |
def is_attrs_or_dataclass(obj) -> bool:
|
|
|
|
| 27 |
from hydra.core.config_store import ConfigStore
|
| 28 |
from omegaconf import DictConfig, OmegaConf
|
| 29 |
|
| 30 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 31 |
+
from AutoregressiveVideo2WorldGeneration.config import Config
|
| 32 |
|
| 33 |
|
| 34 |
def is_attrs_or_dataclass(obj) -> bool:
|
cosmos1/scripts/convert_pixtral_ckpt.py β convert_pixtral_ckpt.py
RENAMED
|
File without changes
|
cosmos1/models/autoregressive/nemo/cosmos.py
CHANGED
|
@@ -29,7 +29,7 @@ from nemo.lightning import OptimizerModule, io
|
|
| 29 |
from nemo.lightning.base import teardown
|
| 30 |
from torch import Tensor, nn
|
| 31 |
|
| 32 |
-
from
|
| 33 |
|
| 34 |
|
| 35 |
class RotaryEmbedding3D(RotaryEmbedding):
|
|
|
|
| 29 |
from nemo.lightning.base import teardown
|
| 30 |
from torch import Tensor, nn
|
| 31 |
|
| 32 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 33 |
|
| 34 |
|
| 35 |
class RotaryEmbedding3D(RotaryEmbedding):
|
cosmos1/models/autoregressive/nemo/inference/general.py
CHANGED
|
@@ -34,10 +34,10 @@ from nemo.lightning import io
|
|
| 34 |
from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
|
| 35 |
|
| 36 |
from cosmos1.models.autoregressive.nemo.utils import run_diffusion_decoder_model
|
| 37 |
-
from
|
| 38 |
-
from
|
| 39 |
-
from
|
| 40 |
-
from
|
| 41 |
|
| 42 |
torch._C._jit_set_texpr_fuser_enabled(False)
|
| 43 |
|
|
|
|
| 34 |
from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
|
| 35 |
|
| 36 |
from cosmos1.models.autoregressive.nemo.utils import run_diffusion_decoder_model
|
| 37 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_discrete_video import DiscreteVideoFSQJITTokenizer
|
| 38 |
+
from AutoregressiveVideo2WorldGeneration.ar_utils_inference import load_vision_input
|
| 39 |
+
from AutoregressiveVideo2WorldGeneration import guardrail_common_presets as guardrail_presets
|
| 40 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 41 |
|
| 42 |
torch._C._jit_set_texpr_fuser_enabled(False)
|
| 43 |
|
cosmos1/models/autoregressive/nemo/post_training/prepare_dataset.py
CHANGED
|
@@ -23,8 +23,8 @@ from huggingface_hub import snapshot_download
|
|
| 23 |
from nemo.collections.nlp.data.language_modeling.megatron import indexed_dataset
|
| 24 |
|
| 25 |
from cosmos1.models.autoregressive.nemo.utils import read_input_videos
|
| 26 |
-
from
|
| 27 |
-
from
|
| 28 |
|
| 29 |
TOKENIZER_COMPRESSION_FACTOR = [8, 16, 16]
|
| 30 |
DATA_RESOLUTION_SUPPORTED = [640, 1024]
|
|
|
|
| 23 |
from nemo.collections.nlp.data.language_modeling.megatron import indexed_dataset
|
| 24 |
|
| 25 |
from cosmos1.models.autoregressive.nemo.utils import read_input_videos
|
| 26 |
+
from AutoregressiveVideo2WorldGeneration.ar_tokenizer_discrete_video import DiscreteVideoFSQJITTokenizer
|
| 27 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 28 |
|
| 29 |
TOKENIZER_COMPRESSION_FACTOR = [8, 16, 16]
|
| 30 |
DATA_RESOLUTION_SUPPORTED = [640, 1024]
|
cosmos1/models/autoregressive/nemo/utils.py
CHANGED
|
@@ -23,16 +23,16 @@ import torch
|
|
| 23 |
import torchvision
|
| 24 |
from huggingface_hub import snapshot_download
|
| 25 |
|
| 26 |
-
from
|
| 27 |
-
from
|
| 28 |
-
from
|
| 29 |
-
from
|
| 30 |
load_network_model,
|
| 31 |
load_tokenizer_model,
|
| 32 |
skip_init_linear,
|
| 33 |
)
|
| 34 |
-
from
|
| 35 |
-
from
|
| 36 |
|
| 37 |
TOKENIZER_COMPRESSION_FACTOR = [8, 16, 16]
|
| 38 |
DATA_RESOLUTION_SUPPORTED = [640, 1024]
|
|
|
|
| 23 |
import torchvision
|
| 24 |
from huggingface_hub import snapshot_download
|
| 25 |
|
| 26 |
+
from AutoregressiveVideo2WorldGeneration.ar_config_inference_inference_config import DiffusionDecoderSamplingConfig
|
| 27 |
+
from AutoregressiveVideo2WorldGeneration.ar_diffusion_decoder_inference import diffusion_decoder_process_tokens
|
| 28 |
+
from AutoregressiveVideo2WorldGeneration.ar_diffusion_decoder_model import LatentDiffusionDecoderModel
|
| 29 |
+
from AutoregressiveVideo2WorldGeneration.df_inference_inference_utils import (
|
| 30 |
load_network_model,
|
| 31 |
load_tokenizer_model,
|
| 32 |
skip_init_linear,
|
| 33 |
)
|
| 34 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 35 |
+
from AutoregressiveVideo2WorldGeneration.config_helper import get_config_module, override
|
| 36 |
|
| 37 |
TOKENIZER_COMPRESSION_FACTOR = [8, 16, 16]
|
| 38 |
DATA_RESOLUTION_SUPPORTED = [640, 1024]
|
cosmos1/models/diffusion/config/config.py
CHANGED
|
@@ -17,10 +17,10 @@ from typing import Any, List
|
|
| 17 |
|
| 18 |
import attrs
|
| 19 |
|
| 20 |
-
from
|
| 21 |
-
from
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
|
| 25 |
|
| 26 |
@attrs.define(slots=False)
|
|
|
|
| 17 |
|
| 18 |
import attrs
|
| 19 |
|
| 20 |
+
from AutoregressiveVideo2WorldGeneration.df_config_base_model import DefaultModelConfig
|
| 21 |
+
from AutoregressiveVideo2WorldGeneration.df_config_registry import register_configs
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration import config
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration.config_helper import import_all_modules_from_package
|
| 24 |
|
| 25 |
|
| 26 |
@attrs.define(slots=False)
|
cosmos1/models/diffusion/config/inference/cosmos-1-diffusion-text2world.py
CHANGED
|
@@ -15,7 +15,7 @@
|
|
| 15 |
|
| 16 |
from hydra.core.config_store import ConfigStore
|
| 17 |
|
| 18 |
-
from
|
| 19 |
|
| 20 |
Cosmos_1_0_Diffusion_Text2World_7B: LazyDict = LazyDict(
|
| 21 |
dict(
|
|
|
|
| 15 |
|
| 16 |
from hydra.core.config_store import ConfigStore
|
| 17 |
|
| 18 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import LazyDict
|
| 19 |
|
| 20 |
Cosmos_1_0_Diffusion_Text2World_7B: LazyDict = LazyDict(
|
| 21 |
dict(
|
cosmos1/models/diffusion/config/inference/cosmos-1-diffusion-video2world.py
CHANGED
|
@@ -16,8 +16,8 @@
|
|
| 16 |
from hydra.core.config_store import ConfigStore
|
| 17 |
|
| 18 |
from cosmos1.models.diffusion.networks.general_dit_video_conditioned import VideoExtendGeneralDIT
|
| 19 |
-
from
|
| 20 |
-
from
|
| 21 |
|
| 22 |
Cosmos_1_0_Diffusion_Video2World_7B: LazyDict = LazyDict(
|
| 23 |
dict(
|
|
|
|
| 16 |
from hydra.core.config_store import ConfigStore
|
| 17 |
|
| 18 |
from cosmos1.models.diffusion.networks.general_dit_video_conditioned import VideoExtendGeneralDIT
|
| 19 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import LazyCall as L
|
| 20 |
+
from AutoregressiveVideo2WorldGeneration.lazy_config_init import LazyDict
|
| 21 |
|
| 22 |
Cosmos_1_0_Diffusion_Video2World_7B: LazyDict = LazyDict(
|
| 23 |
dict(
|
cosmos1/models/diffusion/inference/text2world.py
CHANGED
|
@@ -16,12 +16,13 @@
|
|
| 16 |
import argparse
|
| 17 |
import os
|
| 18 |
|
|
|
|
| 19 |
import torch
|
| 20 |
|
| 21 |
-
from
|
| 22 |
from cosmos1.models.diffusion.inference.world_generation_pipeline import DiffusionText2WorldGenerationPipeline
|
| 23 |
-
from
|
| 24 |
-
from
|
| 25 |
|
| 26 |
torch.enable_grad(False)
|
| 27 |
|
|
|
|
| 16 |
import argparse
|
| 17 |
import os
|
| 18 |
|
| 19 |
+
from AutoregressiveVideo2WorldGeneration import misc
|
| 20 |
import torch
|
| 21 |
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.df_inference_inference_utils import add_common_arguments, validate_args
|
| 23 |
from cosmos1.models.diffusion.inference.world_generation_pipeline import DiffusionText2WorldGenerationPipeline
|
| 24 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 25 |
+
from AutoregressiveVideo2WorldGeneration.io import read_prompts_from_file, save_video
|
| 26 |
|
| 27 |
torch.enable_grad(False)
|
| 28 |
|
cosmos1/models/diffusion/inference/video2world.py
CHANGED
|
@@ -16,12 +16,13 @@
|
|
| 16 |
import argparse
|
| 17 |
import os
|
| 18 |
|
|
|
|
| 19 |
import torch
|
| 20 |
|
| 21 |
-
from
|
| 22 |
from cosmos1.models.diffusion.inference.world_generation_pipeline import DiffusionVideo2WorldGenerationPipeline
|
| 23 |
-
from
|
| 24 |
-
from
|
| 25 |
|
| 26 |
torch.enable_grad(False)
|
| 27 |
|
|
|
|
| 16 |
import argparse
|
| 17 |
import os
|
| 18 |
|
| 19 |
+
from AutoregressiveVideo2WorldGeneration import misc
|
| 20 |
import torch
|
| 21 |
|
| 22 |
+
from AutoregressiveVideo2WorldGeneration.df_inference_inference_utils import add_common_arguments, check_input_frames, validate_args
|
| 23 |
from cosmos1.models.diffusion.inference.world_generation_pipeline import DiffusionVideo2WorldGenerationPipeline
|
| 24 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 25 |
+
from AutoregressiveVideo2WorldGeneration.io import read_prompts_from_file, save_video
|
| 26 |
|
| 27 |
torch.enable_grad(False)
|
| 28 |
|
cosmos1/models/diffusion/inference/world_generation_pipeline.py
CHANGED
|
@@ -20,8 +20,8 @@ from typing import Any, Optional
|
|
| 20 |
import numpy as np
|
| 21 |
import torch
|
| 22 |
|
| 23 |
-
from
|
| 24 |
-
from
|
| 25 |
generate_world_from_text,
|
| 26 |
generate_world_from_video,
|
| 27 |
get_condition_latent,
|
|
@@ -30,8 +30,8 @@ from cosmos1.models.diffusion.inference.inference_utils import (
|
|
| 30 |
load_network_model,
|
| 31 |
load_tokenizer_model,
|
| 32 |
)
|
| 33 |
-
from
|
| 34 |
-
from
|
| 35 |
from cosmos1.models.diffusion.prompt_upsampler.text2world_prompt_upsampler_inference import (
|
| 36 |
create_prompt_upsampler,
|
| 37 |
run_chat_completion,
|
|
@@ -43,7 +43,7 @@ from cosmos1.models.diffusion.prompt_upsampler.video2world_prompt_upsampler_infe
|
|
| 43 |
from cosmos1.models.diffusion.prompt_upsampler.video2world_prompt_upsampler_inference import (
|
| 44 |
run_chat_completion as run_chat_completion_vlm,
|
| 45 |
)
|
| 46 |
-
from
|
| 47 |
|
| 48 |
MODEL_NAME_DICT = {
|
| 49 |
"Cosmos-1.0-Diffusion-7B-Text2World": "Cosmos_1_0_Diffusion_Text2World_7B",
|
|
|
|
| 20 |
import numpy as np
|
| 21 |
import torch
|
| 22 |
|
| 23 |
+
from AutoregressiveVideo2WorldGeneration.base_world_generation_pipeline import BaseWorldGenerationPipeline
|
| 24 |
+
from AutoregressiveVideo2WorldGeneration.df_inference_inference_utils import (
|
| 25 |
generate_world_from_text,
|
| 26 |
generate_world_from_video,
|
| 27 |
get_condition_latent,
|
|
|
|
| 30 |
load_network_model,
|
| 31 |
load_tokenizer_model,
|
| 32 |
)
|
| 33 |
+
from AutoregressiveVideo2WorldGeneration.df_model_model_t2w import DiffusionT2WModel
|
| 34 |
+
from AutoregressiveVideo2WorldGeneration.df_model_model_v2w import DiffusionV2WModel
|
| 35 |
from cosmos1.models.diffusion.prompt_upsampler.text2world_prompt_upsampler_inference import (
|
| 36 |
create_prompt_upsampler,
|
| 37 |
run_chat_completion,
|
|
|
|
| 43 |
from cosmos1.models.diffusion.prompt_upsampler.video2world_prompt_upsampler_inference import (
|
| 44 |
run_chat_completion as run_chat_completion_vlm,
|
| 45 |
)
|
| 46 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 47 |
|
| 48 |
MODEL_NAME_DICT = {
|
| 49 |
"Cosmos-1.0-Diffusion-7B-Text2World": "Cosmos_1_0_Diffusion_Text2World_7B",
|
cosmos1/models/diffusion/nemo/inference/general.py
CHANGED
|
@@ -37,7 +37,7 @@ from nemo.collections.diffusion.sampler.cosmos.cosmos_diffusion_pipeline import
|
|
| 37 |
from transformers import T5EncoderModel, T5TokenizerFast
|
| 38 |
|
| 39 |
from cosmos1.models.diffusion.nemo.inference.inference_utils import process_prompt, save_video
|
| 40 |
-
from
|
| 41 |
|
| 42 |
EXAMPLE_PROMPT = (
|
| 43 |
"The teal robot is cooking food in a kitchen. Steam rises from a simmering pot "
|
|
|
|
| 37 |
from transformers import T5EncoderModel, T5TokenizerFast
|
| 38 |
|
| 39 |
from cosmos1.models.diffusion.nemo.inference.inference_utils import process_prompt, save_video
|
| 40 |
+
from AutoregressiveVideo2WorldGeneration import log
|
| 41 |
|
| 42 |
EXAMPLE_PROMPT = (
|
| 43 |
"The teal robot is cooking food in a kitchen. Steam rises from a simmering pot "
|