Spaces:

howard-hou
/

VisualRWKV-Gradio-1

Runtime error

App Files Files Community

howard-hou commited on Jun 24, 2024

Commit

7fabc1b

verified ·

1 Parent(s): 98ba582

Update modeling_rwkv.py

Browse files

Files changed (1) hide show

modeling_rwkv.py +23 -10

modeling_rwkv.py CHANGED Viewed

@@ -320,14 +320,16 @@ class RWKV(MyModule):
                     w['emb.weight'] = F.layer_norm(w['emb.weight'], (args.n_embd,), weight=w['blocks.0.ln0.weight'], bias=w['blocks.0.ln0.bias'])
                 except:
                     w['emb.weight'] = F.layer_norm(w['emb.weight'].float(), (args.n_embd,), weight=w['blocks.0.ln0.weight'].float(), bias=w['blocks.0.ln0.bias'].float())
-                # del w['blocks.0.ln0.weight']
-                # del w['blocks.0.ln0.bias']
             print_need_newline = False
             REAL_TIME_FIRST = False
             for x in list(w.keys()):
                 if '.time_faaaa' in x: REAL_TIME_FIRST = True
             if REAL_TIME_FIRST:
                 w = {k.replace('.time_faaaa','.time_first') if '.time_faaaa' in k else k: v for k, v in w.items()}
                 self.w = w
@@ -377,7 +379,7 @@ class RWKV(MyModule):
                     elif '.ln_x' in x: # need fp32 for group_norm
                         w[x] = w[x].float()
                     else:
-                        if (len(w[x].shape) == 2) and ('emb' not in x):
                             if WTYPE != torch.uint8:
                                 w[x] = w[x].to(dtype=WTYPE)
                             else:
@@ -436,10 +438,12 @@ class RWKV(MyModule):
                         torch.cuda.empty_cache()
                 shape = [i for i in w[x].shape if i != 1]
-                if len(shape) > 1:
-                    shape = f" {str(shape[0]).rjust(5)} {str(shape[1]).rjust(5)}"
                 else:
-                    shape = f" {str(shape[0]).rjust(5)}      "
                 if layer_id == 0 or layer_id >= args.n_layer-1:
                     if print_need_newline:
                         prxxx('\n', end = '')
@@ -498,7 +502,7 @@ class RWKV(MyModule):
             if self.version == 6.0 and os.environ["RWKV_CUDA_ON"] == '1':
                 HEAD_SIZE = args.n_att // args.n_head
                 rwkv6 = load(name="rwkv6", sources=[f"{current_path}/cuda/rwkv6_op.cpp", f"{current_path}/cuda/rwkv6.cu"],
-                                verbose=True, extra_cuda_cflags=["-res-usage", "--use_fast_math", "-O3", "-Xptxas -O3", "--extra-device-vectorization", f"-D_N_={HEAD_SIZE}", f"-D_T_={4096}"])
                 class RWKV_6(torch.autograd.Function):
                     @staticmethod
@@ -1024,15 +1028,24 @@ class RWKV(MyModule):
                         dev = dd.device
                         atype = dd.atype
                         state[i*3+0] = torch.zeros(args.n_embd, dtype=atype, requires_grad=False, device=dev).contiguous()
-                        state[i*3+1] = torch.zeros((args.n_head, args.n_att//args.n_head, args.n_att//args.n_head), dtype=torch.float, requires_grad=False, device=dev).contiguous()
                         state[i*3+2] = torch.zeros(args.n_embd, dtype=atype, requires_grad=False, device=dev).contiguous()
-            if embs is None:
                 seq_mode = len(tokens) > 1
                 x = w['emb.weight'][tokens if seq_mode else tokens[0]]
-            else:
                 x = embs
                 seq_mode = True
             for i in range(args.n_layer):
                 bbb = f'blocks.{i}.'

                     w['emb.weight'] = F.layer_norm(w['emb.weight'], (args.n_embd,), weight=w['blocks.0.ln0.weight'], bias=w['blocks.0.ln0.bias'])
                 except:
                     w['emb.weight'] = F.layer_norm(w['emb.weight'].float(), (args.n_embd,), weight=w['blocks.0.ln0.weight'].float(), bias=w['blocks.0.ln0.bias'].float())
+                #del w['blocks.0.ln0.weight']
+                #del w['blocks.0.ln0.bias']
             print_need_newline = False
             REAL_TIME_FIRST = False
+            args.time_state = False
             for x in list(w.keys()):
                 if '.time_faaaa' in x: REAL_TIME_FIRST = True
+                if '.time_state' in x: args.time_state = True
             if REAL_TIME_FIRST:
                 w = {k.replace('.time_faaaa','.time_first') if '.time_faaaa' in k else k: v for k, v in w.items()}
                 self.w = w
                     elif '.ln_x' in x: # need fp32 for group_norm
                         w[x] = w[x].float()
                     else:
+                        if (len(w[x].shape) == 2) and ('emb' not in x) and ('_w1' not in x) and ('_w2' not in x):
                             if WTYPE != torch.uint8:
                                 w[x] = w[x].to(dtype=WTYPE)
                             else:
                         torch.cuda.empty_cache()
                 shape = [i for i in w[x].shape if i != 1]
+                if len(shape) > 2:
+                    shape = f" {str(shape[0]).rjust(5)} {str(shape[1]).rjust(5)} {str(shape[2]).rjust(5)}"
+                elif len(shape) > 1:
+                    shape = f" {str(shape[0]).rjust(5)} {str(shape[1]).rjust(5)}      "
                 else:
+                    shape = f" {str(shape[0]).rjust(5)}            "
                 if layer_id == 0 or layer_id >= args.n_layer-1:
                     if print_need_newline:
                         prxxx('\n', end = '')
             if self.version == 6.0 and os.environ["RWKV_CUDA_ON"] == '1':
                 HEAD_SIZE = args.n_att // args.n_head
                 rwkv6 = load(name="rwkv6", sources=[f"{current_path}/cuda/rwkv6_op.cpp", f"{current_path}/cuda/rwkv6.cu"],
+                                verbose=True, extra_cuda_cflags=["-res-usage", "--use_fast_math", "-O3", "-Xptxas -O3" if os.name != "nt" else "", "--extra-device-vectorization", f"-D_N_={HEAD_SIZE}", f"-D_T_={4096}"])
                 class RWKV_6(torch.autograd.Function):
                     @staticmethod
                         dev = dd.device
                         atype = dd.atype
                         state[i*3+0] = torch.zeros(args.n_embd, dtype=atype, requires_grad=False, device=dev).contiguous()
+                        if args.time_state:
+                            state[i*3+1] = w[f'blocks.{i}.att.time_state'].transpose(1,2).to(dtype=torch.float, device=dev).requires_grad_(False).contiguous()
+                        else:
+                            state[i*3+1] = torch.zeros((args.n_head, args.n_att//args.n_head, args.n_att//args.n_head), dtype=torch.float, requires_grad=False, device=dev).contiguous()
                         state[i*3+2] = torch.zeros(args.n_embd, dtype=atype, requires_grad=False, device=dev).contiguous()
+            if embs is None and tokens is not None:
                 seq_mode = len(tokens) > 1
                 x = w['emb.weight'][tokens if seq_mode else tokens[0]]
+            elif embs is not None and tokens is None:
                 x = embs
                 seq_mode = True
+            elif embs is not None and tokens is not None:
+                seq_mode = len(tokens) > 1
+                x = w['emb.weight'][tokens if seq_mode else tokens[0]]
+                x = torch.cat([x, embs], dim=0)
+            else:
+                raise ValueError('Either tokens or embs must be provided')
             for i in range(args.n_layer):
                 bbb = f'blocks.{i}.'