vince62s commited on
Commit
5d78781
·
1 Parent(s): 2be41b7

Upload 5 files

Browse files
mistral-finetune.yaml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Corpus opts:
2
+ data:
3
+ alpaca:
4
+ path_src: "alpaca_clean.txt"
5
+ transforms: [sentencepiece, filtertoolong]
6
+ weight: 10
7
+ sharegpt:
8
+ path_src: "sharegpt.txt"
9
+ transforms: [sentencepiece, filtertoolong]
10
+ weight: 10
11
+ osst1:
12
+ path_src: "osst1.flattened.txt"
13
+ transforms: [sentencepiece, filtertoolong]
14
+ weight: 10
15
+ valid:
16
+ path_src: "dataAI/valid.txt"
17
+ transforms: [sentencepiece]
18
+
19
+ ### Transform related opts:
20
+ #### Subword
21
+ src_subword_model: "tokenizer.model"
22
+ tgt_subword_model: "tokenizer.model"
23
+
24
+ #### Filter
25
+ src_seq_length: 1792
26
+ tgt_seq_length: 1792
27
+
28
+ #truncated_decoder: 32
29
+
30
+ # silently ignore empty lines in the data
31
+ skip_empty_level: silent
32
+
33
+ # General opts
34
+ train_from: "mistral-onmt.pt"
35
+ save_model: "mistral-onmt-sft"
36
+ save_format: pytorch
37
+ keep_checkpoint: 10
38
+ save_checkpoint_steps: 100
39
+ seed: 1234
40
+ report_every: 10
41
+ train_steps: 1000
42
+ valid_steps: 100
43
+
44
+ # Batching
45
+ bucket_size: 32768
46
+ num_workers: 1
47
+ world_size: 2
48
+ gpu_ranks: [0,1]
49
+ parallel_mode: tensor_parallel
50
+ batch_type: "tokens"
51
+ batch_size: 1792
52
+ valid_batch_size: 512
53
+ batch_size_multiple: 1
54
+ accum_count: [8]
55
+ accum_steps: [0]
56
+
57
+ override_opts: true # CAREFULL this requires all settings to be defined below
58
+
59
+ share_vocab: true
60
+ save_data: "mistral-7B"
61
+ src_vocab: "mistral.vocab"
62
+ src_vocab_size: 32000
63
+ tgt_vocab_size: 32000
64
+
65
+ decoder_start_token: '<s>'
66
+ # Optimization
67
+ model_dtype: "fp8"
68
+ apex_opt_level: ""
69
+ optim: "fusedadam"
70
+ learning_rate: 0.0001
71
+ warmup_steps: 100
72
+ decay_method: "none"
73
+ #learning_rate_decay: 0.98
74
+ #start_decay_steps: 100
75
+ #decay_steps: 10
76
+ adam_beta2: 0.998
77
+ max_grad_norm: 0
78
+ label_smoothing: 0.0
79
+ param_init: 0
80
+ param_init_glorot: true
81
+ normalization: "tokens"
82
+
83
+ #4/8bit
84
+ quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear']
85
+ quant_type: "bnb_FP4"
86
+
87
+ #LoRa
88
+ lora_layers: ['linear_values', 'linear_query', 'linear_keys', 'final_linear']
89
+ lora_rank: 4
90
+ lora_dropout: 0.05
91
+ lora_alpha: 8
92
+ lora_embedding: false
93
+
94
+ # Chekpointing
95
+ #use_ckpting: ['ffn', 'lora']
96
+
97
+ # Model
98
+ model_task: lm
99
+ encoder_type: transformer_lm
100
+ decoder_type: transformer_lm
101
+ layer_norm: rms
102
+ norm_eps: 1e-5
103
+ pos_ffn_activation_fn: 'silu'
104
+ max_relative_positions: -1
105
+ position_encoding: false
106
+ add_qkvbias: False
107
+ add_ffnbias: False
108
+ parallel_residual: false
109
+ dec_layers: 32
110
+ heads: 32
111
+ num_kv: 8
112
+ sliding_window: 128
113
+ hidden_size: 4096
114
+ word_vec_size: 4096
115
+ transformer_ff: 14336
116
+ dropout_steps: [0]
117
+ dropout: [0.0]
118
+ attention_dropout: [0.0]
mistral-inference.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transforms: [sentencepiece]
2
+
3
+ #### Subword
4
+ src_subword_model: "tokenizer.model"
5
+ tgt_subword_model: "tokenizer.model"
6
+
7
+ # Model info
8
+ model: "mistral-onmt.pt"
9
+
10
+ # Inference
11
+ seed: 42
12
+ max_length: 256
13
+ gpu: 0
14
+ batch_type: sents
15
+ batch_size: 4
16
+ world_size: 1
17
+ gpu_ranks: [0]
18
+ #parallel_mode: "tensor_parallel"
19
+ #quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear']
20
+ #quant_type: "bnb_NF4"
21
+ precision: fp16
22
+ #random_sampling_topk: 1
23
+ #random_sampling_topp: 0.6
24
+ #random_sampling_temp: 0.9
25
+ beam_size: 1
26
+ n_best: 1
27
+ profile: false
28
+ report_time: true
29
+ src: None
30
+
mistral-onmt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24c8f66e973cff52e1827fea6d9a2d3f7fffc0709fe6bcfd400d34babefd30ec
3
+ size 14485167003
mistral.vocab ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443