Upload 5 files
Browse files- mistral-finetune.yaml +118 -0
- mistral-inference.yaml +30 -0
- mistral-onmt.pt +3 -0
- mistral.vocab +0 -0
- tokenizer.model +3 -0
mistral-finetune.yaml
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Corpus opts:
|
2 |
+
data:
|
3 |
+
alpaca:
|
4 |
+
path_src: "alpaca_clean.txt"
|
5 |
+
transforms: [sentencepiece, filtertoolong]
|
6 |
+
weight: 10
|
7 |
+
sharegpt:
|
8 |
+
path_src: "sharegpt.txt"
|
9 |
+
transforms: [sentencepiece, filtertoolong]
|
10 |
+
weight: 10
|
11 |
+
osst1:
|
12 |
+
path_src: "osst1.flattened.txt"
|
13 |
+
transforms: [sentencepiece, filtertoolong]
|
14 |
+
weight: 10
|
15 |
+
valid:
|
16 |
+
path_src: "dataAI/valid.txt"
|
17 |
+
transforms: [sentencepiece]
|
18 |
+
|
19 |
+
### Transform related opts:
|
20 |
+
#### Subword
|
21 |
+
src_subword_model: "tokenizer.model"
|
22 |
+
tgt_subword_model: "tokenizer.model"
|
23 |
+
|
24 |
+
#### Filter
|
25 |
+
src_seq_length: 1792
|
26 |
+
tgt_seq_length: 1792
|
27 |
+
|
28 |
+
#truncated_decoder: 32
|
29 |
+
|
30 |
+
# silently ignore empty lines in the data
|
31 |
+
skip_empty_level: silent
|
32 |
+
|
33 |
+
# General opts
|
34 |
+
train_from: "mistral-onmt.pt"
|
35 |
+
save_model: "mistral-onmt-sft"
|
36 |
+
save_format: pytorch
|
37 |
+
keep_checkpoint: 10
|
38 |
+
save_checkpoint_steps: 100
|
39 |
+
seed: 1234
|
40 |
+
report_every: 10
|
41 |
+
train_steps: 1000
|
42 |
+
valid_steps: 100
|
43 |
+
|
44 |
+
# Batching
|
45 |
+
bucket_size: 32768
|
46 |
+
num_workers: 1
|
47 |
+
world_size: 2
|
48 |
+
gpu_ranks: [0,1]
|
49 |
+
parallel_mode: tensor_parallel
|
50 |
+
batch_type: "tokens"
|
51 |
+
batch_size: 1792
|
52 |
+
valid_batch_size: 512
|
53 |
+
batch_size_multiple: 1
|
54 |
+
accum_count: [8]
|
55 |
+
accum_steps: [0]
|
56 |
+
|
57 |
+
override_opts: true # CAREFULL this requires all settings to be defined below
|
58 |
+
|
59 |
+
share_vocab: true
|
60 |
+
save_data: "mistral-7B"
|
61 |
+
src_vocab: "mistral.vocab"
|
62 |
+
src_vocab_size: 32000
|
63 |
+
tgt_vocab_size: 32000
|
64 |
+
|
65 |
+
decoder_start_token: '<s>'
|
66 |
+
# Optimization
|
67 |
+
model_dtype: "fp8"
|
68 |
+
apex_opt_level: ""
|
69 |
+
optim: "fusedadam"
|
70 |
+
learning_rate: 0.0001
|
71 |
+
warmup_steps: 100
|
72 |
+
decay_method: "none"
|
73 |
+
#learning_rate_decay: 0.98
|
74 |
+
#start_decay_steps: 100
|
75 |
+
#decay_steps: 10
|
76 |
+
adam_beta2: 0.998
|
77 |
+
max_grad_norm: 0
|
78 |
+
label_smoothing: 0.0
|
79 |
+
param_init: 0
|
80 |
+
param_init_glorot: true
|
81 |
+
normalization: "tokens"
|
82 |
+
|
83 |
+
#4/8bit
|
84 |
+
quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear']
|
85 |
+
quant_type: "bnb_FP4"
|
86 |
+
|
87 |
+
#LoRa
|
88 |
+
lora_layers: ['linear_values', 'linear_query', 'linear_keys', 'final_linear']
|
89 |
+
lora_rank: 4
|
90 |
+
lora_dropout: 0.05
|
91 |
+
lora_alpha: 8
|
92 |
+
lora_embedding: false
|
93 |
+
|
94 |
+
# Chekpointing
|
95 |
+
#use_ckpting: ['ffn', 'lora']
|
96 |
+
|
97 |
+
# Model
|
98 |
+
model_task: lm
|
99 |
+
encoder_type: transformer_lm
|
100 |
+
decoder_type: transformer_lm
|
101 |
+
layer_norm: rms
|
102 |
+
norm_eps: 1e-5
|
103 |
+
pos_ffn_activation_fn: 'silu'
|
104 |
+
max_relative_positions: -1
|
105 |
+
position_encoding: false
|
106 |
+
add_qkvbias: False
|
107 |
+
add_ffnbias: False
|
108 |
+
parallel_residual: false
|
109 |
+
dec_layers: 32
|
110 |
+
heads: 32
|
111 |
+
num_kv: 8
|
112 |
+
sliding_window: 128
|
113 |
+
hidden_size: 4096
|
114 |
+
word_vec_size: 4096
|
115 |
+
transformer_ff: 14336
|
116 |
+
dropout_steps: [0]
|
117 |
+
dropout: [0.0]
|
118 |
+
attention_dropout: [0.0]
|
mistral-inference.yaml
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transforms: [sentencepiece]
|
2 |
+
|
3 |
+
#### Subword
|
4 |
+
src_subword_model: "tokenizer.model"
|
5 |
+
tgt_subword_model: "tokenizer.model"
|
6 |
+
|
7 |
+
# Model info
|
8 |
+
model: "mistral-onmt.pt"
|
9 |
+
|
10 |
+
# Inference
|
11 |
+
seed: 42
|
12 |
+
max_length: 256
|
13 |
+
gpu: 0
|
14 |
+
batch_type: sents
|
15 |
+
batch_size: 4
|
16 |
+
world_size: 1
|
17 |
+
gpu_ranks: [0]
|
18 |
+
#parallel_mode: "tensor_parallel"
|
19 |
+
#quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear']
|
20 |
+
#quant_type: "bnb_NF4"
|
21 |
+
precision: fp16
|
22 |
+
#random_sampling_topk: 1
|
23 |
+
#random_sampling_topp: 0.6
|
24 |
+
#random_sampling_temp: 0.9
|
25 |
+
beam_size: 1
|
26 |
+
n_best: 1
|
27 |
+
profile: false
|
28 |
+
report_time: true
|
29 |
+
src: None
|
30 |
+
|
mistral-onmt.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:24c8f66e973cff52e1827fea6d9a2d3f7fffc0709fe6bcfd400d34babefd30ec
|
3 |
+
size 14485167003
|
mistral.vocab
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
|
3 |
+
size 493443
|