Emilio407 commited on
Commit
59558e7
·
verified ·
1 Parent(s): a597103

Delete README.md

Browse files
Files changed (1) hide show
  1. README.md +0 -634
README.md DELETED
@@ -1,634 +0,0 @@
1
- ---
2
- base_model: google/madlad400-3b-mt
3
- license: apache-2.0
4
- language:
5
- - multilingual
6
- - en
7
- - ru
8
- - es
9
- - fr
10
- - de
11
- - it
12
- - pt
13
- - pl
14
- - nl
15
- - vi
16
- - tr
17
- - sv
18
- - id
19
- - ro
20
- - cs
21
- - zh
22
- - hu
23
- - ja
24
- - th
25
- - fi
26
- - fa
27
- - uk
28
- - da
29
- - el
30
- - "no"
31
- - bg
32
- - sk
33
- - ko
34
- - ar
35
- - lt
36
- - ca
37
- - sl
38
- - he
39
- - et
40
- - lv
41
- - hi
42
- - sq
43
- - ms
44
- - az
45
- - sr
46
- - ta
47
- - hr
48
- - kk
49
- - is
50
- - ml
51
- - mr
52
- - te
53
- - af
54
- - gl
55
- - fil
56
- - be
57
- - mk
58
- - eu
59
- - bn
60
- - ka
61
- - mn
62
- - bs
63
- - uz
64
- - ur
65
- - sw
66
- - yue
67
- - ne
68
- - kn
69
- - kaa
70
- - gu
71
- - si
72
- - cy
73
- - eo
74
- - la
75
- - hy
76
- - ky
77
- - tg
78
- - ga
79
- - mt
80
- - my
81
- - km
82
- - tt
83
- - so
84
- - ku
85
- - ps
86
- - pa
87
- - rw
88
- - lo
89
- - ha
90
- - dv
91
- - fy
92
- - lb
93
- - ckb
94
- - mg
95
- - gd
96
- - am
97
- - ug
98
- - ht
99
- - grc
100
- - hmn
101
- - sd
102
- - jv
103
- - mi
104
- - tk
105
- - ceb
106
- - yi
107
- - ba
108
- - fo
109
- - or
110
- - xh
111
- - su
112
- - kl
113
- - ny
114
- - sm
115
- - sn
116
- - co
117
- - zu
118
- - ig
119
- - yo
120
- - pap
121
- - st
122
- - haw
123
- - as
124
- - oc
125
- - cv
126
- - lus
127
- - tet
128
- - gsw
129
- - sah
130
- - br
131
- - rm
132
- - sa
133
- - bo
134
- - om
135
- - se
136
- - ce
137
- - cnh
138
- - ilo
139
- - hil
140
- - udm
141
- - os
142
- - lg
143
- - ti
144
- - vec
145
- - ts
146
- - tyv
147
- - kbd
148
- - ee
149
- - iba
150
- - av
151
- - kha
152
- - to
153
- - tn
154
- - nso
155
- - fj
156
- - zza
157
- - ak
158
- - ada
159
- - otq
160
- - dz
161
- - bua
162
- - cfm
163
- - ln
164
- - chm
165
- - gn
166
- - krc
167
- - wa
168
- - hif
169
- - yua
170
- - srn
171
- - war
172
- - rom
173
- - bik
174
- - pam
175
- - sg
176
- - lu
177
- - ady
178
- - kbp
179
- - syr
180
- - ltg
181
- - myv
182
- - iso
183
- - kac
184
- - bho
185
- - ay
186
- - kum
187
- - qu
188
- - za
189
- - pag
190
- - ngu
191
- - ve
192
- - pck
193
- - zap
194
- - tyz
195
- - hui
196
- - bbc
197
- - tzo
198
- - tiv
199
- - ksd
200
- - gom
201
- - min
202
- - ang
203
- - nhe
204
- - bgp
205
- - nzi
206
- - nnb
207
- - nv
208
- - zxx
209
- - bci
210
- - kv
211
- - new
212
- - mps
213
- - alt
214
- - meu
215
- - bew
216
- - fon
217
- - iu
218
- - abt
219
- - mgh
220
- - mnw
221
- - tvl
222
- - dov
223
- - tlh
224
- - ho
225
- - kw
226
- - mrj
227
- - meo
228
- - crh
229
- - mbt
230
- - emp
231
- - ace
232
- - ium
233
- - mam
234
- - gym
235
- - mai
236
- - crs
237
- - pon
238
- - ubu
239
- - fip
240
- - quc
241
- - gv
242
- - kj
243
- - btx
244
- - ape
245
- - chk
246
- - rcf
247
- - shn
248
- - tzh
249
- - mdf
250
- - ppk
251
- - ss
252
- - gag
253
- - cab
254
- - kri
255
- - seh
256
- - ibb
257
- - tbz
258
- - bru
259
- - enq
260
- - ach
261
- - cuk
262
- - kmb
263
- - wo
264
- - kek
265
- - qub
266
- - tab
267
- - bts
268
- - kos
269
- - rwo
270
- - cak
271
- - tuc
272
- - bum
273
- - cjk
274
- - gil
275
- - stq
276
- - tsg
277
- - quh
278
- - mak
279
- - arn
280
- - ban
281
- - jiv
282
- - sja
283
- - yap
284
- - tcy
285
- - toj
286
- - twu
287
- - xal
288
- - amu
289
- - rmc
290
- - hus
291
- - nia
292
- - kjh
293
- - bm
294
- - guh
295
- - mas
296
- - acf
297
- - dtp
298
- - ksw
299
- - bzj
300
- - din
301
- - zne
302
- - mad
303
- - msi
304
- - mag
305
- - mkn
306
- - kg
307
- - lhu
308
- - ch
309
- - qvi
310
- - mh
311
- - djk
312
- - sus
313
- - mfe
314
- - srm
315
- - dyu
316
- - ctu
317
- - gui
318
- - pau
319
- - inb
320
- - bi
321
- - mni
322
- - guc
323
- - jam
324
- - wal
325
- - jac
326
- - bas
327
- - gor
328
- - skr
329
- - nyu
330
- - noa
331
- - sda
332
- - gub
333
- - nog
334
- - cni
335
- - teo
336
- - tdx
337
- - sxn
338
- - rki
339
- - nr
340
- - frp
341
- - alz
342
- - taj
343
- - lrc
344
- - cce
345
- - rn
346
- - jvn
347
- - hvn
348
- - nij
349
- - dwr
350
- - izz
351
- - msm
352
- - bus
353
- - ktu
354
- - chr
355
- - maz
356
- - tzj
357
- - suz
358
- - knj
359
- - bim
360
- - gvl
361
- - bqc
362
- - tca
363
- - pis
364
- - prk
365
- - laj
366
- - mel
367
- - qxr
368
- - niq
369
- - ahk
370
- - shp
371
- - hne
372
- - spp
373
- - koi
374
- - krj
375
- - quf
376
- - luz
377
- - agr
378
- - tsc
379
- - mqy
380
- - gof
381
- - gbm
382
- - miq
383
- - dje
384
- - awa
385
- - bjj
386
- - qvz
387
- - sjp
388
- - tll
389
- - raj
390
- - kjg
391
- - bgz
392
- - quy
393
- - cbk
394
- - akb
395
- - oj
396
- - ify
397
- - mey
398
- - ks
399
- - cac
400
- - brx
401
- - qup
402
- - syl
403
- - jax
404
- - ff
405
- - ber
406
- - tks
407
- - trp
408
- - mrw
409
- - adh
410
- - smt
411
- - srr
412
- - ffm
413
- - qvc
414
- - mtr
415
- - ann
416
- - kaa
417
- - aa
418
- - noe
419
- - nut
420
- - gyn
421
- - kwi
422
- - xmm
423
- - msb
424
- library_name: transformers
425
- tags:
426
- - text2text-generation
427
- - text-generation-inference
428
- datasets:
429
- - allenai/MADLAD-400
430
- pipeline_tag: translation
431
-
432
- widget:
433
- - text: "<2en> Como vai, amigo?"
434
- example_title: "Translation to English"
435
- - text: "<2de> Do you speak German?"
436
- example_title: "Translation to German"
437
-
438
- ---
439
-
440
- # Model Card for MADLAD-400-3B-MT
441
-
442
- # Table of Contents
443
-
444
- 0. [TL;DR](#TL;DR)
445
- 1. [Model Details](#model-details)
446
- 2. [Usage](#usage)
447
- 3. [Uses](#uses)
448
- 4. [Bias, Risks, and Limitations](#bias-risks-and-limitations)
449
- 5. [Training Details](#training-details)
450
- 6. [Evaluation](#evaluation)
451
- 7. [Environmental Impact](#environmental-impact)
452
- 8. [Citation](#citation)
453
-
454
- # TL;DR
455
-
456
- MADLAD-400-3B-MT is a multilingual machine translation model based on the T5 architecture that was
457
- trained on 1 trillion tokens covering over 450 languages using publicly available data.
458
- It is competitive with models that are significantly larger.
459
-
460
- **Disclaimer**: [Juarez Bochi](https://huggingface.co/jbochi), who was not involved in this research, converted
461
- the original weights and wrote the contents of this model card based on the original paper and Flan-T5.
462
-
463
- # Model Details
464
-
465
- ## Model Description
466
-
467
- - **Model type:** Language model
468
- - **Language(s) (NLP):** Multilingual (400+ languages)
469
- - **License:** Apache 2.0
470
- - **Related Models:** [All MADLAD-400 Checkpoints](https://huggingface.co/models?search=madlad)
471
- - **Original Checkpoints:** [All Original MADLAD-400 Checkpoints](https://github.com/google-research/google-research/tree/master/madlad_400)
472
- - **Resources for more information:**
473
- - [Research paper](https://arxiv.org/abs/2309.04662)
474
- - [GitHub Repo](https://github.com/google-research/t5x)
475
- - [Hugging Face MADLAD-400 Docs (Similar to T5) ](https://huggingface.co/docs/transformers/model_doc/MADLAD-400) - [Pending PR](https://github.com/huggingface/transformers/pull/27471)
476
-
477
- # Usage
478
-
479
- Find below some example scripts on how to use the model:
480
-
481
- ## Using the Pytorch model with `transformers`
482
-
483
- ### Running the model on a CPU or GPU
484
-
485
- <details>
486
- <summary> Click to expand </summary>
487
-
488
- First, install the Python packages that are required:
489
-
490
- `pip install transformers accelerate sentencepiece`
491
-
492
- ```python
493
- from transformers import T5ForConditionalGeneration, T5Tokenizer
494
-
495
- model_name = 'jbochi/madlad400-3b-mt'
496
- model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
497
- tokenizer = T5Tokenizer.from_pretrained(model_name)
498
-
499
- text = "<2pt> I love pizza!"
500
- input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
501
- outputs = model.generate(input_ids=input_ids)
502
-
503
- tokenizer.decode(outputs[0], skip_special_tokens=True)
504
- # Eu adoro pizza!
505
- ```
506
-
507
- </details>
508
-
509
- ## Running the model with Candle
510
-
511
- <details>
512
- <summary> Click to expand </summary>
513
-
514
- Usage with [candle](https://github.com/huggingface/candle):
515
-
516
- ```bash
517
- $ cargo run --example t5 --release -- \
518
- --model-id "jbochi/madlad400-3b-mt" \
519
- --prompt "<2de> How are you, my friend?" \
520
- --decode --temperature 0
521
- ```
522
-
523
- We also provide a quantized model (1.65 GB vs the original 11.8 GB file):
524
-
525
- ```
526
- cargo run --example quantized-t5 --release -- \
527
- --model-id "jbochi/madlad400-3b-mt" --weight-file "model-q4k.gguf" \
528
- --prompt "<2de> How are you, my friend?" \
529
- --temperature 0
530
- ...
531
- Wie geht es dir, mein Freund?
532
- ```
533
-
534
- </details>
535
-
536
-
537
- # Uses
538
-
539
- ## Direct Use and Downstream Use
540
-
541
- > Primary intended uses: Machine Translation and multilingual NLP tasks on over 400 languages.
542
- > Primary intended users: Research community.
543
-
544
- ## Out-of-Scope Use
545
-
546
- > These models are trained on general domain data and are therefore not meant to
547
- > work on domain-specific models out-of-the box. Moreover, these research models have not been assessed
548
- > for production usecases.
549
-
550
- # Bias, Risks, and Limitations
551
-
552
- > We note that we evaluate on only 204 of the languages supported by these models and on machine translation
553
- > and few-shot machine translation tasks. Users must consider use of this model carefully for their own
554
- > usecase.
555
-
556
- ## Ethical considerations and risks
557
-
558
- > We trained these models with MADLAD-400 and publicly available data to create baseline models that
559
- > support NLP for over 400 languages, with a focus on languages underrepresented in large-scale corpora.
560
- > Given that these models were trained with web-crawled datasets that may contain sensitive, offensive or
561
- > otherwise low-quality content despite extensive preprocessing, it is still possible that these issues to the
562
- > underlying training data may cause differences in model performance and toxic (or otherwise problematic)
563
- > output for certain domains. Moreover, large models are dual use technologies that have specific risks
564
- > associated with their use and development. We point the reader to surveys such as those written by
565
- > Weidinger et al. or Bommasani et al. for a more detailed discussion of these risks, and to Liebling
566
- > et al. for a thorough discussion of the risks of machine translation systems.
567
-
568
- ## Known Limitations
569
-
570
- More information needed
571
-
572
- ## Sensitive Use:
573
-
574
- More information needed
575
-
576
- # Training Details
577
-
578
- > We train models of various sizes: a 3B, 32-layer parameter model,
579
- > a 7.2B 48-layer parameter model and a 10.7B 32-layer parameter model.
580
- > We share all parameters of the model across language pairs,
581
- > and use a Sentence Piece Model with 256k tokens shared on both the encoder and decoder
582
- > side. Each input sentence has a <2xx> token prepended to the source sentence to indicate the target
583
- > language.
584
-
585
- See the [research paper](https://arxiv.org/pdf/2309.04662.pdf) for further details.
586
-
587
- ## Training Data
588
-
589
- > For both the machine translation and language model, MADLAD-400 is used. For the machine translation
590
- > model, a combination of parallel datasources covering 157 languages is also used. Further details are
591
- > described in the [paper](https://arxiv.org/pdf/2309.04662.pdf).
592
-
593
- ## Training Procedure
594
-
595
- See the [research paper](https://arxiv.org/pdf/2309.04662.pdf) for further details.
596
-
597
- # Evaluation
598
-
599
- ## Testing Data, Factors & Metrics
600
-
601
- > For evaluation, we used WMT, NTREX, Flores-200 and Gatones datasets as described in Section 4.3 in the [paper](https://arxiv.org/pdf/2309.04662.pdf).
602
-
603
- > The translation quality of this model varies based on language, as seen in the paper, and likely varies on
604
- > domain, though we have not assessed this.
605
-
606
- ## Results
607
-
608
- ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64b7f632037d6452a321fa15/EzsMD1AwCuFH0S0DeD-n8.png)
609
-
610
- ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64b7f632037d6452a321fa15/CJ5zCUVy7vTU76Lc8NZcK.png)
611
-
612
- ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64b7f632037d6452a321fa15/NK0S-yVeWuhKoidpLYh3m.png)
613
-
614
- See the [research paper](https://arxiv.org/pdf/2309.04662.pdf) for further details.
615
-
616
- # Environmental Impact
617
-
618
- More information needed
619
-
620
- # Citation
621
-
622
- **BibTeX:**
623
-
624
- ```bibtex
625
- @misc{kudugunta2023madlad400,
626
- title={MADLAD-400: A Multilingual And Document-Level Large Audited Dataset},
627
- author={Sneha Kudugunta and Isaac Caswell and Biao Zhang and Xavier Garcia and Christopher A. Choquette-Choo and Katherine Lee and Derrick Xin and Aditya Kusupati and Romi Stella and Ankur Bapna and Orhan Firat},
628
- year={2023},
629
- eprint={2309.04662},
630
- archivePrefix={arXiv},
631
- primaryClass={cs.CL}
632
- }
633
- ```
634
-