Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +17 -0
- beit/microsoft/beit-base-patch16-224-pt22k/.gitattributes +19 -0
- beit/microsoft/beit-base-patch16-224-pt22k/README.md +100 -0
- beit/microsoft/beit-base-patch16-224-pt22k/config.json +28 -0
- beit/microsoft/beit-base-patch16-224-pt22k/flax_model.msgpack +3 -0
- beit/microsoft/beit-base-patch16-224-pt22k/model.safetensors +3 -0
- beit/microsoft/beit-base-patch16-224-pt22k/preprocessor_config.json +19 -0
- beit/microsoft/beit-base-patch16-224-pt22k/pytorch_model.bin +3 -0
- beit/microsoft/beit-large-patch16-224-pt22k/.gitattributes +19 -0
- beit/microsoft/beit-large-patch16-224-pt22k/README.md +100 -0
- beit/microsoft/beit-large-patch16-224-pt22k/config.json +28 -0
- beit/microsoft/beit-large-patch16-224-pt22k/flax_model.msgpack +3 -0
- beit/microsoft/beit-large-patch16-224-pt22k/model.safetensors +3 -0
- beit/microsoft/beit-large-patch16-224-pt22k/preprocessor_config.json +19 -0
- beit/microsoft/beit-large-patch16-224-pt22k/pytorch_model.bin +3 -0
- clip/apple/DFN2B-CLIP-ViT-B-16/.gitattributes +35 -0
- clip/apple/DFN2B-CLIP-ViT-B-16/LICENSE +88 -0
- clip/apple/DFN2B-CLIP-ViT-B-16/README.md +111 -0
- clip/apple/DFN2B-CLIP-ViT-B-16/eval_results.jsonl +40 -0
- clip/apple/DFN2B-CLIP-ViT-B-16/merges.txt +0 -0
- clip/apple/DFN2B-CLIP-ViT-B-16/open_clip_config.json +30 -0
- clip/apple/DFN2B-CLIP-ViT-B-16/open_clip_pytorch_model.bin +3 -0
- clip/apple/DFN2B-CLIP-ViT-B-16/preprocessor_config.json +19 -0
- clip/apple/DFN2B-CLIP-ViT-B-16/special_tokens_map.json +24 -0
- clip/apple/DFN2B-CLIP-ViT-B-16/tokenizer.json +0 -0
- clip/apple/DFN2B-CLIP-ViT-B-16/tokenizer_config.json +34 -0
- clip/apple/DFN2B-CLIP-ViT-B-16/vocab.json +0 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/.gitattributes +35 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/LICENSE +88 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/README.md +110 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/config.json +165 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/eval_results.jsonl +40 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/merges.txt +0 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/open_clip_config.json +31 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/open_clip_pytorch_model.bin +3 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/preprocessor_config.json +28 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/pytorch_model.bin +3 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/special_tokens_map.json +24 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/tokenizer.json +0 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/tokenizer_config.json +34 -0
- clip/apple/DFN2B-CLIP-ViT-L-14/vocab.json +0 -0
- clip/apple/DFN5B-CLIP-ViT-H-14/eval_results.jsonl +40 -0
- clip/apple/DFN5B-CLIP-ViT-H-14/open_clip_pytorch_model.bin +3 -0
- clip/apple/DFN5B-CLIP-ViT-H-14/preprocessor_config.json +28 -0
- clip/apple/DFN5B-CLIP-ViT-H-14/pytorch_model.bin +3 -0
- clip/facebook/metaclip-b16-400m/metaclip_b16_400m.bin +3 -0
- clip/facebook/metaclip-b16-400m/pytorch_model.bin +3 -0
- clip/facebook/metaclip-b16-fullcc2.5b/metaclip_b16_fullcc2.5b.bin +3 -0
- clip/facebook/metaclip-b16-fullcc2.5b/pytorch_model.bin +3 -0
- clip/facebook/metaclip-b32-400m/metaclip_b32_400m.bin +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,20 @@ clip/laion/CLIP-ViT-B-32-laion2B-s34B-b79K/open_clip_model.safetensors filter=lf
|
|
33 |
clip/laion/CLIP-ViT-B-32-laion2B-s34B-b79K/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
34 |
clip/facebook/metaclip-g14-fullcc2.5b/model-00003-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
|
35 |
clip/laion/CLIP-ViT-L-14-laion2B-s32B-b82K/open_clip_pytorch_model.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
clip/laion/CLIP-ViT-B-32-laion2B-s34B-b79K/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
34 |
clip/facebook/metaclip-g14-fullcc2.5b/model-00003-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
|
35 |
clip/laion/CLIP-ViT-L-14-laion2B-s32B-b82K/open_clip_pytorch_model.safetensors filter=lfs diff=lfs merge=lfs -text
|
36 |
+
clip/openai/clip-vit-large-patch14/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
37 |
+
beit/microsoft/beit-base-patch16-224-pt22k/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
38 |
+
beit/microsoft/beit-large-patch16-224-pt22k/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
39 |
+
clip/facebook/metaclip-g14-fullcc2.5b/open_clip_model.safetensors filter=lfs diff=lfs merge=lfs -text
|
40 |
+
clip/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_model.safetensors filter=lfs diff=lfs merge=lfs -text
|
41 |
+
clip/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
42 |
+
clip/google/siglip-so400m-patch14-224/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
43 |
+
clip/google/siglip-large-patch16-256/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
44 |
+
clip/laion/CLIP-ViT-L-14-laion2B-s32B-b82K/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
45 |
+
clip/facebook/metaclip-h14-alt/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
46 |
+
clip/facebook/metaclip-h14-fullcc2.5b/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
47 |
+
clip/google/siglip2-so400m-patch14-224/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
48 |
+
clip/facebook/metaclip-h14-alt/open_clip_model.safetensors filter=lfs diff=lfs merge=lfs -text
|
49 |
+
clip/facebook/metaclip-g14-fullcc2.5b/model-00001-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
|
50 |
+
clip/facebook/metaclip-g14-fullcc2.5b/model-00002-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
|
51 |
+
clip/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/pytorch_model-00001-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
|
52 |
+
clip/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/open_clip_model.safetensors filter=lfs diff=lfs merge=lfs -text
|
beit/microsoft/beit-base-patch16-224-pt22k/.gitattributes
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
19 |
+
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
beit/microsoft/beit-base-patch16-224-pt22k/README.md
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
tags:
|
4 |
+
- image-classification
|
5 |
+
- vision
|
6 |
+
datasets:
|
7 |
+
- imagenet
|
8 |
+
- imagenet-21k
|
9 |
+
---
|
10 |
+
|
11 |
+
# BEiT (base-sized model, pre-trained only)
|
12 |
+
|
13 |
+
BEiT model pre-trained in a self-supervised fashion on ImageNet-22k - also called ImageNet-21k (14 million images, 21,841 classes) at resolution 224x224. It was introduced in the paper [BEIT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong and Furu Wei and first released in [this repository](https://github.com/microsoft/unilm/tree/master/beit).
|
14 |
+
|
15 |
+
Disclaimer: The team releasing BEiT did not write a model card for this model so this model card has been written by the Hugging Face team.
|
16 |
+
|
17 |
+
## Model description
|
18 |
+
|
19 |
+
The BEiT model is a Vision Transformer (ViT), which is a transformer encoder model (BERT-like). In contrast to the original ViT model, BEiT is pretrained on a large collection of images in a self-supervised fashion, namely ImageNet-21k, at a resolution of 224x224 pixels. The pre-training objective for the model is to predict visual tokens from the encoder of OpenAI's DALL-E's VQ-VAE, based on masked patches.
|
20 |
+
|
21 |
+
Images are presented to the model as a sequence of fixed-size patches (resolution 16x16), which are linearly embedded. Contrary to the original ViT models, BEiT models do use relative position embeddings (similar to T5) instead of absolute position embeddings, and perform classification of images by mean-pooling the final hidden states of the patches, instead of placing a linear layer on top of the final hidden state of the [CLS] token.
|
22 |
+
|
23 |
+
By pre-training the model, it learns an inner representation of images that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled images for instance, you can train a standard classifier by placing a linear layer on top of the pre-trained encoder. One typically places a linear layer on top of the [CLS] token, as the last hidden state of this token can be seen as a representation of an entire image. Alternatively, one can mean-pool the final hidden states of the patch embeddings, and place a linear layer on top of that.
|
24 |
+
|
25 |
+
## Intended uses & limitations
|
26 |
+
|
27 |
+
You can use the raw model for image classification. See the [model hub](https://huggingface.co/models?search=microsoft/beit) to look for
|
28 |
+
fine-tuned versions on a task that interests you.
|
29 |
+
|
30 |
+
### How to use
|
31 |
+
|
32 |
+
Here is how to use this model:
|
33 |
+
|
34 |
+
```python
|
35 |
+
from transformers import BeitFeatureExtractor, BeitForMaskedImageModeling
|
36 |
+
from PIL import Image
|
37 |
+
import requests
|
38 |
+
|
39 |
+
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
|
40 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
41 |
+
|
42 |
+
feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
|
43 |
+
model = BeitForMaskedImageModeling.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
|
44 |
+
|
45 |
+
inputs = feature_extractor(images=image, return_tensors="pt")
|
46 |
+
outputs = model(**inputs)
|
47 |
+
logits = outputs.logits
|
48 |
+
```
|
49 |
+
|
50 |
+
Currently, both the feature extractor and model support PyTorch.
|
51 |
+
|
52 |
+
## Training data
|
53 |
+
|
54 |
+
The BEiT model was pretrained on [ImageNet-21k](http://www.image-net.org/), a dataset consisting of 14 million images and 21k classes.
|
55 |
+
|
56 |
+
## Training procedure
|
57 |
+
|
58 |
+
### Preprocessing
|
59 |
+
|
60 |
+
The exact details of preprocessing of images during training/validation can be found [here](https://github.com/microsoft/unilm/blob/master/beit/datasets.py).
|
61 |
+
|
62 |
+
Images are resized/rescaled to the same resolution (224x224) and normalized across the RGB channels with mean (0.5, 0.5, 0.5) and standard deviation (0.5, 0.5, 0.5).
|
63 |
+
|
64 |
+
### Pretraining
|
65 |
+
|
66 |
+
For all pre-training related hyperparameters, we refer to page 15 of the [original paper](https://arxiv.org/abs/2106.08254).
|
67 |
+
|
68 |
+
## Evaluation results
|
69 |
+
|
70 |
+
For evaluation results on several image classification benchmarks, we refer to tables 1 and 2 of the original paper. Note that for fine-tuning, the best results are obtained with a higher resolution. Of course, increasing the model size will result in better performance.
|
71 |
+
|
72 |
+
### BibTeX entry and citation info
|
73 |
+
|
74 |
+
```@article{DBLP:journals/corr/abs-2106-08254,
|
75 |
+
author = {Hangbo Bao and
|
76 |
+
Li Dong and
|
77 |
+
Furu Wei},
|
78 |
+
title = {BEiT: {BERT} Pre-Training of Image Transformers},
|
79 |
+
journal = {CoRR},
|
80 |
+
volume = {abs/2106.08254},
|
81 |
+
year = {2021},
|
82 |
+
url = {https://arxiv.org/abs/2106.08254},
|
83 |
+
archivePrefix = {arXiv},
|
84 |
+
eprint = {2106.08254},
|
85 |
+
timestamp = {Tue, 29 Jun 2021 16:55:04 +0200},
|
86 |
+
biburl = {https://dblp.org/rec/journals/corr/abs-2106-08254.bib},
|
87 |
+
bibsource = {dblp computer science bibliography, https://dblp.org}
|
88 |
+
}
|
89 |
+
```
|
90 |
+
|
91 |
+
```bibtex
|
92 |
+
@inproceedings{deng2009imagenet,
|
93 |
+
title={Imagenet: A large-scale hierarchical image database},
|
94 |
+
author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
|
95 |
+
booktitle={2009 IEEE conference on computer vision and pattern recognition},
|
96 |
+
pages={248--255},
|
97 |
+
year={2009},
|
98 |
+
organization={Ieee}
|
99 |
+
}
|
100 |
+
```
|
beit/microsoft/beit-base-patch16-224-pt22k/config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"BeitForMaskedImageModeling"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.0,
|
6 |
+
"drop_path_rate": 0.1,
|
7 |
+
"hidden_act": "gelu",
|
8 |
+
"hidden_dropout_prob": 0.0,
|
9 |
+
"hidden_size": 768,
|
10 |
+
"image_size": 224,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 3072,
|
13 |
+
"layer_norm_eps": 1e-12,
|
14 |
+
"layer_scale_init_value": 0.1,
|
15 |
+
"model_type": "beit",
|
16 |
+
"num_attention_heads": 12,
|
17 |
+
"num_channels": 3,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"patch_size": 16,
|
20 |
+
"torch_dtype": "float32",
|
21 |
+
"transformers_version": "4.11.0.dev0",
|
22 |
+
"use_absolute_position_embeddings": false,
|
23 |
+
"use_mask_token": true,
|
24 |
+
"use_mean_pooling": true,
|
25 |
+
"use_relative_position_bias": false,
|
26 |
+
"use_shared_relative_position_bias": true,
|
27 |
+
"vocab_size": 8192
|
28 |
+
}
|
beit/microsoft/beit-base-patch16-224-pt22k/flax_model.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:992202b28ad4d6f77a846849e498a6a634af5d8e92619ad78afb358c8e8084d1
|
3 |
+
size 367870927
|
beit/microsoft/beit-base-patch16-224-pt22k/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:18b08989cad41adba4900055a11899c5d28b1d22b1fd5198c126eb2007ea000e
|
3 |
+
size 368198186
|
beit/microsoft/beit-base-patch16-224-pt22k/preprocessor_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"crop_size": 224,
|
3 |
+
"do_center_crop": false,
|
4 |
+
"do_normalize": true,
|
5 |
+
"do_resize": true,
|
6 |
+
"feature_extractor_type": "BeitFeatureExtractor",
|
7 |
+
"image_mean": [
|
8 |
+
0.5,
|
9 |
+
0.5,
|
10 |
+
0.5
|
11 |
+
],
|
12 |
+
"image_std": [
|
13 |
+
0.5,
|
14 |
+
0.5,
|
15 |
+
0.5
|
16 |
+
],
|
17 |
+
"resample": 2,
|
18 |
+
"size": 224
|
19 |
+
}
|
beit/microsoft/beit-base-patch16-224-pt22k/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:284cc1c178af57275ef00cbd188a1ee9026091b6eb582aa44e043a37a743a70b
|
3 |
+
size 368254997
|
beit/microsoft/beit-large-patch16-224-pt22k/.gitattributes
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
19 |
+
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
beit/microsoft/beit-large-patch16-224-pt22k/README.md
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
tags:
|
4 |
+
- image-classification
|
5 |
+
- vision
|
6 |
+
datasets:
|
7 |
+
- imagenet
|
8 |
+
- imagenet-21k
|
9 |
+
---
|
10 |
+
|
11 |
+
# BEiT (large-sized model, pre-trained only)
|
12 |
+
|
13 |
+
BEiT model pre-trained in a self-supervised fashion on ImageNet-22k - also called ImageNet-21k (14 million images, 21,841 classes) at resolution 224x224. It was introduced in the paper [BEIT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong and Furu Wei and first released in [this repository](https://github.com/microsoft/unilm/tree/master/beit).
|
14 |
+
|
15 |
+
Disclaimer: The team releasing BEiT did not write a model card for this model so this model card has been written by the Hugging Face team.
|
16 |
+
|
17 |
+
## Model description
|
18 |
+
|
19 |
+
The BEiT model is a Vision Transformer (ViT), which is a transformer encoder model (BERT-like). In contrast to the original ViT model, BEiT is pretrained on a large collection of images in a self-supervised fashion, namely ImageNet-21k, at a resolution of 224x224 pixels. The pre-training objective for the model is to predict visual tokens from the encoder of OpenAI's DALL-E's VQ-VAE, based on masked patches.
|
20 |
+
|
21 |
+
Images are presented to the model as a sequence of fixed-size patches (resolution 16x16), which are linearly embedded. Contrary to the original ViT models, BEiT models do use relative position embeddings (similar to T5) instead of absolute position embeddings, and perform classification of images by mean-pooling the final hidden states of the patches, instead of placing a linear layer on top of the final hidden state of the [CLS] token.
|
22 |
+
|
23 |
+
By pre-training the model, it learns an inner representation of images that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled images for instance, you can train a standard classifier by placing a linear layer on top of the pre-trained encoder. One typically places a linear layer on top of the [CLS] token, as the last hidden state of this token can be seen as a representation of an entire image. Alternatively, one can mean-pool the final hidden states of the patch embeddings, and place a linear layer on top of that.
|
24 |
+
|
25 |
+
## Intended uses & limitations
|
26 |
+
|
27 |
+
You can use the raw model for image classification. See the [model hub](https://huggingface.co/models?search=microsoft/beit) to look for
|
28 |
+
fine-tuned versions on a task that interests you.
|
29 |
+
|
30 |
+
### How to use
|
31 |
+
|
32 |
+
Here is how to use this model:
|
33 |
+
|
34 |
+
```python
|
35 |
+
from transformers import BeitFeatureExtractor, BeitForMaskedImageModeling
|
36 |
+
from PIL import Image
|
37 |
+
import requests
|
38 |
+
|
39 |
+
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
|
40 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
41 |
+
|
42 |
+
feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-large-patch16-224-pt22k')
|
43 |
+
model = BeitForMaskedImageModeling.from_pretrained('microsoft/beit-large-patch16-224-pt22k')
|
44 |
+
|
45 |
+
inputs = feature_extractor(images=image, return_tensors="pt")
|
46 |
+
outputs = model(**inputs)
|
47 |
+
logits = outputs.logits
|
48 |
+
```
|
49 |
+
|
50 |
+
Currently, both the feature extractor and model support PyTorch.
|
51 |
+
|
52 |
+
## Training data
|
53 |
+
|
54 |
+
The BEiT model was pretrained on [ImageNet-21k](http://www.image-net.org/), a dataset consisting of 14 million images and 21k classes.
|
55 |
+
|
56 |
+
## Training procedure
|
57 |
+
|
58 |
+
### Preprocessing
|
59 |
+
|
60 |
+
The exact details of preprocessing of images during training/validation can be found [here](https://github.com/microsoft/unilm/blob/master/beit/datasets.py).
|
61 |
+
|
62 |
+
Images are resized/rescaled to the same resolution (224x224) and normalized across the RGB channels with mean (0.5, 0.5, 0.5) and standard deviation (0.5, 0.5, 0.5).
|
63 |
+
|
64 |
+
### Pretraining
|
65 |
+
|
66 |
+
For all pre-training related hyperparameters, we refer to page 15 of the [original paper](https://arxiv.org/abs/2106.08254).
|
67 |
+
|
68 |
+
## Evaluation results
|
69 |
+
|
70 |
+
For evaluation results on several image classification benchmarks, we refer to tables 1 and 2 of the original paper. Note that for fine-tuning, the best results are obtained with a higher resolution. Of course, increasing the model size will result in better performance.
|
71 |
+
|
72 |
+
### BibTeX entry and citation info
|
73 |
+
|
74 |
+
```@article{DBLP:journals/corr/abs-2106-08254,
|
75 |
+
author = {Hangbo Bao and
|
76 |
+
Li Dong and
|
77 |
+
Furu Wei},
|
78 |
+
title = {BEiT: {BERT} Pre-Training of Image Transformers},
|
79 |
+
journal = {CoRR},
|
80 |
+
volume = {abs/2106.08254},
|
81 |
+
year = {2021},
|
82 |
+
url = {https://arxiv.org/abs/2106.08254},
|
83 |
+
archivePrefix = {arXiv},
|
84 |
+
eprint = {2106.08254},
|
85 |
+
timestamp = {Tue, 29 Jun 2021 16:55:04 +0200},
|
86 |
+
biburl = {https://dblp.org/rec/journals/corr/abs-2106-08254.bib},
|
87 |
+
bibsource = {dblp computer science bibliography, https://dblp.org}
|
88 |
+
}
|
89 |
+
```
|
90 |
+
|
91 |
+
```bibtex
|
92 |
+
@inproceedings{deng2009imagenet,
|
93 |
+
title={Imagenet: A large-scale hierarchical image database},
|
94 |
+
author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
|
95 |
+
booktitle={2009 IEEE conference on computer vision and pattern recognition},
|
96 |
+
pages={248--255},
|
97 |
+
year={2009},
|
98 |
+
organization={Ieee}
|
99 |
+
}
|
100 |
+
```
|
beit/microsoft/beit-large-patch16-224-pt22k/config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"BeitForMaskedImageModeling"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.0,
|
6 |
+
"drop_path_rate": 0.1,
|
7 |
+
"hidden_act": "gelu",
|
8 |
+
"hidden_dropout_prob": 0.0,
|
9 |
+
"hidden_size": 1024,
|
10 |
+
"image_size": 224,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 4096,
|
13 |
+
"layer_norm_eps": 1e-12,
|
14 |
+
"layer_scale_init_value": 0.1,
|
15 |
+
"model_type": "beit",
|
16 |
+
"num_attention_heads": 16,
|
17 |
+
"num_channels": 3,
|
18 |
+
"num_hidden_layers": 24,
|
19 |
+
"patch_size": 16,
|
20 |
+
"torch_dtype": "float32",
|
21 |
+
"transformers_version": "4.11.0.dev0",
|
22 |
+
"use_absolute_position_embeddings": false,
|
23 |
+
"use_mask_token": true,
|
24 |
+
"use_mean_pooling": true,
|
25 |
+
"use_relative_position_bias": false,
|
26 |
+
"use_shared_relative_position_bias": true,
|
27 |
+
"vocab_size": 8192
|
28 |
+
}
|
beit/microsoft/beit-large-patch16-224-pt22k/flax_model.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:790333e0495827b2375db293914e141498e3ca259ff9da1edca29a6ff6b20ab9
|
3 |
+
size 1246151329
|
beit/microsoft/beit-large-patch16-224-pt22k/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed81d0faf9f905c2eeee043a77976f997f9249c6fcda57172660ffb48a88d939
|
3 |
+
size 1246495686
|
beit/microsoft/beit-large-patch16-224-pt22k/preprocessor_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"crop_size": 224,
|
3 |
+
"do_center_crop": false,
|
4 |
+
"do_normalize": true,
|
5 |
+
"do_resize": true,
|
6 |
+
"feature_extractor_type": "BeitFeatureExtractor",
|
7 |
+
"image_mean": [
|
8 |
+
0.5,
|
9 |
+
0.5,
|
10 |
+
0.5
|
11 |
+
],
|
12 |
+
"image_std": [
|
13 |
+
0.5,
|
14 |
+
0.5,
|
15 |
+
0.5
|
16 |
+
],
|
17 |
+
"resample": 2,
|
18 |
+
"size": 224
|
19 |
+
}
|
beit/microsoft/beit-large-patch16-224-pt22k/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7e36e4e7e4b59c8713e27b17133381b2a9476dd4cec8a05392ca71c5be4abb7
|
3 |
+
size 1246607101
|
clip/apple/DFN2B-CLIP-ViT-B-16/.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
clip/apple/DFN2B-CLIP-ViT-B-16/LICENSE
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Disclaimer: IMPORTANT: This Apple Machine Learning Research Model is
|
2 |
+
specifically developed and released by Apple Inc. ("Apple") for the sole purpose
|
3 |
+
of scientific research of artificial intelligence and machine-learning
|
4 |
+
technology. “Apple Machine Learning Research Model” means the model, including
|
5 |
+
but not limited to algorithms, formulas, trained model weights, parameters,
|
6 |
+
configurations, checkpoints, and any related materials (including
|
7 |
+
documentation).
|
8 |
+
|
9 |
+
This Apple Machine Learning Research Model is provided to You by
|
10 |
+
Apple in consideration of your agreement to the following terms, and your use,
|
11 |
+
modification, creation of Model Derivatives, and or redistribution of the Apple
|
12 |
+
Machine Learning Research Model constitutes acceptance of this Agreement. If You
|
13 |
+
do not agree with these terms, please do not use, modify, create Model
|
14 |
+
Derivatives of, or distribute this Apple Machine Learning Research Model or
|
15 |
+
Model Derivatives.
|
16 |
+
|
17 |
+
* License Scope: In consideration of your agreement to abide by the following
|
18 |
+
terms, and subject to these terms, Apple hereby grants you a personal,
|
19 |
+
non-exclusive, worldwide, non-transferable, royalty-free, revocable, and
|
20 |
+
limited license, to use, copy, modify, distribute, and create Model
|
21 |
+
Derivatives (defined below) of the Apple Machine Learning Research Model
|
22 |
+
exclusively for Research Purposes. You agree that any Model Derivatives You
|
23 |
+
may create or that may be created for You will be limited to Research Purposes
|
24 |
+
as well. “Research Purposes” means non-commercial scientific research and
|
25 |
+
academic development activities, such as experimentation, analysis, testing
|
26 |
+
conducted by You with the sole intent to advance scientific knowledge and
|
27 |
+
research. “Research Purposes” does not include any commercial exploitation,
|
28 |
+
product development or use in any commercial product or service.
|
29 |
+
|
30 |
+
* Distribution of Apple Machine Learning Research Model and Model Derivatives:
|
31 |
+
If you choose to redistribute Apple Machine Learning Research Model or its
|
32 |
+
Model Derivatives, you must provide a copy of this Agreement to such third
|
33 |
+
party, and ensure that the following attribution notice be provided: “Apple
|
34 |
+
Machine Learning Research Model is licensed under the Apple Machine Learning
|
35 |
+
Research Model License Agreement.” Additionally, all Model Derivatives must
|
36 |
+
clearly be identified as such, including disclosure of modifications and
|
37 |
+
changes made to the Apple Machine Learning Research Model. The name,
|
38 |
+
trademarks, service marks or logos of Apple may not be used to endorse or
|
39 |
+
promote Model Derivatives or the relationship between You and Apple. “Model
|
40 |
+
Derivatives” means any models or any other artifacts created by modifications,
|
41 |
+
improvements, adaptations, alterations to the architecture, algorithm or
|
42 |
+
training processes of the Apple Machine Learning Research Model, or by any
|
43 |
+
retraining, fine-tuning of the Apple Machine Learning Research Model.
|
44 |
+
|
45 |
+
* No Other License: Except as expressly stated in this notice, no other rights
|
46 |
+
or licenses, express or implied, are granted by Apple herein, including but
|
47 |
+
not limited to any patent, trademark, and similar intellectual property rights
|
48 |
+
worldwide that may be infringed by the Apple Machine Learning Research Model,
|
49 |
+
the Model Derivatives or by other works in which the Apple Machine Learning
|
50 |
+
Research Model may be incorporated.
|
51 |
+
|
52 |
+
* Compliance with Laws: Your use of Apple Machine Learning Research Model must
|
53 |
+
be in compliance with all applicable laws and regulations.
|
54 |
+
|
55 |
+
* Term and Termination: The term of this Agreement will begin upon your
|
56 |
+
acceptance of this Agreement or use of the Apple Machine Learning Research
|
57 |
+
Model and will continue until terminated in accordance with the following
|
58 |
+
terms. Apple may terminate this Agreement at any time if You are in breach of
|
59 |
+
any term or condition of this Agreement. Upon termination of this Agreement,
|
60 |
+
You must cease to use all Apple Machine Learning Research Models and Model
|
61 |
+
Derivatives and permanently delete any copy thereof. Sections 3, 6 and 7 will
|
62 |
+
survive termination.
|
63 |
+
|
64 |
+
* Disclaimer and Limitation of Liability: This Apple Machine Learning Research
|
65 |
+
Model and any outputs generated by the Apple Machine Learning Research Model
|
66 |
+
are provided on an “AS IS” basis. APPLE MAKES NO WARRANTIES, EXPRESS OR
|
67 |
+
IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED WARRANTIES OF
|
68 |
+
NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE,
|
69 |
+
REGARDING THE APPLE MACHINE LEARNING RESEARCH MODEL OR OUTPUTS GENERATED BY
|
70 |
+
THE APPLE MACHINE LEARNING RESEARCH MODEL. You are solely responsible for
|
71 |
+
determining the appropriateness of using or redistributing the Apple Machine
|
72 |
+
Learning Research Model and any outputs of the Apple Machine Learning Research
|
73 |
+
Model and assume any risks associated with Your use of the Apple Machine
|
74 |
+
Learning Research Model and any output and results. IN NO EVENT SHALL APPLE BE
|
75 |
+
LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
|
76 |
+
IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION OF
|
77 |
+
THE APPLE MACHINE LEARNING RESEARCH MODEL AND ANY OUTPUTS OF THE APPLE MACHINE
|
78 |
+
LEARNING RESEARCH MODEL, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT,
|
79 |
+
TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS
|
80 |
+
BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
81 |
+
|
82 |
+
* Governing Law: This Agreement will be governed by and construed under the laws
|
83 |
+
of the State of California without regard to its choice of law principles. The
|
84 |
+
Convention on Contracts for the International Sale of Goods shall not apply to
|
85 |
+
the Agreement except that the arbitration clause and any arbitration hereunder
|
86 |
+
shall be governed by the Federal Arbitration Act, Chapters 1 and 2.
|
87 |
+
|
88 |
+
Copyright (C) 2025 Apple Inc. All Rights Reserved.
|
clip/apple/DFN2B-CLIP-ViT-B-16/README.md
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apple-amlr
|
3 |
+
license_name: apple-sample-code-license
|
4 |
+
license_link: LICENSE
|
5 |
+
---
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
A CLIP (Contrastive Language-Image Pre-training) model trained on DFN-2B.
|
10 |
+
Data Filtering Networks (DFNs) are small networks used to automatically filter large pools of uncurated data.
|
11 |
+
This model was trained on 2B images that were filtered from a pool of 12.8B uncurated image-text pairs
|
12 |
+
(12.8B image-text pairs from CommonPool-12.8B).
|
13 |
+
|
14 |
+
These weights are directly usable in OpenCLIP (image + text).
|
15 |
+
|
16 |
+
|
17 |
+
## Model Details
|
18 |
+
|
19 |
+
- **Model Type:** Contrastive Image-Text, Zero-Shot Image Classification.
|
20 |
+
- **Dataset:** DFN-2b
|
21 |
+
- **Papers:**
|
22 |
+
- Data Filtering Networks: https://arxiv.org/abs/2309.17425
|
23 |
+
- **Examples Seen:** 12.8B
|
24 |
+
|
25 |
+
|
26 |
+
## Model Metrics
|
27 |
+
| Dataset | Metric |
|
28 |
+
|:-----------------------|---------:|
|
29 |
+
| ImageNet 1k | 0.76236 |
|
30 |
+
| Caltech-101 | 0.942894 |
|
31 |
+
| CIFAR-10 | 0.9672 |
|
32 |
+
| CIFAR-100 | 0.8347 |
|
33 |
+
| CLEVR Counts | 0.232333 |
|
34 |
+
| CLEVR Distance | 0.245267 |
|
35 |
+
| Country211 | 0.19545 |
|
36 |
+
| Describable Textures | 0.575532 |
|
37 |
+
| EuroSAT | 0.54 |
|
38 |
+
| FGVC Aircraft | 0.248503 |
|
39 |
+
| Food-101 | 0.91303 |
|
40 |
+
| GTSRB | 0.469913 |
|
41 |
+
| ImageNet Sketch | 0.620684 |
|
42 |
+
| ImageNet v2 | 0.682 |
|
43 |
+
| ImageNet-A | 0.482133 |
|
44 |
+
| ImageNet-O | 0.493 |
|
45 |
+
| ImageNet-R | 0.830967 |
|
46 |
+
| KITTI Vehicle Distance | 0.192686 |
|
47 |
+
| MNIST | 0.782 |
|
48 |
+
| ObjectNet | 0.631851 |
|
49 |
+
| Oxford Flowers-102 | 0.819895 |
|
50 |
+
| Oxford-IIIT Pet | 0.936907 |
|
51 |
+
| Pascal VOC 2007 | 0.788528 |
|
52 |
+
| PatchCamelyon | 0.521545 |
|
53 |
+
| Rendered SST2 | 0.486546 |
|
54 |
+
| RESISC45 | 0.61381 |
|
55 |
+
| Stanford Cars | 0.90735 |
|
56 |
+
| STL-10 | 0.97525 |
|
57 |
+
| SUN397 | 0.714162 |
|
58 |
+
| SVHN | 0.598955 |
|
59 |
+
| Flickr | 0.7728 |
|
60 |
+
| MSCOCO | 0.518773 |
|
61 |
+
| WinoGAViL | 0.541748 |
|
62 |
+
| iWildCam | 0.155574 |
|
63 |
+
| Camelyon17 | 0.499283 |
|
64 |
+
| FMoW | 0.141149 |
|
65 |
+
| Dollar Street | 0.625 |
|
66 |
+
| GeoDE | 0.891023 |
|
67 |
+
| **Average** | **0.609232** |
|
68 |
+
|
69 |
+
## Model Usage
|
70 |
+
### With OpenCLIP
|
71 |
+
```
|
72 |
+
import torch
|
73 |
+
import torch.nn.functional as F
|
74 |
+
from urllib.request import urlopen
|
75 |
+
from PIL import Image
|
76 |
+
from open_clip import create_model_from_pretrained, get_tokenizer
|
77 |
+
|
78 |
+
model, preprocess = create_model_from_pretrained('hf-hub:apple/DFN2B-CLIP-ViT-B-16')
|
79 |
+
tokenizer = get_tokenizer('ViT-B-16')
|
80 |
+
|
81 |
+
image = Image.open(urlopen(
|
82 |
+
'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
|
83 |
+
))
|
84 |
+
image = preprocess(image).unsqueeze(0)
|
85 |
+
|
86 |
+
labels_list = ["a dog", "a cat", "a donut", "a beignet"]
|
87 |
+
text = tokenizer(labels_list, context_length=model.context_length)
|
88 |
+
|
89 |
+
with torch.no_grad(), torch.cuda.amp.autocast():
|
90 |
+
image_features = model.encode_image(image)
|
91 |
+
text_features = model.encode_text(text)
|
92 |
+
image_features = F.normalize(image_features, dim=-1)
|
93 |
+
text_features = F.normalize(text_features, dim=-1)
|
94 |
+
|
95 |
+
text_probs = torch.sigmoid(image_features @ text_features.T * model.logit_scale.exp() + model.logit_bias)
|
96 |
+
|
97 |
+
zipped_list = list(zip(labels_list, [round(p.item(), 3) for p in text_probs[0]]))
|
98 |
+
print("Label probabilities: ", zipped_list)
|
99 |
+
```
|
100 |
+
|
101 |
+
## Citation
|
102 |
+
```bibtex
|
103 |
+
@article{fang2023data,
|
104 |
+
title={Data Filtering Networks},
|
105 |
+
author={Fang, Alex and Jose, Albin Madappally and Jain, Amit and Schmidt, Ludwig and Toshev, Alexander and Shankar, Vaishaal},
|
106 |
+
journal={arXiv preprint arXiv:2309.17425},
|
107 |
+
year={2023}
|
108 |
+
}
|
109 |
+
|
110 |
+
```
|
111 |
+
|
clip/apple/DFN2B-CLIP-ViT-B-16/eval_results.jsonl
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"key": "imagenet1k", "dataset": "ImageNet 1k", "metrics": {"acc1": 0.76236, "acc5": 0.94602, "mean_per_class_recall": 0.7625, "main_metric": 0.76236}}
|
2 |
+
{"key": "vtab/caltech101", "dataset": "Caltech-101", "metrics": {"acc1": 0.8491372226787182, "acc5": 0.9479046836483155, "mean_per_class_recall": 0.9428944730514909, "main_metric": 0.9428944730514909}}
|
3 |
+
{"key": "cifar10", "dataset": "CIFAR-10", "metrics": {"acc1": 0.9672, "acc5": 0.9996, "mean_per_class_recall": 0.9671999999999998, "main_metric": 0.9672}}
|
4 |
+
{"key": "vtab/cifar100", "dataset": "CIFAR-100", "metrics": {"acc1": 0.8347, "acc5": 0.9718, "mean_per_class_recall": 0.8347000000000001, "main_metric": 0.8347}}
|
5 |
+
{"key": "vtab/clevr_count_all", "dataset": "CLEVR Counts", "metrics": {"acc1": 0.23233333333333334, "acc5": 0.8664, "mean_per_class_recall": 0.23372915568843933, "main_metric": 0.23233333333333334}}
|
6 |
+
{"key": "vtab/clevr_closest_object_distance", "dataset": "CLEVR Distance", "metrics": {"acc1": 0.24526666666666666, "acc5": 0.9186666666666666, "mean_per_class_recall": 0.16711260731376074, "main_metric": 0.24526666666666666}}
|
7 |
+
{"key": "country211", "dataset": "Country211", "metrics": {"acc1": 0.19545023696682465, "acc5": 0.415260663507109, "mean_per_class_recall": 0.19545023696682462, "main_metric": 0.19545023696682465}}
|
8 |
+
{"key": "vtab/dtd", "dataset": "Describable Textures", "metrics": {"acc1": 0.575531914893617, "acc5": 0.8914893617021277, "mean_per_class_recall": 0.575531914893617, "main_metric": 0.575531914893617}}
|
9 |
+
{"key": "vtab/eurosat", "dataset": "EuroSAT", "metrics": {"acc1": 0.54, "acc5": 0.957962962962963, "mean_per_class_recall": 0.5559610506168461, "main_metric": 0.54}}
|
10 |
+
{"key": "fgvc_aircraft", "dataset": "FGVC Aircraft", "metrics": {"acc1": 0.24902490249024903, "acc5": 0.5727572757275727, "mean_per_class_recall": 0.24850267379679145, "main_metric": 0.24850267379679145}}
|
11 |
+
{"key": "food101", "dataset": "Food-101", "metrics": {"acc1": 0.9130297029702971, "acc5": 0.988, "mean_per_class_recall": 0.9130297029702971, "main_metric": 0.9130297029702971}}
|
12 |
+
{"key": "gtsrb", "dataset": "GTSRB", "metrics": {"acc1": 0.46991290577988915, "acc5": 0.7560570071258907, "mean_per_class_recall": 0.4815686220961752, "main_metric": 0.46991290577988915}}
|
13 |
+
{"key": "imagenet_sketch", "dataset": "ImageNet Sketch", "metrics": {"acc1": 0.6206842343138989, "acc5": 0.8517754328047319, "mean_per_class_recall": 0.6207807843137254, "main_metric": 0.6206842343138989}}
|
14 |
+
{"key": "imagenetv2", "dataset": "ImageNet v2", "metrics": {"acc1": 0.682, "acc5": 0.9033, "mean_per_class_recall": 0.6822, "main_metric": 0.682}}
|
15 |
+
{"key": "imagenet-a", "dataset": "ImageNet-A", "metrics": {"acc1": 0.48213333333333336, "acc5": 0.7836, "mean_per_class_recall": 0.4535268702732058, "main_metric": 0.48213333333333336}}
|
16 |
+
{"key": "imagenet-o", "dataset": "ImageNet-O", "metrics": {"acc1": 0.493, "acc5": 0.803, "mean_per_class_recall": 0.5078972496629617, "main_metric": 0.493}}
|
17 |
+
{"key": "imagenet-r", "dataset": "ImageNet-R", "metrics": {"acc1": 0.8309666666666666, "acc5": 0.9468333333333333, "mean_per_class_recall": 0.8192261619836506, "main_metric": 0.8309666666666666}}
|
18 |
+
{"key": "vtab/kitti_closest_vehicle_distance", "dataset": "KITTI Vehicle Distance", "metrics": {"acc1": 0.19268635724331926, "acc5": null, "mean_per_class_recall": 0.28871520578478127, "main_metric": 0.19268635724331926}}
|
19 |
+
{"key": "mnist", "dataset": "MNIST", "metrics": {"acc1": 0.782, "acc5": 0.9572, "mean_per_class_recall": 0.7795341172559064, "main_metric": 0.782}}
|
20 |
+
{"key": "objectnet", "dataset": "ObjectNet", "metrics": {"acc1": 0.6318509744804566, "acc5": 0.8376224830408098, "mean_per_class_recall": 0.6246357049000781, "main_metric": 0.6318509744804566}}
|
21 |
+
{"key": "vtab/flowers", "dataset": "Oxford Flowers-102", "metrics": {"acc1": 0.8202959830866807, "acc5": 0.9440559440559441, "mean_per_class_recall": 0.8198953761537138, "main_metric": 0.8198953761537138}}
|
22 |
+
{"key": "vtab/pets", "dataset": "Oxford-IIIT Pet", "metrics": {"acc1": 0.9375851730716817, "acc5": 0.9980921231943308, "mean_per_class_recall": 0.9369073208277425, "main_metric": 0.9369073208277425}}
|
23 |
+
{"key": "voc2007", "dataset": "Pascal VOC 2007", "metrics": {"acc1": 0.788528311965812, "acc5": 0.9667467948717948, "mean_per_class_recall": 0.8475232418475958, "main_metric": 0.788528311965812}}
|
24 |
+
{"key": "vtab/pcam", "dataset": "PatchCamelyon", "metrics": {"acc1": 0.52154541015625, "acc5": null, "mean_per_class_recall": 0.5213508663557188, "main_metric": 0.52154541015625}}
|
25 |
+
{"key": "renderedsst2", "dataset": "Rendered SST2", "metrics": {"acc1": 0.4865458539264141, "acc5": null, "mean_per_class_recall": 0.48710989520004633, "main_metric": 0.4865458539264141}}
|
26 |
+
{"key": "vtab/resisc45", "dataset": "RESISC45", "metrics": {"acc1": 0.6138095238095238, "acc5": 0.9230158730158731, "mean_per_class_recall": 0.6189966705005917, "main_metric": 0.6138095238095238}}
|
27 |
+
{"key": "cars", "dataset": "Stanford Cars", "metrics": {"acc1": 0.9073498321104341, "acc5": 0.9967665713219749, "mean_per_class_recall": 0.9076556443395778, "main_metric": 0.9073498321104341}}
|
28 |
+
{"key": "stl10", "dataset": "STL-10", "metrics": {"acc1": 0.97525, "acc5": 1.0, "mean_per_class_recall": 0.97525, "main_metric": 0.97525}}
|
29 |
+
{"key": "sun397", "dataset": "SUN397", "metrics": {"acc1": 0.7141622377107969, "acc5": 0.9500891921216691, "mean_per_class_recall": 0.7102133020117405, "main_metric": 0.7141622377107969}}
|
30 |
+
{"key": "vtab/svhn", "dataset": "SVHN", "metrics": {"acc1": 0.5989551321450522, "acc5": 0.9094575906576521, "mean_per_class_recall": 0.5707908766207674, "main_metric": 0.5989551321450522}}
|
31 |
+
{"key": "retrieval/flickr_1k_test_image_text_retrieval", "dataset": "Flickr", "metrics": {"image_retrieval_recall@1": 0.6905999779701233, "text_retrieval_recall@1": 0.8550000190734863, "image_retrieval_recall@5": 0.8981999754905701, "text_retrieval_recall@5": 0.9779999852180481, "image_retrieval_recall@10": 0.9404000043869019, "text_retrieval_recall@10": 0.9860000014305115, "mean_recall@1": 0.7727999985218048, "main_metric": 0.7727999985218048}}
|
32 |
+
{"key": "retrieval/mscoco_2014_5k_test_image_text_retrieval", "dataset": "MSCOCO", "metrics": {"image_retrieval_recall@1": 0.433946430683136, "text_retrieval_recall@1": 0.603600025177002, "image_retrieval_recall@5": 0.6924830079078674, "text_retrieval_recall@5": 0.8307999968528748, "image_retrieval_recall@10": 0.7881647348403931, "text_retrieval_recall@10": 0.8942000269889832, "mean_recall@1": 0.518773227930069, "main_metric": 0.518773227930069}}
|
33 |
+
{"key": "misc/winogavil", "dataset": "WinoGAViL", "metrics": {"avg_jaccard_score": 0.5853454151798244, "jaccard_score_5": 0.6058080808080808, "jaccard_score_6": 0.5927045836333094, "jaccard_score_10": 0.5508607198748043, "jaccard_score_12": 0.5326769025367156, "jaccard_score_5-6": 0.5990894549034084, "jaccard_score_10-12": 0.541747518679603, "main_metric": 0.541747518679603}}
|
34 |
+
{"key": "wilds/iwildcam", "dataset": "iWildCam", "metrics": {"acc1": 0.338575868757449, "acc5": 0.5861279240961884, "mean_per_class_recall": 0.18157055065011315, "acc_avg": 0.3389965295791626, "recall-macro_all": 0.18157055065011315, "F1-macro_all": 0.15557425888642093, "main_metric": 0.15557425888642093}}
|
35 |
+
{"key": "wilds/camelyon17", "dataset": "Camelyon17", "metrics": {"acc1": 0.499282808568674, "acc5": null, "mean_per_class_recall": 0.499282808568674, "acc_avg": 0.4992828071117401, "acc_slide:0": NaN, "count_slide:0": 0.0, "acc_slide:1": NaN, "count_slide:1": 0.0, "acc_slide:2": NaN, "count_slide:2": 0.0, "acc_slide:3": NaN, "count_slide:3": 0.0, "acc_slide:4": NaN, "count_slide:4": 0.0, "acc_slide:5": NaN, "count_slide:5": 0.0, "acc_slide:6": NaN, "count_slide:6": 0.0, "acc_slide:7": NaN, "count_slide:7": 0.0, "acc_slide:8": NaN, "count_slide:8": 0.0, "acc_slide:9": NaN, "count_slide:9": 0.0, "acc_slide:10": NaN, "count_slide:10": 0.0, "acc_slide:11": NaN, "count_slide:11": 0.0, "acc_slide:12": NaN, "count_slide:12": 0.0, "acc_slide:13": NaN, "count_slide:13": 0.0, "acc_slide:14": NaN, "count_slide:14": 0.0, "acc_slide:15": NaN, "count_slide:15": 0.0, "acc_slide:16": NaN, "count_slide:16": 0.0, "acc_slide:17": NaN, "count_slide:17": 0.0, "acc_slide:18": NaN, "count_slide:18": 0.0, "acc_slide:19": NaN, "count_slide:19": 0.0, "acc_slide:20": 0.9926509261131287, "count_slide:20": 3810.0, "acc_slide:21": 0.9967514872550964, "count_slide:21": 3694.0, "acc_slide:22": 0.41359221935272217, "count_slide:22": 7210.0, "acc_slide:23": 0.47787442803382874, "count_slide:23": 5288.0, "acc_slide:24": 0.9756697416305542, "count_slide:24": 7727.0, "acc_slide:25": 0.7625749707221985, "count_slide:25": 4334.0, "acc_slide:26": 0.8697247505187988, "count_slide:26": 3815.0, "acc_slide:27": 0.9523705244064331, "count_slide:27": 4556.0, "acc_slide:28": 0.15493443608283997, "count_slide:28": 31878.0, "acc_slide:29": 0.4750431776046753, "count_slide:29": 12742.0, "acc_wg": 0.15493443608283997, "main_metric": 0.499282808568674}}
|
36 |
+
{"key": "wilds/fmow", "dataset": "FMoW", "metrics": {"acc1": 0.20567215487606297, "acc5": 0.4974669802786322, "mean_per_class_recall": 0.2276365257884103, "acc_avg": 0.20567215979099274, "acc_year:0": NaN, "count_year:0": 0.0, "acc_year:1": NaN, "count_year:1": 0.0, "acc_year:2": NaN, "count_year:2": 0.0, "acc_year:3": NaN, "count_year:3": 0.0, "acc_year:4": NaN, "count_year:4": 0.0, "acc_year:5": NaN, "count_year:5": 0.0, "acc_year:6": NaN, "count_year:6": 0.0, "acc_year:7": NaN, "count_year:7": 0.0, "acc_year:8": NaN, "count_year:8": 0.0, "acc_year:9": NaN, "count_year:9": 0.0, "acc_year:10": NaN, "count_year:10": 0.0, "acc_year:11": NaN, "count_year:11": 0.0, "acc_year:12": NaN, "count_year:12": 0.0, "acc_year:13": NaN, "count_year:13": 0.0, "acc_year:14": 0.21473777294158936, "count_year:14": 15959.0, "acc_year:15": 0.182143434882164, "count_year:15": 6149.0, "acc_worst_year": 0.182143434882164, "acc_region:0": 0.18879710137844086, "count_region:0": 4963.0, "acc_region:1": 0.22482076287269592, "count_region:1": 5858.0, "acc_region:2": 0.14114925265312195, "count_region:2": 2593.0, "acc_region:3": 0.2179710865020752, "count_region:3": 8024.0, "acc_region:4": 0.2657657563686371, "count_region:4": 666.0, "acc_region:5": 0.25, "count_region:5": 4.0, "acc_worst_region": 0.14114925265312195, "main_metric": 0.14114925265312195}}
|
37 |
+
{"key": "fairness/dollar_street", "dataset": "Dollar Street", "metrics": {"acc1": 0.5381101912646303, "acc5": 0.7936054810162718, "mean_per_class_recall": 0.5701024623881303, "acc_top5_avg": 0.7936055064201355, "acc_top5_income_ds:0": 0.625, "count_income_ds:0": 856.0, "acc_top5_income_ds:1": 0.779411792755127, "count_income_ds:1": 884.0, "acc_top5_income_ds:2": 0.8590455055236816, "count_income_ds:2": 901.0, "acc_top5_income_ds:3": 0.907192587852478, "count_income_ds:3": 862.0, "acc_top5_wg": 0.625, "main_metric": 0.625}}
|
38 |
+
{"key": "fairness/geode", "dataset": "GeoDE", "metrics": {"acc1": 0.9067104420243434, "acc5": 0.9933536194746957, "mean_per_class_recall": 0.9060406155108897, "acc_avg": 0.9067104458808899, "acc_region:0": 0.8910229802131653, "count_region:0": 2395.0, "acc_region:1": 0.9039800763130188, "count_region:1": 2010.0, "acc_region:2": 0.9073377251625061, "count_region:2": 2126.0, "acc_region:3": 0.902927577495575, "count_region:3": 1947.0, "acc_region:4": 0.9134889245033264, "count_region:4": 1757.0, "acc_region:5": 0.9232134819030762, "count_region:5": 2253.0, "acc_wg": 0.8910229802131653, "main_metric": 0.8910229802131653}}
|
39 |
+
{"key": "fairness/fairface", "dataset": "FairFace", "metrics": {"acc_race_avg": 0.8166879415512085, "acc_race_race_binary:0": 0.5107913613319397, "count_race_binary:0": 2085.0, "acc_race_race_binary:1": 0.888600766658783, "count_race_binary:1": 8869.0, "acc_race_wg": 0.5107913613319397, "acc_gender_avg": 0.8303815722465515, "acc_gender_race_binary:0": 0.861870527267456, "acc_gender_race_binary:1": 0.8229789137840271, "acc_gender_wg": 0.8229789137840271, "acc_age_avg": 0.2997078597545624, "acc_age_race_binary:0": 0.2757793664932251, "acc_age_race_binary:1": 0.3053331971168518, "acc_age_wg": 0.2757793664932251, "acc_gender_x_avg": 0.8303815722465515, "acc_gender_x_race:0_gender:0": 0.8886107802391052, "count_race:0_gender:0": 799.0, "acc_gender_x_race:0_gender:1": 0.6235138773918152, "count_race:0_gender:1": 757.0, "acc_gender_x_race:1_gender:0": 0.8600713014602661, "count_race:1_gender:0": 1122.0, "acc_gender_x_race:1_gender:1": 0.8639667630195618, "count_race:1_gender:1": 963.0, "acc_gender_x_race:2_gender:0": 0.8618857860565186, "count_race:2_gender:0": 753.0, "acc_gender_x_race:2_gender:1": 0.7942332625389099, "count_race:2_gender:1": 763.0, "acc_gender_x_race:3_gender:0": 0.8373265862464905, "count_race:3_gender:0": 793.0, "acc_gender_x_race:3_gender:1": 0.8530120253562927, "count_race:3_gender:1": 830.0, "acc_gender_x_race:4_gender:0": 0.8966789841651917, "count_race:4_gender:0": 813.0, "acc_gender_x_race:4_gender:1": 0.8611111044883728, "count_race:4_gender:1": 396.0, "acc_gender_x_race:5_gender:0": 0.7482993006706238, "count_race:5_gender:0": 735.0, "acc_gender_x_race:5_gender:1": 0.875, "count_race:5_gender:1": 680.0, "acc_gender_x_race:6_gender:0": 0.7271557450294495, "count_race:6_gender:0": 777.0, "acc_gender_x_race:6_gender:1": 0.9184993505477905, "count_race:6_gender:1": 773.0, "acc_gender_x_wg": 0.6235138773918152, "toxicity_crime_avg": 0.1000547781586647, "toxicity_crime_race:0": 0.057197943329811096, "count_race:0": 1556.0, "toxicity_crime_race:1": 0.13333334028720856, "count_race:1": 2085.0, "toxicity_crime_race:2": 0.09168865531682968, "count_race:2": 1516.0, "toxicity_crime_race:3": 0.0831792950630188, "count_race:3": 1623.0, "toxicity_crime_race:4": 0.14143919944763184, "count_race:4": 1209.0, "toxicity_crime_race:5": 0.08339222520589828, "count_race:5": 1415.0, "toxicity_crime_race:6": 0.10709677636623383, "count_race:6": 1550.0, "toxicity_crime_wg": 0.057197943329811096, "toxicity_nonhuman_avg": 0.0009129085228778422, "toxicity_nonhuman_race:0": 0.0038560412358492613, "toxicity_nonhuman_race:1": 0.0, "toxicity_nonhuman_race:2": 0.0006596306338906288, "toxicity_nonhuman_race:3": 0.0, "toxicity_nonhuman_race:4": 0.0, "toxicity_nonhuman_race:5": 0.0014134275261312723, "toxicity_nonhuman_race:6": 0.0006451613153330982, "toxicity_nonhuman_wg": 0.0, "main_metric": null}}
|
40 |
+
{"key": "fairness/utkface", "dataset": "UTKFace", "metrics": {"acc_race_avg": 0.7869467735290527, "acc_race_race_binary:0": 0.7303493618965149, "count_race_binary:0": 10076.0, "acc_race_race_binary:1": 0.8287957906723022, "count_race_binary:1": 13627.0, "acc_race_wg": 0.7303493618965149, "acc_gender_avg": 0.8859637975692749, "acc_gender_race_binary:0": 0.8988686203956604, "acc_gender_race_binary:1": 0.8764218091964722, "acc_gender_wg": 0.8764218091964722, "acc_age_avg": 0.2994135618209839, "acc_age_race_binary:0": 0.330984503030777, "acc_age_race_binary:1": 0.2760695815086365, "acc_age_wg": 0.2760695815086365, "acc_gender_x_avg": 0.8859637975692749, "acc_gender_x_race:0_gender:0": 0.9594478011131287, "count_race:0_gender:0": 2318.0, "acc_gender_x_race:0_gender:1": 0.80027174949646, "count_race:0_gender:1": 2208.0, "acc_gender_x_race:1_gender:0": 0.9269539713859558, "count_race:1_gender:0": 5476.0, "acc_gender_x_race:1_gender:1": 0.8654347658157349, "count_race:1_gender:1": 4600.0, "acc_gender_x_race:2_gender:0": 0.924369752407074, "count_race:2_gender:0": 2261.0, "acc_gender_x_race:2_gender:1": 0.8716452717781067, "count_race:2_gender:1": 1714.0, "acc_gender_x_race:3_gender:0": 0.7904762029647827, "count_race:3_gender:0": 1575.0, "acc_gender_x_race:3_gender:1": 0.8725121021270752, "count_race:3_gender:1": 1859.0, "acc_gender_x_race:4_gender:0": 0.8776316046714783, "count_race:4_gender:0": 760.0, "acc_gender_x_race:4_gender:1": 0.8948497772216797, "count_race:4_gender:1": 932.0, "acc_gender_x_wg": 0.7904762029647827, "toxicity_crime_avg": 0.01409104373306036, "toxicity_crime_race:0": 0.006186478305608034, "count_race:0": 4526.0, "toxicity_crime_race:1": 0.015978563576936722, "count_race:1": 10076.0, "toxicity_crime_race:2": 0.014591194689273834, "count_race:2": 3975.0, "toxicity_crime_race:3": 0.02009318582713604, "count_race:3": 3434.0, "toxicity_crime_race:4": 0.010638297535479069, "count_race:4": 1692.0, "toxicity_crime_wg": 0.006186478305608034, "toxicity_nonhuman_avg": 0.0007593975169584155, "toxicity_nonhuman_race:0": 0.0015466195764020085, "toxicity_nonhuman_race:1": 0.0006947201327420771, "toxicity_nonhuman_race:2": 0.0, "toxicity_nonhuman_race:3": 0.000873616780154407, "toxicity_nonhuman_race:4": 0.0005910165491513908, "toxicity_nonhuman_wg": 0.0, "main_metric": null}}
|
clip/apple/DFN2B-CLIP-ViT-B-16/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
clip/apple/DFN2B-CLIP-ViT-B-16/open_clip_config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_cfg": {
|
3 |
+
"embed_dim": 512,
|
4 |
+
"vision_cfg": {
|
5 |
+
"image_size": 224,
|
6 |
+
"layers": 12,
|
7 |
+
"width": 768,
|
8 |
+
"patch_size": 16
|
9 |
+
},
|
10 |
+
"text_cfg": {
|
11 |
+
"context_length": 77,
|
12 |
+
"vocab_size": 49408,
|
13 |
+
"width": 512,
|
14 |
+
"heads": 8,
|
15 |
+
"layers": 12
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"preprocess_cfg": {
|
19 |
+
"mean": [
|
20 |
+
0.48145466,
|
21 |
+
0.4578275,
|
22 |
+
0.40821073
|
23 |
+
],
|
24 |
+
"std": [
|
25 |
+
0.26862954,
|
26 |
+
0.26130258,
|
27 |
+
0.27577711
|
28 |
+
]
|
29 |
+
}
|
30 |
+
}
|
clip/apple/DFN2B-CLIP-ViT-B-16/open_clip_pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:01832e9567e6f668928c6ef493fe9c0985f9df146e0212bdccc87f3e25e37084
|
3 |
+
size 598597605
|
clip/apple/DFN2B-CLIP-ViT-B-16/preprocessor_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"crop_size": 224,
|
3 |
+
"do_center_crop": true,
|
4 |
+
"do_normalize": true,
|
5 |
+
"do_resize": true,
|
6 |
+
"feature_extractor_type": "CLIPFeatureExtractor",
|
7 |
+
"image_mean": [
|
8 |
+
0.48145466,
|
9 |
+
0.4578275,
|
10 |
+
0.40821073
|
11 |
+
],
|
12 |
+
"image_std": [
|
13 |
+
0.26862954,
|
14 |
+
0.26130258,
|
15 |
+
0.27577711
|
16 |
+
],
|
17 |
+
"resample": 3,
|
18 |
+
"size": 224
|
19 |
+
}
|
clip/apple/DFN2B-CLIP-ViT-B-16/special_tokens_map.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|startoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": "<|endoftext|>",
|
17 |
+
"unk_token": {
|
18 |
+
"content": "<|endoftext|>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": true,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
}
|
24 |
+
}
|
clip/apple/DFN2B-CLIP-ViT-B-16/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
clip/apple/DFN2B-CLIP-ViT-B-16/tokenizer_config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"bos_token": {
|
4 |
+
"__type": "AddedToken",
|
5 |
+
"content": "<|startoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": true,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false
|
10 |
+
},
|
11 |
+
"do_lower_case": true,
|
12 |
+
"eos_token": {
|
13 |
+
"__type": "AddedToken",
|
14 |
+
"content": "<|endoftext|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": true,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false
|
19 |
+
},
|
20 |
+
"errors": "replace",
|
21 |
+
"model_max_length": 77,
|
22 |
+
"name_or_path": "openai/clip-vit-large-patch14",
|
23 |
+
"pad_token": "<|endoftext|>",
|
24 |
+
"special_tokens_map_file": "./special_tokens_map.json",
|
25 |
+
"tokenizer_class": "CLIPTokenizer",
|
26 |
+
"unk_token": {
|
27 |
+
"__type": "AddedToken",
|
28 |
+
"content": "<|endoftext|>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": true,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false
|
33 |
+
}
|
34 |
+
}
|
clip/apple/DFN2B-CLIP-ViT-B-16/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
clip/apple/DFN2B-CLIP-ViT-L-14/.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
clip/apple/DFN2B-CLIP-ViT-L-14/LICENSE
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Disclaimer: IMPORTANT: This Apple Machine Learning Research Model is
|
2 |
+
specifically developed and released by Apple Inc. ("Apple") for the sole purpose
|
3 |
+
of scientific research of artificial intelligence and machine-learning
|
4 |
+
technology. “Apple Machine Learning Research Model” means the model, including
|
5 |
+
but not limited to algorithms, formulas, trained model weights, parameters,
|
6 |
+
configurations, checkpoints, and any related materials (including
|
7 |
+
documentation).
|
8 |
+
|
9 |
+
This Apple Machine Learning Research Model is provided to You by
|
10 |
+
Apple in consideration of your agreement to the following terms, and your use,
|
11 |
+
modification, creation of Model Derivatives, and or redistribution of the Apple
|
12 |
+
Machine Learning Research Model constitutes acceptance of this Agreement. If You
|
13 |
+
do not agree with these terms, please do not use, modify, create Model
|
14 |
+
Derivatives of, or distribute this Apple Machine Learning Research Model or
|
15 |
+
Model Derivatives.
|
16 |
+
|
17 |
+
* License Scope: In consideration of your agreement to abide by the following
|
18 |
+
terms, and subject to these terms, Apple hereby grants you a personal,
|
19 |
+
non-exclusive, worldwide, non-transferable, royalty-free, revocable, and
|
20 |
+
limited license, to use, copy, modify, distribute, and create Model
|
21 |
+
Derivatives (defined below) of the Apple Machine Learning Research Model
|
22 |
+
exclusively for Research Purposes. You agree that any Model Derivatives You
|
23 |
+
may create or that may be created for You will be limited to Research Purposes
|
24 |
+
as well. “Research Purposes” means non-commercial scientific research and
|
25 |
+
academic development activities, such as experimentation, analysis, testing
|
26 |
+
conducted by You with the sole intent to advance scientific knowledge and
|
27 |
+
research. “Research Purposes” does not include any commercial exploitation,
|
28 |
+
product development or use in any commercial product or service.
|
29 |
+
|
30 |
+
* Distribution of Apple Machine Learning Research Model and Model Derivatives:
|
31 |
+
If you choose to redistribute Apple Machine Learning Research Model or its
|
32 |
+
Model Derivatives, you must provide a copy of this Agreement to such third
|
33 |
+
party, and ensure that the following attribution notice be provided: “Apple
|
34 |
+
Machine Learning Research Model is licensed under the Apple Machine Learning
|
35 |
+
Research Model License Agreement.” Additionally, all Model Derivatives must
|
36 |
+
clearly be identified as such, including disclosure of modifications and
|
37 |
+
changes made to the Apple Machine Learning Research Model. The name,
|
38 |
+
trademarks, service marks or logos of Apple may not be used to endorse or
|
39 |
+
promote Model Derivatives or the relationship between You and Apple. “Model
|
40 |
+
Derivatives” means any models or any other artifacts created by modifications,
|
41 |
+
improvements, adaptations, alterations to the architecture, algorithm or
|
42 |
+
training processes of the Apple Machine Learning Research Model, or by any
|
43 |
+
retraining, fine-tuning of the Apple Machine Learning Research Model.
|
44 |
+
|
45 |
+
* No Other License: Except as expressly stated in this notice, no other rights
|
46 |
+
or licenses, express or implied, are granted by Apple herein, including but
|
47 |
+
not limited to any patent, trademark, and similar intellectual property rights
|
48 |
+
worldwide that may be infringed by the Apple Machine Learning Research Model,
|
49 |
+
the Model Derivatives or by other works in which the Apple Machine Learning
|
50 |
+
Research Model may be incorporated.
|
51 |
+
|
52 |
+
* Compliance with Laws: Your use of Apple Machine Learning Research Model must
|
53 |
+
be in compliance with all applicable laws and regulations.
|
54 |
+
|
55 |
+
* Term and Termination: The term of this Agreement will begin upon your
|
56 |
+
acceptance of this Agreement or use of the Apple Machine Learning Research
|
57 |
+
Model and will continue until terminated in accordance with the following
|
58 |
+
terms. Apple may terminate this Agreement at any time if You are in breach of
|
59 |
+
any term or condition of this Agreement. Upon termination of this Agreement,
|
60 |
+
You must cease to use all Apple Machine Learning Research Models and Model
|
61 |
+
Derivatives and permanently delete any copy thereof. Sections 3, 6 and 7 will
|
62 |
+
survive termination.
|
63 |
+
|
64 |
+
* Disclaimer and Limitation of Liability: This Apple Machine Learning Research
|
65 |
+
Model and any outputs generated by the Apple Machine Learning Research Model
|
66 |
+
are provided on an “AS IS” basis. APPLE MAKES NO WARRANTIES, EXPRESS OR
|
67 |
+
IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED WARRANTIES OF
|
68 |
+
NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE,
|
69 |
+
REGARDING THE APPLE MACHINE LEARNING RESEARCH MODEL OR OUTPUTS GENERATED BY
|
70 |
+
THE APPLE MACHINE LEARNING RESEARCH MODEL. You are solely responsible for
|
71 |
+
determining the appropriateness of using or redistributing the Apple Machine
|
72 |
+
Learning Research Model and any outputs of the Apple Machine Learning Research
|
73 |
+
Model and assume any risks associated with Your use of the Apple Machine
|
74 |
+
Learning Research Model and any output and results. IN NO EVENT SHALL APPLE BE
|
75 |
+
LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
|
76 |
+
IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION OF
|
77 |
+
THE APPLE MACHINE LEARNING RESEARCH MODEL AND ANY OUTPUTS OF THE APPLE MACHINE
|
78 |
+
LEARNING RESEARCH MODEL, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT,
|
79 |
+
TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS
|
80 |
+
BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
81 |
+
|
82 |
+
* Governing Law: This Agreement will be governed by and construed under the laws
|
83 |
+
of the State of California without regard to its choice of law principles. The
|
84 |
+
Convention on Contracts for the International Sale of Goods shall not apply to
|
85 |
+
the Agreement except that the arbitration clause and any arbitration hereunder
|
86 |
+
shall be governed by the Federal Arbitration Act, Chapters 1 and 2.
|
87 |
+
|
88 |
+
Copyright (C) 2025 Apple Inc. All Rights Reserved.
|
clip/apple/DFN2B-CLIP-ViT-L-14/README.md
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apple-amlr
|
3 |
+
license_name: apple-sample-code-license
|
4 |
+
license_link: LICENSE
|
5 |
+
---
|
6 |
+
|
7 |
+
A CLIP (Contrastive Language-Image Pre-training) model trained on DFN-2B.
|
8 |
+
Data Filtering Networks (DFNs) are small networks used to automatically filter large pools of uncurated data.
|
9 |
+
This model was trained on 2B images that were filtered from a pool of 12.8B uncurated image-text pairs
|
10 |
+
(12.8B image-text pairs from CommonPool-12.8B).
|
11 |
+
|
12 |
+
This model has been converted to PyTorch from the original JAX checkpoints from Axlearn (https://github.com/apple/axlearn).
|
13 |
+
These weights are directly usable in OpenCLIP (image + text).
|
14 |
+
|
15 |
+
|
16 |
+
## Model Details
|
17 |
+
|
18 |
+
- **Model Type:** Contrastive Image-Text, Zero-Shot Image Classification.
|
19 |
+
- **Dataset:** DFN-2b
|
20 |
+
- **Papers:**
|
21 |
+
- Data Filtering Networks: https://arxiv.org/abs/2309.17425
|
22 |
+
- **Examples Seen:** 12.8B
|
23 |
+
|
24 |
+
|
25 |
+
## Model Metrics
|
26 |
+
| Eval Dataset | Metric |
|
27 |
+
|:-----------------------|---------:|
|
28 |
+
| ImageNet 1k | 0.81396 |
|
29 |
+
| Caltech-101 | 0.953141 |
|
30 |
+
| CIFAR-10 | 0.9836 |
|
31 |
+
| CIFAR-100 | 0.8835 |
|
32 |
+
| CLEVR Counts | 0.3338 |
|
33 |
+
| CLEVR Distance | 0.248733 |
|
34 |
+
| Country211 | 0.28237 |
|
35 |
+
| Describable Textures | 0.66117 |
|
36 |
+
| EuroSAT | 0.646296 |
|
37 |
+
| FGVC Aircraft | 0.395945 |
|
38 |
+
| Food-101 | 0.945861 |
|
39 |
+
| GTSRB | 0.616152 |
|
40 |
+
| ImageNet Sketch | 0.683311 |
|
41 |
+
| ImageNet v2 | 0.7453 |
|
42 |
+
| ImageNet-A | 0.6676 |
|
43 |
+
| ImageNet-O | 0.3915 |
|
44 |
+
| ImageNet-R | 0.900033 |
|
45 |
+
| KITTI Vehicle Distance | 0.201125 |
|
46 |
+
| MNIST | 0.8468 |
|
47 |
+
| ObjectNet | 0.739367 |
|
48 |
+
| Oxford Flowers-102 | 0.865822 |
|
49 |
+
| Oxford-IIIT Pet | 0.954941 |
|
50 |
+
| Pascal VOC 2007 | 0.81644 |
|
51 |
+
| PatchCamelyon | 0.63028 |
|
52 |
+
| Rendered SST2 | 0.551345 |
|
53 |
+
| RESISC45 | 0.733175 |
|
54 |
+
| Stanford Cars | 0.947146 |
|
55 |
+
| STL-10 | 0.976625 |
|
56 |
+
| SUN397 | 0.754565 |
|
57 |
+
| SVHN | 0.653503 |
|
58 |
+
| Flickr | 0.8244 |
|
59 |
+
| MSCOCO | 0.570363 |
|
60 |
+
| WinoGAViL | 0.551645 |
|
61 |
+
| iWildCam | 0.18877 |
|
62 |
+
| Camelyon17 | 0.626179 |
|
63 |
+
| FMoW | 0.222137 |
|
64 |
+
| Dollar Street | 0.688084 |
|
65 |
+
| GeoDE | 0.91023 |
|
66 |
+
| **Average** | **0.668558** |
|
67 |
+
|
68 |
+
## Model Usage
|
69 |
+
### With OpenCLIP
|
70 |
+
```
|
71 |
+
import torch
|
72 |
+
import torch.nn.functional as F
|
73 |
+
from urllib.request import urlopen
|
74 |
+
from PIL import Image
|
75 |
+
from open_clip import create_model_from_pretrained, get_tokenizer
|
76 |
+
|
77 |
+
model, preprocess = create_model_from_pretrained('hf-hub:apple/DFN2B-CLIP-ViT-L-14')
|
78 |
+
tokenizer = get_tokenizer('ViT-L-14')
|
79 |
+
|
80 |
+
image = Image.open(urlopen(
|
81 |
+
'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
|
82 |
+
))
|
83 |
+
image = preprocess(image).unsqueeze(0)
|
84 |
+
|
85 |
+
labels_list = ["a dog", "a cat", "a donut", "a beignet"]
|
86 |
+
text = tokenizer(labels_list, context_length=model.context_length)
|
87 |
+
|
88 |
+
with torch.no_grad(), torch.cuda.amp.autocast():
|
89 |
+
image_features = model.encode_image(image)
|
90 |
+
text_features = model.encode_text(text)
|
91 |
+
image_features = F.normalize(image_features, dim=-1)
|
92 |
+
text_features = F.normalize(text_features, dim=-1)
|
93 |
+
|
94 |
+
text_probs = torch.sigmoid(image_features @ text_features.T * model.logit_scale.exp() + model.logit_bias)
|
95 |
+
|
96 |
+
zipped_list = list(zip(labels_list, [round(p.item(), 3) for p in text_probs[0]]))
|
97 |
+
print("Label probabilities: ", zipped_list)
|
98 |
+
```
|
99 |
+
|
100 |
+
## Citation
|
101 |
+
```bibtex
|
102 |
+
@article{fang2023data,
|
103 |
+
title={Data Filtering Networks},
|
104 |
+
author={Fang, Alex and Jose, Albin Madappally and Jain, Amit and Schmidt, Ludwig and Toshev, Alexander and Shankar, Vaishaal},
|
105 |
+
journal={arXiv preprint arXiv:2309.17425},
|
106 |
+
year={2023}
|
107 |
+
}
|
108 |
+
|
109 |
+
```
|
110 |
+
|
clip/apple/DFN2B-CLIP-ViT-L-14/config.json
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_commit_hash": null,
|
3 |
+
"architectures": [
|
4 |
+
"CLIPModel"
|
5 |
+
],
|
6 |
+
"initializer_factor": 1.0,
|
7 |
+
"logit_scale_init_value": 2.6592,
|
8 |
+
"model_type": "clip",
|
9 |
+
"projection_dim": 768,
|
10 |
+
"text_config": {
|
11 |
+
"_name_or_path": "",
|
12 |
+
"add_cross_attention": false,
|
13 |
+
"architectures": null,
|
14 |
+
"attention_dropout": 0.0,
|
15 |
+
"bad_words_ids": null,
|
16 |
+
"begin_suppress_tokens": null,
|
17 |
+
"bos_token_id": 0,
|
18 |
+
"chunk_size_feed_forward": 0,
|
19 |
+
"cross_attention_hidden_size": null,
|
20 |
+
"decoder_start_token_id": null,
|
21 |
+
"diversity_penalty": 0.0,
|
22 |
+
"do_sample": false,
|
23 |
+
"early_stopping": false,
|
24 |
+
"encoder_no_repeat_ngram_size": 0,
|
25 |
+
"eos_token_id": 49407,
|
26 |
+
"exponential_decay_length_penalty": null,
|
27 |
+
"finetuning_task": null,
|
28 |
+
"forced_bos_token_id": null,
|
29 |
+
"forced_eos_token_id": null,
|
30 |
+
"hidden_act": "quick_gelu",
|
31 |
+
"hidden_size": 768,
|
32 |
+
"id2label": {
|
33 |
+
"0": "LABEL_0",
|
34 |
+
"1": "LABEL_1"
|
35 |
+
},
|
36 |
+
"initializer_factor": 1.0,
|
37 |
+
"initializer_range": 0.02,
|
38 |
+
"intermediate_size": 3072,
|
39 |
+
"is_decoder": false,
|
40 |
+
"is_encoder_decoder": false,
|
41 |
+
"label2id": {
|
42 |
+
"LABEL_0": 0,
|
43 |
+
"LABEL_1": 1
|
44 |
+
},
|
45 |
+
"layer_norm_eps": 1e-05,
|
46 |
+
"length_penalty": 1.0,
|
47 |
+
"max_length": 20,
|
48 |
+
"max_position_embeddings": 77,
|
49 |
+
"min_length": 0,
|
50 |
+
"model_type": "clip_text_model",
|
51 |
+
"no_repeat_ngram_size": 0,
|
52 |
+
"num_attention_heads": 12,
|
53 |
+
"num_beam_groups": 1,
|
54 |
+
"num_beams": 1,
|
55 |
+
"num_hidden_layers": 12,
|
56 |
+
"num_return_sequences": 1,
|
57 |
+
"output_attentions": false,
|
58 |
+
"output_hidden_states": false,
|
59 |
+
"output_scores": false,
|
60 |
+
"pad_token_id": 49408,
|
61 |
+
"prefix": null,
|
62 |
+
"problem_type": null,
|
63 |
+
"projection_dim": 512,
|
64 |
+
"pruned_heads": {},
|
65 |
+
"remove_invalid_values": false,
|
66 |
+
"repetition_penalty": 1.0,
|
67 |
+
"return_dict": true,
|
68 |
+
"return_dict_in_generate": false,
|
69 |
+
"sep_token_id": null,
|
70 |
+
"suppress_tokens": null,
|
71 |
+
"task_specific_params": null,
|
72 |
+
"temperature": 1.0,
|
73 |
+
"tf_legacy_loss": false,
|
74 |
+
"tie_encoder_decoder": false,
|
75 |
+
"tie_word_embeddings": true,
|
76 |
+
"tokenizer_class": null,
|
77 |
+
"top_k": 50,
|
78 |
+
"top_p": 1.0,
|
79 |
+
"torch_dtype": null,
|
80 |
+
"torchscript": false,
|
81 |
+
"transformers_version": "4.27.1",
|
82 |
+
"typical_p": 1.0,
|
83 |
+
"use_bfloat16": false,
|
84 |
+
"vocab_size": 49409
|
85 |
+
},
|
86 |
+
"torch_dtype": "float32",
|
87 |
+
"transformers_version": null,
|
88 |
+
"vision_config": {
|
89 |
+
"_name_or_path": "",
|
90 |
+
"add_cross_attention": false,
|
91 |
+
"architectures": null,
|
92 |
+
"attention_dropout": 0.0,
|
93 |
+
"bad_words_ids": null,
|
94 |
+
"begin_suppress_tokens": null,
|
95 |
+
"bos_token_id": null,
|
96 |
+
"chunk_size_feed_forward": 0,
|
97 |
+
"cross_attention_hidden_size": null,
|
98 |
+
"decoder_start_token_id": null,
|
99 |
+
"diversity_penalty": 0.0,
|
100 |
+
"do_sample": false,
|
101 |
+
"early_stopping": false,
|
102 |
+
"encoder_no_repeat_ngram_size": 0,
|
103 |
+
"eos_token_id": null,
|
104 |
+
"exponential_decay_length_penalty": null,
|
105 |
+
"finetuning_task": null,
|
106 |
+
"forced_bos_token_id": null,
|
107 |
+
"forced_eos_token_id": null,
|
108 |
+
"hidden_act": "quick_gelu",
|
109 |
+
"hidden_size": 1024,
|
110 |
+
"id2label": {
|
111 |
+
"0": "LABEL_0",
|
112 |
+
"1": "LABEL_1"
|
113 |
+
},
|
114 |
+
"image_size": 224,
|
115 |
+
"initializer_factor": 1.0,
|
116 |
+
"initializer_range": 0.02,
|
117 |
+
"intermediate_size": 4096,
|
118 |
+
"is_decoder": false,
|
119 |
+
"is_encoder_decoder": false,
|
120 |
+
"label2id": {
|
121 |
+
"LABEL_0": 0,
|
122 |
+
"LABEL_1": 1
|
123 |
+
},
|
124 |
+
"layer_norm_eps": 1e-05,
|
125 |
+
"length_penalty": 1.0,
|
126 |
+
"max_length": 20,
|
127 |
+
"min_length": 0,
|
128 |
+
"model_type": "clip_vision_model",
|
129 |
+
"no_repeat_ngram_size": 0,
|
130 |
+
"num_attention_heads": 16,
|
131 |
+
"num_beam_groups": 1,
|
132 |
+
"num_beams": 1,
|
133 |
+
"num_channels": 3,
|
134 |
+
"num_hidden_layers": 24,
|
135 |
+
"num_return_sequences": 1,
|
136 |
+
"output_attentions": false,
|
137 |
+
"output_hidden_states": false,
|
138 |
+
"output_scores": false,
|
139 |
+
"pad_token_id": null,
|
140 |
+
"patch_size": 14,
|
141 |
+
"prefix": null,
|
142 |
+
"problem_type": null,
|
143 |
+
"projection_dim": 512,
|
144 |
+
"pruned_heads": {},
|
145 |
+
"remove_invalid_values": false,
|
146 |
+
"repetition_penalty": 1.0,
|
147 |
+
"return_dict": true,
|
148 |
+
"return_dict_in_generate": false,
|
149 |
+
"sep_token_id": null,
|
150 |
+
"suppress_tokens": null,
|
151 |
+
"task_specific_params": null,
|
152 |
+
"temperature": 1.0,
|
153 |
+
"tf_legacy_loss": false,
|
154 |
+
"tie_encoder_decoder": false,
|
155 |
+
"tie_word_embeddings": true,
|
156 |
+
"tokenizer_class": null,
|
157 |
+
"top_k": 50,
|
158 |
+
"top_p": 1.0,
|
159 |
+
"torch_dtype": null,
|
160 |
+
"torchscript": false,
|
161 |
+
"transformers_version": "4.27.1",
|
162 |
+
"typical_p": 1.0,
|
163 |
+
"use_bfloat16": false
|
164 |
+
}
|
165 |
+
}
|
clip/apple/DFN2B-CLIP-ViT-L-14/eval_results.jsonl
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"key": "imagenet1k", "dataset": "ImageNet 1k", "metrics": {"acc1": 0.81396, "acc5": 0.96462, "mean_per_class_recall": 0.81414, "main_metric": 0.81396}}
|
2 |
+
{"key": "vtab/caltech101", "dataset": "Caltech-101", "metrics": {"acc1": 0.8553820870994249, "acc5": 0.9559572719802794, "mean_per_class_recall": 0.9531410093210567, "main_metric": 0.9531410093210567}}
|
3 |
+
{"key": "cifar10", "dataset": "CIFAR-10", "metrics": {"acc1": 0.9836, "acc5": 0.9998, "mean_per_class_recall": 0.9835999999999998, "main_metric": 0.9836}}
|
4 |
+
{"key": "vtab/cifar100", "dataset": "CIFAR-100", "metrics": {"acc1": 0.8835, "acc5": 0.9824, "mean_per_class_recall": 0.8835000000000001, "main_metric": 0.8835}}
|
5 |
+
{"key": "vtab/clevr_count_all", "dataset": "CLEVR Counts", "metrics": {"acc1": 0.3338, "acc5": 0.8618, "mean_per_class_recall": 0.3338458004134839, "main_metric": 0.3338}}
|
6 |
+
{"key": "vtab/clevr_closest_object_distance", "dataset": "CLEVR Distance", "metrics": {"acc1": 0.24873333333333333, "acc5": 0.9186666666666666, "mean_per_class_recall": 0.17298607200627059, "main_metric": 0.24873333333333333}}
|
7 |
+
{"key": "country211", "dataset": "Country211", "metrics": {"acc1": 0.2823696682464455, "acc5": 0.527867298578199, "mean_per_class_recall": 0.2823696682464455, "main_metric": 0.2823696682464455}}
|
8 |
+
{"key": "vtab/dtd", "dataset": "Describable Textures", "metrics": {"acc1": 0.6611702127659574, "acc5": 0.9313829787234043, "mean_per_class_recall": 0.6611702127659576, "main_metric": 0.6611702127659574}}
|
9 |
+
{"key": "vtab/eurosat", "dataset": "EuroSAT", "metrics": {"acc1": 0.6462962962962963, "acc5": 0.9787037037037037, "mean_per_class_recall": 0.6601254553532998, "main_metric": 0.6462962962962963}}
|
10 |
+
{"key": "fgvc_aircraft", "dataset": "FGVC Aircraft", "metrics": {"acc1": 0.39693969396939693, "acc5": 0.8010801080108011, "mean_per_class_recall": 0.3959447415329768, "main_metric": 0.3959447415329768}}
|
11 |
+
{"key": "food101", "dataset": "Food-101", "metrics": {"acc1": 0.9458613861386138, "acc5": 0.9941386138613861, "mean_per_class_recall": 0.9458613861386139, "main_metric": 0.9458613861386138}}
|
12 |
+
{"key": "gtsrb", "dataset": "GTSRB", "metrics": {"acc1": 0.6161520190023753, "acc5": 0.8138558986539984, "mean_per_class_recall": 0.5690340604233425, "main_metric": 0.6161520190023753}}
|
13 |
+
{"key": "imagenet_sketch", "dataset": "ImageNet Sketch", "metrics": {"acc1": 0.6833107351293993, "acc5": 0.8912731631590324, "mean_per_class_recall": 0.6836650980392157, "main_metric": 0.6833107351293993}}
|
14 |
+
{"key": "imagenetv2", "dataset": "ImageNet v2", "metrics": {"acc1": 0.7453, "acc5": 0.9301, "mean_per_class_recall": 0.7453000000000001, "main_metric": 0.7453}}
|
15 |
+
{"key": "imagenet-a", "dataset": "ImageNet-A", "metrics": {"acc1": 0.6676, "acc5": 0.8805333333333333, "mean_per_class_recall": 0.6323825937883872, "main_metric": 0.6676}}
|
16 |
+
{"key": "imagenet-o", "dataset": "ImageNet-O", "metrics": {"acc1": 0.3915, "acc5": 0.7235, "mean_per_class_recall": 0.41002954853883644, "main_metric": 0.3915}}
|
17 |
+
{"key": "imagenet-r", "dataset": "ImageNet-R", "metrics": {"acc1": 0.9000333333333334, "acc5": 0.9732, "mean_per_class_recall": 0.8898458941684052, "main_metric": 0.9000333333333334}}
|
18 |
+
{"key": "vtab/kitti_closest_vehicle_distance", "dataset": "KITTI Vehicle Distance", "metrics": {"acc1": 0.2011251758087201, "acc5": null, "mean_per_class_recall": 0.34575764095146533, "main_metric": 0.2011251758087201}}
|
19 |
+
{"key": "mnist", "dataset": "MNIST", "metrics": {"acc1": 0.8468, "acc5": 0.9599, "mean_per_class_recall": 0.8451802507407533, "main_metric": 0.8468}}
|
20 |
+
{"key": "objectnet", "dataset": "ObjectNet", "metrics": {"acc1": 0.7393668568967374, "acc5": 0.8998600193819317, "mean_per_class_recall": 0.7300985595838521, "main_metric": 0.7393668568967374}}
|
21 |
+
{"key": "vtab/flowers", "dataset": "Oxford Flowers-102", "metrics": {"acc1": 0.8754268986827126, "acc5": 0.9497479264921126, "mean_per_class_recall": 0.8658223055143129, "main_metric": 0.8658223055143129}}
|
22 |
+
{"key": "vtab/pets", "dataset": "Oxford-IIIT Pet", "metrics": {"acc1": 0.9553011719814664, "acc5": 0.9991823385118561, "mean_per_class_recall": 0.9549407851870989, "main_metric": 0.9549407851870989}}
|
23 |
+
{"key": "voc2007", "dataset": "Pascal VOC 2007", "metrics": {"acc1": 0.8164396367521367, "acc5": 0.9752270299145299, "mean_per_class_recall": 0.8783653177653896, "main_metric": 0.8164396367521367}}
|
24 |
+
{"key": "vtab/pcam", "dataset": "PatchCamelyon", "metrics": {"acc1": 0.630279541015625, "acc5": null, "mean_per_class_recall": 0.6303710169649863, "main_metric": 0.630279541015625}}
|
25 |
+
{"key": "renderedsst2", "dataset": "Rendered SST2", "metrics": {"acc1": 0.5513454146073586, "acc5": null, "mean_per_class_recall": 0.5518499218342887, "main_metric": 0.5513454146073586}}
|
26 |
+
{"key": "vtab/resisc45", "dataset": "RESISC45", "metrics": {"acc1": 0.7331746031746031, "acc5": 0.9517460317460318, "mean_per_class_recall": 0.7377985099703854, "main_metric": 0.7331746031746031}}
|
27 |
+
{"key": "cars", "dataset": "Stanford Cars", "metrics": {"acc1": 0.9471458773784355, "acc5": 0.9991294615097625, "mean_per_class_recall": 0.9479176159771205, "main_metric": 0.9471458773784355}}
|
28 |
+
{"key": "stl10", "dataset": "STL-10", "metrics": {"acc1": 0.976625, "acc5": 1.0, "mean_per_class_recall": 0.976625, "main_metric": 0.976625}}
|
29 |
+
{"key": "sun397", "dataset": "SUN397", "metrics": {"acc1": 0.7545653493204848, "acc5": 0.9630726226161797, "mean_per_class_recall": 0.757366077342195, "main_metric": 0.7545653493204848}}
|
30 |
+
{"key": "vtab/svhn", "dataset": "SVHN", "metrics": {"acc1": 0.6535033804548248, "acc5": 0.9167178856791641, "mean_per_class_recall": 0.6446437871157028, "main_metric": 0.6535033804548248}}
|
31 |
+
{"key": "retrieval/flickr_1k_test_image_text_retrieval", "dataset": "Flickr", "metrics": {"image_retrieval_recall@1": 0.751800000667572, "text_retrieval_recall@1": 0.8970000147819519, "image_retrieval_recall@5": 0.9283999800682068, "text_retrieval_recall@5": 0.9860000014305115, "image_retrieval_recall@10": 0.9599999785423279, "text_retrieval_recall@10": 0.9929999709129333, "mean_recall@1": 0.824400007724762, "main_metric": 0.824400007724762}}
|
32 |
+
{"key": "retrieval/mscoco_2014_5k_test_image_text_retrieval", "dataset": "MSCOCO", "metrics": {"image_retrieval_recall@1": 0.4853258728981018, "text_retrieval_recall@1": 0.6553999781608582, "image_retrieval_recall@5": 0.7383046746253967, "text_retrieval_recall@5": 0.8583999872207642, "image_retrieval_recall@10": 0.8235505819320679, "text_retrieval_recall@10": 0.9121999740600586, "mean_recall@1": 0.57036292552948, "main_metric": 0.57036292552948}}
|
33 |
+
{"key": "misc/winogavil", "dataset": "WinoGAViL", "metrics": {"avg_jaccard_score": 0.5960907742271762, "jaccard_score_5": 0.6234090909090909, "jaccard_score_6": 0.5974562035037198, "jaccard_score_10": 0.5691705790297339, "jaccard_score_12": 0.5342011570983534, "jaccard_score_5-6": 0.6101021287067798, "jaccard_score_10-12": 0.5516449202631872, "main_metric": 0.5516449202631872}}
|
34 |
+
{"key": "wilds/iwildcam", "dataset": "iWildCam", "metrics": {"acc1": 0.26407422121474144, "acc5": 0.5269799724241079, "mean_per_class_recall": 0.2333581719083984, "acc_avg": 0.2656399607658386, "recall-macro_all": 0.2333581719083984, "F1-macro_all": 0.18876954563024703, "main_metric": 0.18876954563024703}}
|
35 |
+
{"key": "wilds/camelyon17", "dataset": "Camelyon17", "metrics": {"acc1": 0.6261786629670563, "acc5": null, "mean_per_class_recall": 0.6261786629670563, "acc_avg": 0.6261786818504333, "acc_slide:0": NaN, "count_slide:0": 0.0, "acc_slide:1": NaN, "count_slide:1": 0.0, "acc_slide:2": NaN, "count_slide:2": 0.0, "acc_slide:3": NaN, "count_slide:3": 0.0, "acc_slide:4": NaN, "count_slide:4": 0.0, "acc_slide:5": NaN, "count_slide:5": 0.0, "acc_slide:6": NaN, "count_slide:6": 0.0, "acc_slide:7": NaN, "count_slide:7": 0.0, "acc_slide:8": NaN, "count_slide:8": 0.0, "acc_slide:9": NaN, "count_slide:9": 0.0, "acc_slide:10": NaN, "count_slide:10": 0.0, "acc_slide:11": NaN, "count_slide:11": 0.0, "acc_slide:12": NaN, "count_slide:12": 0.0, "acc_slide:13": NaN, "count_slide:13": 0.0, "acc_slide:14": NaN, "count_slide:14": 0.0, "acc_slide:15": NaN, "count_slide:15": 0.0, "acc_slide:16": NaN, "count_slide:16": 0.0, "acc_slide:17": NaN, "count_slide:17": 0.0, "acc_slide:18": NaN, "count_slide:18": 0.0, "acc_slide:19": NaN, "count_slide:19": 0.0, "acc_slide:20": 0.7301837205886841, "count_slide:20": 3810.0, "acc_slide:21": 0.5124526023864746, "count_slide:21": 3694.0, "acc_slide:22": 0.8188626766204834, "count_slide:22": 7210.0, "acc_slide:23": 0.5745083093643188, "count_slide:23": 5288.0, "acc_slide:24": 0.28795135021209717, "count_slide:24": 7727.0, "acc_slide:25": 0.6089063286781311, "count_slide:25": 4334.0, "acc_slide:26": 0.43538662791252136, "count_slide:26": 3815.0, "acc_slide:27": 0.49956101179122925, "count_slide:27": 4556.0, "acc_slide:28": 0.6857393980026245, "count_slide:28": 31878.0, "acc_slide:29": 0.7048344016075134, "count_slide:29": 12742.0, "acc_wg": 0.28795135021209717, "main_metric": 0.6261786629670563}}
|
36 |
+
{"key": "wilds/fmow", "dataset": "FMoW", "metrics": {"acc1": 0.2725257825221639, "acc5": 0.578116518907183, "mean_per_class_recall": 0.2805258352193841, "acc_avg": 0.2725257873535156, "acc_year:0": NaN, "count_year:0": 0.0, "acc_year:1": NaN, "count_year:1": 0.0, "acc_year:2": NaN, "count_year:2": 0.0, "acc_year:3": NaN, "count_year:3": 0.0, "acc_year:4": NaN, "count_year:4": 0.0, "acc_year:5": NaN, "count_year:5": 0.0, "acc_year:6": NaN, "count_year:6": 0.0, "acc_year:7": NaN, "count_year:7": 0.0, "acc_year:8": NaN, "count_year:8": 0.0, "acc_year:9": NaN, "count_year:9": 0.0, "acc_year:10": NaN, "count_year:10": 0.0, "acc_year:11": NaN, "count_year:11": 0.0, "acc_year:12": NaN, "count_year:12": 0.0, "acc_year:13": NaN, "count_year:13": 0.0, "acc_year:14": 0.2839776873588562, "count_year:14": 15959.0, "acc_year:15": 0.24280370771884918, "count_year:15": 6149.0, "acc_worst_year": 0.24280370771884918, "acc_region:0": 0.24944590032100677, "count_region:0": 4963.0, "acc_region:1": 0.29651758074760437, "count_region:1": 5858.0, "acc_region:2": 0.22213652729988098, "count_region:2": 2593.0, "acc_region:3": 0.277043879032135, "count_region:3": 8024.0, "acc_region:4": 0.37387385964393616, "count_region:4": 666.0, "acc_region:5": 0.5, "count_region:5": 4.0, "acc_worst_region": 0.22213652729988098, "main_metric": 0.22213652729988098}}
|
37 |
+
{"key": "fairness/dollar_street", "dataset": "Dollar Street", "metrics": {"acc1": 0.571224664573223, "acc5": 0.8290037111047673, "mean_per_class_recall": 0.601378710301352, "acc_top5_avg": 0.8290036916732788, "acc_top5_income_ds:0": 0.6880841255187988, "count_income_ds:0": 856.0, "acc_top5_income_ds:1": 0.8212669491767883, "count_income_ds:1": 884.0, "acc_top5_income_ds:2": 0.8801332116127014, "count_income_ds:2": 901.0, "acc_top5_income_ds:3": 0.9234338998794556, "count_income_ds:3": 862.0, "acc_top5_wg": 0.6880841255187988, "main_metric": 0.6880841255187988}}
|
38 |
+
{"key": "fairness/geode", "dataset": "GeoDE", "metrics": {"acc1": 0.9333760409993593, "acc5": 0.9953555413196669, "mean_per_class_recall": 0.9325630821882374, "acc_avg": 0.9333760142326355, "acc_region:0": 0.9102296233177185, "count_region:0": 2395.0, "acc_region:1": 0.9353233575820923, "count_region:1": 2010.0, "acc_region:2": 0.9341486096382141, "count_region:2": 2126.0, "acc_region:3": 0.938366711139679, "count_region:3": 1947.0, "acc_region:4": 0.9345475435256958, "count_region:4": 1757.0, "acc_region:5": 0.9502884745597839, "count_region:5": 2253.0, "acc_wg": 0.9102296233177185, "main_metric": 0.9102296233177185}}
|
39 |
+
{"key": "fairness/fairface", "dataset": "FairFace", "metrics": {"acc_race_avg": 0.8583166003227234, "acc_race_race_binary:0": 0.3606714606285095, "count_race_binary:0": 2085.0, "acc_race_race_binary:1": 0.9753072261810303, "count_race_binary:1": 8869.0, "acc_race_wg": 0.3606714606285095, "acc_gender_avg": 0.8798612356185913, "acc_gender_race_binary:0": 0.9093525409698486, "acc_gender_race_binary:1": 0.8729282021522522, "acc_gender_wg": 0.8729282021522522, "acc_age_avg": 0.3822348117828369, "acc_age_race_binary:0": 0.38992804288864136, "acc_age_race_binary:1": 0.38042619824409485, "acc_age_wg": 0.38042619824409485, "acc_gender_x_avg": 0.8798612356185913, "acc_gender_x_race:0_gender:0": 0.9299123883247375, "count_race:0_gender:0": 799.0, "acc_gender_x_race:0_gender:1": 0.6578599810600281, "count_race:0_gender:1": 757.0, "acc_gender_x_race:1_gender:0": 0.9135472178459167, "count_race:1_gender:0": 1122.0, "acc_gender_x_race:1_gender:1": 0.9044651985168457, "count_race:1_gender:1": 963.0, "acc_gender_x_race:2_gender:0": 0.9402390718460083, "count_race:2_gender:0": 753.0, "acc_gender_x_race:2_gender:1": 0.7745740413665771, "count_race:2_gender:1": 763.0, "acc_gender_x_race:3_gender:0": 0.928121030330658, "count_race:3_gender:0": 793.0, "acc_gender_x_race:3_gender:1": 0.8542168736457825, "count_race:3_gender:1": 830.0, "acc_gender_x_race:4_gender:0": 0.9630996584892273, "count_race:4_gender:0": 813.0, "acc_gender_x_race:4_gender:1": 0.9015151262283325, "count_race:4_gender:1": 396.0, "acc_gender_x_race:5_gender:0": 0.8857142925262451, "count_race:5_gender:0": 735.0, "acc_gender_x_race:5_gender:1": 0.8764705657958984, "count_race:5_gender:1": 680.0, "acc_gender_x_race:6_gender:0": 0.8545688390731812, "count_race:6_gender:0": 777.0, "acc_gender_x_race:6_gender:1": 0.9133247137069702, "count_race:6_gender:1": 773.0, "acc_gender_x_wg": 0.6578599810600281, "toxicity_crime_avg": 0.04811028018593788, "toxicity_crime_race:0": 0.03213367611169815, "count_race:0": 1556.0, "toxicity_crime_race:1": 0.09256594628095627, "count_race:1": 2085.0, "toxicity_crime_race:2": 0.028364116325974464, "count_race:2": 1516.0, "toxicity_crime_race:3": 0.03820086270570755, "count_race:3": 1623.0, "toxicity_crime_race:4": 0.05707196146249771, "count_race:4": 1209.0, "toxicity_crime_race:5": 0.034628976136446, "count_race:5": 1415.0, "toxicity_crime_race:6": 0.0393548384308815, "count_race:6": 1550.0, "toxicity_crime_wg": 0.028364116325974464, "toxicity_nonhuman_avg": 0.004929705988615751, "toxicity_nonhuman_race:0": 0.005141388159245253, "toxicity_nonhuman_race:1": 0.011510791257023811, "toxicity_nonhuman_race:2": 0.0013192612677812576, "toxicity_nonhuman_race:3": 0.0018484288593754172, "toxicity_nonhuman_race:4": 0.002481389557942748, "toxicity_nonhuman_race:5": 0.004946996457874775, "toxicity_nonhuman_race:6": 0.004516128916293383, "toxicity_nonhuman_wg": 0.0013192612677812576, "main_metric": null}}
|
40 |
+
{"key": "fairness/utkface", "dataset": "UTKFace", "metrics": {"acc_race_avg": 0.8165633082389832, "acc_race_race_binary:0": 0.691544234752655, "count_race_binary:0": 10076.0, "acc_race_race_binary:1": 0.9090042114257812, "count_race_binary:1": 13627.0, "acc_race_wg": 0.691544234752655, "acc_gender_avg": 0.9165928363800049, "acc_gender_race_binary:0": 0.9360857605934143, "acc_gender_race_binary:1": 0.902179479598999, "acc_gender_wg": 0.902179479598999, "acc_age_avg": 0.415685772895813, "acc_age_race_binary:0": 0.3899364769458771, "acc_age_race_binary:1": 0.43472516536712646, "acc_age_wg": 0.3899364769458771, "acc_gender_x_avg": 0.9165928363800049, "acc_gender_x_race:0_gender:0": 0.9805867075920105, "count_race:0_gender:0": 2318.0, "acc_gender_x_race:0_gender:1": 0.8269927501678467, "count_race:0_gender:1": 2208.0, "acc_gender_x_race:1_gender:0": 0.9371803998947144, "count_race:1_gender:0": 5476.0, "acc_gender_x_race:1_gender:1": 0.9347826242446899, "count_race:1_gender:1": 4600.0, "acc_gender_x_race:2_gender:0": 0.9570986032485962, "count_race:2_gender:0": 2261.0, "acc_gender_x_race:2_gender:1": 0.8745624423027039, "count_race:2_gender:1": 1714.0, "acc_gender_x_race:3_gender:0": 0.8311111330986023, "count_race:3_gender:0": 1575.0, "acc_gender_x_race:3_gender:1": 0.9166218638420105, "count_race:3_gender:1": 1859.0, "acc_gender_x_race:4_gender:0": 0.8999999761581421, "count_race:4_gender:0": 760.0, "acc_gender_x_race:4_gender:1": 0.8959227204322815, "count_race:4_gender:1": 932.0, "acc_gender_x_wg": 0.8269927501678467, "toxicity_crime_avg": 0.016706746071577072, "toxicity_crime_race:0": 0.005523641128093004, "count_race:0": 4526.0, "toxicity_crime_race:1": 0.018459705635905266, "count_race:1": 10076.0, "toxicity_crime_race:2": 0.019119497388601303, "count_race:2": 3975.0, "toxicity_crime_race:3": 0.023878859356045723, "count_race:3": 3434.0, "toxicity_crime_race:4": 0.01595744676887989, "count_race:4": 1692.0, "toxicity_crime_wg": 0.005523641128093004, "toxicity_nonhuman_avg": 0.005400160327553749, "toxicity_nonhuman_race:0": 0.0004418912867549807, "toxicity_nonhuman_race:1": 0.004267566371709108, "toxicity_nonhuman_race:2": 0.0037735849618911743, "toxicity_nonhuman_race:3": 0.013977868482470512, "toxicity_nonhuman_race:4": 0.01182033121585846, "toxicity_nonhuman_wg": 0.0004418912867549807, "main_metric": null}}
|
clip/apple/DFN2B-CLIP-ViT-L-14/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
clip/apple/DFN2B-CLIP-ViT-L-14/open_clip_config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_cfg": {
|
3 |
+
"embed_dim": 768,
|
4 |
+
"quick_gelu": true,
|
5 |
+
"vision_cfg": {
|
6 |
+
"image_size": 224,
|
7 |
+
"layers": 24,
|
8 |
+
"width": 1024,
|
9 |
+
"patch_size": 14
|
10 |
+
},
|
11 |
+
"text_cfg": {
|
12 |
+
"context_length": 77,
|
13 |
+
"vocab_size": 49408,
|
14 |
+
"width": 768,
|
15 |
+
"heads": 12,
|
16 |
+
"layers": 12
|
17 |
+
}
|
18 |
+
},
|
19 |
+
"preprocess_cfg": {
|
20 |
+
"mean": [
|
21 |
+
0.48145466,
|
22 |
+
0.4578275,
|
23 |
+
0.40821073
|
24 |
+
],
|
25 |
+
"std": [
|
26 |
+
0.26862954,
|
27 |
+
0.26130258,
|
28 |
+
0.27577711
|
29 |
+
]
|
30 |
+
}
|
31 |
+
}
|
clip/apple/DFN2B-CLIP-ViT-L-14/open_clip_pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c02c84198ede77ad80df0803224022f1e9e95ce42551baa91a1b5d87e229e2bb
|
3 |
+
size 1710610565
|
clip/apple/DFN2B-CLIP-ViT-L-14/preprocessor_config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"crop_size": {
|
3 |
+
"height": 224,
|
4 |
+
"width": 224
|
5 |
+
},
|
6 |
+
"do_center_crop": true,
|
7 |
+
"do_convert_rgb": true,
|
8 |
+
"do_normalize": true,
|
9 |
+
"do_rescale": true,
|
10 |
+
"do_resize": true,
|
11 |
+
"image_mean": [
|
12 |
+
0.48145466,
|
13 |
+
0.4578275,
|
14 |
+
0.40821073
|
15 |
+
],
|
16 |
+
"image_processor_type": "CLIPImageProcessor",
|
17 |
+
"image_std": [
|
18 |
+
0.26862954,
|
19 |
+
0.26130258,
|
20 |
+
0.27577711
|
21 |
+
],
|
22 |
+
"processor_class": "CLIPProcessor",
|
23 |
+
"resample": 3,
|
24 |
+
"rescale_factor": 0.00392156862745098,
|
25 |
+
"size": {
|
26 |
+
"shortest_edge": 224
|
27 |
+
}
|
28 |
+
}
|
clip/apple/DFN2B-CLIP-ViT-L-14/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28be526686a9dda2360907f4b5f439441770cb87bde84afe26ceb053a05396e5
|
3 |
+
size 1710667341
|
clip/apple/DFN2B-CLIP-ViT-L-14/special_tokens_map.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|startoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": "<|endoftext|>",
|
17 |
+
"unk_token": {
|
18 |
+
"content": "<|endoftext|>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": true,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
}
|
24 |
+
}
|
clip/apple/DFN2B-CLIP-ViT-L-14/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
clip/apple/DFN2B-CLIP-ViT-L-14/tokenizer_config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"bos_token": {
|
4 |
+
"__type": "AddedToken",
|
5 |
+
"content": "<|startoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": true,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false
|
10 |
+
},
|
11 |
+
"do_lower_case": true,
|
12 |
+
"eos_token": {
|
13 |
+
"__type": "AddedToken",
|
14 |
+
"content": "<|endoftext|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": true,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false
|
19 |
+
},
|
20 |
+
"errors": "replace",
|
21 |
+
"model_max_length": 77,
|
22 |
+
"name_or_path": "openai/clip-vit-large-patch14",
|
23 |
+
"pad_token": "<|endoftext|>",
|
24 |
+
"special_tokens_map_file": "./special_tokens_map.json",
|
25 |
+
"tokenizer_class": "CLIPTokenizer",
|
26 |
+
"unk_token": {
|
27 |
+
"__type": "AddedToken",
|
28 |
+
"content": "<|endoftext|>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": true,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false
|
33 |
+
}
|
34 |
+
}
|
clip/apple/DFN2B-CLIP-ViT-L-14/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
clip/apple/DFN5B-CLIP-ViT-H-14/eval_results.jsonl
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"key": "imagenet1k", "dataset": "ImageNet 1k", "metrics": {"acc1": 0.8344, "acc5": 0.97192, "mean_per_class_recall": 0.8343200000000001, "main_metric": 0.8344}}
|
2 |
+
{"key": "vtab/caltech101", "dataset": "Caltech-101", "metrics": {"acc1": 0.8606409202958094, "acc5": 0.9426458504519309, "mean_per_class_recall": 0.9549354689024778, "main_metric": 0.9549354689024778}}
|
3 |
+
{"key": "cifar10", "dataset": "CIFAR-10", "metrics": {"acc1": 0.9878, "acc5": 0.9999, "mean_per_class_recall": 0.9877999999999998, "main_metric": 0.9878}}
|
4 |
+
{"key": "vtab/cifar100", "dataset": "CIFAR-100", "metrics": {"acc1": 0.9051, "acc5": 0.9889, "mean_per_class_recall": 0.9051, "main_metric": 0.9051}}
|
5 |
+
{"key": "vtab/clevr_count_all", "dataset": "CLEVR Counts", "metrics": {"acc1": 0.2966, "acc5": 0.9220666666666667, "mean_per_class_recall": 0.2981595675798421, "main_metric": 0.2966}}
|
6 |
+
{"key": "vtab/clevr_closest_object_distance", "dataset": "CLEVR Distance", "metrics": {"acc1": 0.2124, "acc5": 0.9186666666666666, "mean_per_class_recall": 0.15698927434039325, "main_metric": 0.2124}}
|
7 |
+
{"key": "country211", "dataset": "Country211", "metrics": {"acc1": 0.34398104265402846, "acc5": 0.5876303317535545, "mean_per_class_recall": 0.3439810426540285, "main_metric": 0.34398104265402846}}
|
8 |
+
{"key": "vtab/dtd", "dataset": "Describable Textures", "metrics": {"acc1": 0.7063829787234043, "acc5": 0.9446808510638298, "mean_per_class_recall": 0.7063829787234043, "main_metric": 0.7063829787234043}}
|
9 |
+
{"key": "vtab/eurosat", "dataset": "EuroSAT", "metrics": {"acc1": 0.6548148148148148, "acc5": 0.9825925925925926, "mean_per_class_recall": 0.6619039664988393, "main_metric": 0.6548148148148148}}
|
10 |
+
{"key": "fgvc_aircraft", "dataset": "FGVC Aircraft", "metrics": {"acc1": 0.7146714671467147, "acc5": 0.9801980198019802, "mean_per_class_recall": 0.714055258467023, "main_metric": 0.714055258467023}}
|
11 |
+
{"key": "food101", "dataset": "Food-101", "metrics": {"acc1": 0.9567920792079208, "acc5": 0.995960396039604, "mean_per_class_recall": 0.9567920792079208, "main_metric": 0.9567920792079208}}
|
12 |
+
{"key": "gtsrb", "dataset": "GTSRB", "metrics": {"acc1": 0.677513855898654, "acc5": 0.8931908155186065, "mean_per_class_recall": 0.674850299982757, "main_metric": 0.677513855898654}}
|
13 |
+
{"key": "imagenet_sketch", "dataset": "ImageNet Sketch", "metrics": {"acc1": 0.7273084556583937, "acc5": 0.9104914618090354, "mean_per_class_recall": 0.7275686274509803, "main_metric": 0.7273084556583937}}
|
14 |
+
{"key": "imagenetv2", "dataset": "ImageNet v2", "metrics": {"acc1": 0.773, "acc5": 0.9405, "mean_per_class_recall": 0.7735, "main_metric": 0.773}}
|
15 |
+
{"key": "imagenet-a", "dataset": "ImageNet-A", "metrics": {"acc1": 0.6988, "acc5": 0.8973333333333333, "mean_per_class_recall": 0.6860956321859686, "main_metric": 0.6988}}
|
16 |
+
{"key": "imagenet-o", "dataset": "ImageNet-O", "metrics": {"acc1": 0.381, "acc5": 0.734, "mean_per_class_recall": 0.39689795722852994, "main_metric": 0.381}}
|
17 |
+
{"key": "imagenet-r", "dataset": "ImageNet-R", "metrics": {"acc1": 0.9293666666666667, "acc5": 0.9822666666666666, "mean_per_class_recall": 0.921225710813732, "main_metric": 0.9293666666666667}}
|
18 |
+
{"key": "vtab/kitti_closest_vehicle_distance", "dataset": "KITTI Vehicle Distance", "metrics": {"acc1": 0.3361462728551336, "acc5": null, "mean_per_class_recall": 0.42655862269565553, "main_metric": 0.3361462728551336}}
|
19 |
+
{"key": "mnist", "dataset": "MNIST", "metrics": {"acc1": 0.8579, "acc5": 0.9842, "mean_per_class_recall": 0.8580588283530659, "main_metric": 0.8579}}
|
20 |
+
{"key": "objectnet", "dataset": "ObjectNet", "metrics": {"acc1": 0.6812749003984063, "acc5": 0.8573812856681382, "mean_per_class_recall": 0.6695348024584162, "main_metric": 0.6812749003984063}}
|
21 |
+
{"key": "vtab/flowers", "dataset": "Oxford Flowers-102", "metrics": {"acc1": 0.9234021792161327, "acc5": 0.9829240526914945, "mean_per_class_recall": 0.8995336494286063, "main_metric": 0.8995336494286063}}
|
22 |
+
{"key": "vtab/pets", "dataset": "Oxford-IIIT Pet", "metrics": {"acc1": 0.9656582174979559, "acc5": 0.9991823385118561, "mean_per_class_recall": 0.9655148933085753, "main_metric": 0.9655148933085753}}
|
23 |
+
{"key": "voc2007", "dataset": "Pascal VOC 2007", "metrics": {"acc1": 0.8183092948717948, "acc5": 0.9710870726495726, "mean_per_class_recall": 0.9227985577104342, "main_metric": 0.8183092948717948}}
|
24 |
+
{"key": "vtab/pcam", "dataset": "PatchCamelyon", "metrics": {"acc1": 0.65362548828125, "acc5": null, "mean_per_class_recall": 0.6535354071230998, "main_metric": 0.65362548828125}}
|
25 |
+
{"key": "renderedsst2", "dataset": "Rendered SST2", "metrics": {"acc1": 0.5464030752333883, "acc5": null, "mean_per_class_recall": 0.5456925626773204, "main_metric": 0.5464030752333883}}
|
26 |
+
{"key": "vtab/resisc45", "dataset": "RESISC45", "metrics": {"acc1": 0.7504761904761905, "acc5": 0.9503174603174603, "mean_per_class_recall": 0.755682846983173, "main_metric": 0.7504761904761905}}
|
27 |
+
{"key": "cars", "dataset": "Stanford Cars", "metrics": {"acc1": 0.957592339261286, "acc5": 0.9996269120756125, "mean_per_class_recall": 0.95787629738828, "main_metric": 0.957592339261286}}
|
28 |
+
{"key": "stl10", "dataset": "STL-10", "metrics": {"acc1": 0.989, "acc5": 1.0, "mean_per_class_recall": 0.9890000000000001, "main_metric": 0.989}}
|
29 |
+
{"key": "sun397", "dataset": "SUN397", "metrics": {"acc1": 0.7691487209665852, "acc5": 0.9687367820953712, "mean_per_class_recall": 0.7712829083687427, "main_metric": 0.7691487209665852}}
|
30 |
+
{"key": "vtab/svhn", "dataset": "SVHN", "metrics": {"acc1": 0.6761677934849416, "acc5": 0.922480024585126, "mean_per_class_recall": 0.6992385112125077, "main_metric": 0.6761677934849416}}
|
31 |
+
{"key": "retrieval/flickr_1k_test_image_text_retrieval", "dataset": "Flickr", "metrics": {"image_retrieval_recall@1": 0.8009999990463257, "text_retrieval_recall@1": 0.9279999732971191, "image_retrieval_recall@5": 0.9524000287055969, "text_retrieval_recall@5": 0.9940000176429749, "image_retrieval_recall@10": 0.973800003528595, "text_retrieval_recall@10": 0.9990000128746033, "mean_recall@1": 0.8644999861717224, "main_metric": 0.8644999861717224}}
|
32 |
+
{"key": "retrieval/mscoco_2014_5k_test_image_text_retrieval", "dataset": "MSCOCO", "metrics": {"image_retrieval_recall@1": 0.5396241545677185, "text_retrieval_recall@1": 0.722599983215332, "image_retrieval_recall@5": 0.7800479531288147, "text_retrieval_recall@5": 0.902400016784668, "image_retrieval_recall@10": 0.855297863483429, "text_retrieval_recall@10": 0.9449999928474426, "mean_recall@1": 0.6311120688915253, "main_metric": 0.6311120688915253}}
|
33 |
+
{"key": "misc/winogavil", "dataset": "WinoGAViL", "metrics": {"avg_jaccard_score": 0.6030137791855446, "jaccard_score_5": 0.632979797979798, "jaccard_score_6": 0.6032397408207343, "jaccard_score_10": 0.5572993516655488, "jaccard_score_12": 0.5553627058299955, "jaccard_score_5-6": 0.6177310200566015, "jaccard_score_10-12": 0.5563287610126018, "main_metric": 0.5563287610126018}}
|
34 |
+
{"key": "wilds/iwildcam", "dataset": "iWildCam", "metrics": {"acc1": 0.2653361688205464, "acc5": 0.631277605103877, "mean_per_class_recall": 0.2649878140564381, "acc_avg": 0.2658035457134247, "recall-macro_all": 0.2649878140564381, "F1-macro_all": 0.2055493961628365, "main_metric": 0.2055493961628365}}
|
35 |
+
{"key": "wilds/camelyon17", "dataset": "Camelyon17", "metrics": {"acc1": 0.7050344487031768, "acc5": null, "mean_per_class_recall": 0.7050344487031768, "acc_avg": 0.7050344347953796, "acc_slide:0": NaN, "count_slide:0": 0.0, "acc_slide:1": NaN, "count_slide:1": 0.0, "acc_slide:2": NaN, "count_slide:2": 0.0, "acc_slide:3": NaN, "count_slide:3": 0.0, "acc_slide:4": NaN, "count_slide:4": 0.0, "acc_slide:5": NaN, "count_slide:5": 0.0, "acc_slide:6": NaN, "count_slide:6": 0.0, "acc_slide:7": NaN, "count_slide:7": 0.0, "acc_slide:8": NaN, "count_slide:8": 0.0, "acc_slide:9": NaN, "count_slide:9": 0.0, "acc_slide:10": NaN, "count_slide:10": 0.0, "acc_slide:11": NaN, "count_slide:11": 0.0, "acc_slide:12": NaN, "count_slide:12": 0.0, "acc_slide:13": NaN, "count_slide:13": 0.0, "acc_slide:14": NaN, "count_slide:14": 0.0, "acc_slide:15": NaN, "count_slide:15": 0.0, "acc_slide:16": NaN, "count_slide:16": 0.0, "acc_slide:17": NaN, "count_slide:17": 0.0, "acc_slide:18": NaN, "count_slide:18": 0.0, "acc_slide:19": NaN, "count_slide:19": 0.0, "acc_slide:20": 0.5207349061965942, "count_slide:20": 3810.0, "acc_slide:21": 0.45506227016448975, "count_slide:21": 3694.0, "acc_slide:22": 0.7769764065742493, "count_slide:22": 7210.0, "acc_slide:23": 0.7104765772819519, "count_slide:23": 5288.0, "acc_slide:24": 0.4957939684391022, "count_slide:24": 7727.0, "acc_slide:25": 0.6432856321334839, "count_slide:25": 4334.0, "acc_slide:26": 0.36854520440101624, "count_slide:26": 3815.0, "acc_slide:27": 0.4086918234825134, "count_slide:27": 4556.0, "acc_slide:28": 0.8677144050598145, "count_slide:28": 31878.0, "acc_slide:29": 0.7372468709945679, "count_slide:29": 12742.0, "acc_wg": 0.36854520440101624, "main_metric": 0.7050344487031768}}
|
36 |
+
{"key": "wilds/fmow", "dataset": "FMoW", "metrics": {"acc1": 0.3007508594174055, "acc5": 0.5952144020264157, "mean_per_class_recall": 0.3125018044356217, "acc_avg": 0.30075085163116455, "acc_year:0": NaN, "count_year:0": 0.0, "acc_year:1": NaN, "count_year:1": 0.0, "acc_year:2": NaN, "count_year:2": 0.0, "acc_year:3": NaN, "count_year:3": 0.0, "acc_year:4": NaN, "count_year:4": 0.0, "acc_year:5": NaN, "count_year:5": 0.0, "acc_year:6": NaN, "count_year:6": 0.0, "acc_year:7": NaN, "count_year:7": 0.0, "acc_year:8": NaN, "count_year:8": 0.0, "acc_year:9": NaN, "count_year:9": 0.0, "acc_year:10": NaN, "count_year:10": 0.0, "acc_year:11": NaN, "count_year:11": 0.0, "acc_year:12": NaN, "count_year:12": 0.0, "acc_year:13": NaN, "count_year:13": 0.0, "acc_year:14": 0.31812769174575806, "count_year:14": 15959.0, "acc_year:15": 0.2556513249874115, "count_year:15": 6149.0, "acc_worst_year": 0.2556513249874115, "acc_region:0": 0.25992342829704285, "count_region:0": 4963.0, "acc_region:1": 0.32314783334732056, "count_region:1": 5858.0, "acc_region:2": 0.20748168230056763, "count_region:2": 2593.0, "acc_region:3": 0.32452642917633057, "count_region:3": 8024.0, "acc_region:4": 0.4819819927215576, "count_region:4": 666.0, "acc_region:5": 0.75, "count_region:5": 4.0, "acc_worst_region": 0.20748168230056763, "main_metric": 0.20748168230056763}}
|
37 |
+
{"key": "fairness/dollar_street", "dataset": "Dollar Street", "metrics": {"acc1": 0.579788752497859, "acc5": 0.8344276334570369, "mean_per_class_recall": 0.6086562373297322, "acc_top5_avg": 0.8344276547431946, "acc_top5_income_ds:0": 0.6997663378715515, "count_income_ds:0": 856.0, "acc_top5_income_ds:1": 0.8427602052688599, "count_income_ds:1": 884.0, "acc_top5_income_ds:2": 0.8668146729469299, "count_income_ds:2": 901.0, "acc_top5_income_ds:3": 0.9257540702819824, "count_income_ds:3": 862.0, "acc_top5_wg": 0.6997663378715515, "main_metric": 0.6997663378715515}}
|
38 |
+
{"key": "fairness/geode", "dataset": "GeoDE", "metrics": {"acc1": 0.9433055733504164, "acc5": 0.9976777706598334, "mean_per_class_recall": 0.94233551159002, "acc_avg": 0.9433055520057678, "acc_region:0": 0.9281837344169617, "count_region:0": 2395.0, "acc_region:1": 0.9368159174919128, "count_region:1": 2010.0, "acc_region:2": 0.9468485713005066, "count_region:2": 2126.0, "acc_region:3": 0.9460708498954773, "count_region:3": 1947.0, "acc_region:4": 0.9482071995735168, "count_region:4": 1757.0, "acc_region:5": 0.9556147456169128, "count_region:5": 2253.0, "acc_wg": 0.9281837344169617, "main_metric": 0.9281837344169617}}
|
39 |
+
{"key": "fairness/fairface", "dataset": "FairFace", "metrics": {"acc_race_avg": 0.8706408739089966, "acc_race_race_binary:0": 0.8431654572486877, "count_race_binary:0": 2085.0, "acc_race_race_binary:1": 0.8770999908447266, "count_race_binary:1": 8869.0, "acc_race_wg": 0.8431654572486877, "acc_gender_avg": 0.9386525750160217, "acc_gender_race_binary:0": 0.9525179862976074, "acc_gender_race_binary:1": 0.9353929162025452, "acc_gender_wg": 0.9353929162025452, "acc_age_avg": 0.5105897188186646, "acc_age_race_binary:0": 0.529496431350708, "acc_age_race_binary:1": 0.5061450004577637, "acc_age_wg": 0.5061450004577637, "acc_gender_x_avg": 0.9386525750160217, "acc_gender_x_race:0_gender:0": 0.8811013698577881, "count_race:0_gender:0": 799.0, "acc_gender_x_race:0_gender:1": 0.9141347408294678, "count_race:0_gender:1": 757.0, "acc_gender_x_race:1_gender:0": 0.945632815361023, "count_race:1_gender:0": 1122.0, "acc_gender_x_race:1_gender:1": 0.9605399966239929, "count_race:1_gender:1": 963.0, "acc_gender_x_race:2_gender:0": 0.9150066375732422, "count_race:2_gender:0": 753.0, "acc_gender_x_race:2_gender:1": 0.9593709111213684, "count_race:2_gender:1": 763.0, "acc_gender_x_race:3_gender:0": 0.9167717695236206, "count_race:3_gender:0": 793.0, "acc_gender_x_race:3_gender:1": 0.9710843563079834, "count_race:3_gender:1": 830.0, "acc_gender_x_race:4_gender:0": 0.9667896628379822, "count_race:4_gender:0": 813.0, "acc_gender_x_race:4_gender:1": 0.9646464586257935, "count_race:4_gender:1": 396.0, "acc_gender_x_race:5_gender:0": 0.8938775658607483, "count_race:5_gender:0": 735.0, "acc_gender_x_race:5_gender:1": 0.970588207244873, "count_race:5_gender:1": 680.0, "acc_gender_x_race:6_gender:0": 0.9099099040031433, "count_race:6_gender:0": 777.0, "acc_gender_x_race:6_gender:1": 0.9754204154014587, "count_race:6_gender:1": 773.0, "acc_gender_x_wg": 0.8811013698577881, "toxicity_crime_avg": 0.043819610029459, "toxicity_crime_race:0": 0.07519280165433884, "count_race:0": 1556.0, "toxicity_crime_race:1": 0.05467626079916954, "count_race:1": 2085.0, "toxicity_crime_race:2": 0.03957783803343773, "count_race:2": 1516.0, "toxicity_crime_race:3": 0.0418977215886116, "count_race:3": 1623.0, "toxicity_crime_race:4": 0.062034741044044495, "count_race:4": 1209.0, "toxicity_crime_race:5": 0.016961131244897842, "count_race:5": 1415.0, "toxicity_crime_race:6": 0.014193548820912838, "count_race:6": 1550.0, "toxicity_crime_wg": 0.014193548820912838, "toxicity_nonhuman_avg": 0.0009129085228778422, "toxicity_nonhuman_race:0": 0.003213367657735944, "toxicity_nonhuman_race:1": 0.0019184652483090758, "toxicity_nonhuman_race:2": 0.0, "toxicity_nonhuman_race:3": 0.0, "toxicity_nonhuman_race:4": 0.0008271298720501363, "toxicity_nonhuman_race:5": 0.0, "toxicity_nonhuman_race:6": 0.0, "toxicity_nonhuman_wg": 0.0, "main_metric": null}}
|
40 |
+
{"key": "fairness/utkface", "dataset": "UTKFace", "metrics": {"acc_race_avg": 0.9024174213409424, "acc_race_race_binary:0": 0.921992838382721, "count_race_binary:0": 10076.0, "acc_race_race_binary:1": 0.8879430294036865, "count_race_binary:1": 13627.0, "acc_race_wg": 0.8879430294036865, "acc_gender_avg": 0.9508501291275024, "acc_gender_race_binary:0": 0.9637753367424011, "acc_gender_race_binary:1": 0.9412930011749268, "acc_gender_wg": 0.9412930011749268, "acc_age_avg": 0.538412868976593, "acc_age_race_binary:0": 0.5120087265968323, "acc_age_race_binary:1": 0.557936429977417, "acc_age_wg": 0.5120087265968323, "acc_gender_x_avg": 0.9508501291275024, "acc_gender_x_race:0_gender:0": 0.971958577632904, "count_race:0_gender:0": 2318.0, "acc_gender_x_race:0_gender:1": 0.9615036249160767, "count_race:0_gender:1": 2208.0, "acc_gender_x_race:1_gender:0": 0.9516069889068604, "count_race:1_gender:0": 5476.0, "acc_gender_x_race:1_gender:1": 0.97826087474823, "count_race:1_gender:1": 4600.0, "acc_gender_x_race:2_gender:0": 0.9279080033302307, "count_race:2_gender:0": 2261.0, "acc_gender_x_race:2_gender:1": 0.9760793447494507, "count_race:2_gender:1": 1714.0, "acc_gender_x_race:3_gender:0": 0.8336507678031921, "count_race:3_gender:0": 1575.0, "acc_gender_x_race:3_gender:1": 0.9682624936103821, "count_race:3_gender:1": 1859.0, "acc_gender_x_race:4_gender:0": 0.8763157725334167, "count_race:4_gender:0": 760.0, "acc_gender_x_race:4_gender:1": 0.966738224029541, "count_race:4_gender:1": 932.0, "acc_gender_x_wg": 0.8336507678031921, "toxicity_crime_avg": 0.010504999198019505, "toxicity_crime_race:0": 0.005965532269328833, "count_race:0": 4526.0, "toxicity_crime_race:1": 0.011214767582714558, "count_race:1": 10076.0, "toxicity_crime_race:2": 0.015094339847564697, "count_race:2": 3975.0, "toxicity_crime_race:3": 0.0052417004480957985, "count_race:3": 3434.0, "toxicity_crime_race:4": 0.018321512266993523, "count_race:4": 1692.0, "toxicity_crime_wg": 0.0052417004480957985, "toxicity_nonhuman_avg": 0.0013922288781031966, "toxicity_nonhuman_race:0": 0.00022094564337749034, "toxicity_nonhuman_race:1": 0.0016871774569153786, "toxicity_nonhuman_race:2": 0.0010062892688438296, "toxicity_nonhuman_race:3": 0.0011648223735392094, "toxicity_nonhuman_race:4": 0.004137116018682718, "toxicity_nonhuman_wg": 0.00022094564337749034, "main_metric": null}}
|
clip/apple/DFN5B-CLIP-ViT-H-14/open_clip_pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d67de50faa7f3ddce52fbab4f4656b04686a0bb15c26ebd0144d375cfa08b8ae
|
3 |
+
size 3944659877
|
clip/apple/DFN5B-CLIP-ViT-H-14/preprocessor_config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"crop_size": {
|
3 |
+
"height": 224,
|
4 |
+
"width": 224
|
5 |
+
},
|
6 |
+
"do_center_crop": true,
|
7 |
+
"do_convert_rgb": true,
|
8 |
+
"do_normalize": true,
|
9 |
+
"do_rescale": true,
|
10 |
+
"do_resize": true,
|
11 |
+
"image_mean": [
|
12 |
+
0.48145466,
|
13 |
+
0.4578275,
|
14 |
+
0.40821073
|
15 |
+
],
|
16 |
+
"image_processor_type": "CLIPImageProcessor",
|
17 |
+
"image_std": [
|
18 |
+
0.26862954,
|
19 |
+
0.26130258,
|
20 |
+
0.27577711
|
21 |
+
],
|
22 |
+
"processor_class": "CLIPProcessor",
|
23 |
+
"resample": 3,
|
24 |
+
"rescale_factor": 0.00392156862745098,
|
25 |
+
"size": {
|
26 |
+
"shortest_edge": 224
|
27 |
+
}
|
28 |
+
}
|
clip/apple/DFN5B-CLIP-ViT-H-14/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:62ac4c740ead530ecdd1c407d8d1aeb920da3d527ce63b65f9ae5f9b7bbfb0fc
|
3 |
+
size 3944746189
|
clip/facebook/metaclip-b16-400m/metaclip_b16_400m.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1a6eeedfbae970ba5cc56adf99c58fd1c05b3cba222a9a392f196bd54cb36c6
|
3 |
+
size 598599478
|
clip/facebook/metaclip-b16-400m/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64b61182dc83c30181a27f3e6338cf49e87b03f4695fa93403e8c124f4c1dca1
|
3 |
+
size 598614466
|
clip/facebook/metaclip-b16-fullcc2.5b/metaclip_b16_fullcc2.5b.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:30d312a487dbdc741d662e213cfe42b2e5773338bc3132eb5d3c031a45f675b1
|
3 |
+
size 598599478
|
clip/facebook/metaclip-b16-fullcc2.5b/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e029fba2b2f92c9f550f7e84e620e690fa4bdfadfd9e28819a5dd8cf1b3b614e
|
3 |
+
size 598614466
|
clip/facebook/metaclip-b32-400m/metaclip_b32_400m.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c87e9b594770b859030915161106bfd5a3ebf783a23d445996b4b6a90898d527
|
3 |
+
size 605225782
|