omarelshehy commited on
Commit
749f1cf
·
verified ·
1 Parent(s): 34e3db4

Upload folder using huggingface_hub

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/aws.xml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="accountSettings">
4
+ <option name="activeRegion" value="us-east-1" />
5
+ <option name="recentlyUsedRegions">
6
+ <list>
7
+ <option value="us-east-1" />
8
+ </list>
9
+ </option>
10
+ </component>
11
+ </project>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="Black">
4
+ <option name="sdkName" value="Python 3.12 (pythonProject)" />
5
+ </component>
6
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (pythonProject)" project-jdk-type="Python SDK" />
7
+ </project>
.idea/modernbert_mlm_sts.iml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/modernbert_mlm_sts.iml" filepath="$PROJECT_DIR$/.idea/modernbert_mlm_sts.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
+ </project>
.idea/workspace.xml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="AutoImportSettings">
4
+ <option name="autoReloadType" value="SELECTIVE" />
5
+ </component>
6
+ <component name="ChangeListManager">
7
+ <list default="true" id="59391769-f550-47d2-b479-30410ad79396" name="Changes" comment="">
8
+ <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
9
+ <change beforePath="$PROJECT_DIR$/config.json" beforeDir="false" afterPath="$PROJECT_DIR$/config.json" afterDir="false" />
10
+ <change beforePath="$PROJECT_DIR$/optimizer.pt" beforeDir="false" />
11
+ <change beforePath="$PROJECT_DIR$/rng_state.pth" beforeDir="false" />
12
+ <change beforePath="$PROJECT_DIR$/scaler.pt" beforeDir="false" />
13
+ <change beforePath="$PROJECT_DIR$/scheduler.pt" beforeDir="false" />
14
+ <change beforePath="$PROJECT_DIR$/trainer_state.json" beforeDir="false" />
15
+ <change beforePath="$PROJECT_DIR$/training_args.bin" beforeDir="false" />
16
+ </list>
17
+ <option name="SHOW_DIALOG" value="false" />
18
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
19
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
20
+ <option name="LAST_RESOLUTION" value="IGNORE" />
21
+ </component>
22
+ <component name="Git.Settings">
23
+ <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
24
+ </component>
25
+ <component name="ProjectColorInfo"><![CDATA[{
26
+ "associatedIndex": 8
27
+ }]]></component>
28
+ <component name="ProjectId" id="2u5uX6wr1oiVSfgNfNopm0F96iP" />
29
+ <component name="ProjectViewState">
30
+ <option name="hideEmptyMiddlePackages" value="true" />
31
+ <option name="showLibraryContents" value="true" />
32
+ </component>
33
+ <component name="PropertiesComponent"><![CDATA[{
34
+ "keyToString": {
35
+ "RunOnceActivity.ShowReadmeOnStart": "true",
36
+ "git-widget-placeholder": "main",
37
+ "last_opened_file_path": "/Users/omarelshehy/modernbert_mlm_sts"
38
+ }
39
+ }]]></component>
40
+ <component name="SharedIndexes">
41
+ <attachedChunks>
42
+ <set>
43
+ <option value="bundled-python-sdk-abccbe9d933a-ba05f1cad1b1-com.jetbrains.pycharm.community.sharedIndexes.bundled-PC-242.20224.428" />
44
+ </set>
45
+ </attachedChunks>
46
+ </component>
47
+ <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
48
+ <component name="TaskManager">
49
+ <task active="true" id="Default" summary="Default task">
50
+ <changelist id="59391769-f550-47d2-b479-30410ad79396" name="Changes" comment="" />
51
+ <created>1741550792489</created>
52
+ <option name="number" value="Default" />
53
+ <option name="presentableId" value="Default" />
54
+ <updated>1741550792489</updated>
55
+ </task>
56
+ <servers />
57
+ </component>
58
+ </project>
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md CHANGED
@@ -1,3 +1,190 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - sentence-transformers
4
+ - sentence-similarity
5
+ - feature-extraction
6
+ - generated_from_trainer
7
+ - loss:MultipleNegativesRankingLoss
8
+ - mteb
9
+ base_model: NAMAA-Space/AraModernBert-Base-V1.0
10
+ pipeline_tag: sentence-similarity
11
+ library_name: sentence-transformers
12
+ metrics:
13
+ - pearson_cosine
14
+ - spearman_cosine
15
+ model-index:
16
+ - name: NAMAA-Space/AraModernBert-Base-V1.0
17
+ results:
18
+ - dataset:
19
+ config: ar-ar
20
+ name: MTEB STS17 (ar-ar)
21
+ revision: faeb762787bd10488a50c8b5be4a3b82e411949c
22
+ split: test
23
+ type: mteb/sts17-crosslingual-sts
24
+ metrics:
25
+ - type: pearson
26
+ value: 82.4888
27
+ - type: spearman
28
+ value: 83.0981
29
+ - type: cosine_pearson
30
+ value: 82.4888
31
+ - type: cosine_spearman
32
+ value: 83.1109
33
+ - type: manhattan_pearson
34
+ value: 81.2051
35
+ - type: manhattan_spearman
36
+ value: 83.0197
37
+ - type: euclidean_pearson
38
+ value: 81.1013
39
+ - type: euclidean_spearman
40
+ value: 82.8922
41
+ - type: main_score
42
+ value: 83.1109
43
+ task:
44
+ type: STS
45
+ - dataset:
46
+ config: ar
47
+ name: MTEB STS22.v2 (ar)
48
+ revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd
49
+ split: test
50
+ type: mteb/sts22-crosslingual-sts
51
+ metrics:
52
+ - type: pearson
53
+ value: 52.58540000000001
54
+ - type: spearman
55
+ value: 61.7371
56
+ - type: cosine_pearson
57
+ value: 52.58540000000001
58
+ - type: cosine_spearman
59
+ value: 61.7371
60
+ - type: manhattan_pearson
61
+ value: 55.887299999999996
62
+ - type: manhattan_spearman
63
+ value: 61.3654
64
+ - type: euclidean_pearson
65
+ value: 55.633500000000005
66
+ - type: euclidean_spearman
67
+ value: 61.2124
68
+ - type: main_score
69
+ value: 61.7371
70
+ task:
71
+ type: STS
72
+ ---
73
+
74
+ # SentenceTransformer based on NAMAA-Space/AraModernBert-Base-V1.0
75
+
76
+ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [NAMAA-Space/AraModernBert-Base-V1.0](https://huggingface.co/NAMAA-Space/AraModernBert-Base-V1.0) . It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
77
+
78
+ ## Model Details
79
+
80
+ ### Model Description
81
+ - **Model Type:** Sentence Transformer
82
+ - **Base model:** [NAMAA-Space/AraModernBert-Base-V1.0](https://huggingface.co/NAMAA-Space/AraModernBert-Base-V1.0) <!-- at revision b2db86686c6c03497db32a63187be8954cc34013 -->
83
+ - **Maximum Sequence Length:** 512 tokens
84
+ - **Output Dimensionality:** 768 dimensions
85
+ - **Similarity Function:** Cosine Similarity
86
+
87
+ ### Full Model Architecture
88
+
89
+ ```
90
+ SentenceTransformer(
91
+ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: ModernBertModel
92
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
93
+ )
94
+ ```
95
+
96
+ ## Usage
97
+
98
+ ### Direct Usage (Sentence Transformers)
99
+
100
+ First install the Sentence Transformers library:
101
+
102
+ ```bash
103
+ pip install -U sentence-transformers
104
+ ```
105
+
106
+ Then you can load this model and run inference.
107
+ ```python
108
+ from sentence_transformers import SentenceTransformer
109
+
110
+ # Download from the 🤗 Hub
111
+ model = SentenceTransformer("sentence_transformers_model_id")
112
+ # Run inference
113
+ sentences = [
114
+ 'الذكاء الاصطناعي يغير طريقة تفاعلنا مع التكنولوجيا.',
115
+ 'التكنولوجيا تتطور بسرعة بفضل الذكاء الاصطناعي.',
116
+ 'الذكاء الاصطناعي يسهم في تطوير التطبيقات الذكية.',
117
+ ]
118
+ embeddings = model.encode(sentences)
119
+ print(embeddings.shape)
120
+ # [3, 768]
121
+
122
+ # Get the similarity scores for the embeddings
123
+ similarities = model.similarity(embeddings, embeddings)
124
+ print(similarities.shape)
125
+ # [3, 3]
126
+ ```
127
+
128
+ ## Evaluation
129
+
130
+ ### Metrics
131
+
132
+ #### Semantic Similarity
133
+
134
+ * Datasets: ` STS17` and `STS22.v2`
135
+ * Evaluated with [<code>EmbeddingSimilarityEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.EmbeddingSimilarityEvaluator)
136
+
137
+ | Metric | STS17 | STS22.v2 |
138
+ |:--------------------|:----------|:-----------|
139
+ | pearson_cosine | 0.8249 | 0.5259 |
140
+ | **spearman_cosine** | **0.831** | **0.6169** |
141
+
142
+ <!--
143
+ ## Bias, Risks and Limitations
144
+
145
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
146
+ -->
147
+
148
+ <!--
149
+ ### Recommendations
150
+
151
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
152
+ -->
153
+
154
+ ### Framework Versions
155
+ - Python: 3.10.12
156
+ - Sentence Transformers: 3.4.1
157
+ - Transformers: 4.49.0
158
+ - PyTorch: 2.1.0+cu118
159
+ - Accelerate: 1.4.0
160
+ - Datasets: 2.21.0
161
+ - Tokenizers: 0.21.0
162
+
163
+ ## Citation
164
+
165
+ ### BibTeX
166
+
167
+ #### Sentence Transformers
168
+ ```bibtex
169
+ @inproceedings{reimers-2019-sentence-bert,
170
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
171
+ author = "Reimers, Nils and Gurevych, Iryna",
172
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
173
+ month = "11",
174
+ year = "2019",
175
+ publisher = "Association for Computational Linguistics",
176
+ url = "https://arxiv.org/abs/1908.10084",
177
+ }
178
+ ```
179
+
180
+ #### MultipleNegativesRankingLoss
181
+ ```bibtex
182
+ @misc{henderson2017efficient,
183
+ title={Efficient Natural Language Response Suggestion for Smart Reply},
184
+ author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
185
+ year={2017},
186
+ eprint={1705.00652},
187
+ archivePrefix={arXiv},
188
+ primaryClass={cs.CL}
189
+ }
190
+ ```
config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "NAMAA-Space/AraModernBert-Base-V1.0",
3
+ "additional_special_tokens_ids": [],
4
+ "architectures": [
5
+ "ModernBertModel"
6
+ ],
7
+ "attention_bias": false,
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": null,
10
+ "classifier_activation": "gelu",
11
+ "classifier_bias": false,
12
+ "classifier_dropout": 0.0,
13
+ "classifier_pooling": "mean",
14
+ "cls_token_id": 3,
15
+ "decoder_bias": true,
16
+ "deterministic_flash_attn": false,
17
+ "embedding_dropout": 0.0,
18
+ "eos_token_id": null,
19
+ "global_attn_every_n_layers": 3,
20
+ "global_rope_theta": 160000.0,
21
+ "gradient_checkpointing": false,
22
+ "hidden_activation": "gelu",
23
+ "hidden_size": 768,
24
+ "initializer_cutoff_factor": 2.0,
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 1152,
27
+ "layer_norm_eps": 1e-05,
28
+ "local_attention": 128,
29
+ "local_rope_theta": 10000.0,
30
+ "mask_token_id": 6,
31
+ "max_position_embeddings": 8192,
32
+ "mlp_bias": false,
33
+ "mlp_dropout": 0.0,
34
+ "model_type": "modernbert",
35
+ "norm_bias": false,
36
+ "norm_eps": 1e-05,
37
+ "num_attention_heads": 12,
38
+ "num_hidden_layers": 22,
39
+ "pad_token_id": 5,
40
+ "position_embedding_type": "absolute",
41
+ "reference_compile": false,
42
+ "repad_logits_with_grad": false,
43
+ "sep_token_id": 4,
44
+ "sparse_pred_ignore_index": -100,
45
+ "sparse_prediction": false,
46
+ "tokenizer_class": "PreTrainedTokenizerFast",
47
+ "torch_dtype": "float32",
48
+ "transformers_version": "4.49.0",
49
+ "unk_token_id": 2,
50
+ "vocab_size": 50280
51
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.4.1",
4
+ "transformers": "4.49.0",
5
+ "pytorch": "2.1.0+cu118"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": "cosine"
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a840401f05761949351c98fd7d02602b0684be881786711e14591238e60bd4dd
3
+ size 595799800
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": true,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<|padding|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<|endoftext|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[UNK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[CLS]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[SEP]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[PAD]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[MASK]",
53
+ "lstrip": true,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "clean_up_tokenization_spaces": true,
61
+ "cls_token": "[CLS]",
62
+ "extra_special_tokens": {},
63
+ "mask_token": "[MASK]",
64
+ "model_input_names": [
65
+ "input_ids",
66
+ "attention_mask"
67
+ ],
68
+ "model_max_length": 8192,
69
+ "pad_token": "[PAD]",
70
+ "sep_token": "[SEP]",
71
+ "tokenizer_class": "PreTrainedTokenizer",
72
+ "unk_token": "[UNK]"
73
+ }