|
|
"""
|
|
|
ZamAI Simple Multilingual Embeddings Demo
|
|
|
This script demonstrates embedding sentences in multiple languages, including Pashto.
|
|
|
"""
|
|
|
from sentence_transformers import SentenceTransformer
|
|
|
import numpy as np
|
|
|
|
|
|
def cosine_similarity(vec1, vec2):
|
|
|
"""Calculate cosine similarity between two vectors"""
|
|
|
dot_product = np.dot(vec1, vec2)
|
|
|
norm1 = np.linalg.norm(vec1)
|
|
|
norm2 = np.linalg.norm(vec2)
|
|
|
return dot_product / (norm1 * norm2)
|
|
|
|
|
|
def print_similarities(model, sentences, query_idx=0):
|
|
|
"""Print similarity scores between a query and all other sentences"""
|
|
|
|
|
|
embeddings = model.encode(sentences)
|
|
|
|
|
|
|
|
|
query_embedding = embeddings[query_idx]
|
|
|
query = sentences[query_idx]
|
|
|
|
|
|
print(f"Query: '{query}'")
|
|
|
print("Similarities:")
|
|
|
|
|
|
|
|
|
for i, sentence in enumerate(sentences):
|
|
|
if i == query_idx:
|
|
|
continue
|
|
|
|
|
|
similarity = cosine_similarity(query_embedding, embeddings[i])
|
|
|
print(f"- {similarity:.4f}: '{sentence}'")
|
|
|
print()
|
|
|
|
|
|
def main():
|
|
|
|
|
|
print("Loading multilingual embedding model...")
|
|
|
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
|
|
model = SentenceTransformer(model_name)
|
|
|
print(f"Model loaded: {model_name}")
|
|
|
|
|
|
|
|
|
print("\n===== English-English Similarity =====")
|
|
|
english_sentences = [
|
|
|
"This is a sample sentence in English.",
|
|
|
"This sentence is similar to the first one.",
|
|
|
"This sentence has nothing to do with the others."
|
|
|
]
|
|
|
print_similarities(model, english_sentences, query_idx=0)
|
|
|
|
|
|
|
|
|
print("\n===== Pashto-Pashto Similarity =====")
|
|
|
pashto_sentences = [
|
|
|
"دا په پښتو کې یوه نمونه جمله ده.",
|
|
|
"دا جمله د لومړۍ جملې سره ورته ده.",
|
|
|
"دا جمله د نورو سره هېڅ تړاو نلري."
|
|
|
]
|
|
|
print_similarities(model, pashto_sentences, query_idx=0)
|
|
|
|
|
|
|
|
|
print("\n===== Cross-lingual Similarity (English-Pashto) =====")
|
|
|
cross_lingual_sentences = [
|
|
|
"This is a sample sentence in English.",
|
|
|
"دا په پښتو کې یوه نمونه جمله ده.",
|
|
|
"I'm learning to speak Pashto.",
|
|
|
"زه د پښتو ژبې زده کړه کوم."
|
|
|
]
|
|
|
print_similarities(model, cross_lingual_sentences, query_idx=0)
|
|
|
|
|
|
|
|
|
print("\n===== Cross-lingual Similarity (Pashto-English) =====")
|
|
|
print_similarities(model, cross_lingual_sentences, query_idx=1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|