In [1]:
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
modelPath = "embedding_model"

model = SentenceTransformer('all-mpnet-base-v2')
model.save(modelPath)
model = SentenceTransformer(modelPath)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
from spark_setup import create_spark_session, load_data, file_paths
from pyspark.sql import functions as F

In [6]:
spark = create_spark_session()

# Load Spark data
dataframes = load_data(spark, file_paths)
spark_details = {
    "SparkSession": "In-Memory",
    "SparkContext": str(spark.sparkContext),
    "Spark UI": spark.sparkContext.uiWebUrl,
    "Version": spark.version,
    "Master": spark.sparkContext.master,
    "AppName": spark.sparkContext.appName
}

In [16]:
df = dataframes["geospatial_clustering_data"]
titles = df.select("title_x").rdd.flatMap(lambda x: x).collect()

In [15]:
df.head(5)

[Row(_c0=0, Unnamed: 0_x=0, publication_id='998a43f1-b808-46c9-8a11-e0edddcadb8a', affiliation_id=60029445, header='University of Minnesota Twin Cities', city='MN', country='United States', title_x='Wer Viewership and Queer Imag(in)ing: Thai Soap Opera Shadow of Love and Boys Love Media', publication_year='2024', Unnamed: 0_y='0', title_y='Wer Viewership and Queer Imag(in)ing: Thai Soap Opera Shadow of Love and Boys Love Media', abstract='This article brings film/media theory into Southeast Asian research through a revisionist queer approach. It contains two goals: addressing some recent developments about queer imag(in)ing in Thai media whilst reappraising the fundamental question of spectatorship via screen theory. Taking into account the more general issue of media specificity and the particular textual device of identity/gender-switch in several recent Thai television serials, we propose the notion of wer viewership: a mode of viewing practice that features viewer-text interaction 

In [35]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances


embeddings = np.load("embeddings.npy")

input = "Game"

# Encode the input query
input_embedding = model.encode(input)

# Calculate similarity scores
cos_similarities = cosine_similarity([input_embedding], embeddings)[0]

threshold = 0.38  # Set your desired threshold

# Find indices where cosine similarity exceeds the threshold
selected_indices = np.where(cos_similarities >= threshold)[0]

# Sort the selected indices based on cosine similarity in descending order
sorted_indices = selected_indices[np.argsort(cos_similarities[selected_indices])[::-1]]

# Get the top 10 and last 10 titles
top_10_titles = [titles[i] for i in sorted_indices[:10]]
last_10_titles = [titles[i] for i in sorted_indices[-10:]]

print("Number of Selected Indices:", sorted_indices.shape)
print("Top 10 Titles:", top_10_titles)
print("Last 10 Titles:", last_10_titles)

Number of Selected Indices: (3,)
Top 10 Titles: ['Factors affecting repositioning policy compliance: an integrative review', 'Identification, Management, and Outcomes of Combination Antiretroviral Treatment Failure in Adolescents with Perinatal Human Immunodeficiency Virus Infection in Asia', 'Development and Validation of a Pan-Genotypic Real-Time Quantitative Reverse Transcription-PCR Assay To Detect Canine Distemper Virus and Phocine Distemper Virus in Domestic Animals and Wildlife']
Last 10 Titles: ['Factors affecting repositioning policy compliance: an integrative review', 'Identification, Management, and Outcomes of Combination Antiretroviral Treatment Failure in Adolescents with Perinatal Human Immunodeficiency Virus Infection in Asia', 'Development and Validation of a Pan-Genotypic Real-Time Quantitative Reverse Transcription-PCR Assay To Detect Canine Distemper Virus and Phocine Distemper Virus in Domestic Animals and Wildlife']


In [36]:
publications_df["publication_id"].shape

(19999,)

In [37]:
index_to_pub_id

{0: '998a43f1-b808-46c9-8a11-e0edddcadb8a',
 1: '44389cca-52a8-4efb-b501-dff66d4c8e04',
 2: 'f2d4527d-88d4-4f91-9857-fd60d347af84',
 3: '0b554e30-f32a-4e26-8dcf-6831bab2875e',
 4: '93e8dd90-5e13-423f-b2d2-c362a7da2cb9',
 5: '4ba78171-27b1-4af1-94c7-87845c85b922',
 6: '9ace2de5-6995-45e1-9861-834ec02e37e4',
 7: '129403dd-3450-4011-a77e-6365cbdec2e8',
 8: 'caa341a0-5c4b-42de-a5e8-61f1893e3bdf',
 9: '185e7421-8d00-49a2-929e-7607a75e07de',
 10: '9d5d32f8-feba-429e-b525-d178d376f040',
 11: '3e1424e2-e64c-4721-ba2a-7ebf392d3313',
 12: '708719c2-044f-4cfd-ab72-ad2d992f054e',
 13: '49113f2d-b03e-442b-a637-c6d9fe7a5d12',
 14: '565112d0-d8e8-4d01-bbfc-539c343060ee',
 15: 'caa7f034-0ca7-4f66-a20e-75353f98c803',
 16: 'f00407ca-49f6-4ebd-ab4c-d707126bf076',
 17: '57e721b4-2e7f-495e-8d6e-cfa3a5648d4c',
 18: '170e550d-2cab-4c79-a40f-ef7312275a46',
 19: 'dfcfaab9-4281-4d44-a90c-2c0ad23514bd',
 20: 'a7a9d7f7-574e-4a1b-9203-78b9a0f9bf36',
 21: 'd2f5a5ef-509c-4150-903f-c12d3e56a04a',
 22: '22e9b950-c185-

In [54]:
import numpy as np
import pandas as pd

# Load embeddings
embeddings = np.load("embeddings.npy")

# Load publications.csv into a Pandas DataFrame
publications_df = pd.read_csv("data/clustering.csv")

# Create a dictionary mapping embedding index to publication ID
index_to_pub_id = {index: pub_id for index, pub_id in enumerate(publications_df["publication_id"].tolist())}

# Input query
input = "Computer and Human Interaction"

# Encode the input query
input_embedding = model.encode(input)  # Ensure `model` is properly defined

# Calculate similarity scores
cos_similarities = cosine_similarity([input_embedding], embeddings)[0]

# Set the similarity threshold
threshold = 0.38

# Find indices where cosine similarity exceeds the threshold
selected_indices = np.where(cos_similarities >= threshold)[0]

# Sort the selected indices based on cosine similarity in descending order
sorted_indices = selected_indices[np.argsort(cos_similarities[selected_indices])[::-1]]

# Map sorted indices to publication IDs
sorted_pub_ids = [index_to_pub_id[i] for i in sorted_indices]

# Query the Spark DataFrame `df` using the publication IDs
from pyspark.sql import SparkSession

# Assuming `spark` is your SparkSession and `df` is your Spark DataFrame
# spark = SparkSession.builder.appName("SimilaritySearch").getOrCreate()

# Filter `df` by publication IDs
print(len(sorted_pub_ids))
# publication_ids_to_query = sorted_pub_ids[:10]  # Example: Top 10 results
filtered_df = df.filter(df["publication_id"].isin(sorted_pub_ids))

# Collect results for further processing (e.g., displaying titles)
result_titles = filtered_df.select("title_x").limit(10).rdd.flatMap(lambda x: x).collect()

# Print results
print("Top 10 Titles:",result_titles)


51
Top 10 Titles: ['Development of a Virtual Reality Cognitive Stimulation Game for Elderly Patients with Cognitive Impairment', 'Development of a Game Play in Chulalongkorn University Central Library Metaverse', "Experience NECTEC's Virtual 3D Mini-Exhibition Hall: A Digital Showcase for Online Visitors", "Experience NECTEC's Virtual 3D Mini-Exhibition Hall: A Digital Showcase for Online Visitors", 'The Virtual Service Center For Telecommunication Business Enhancement Using Metaverse', 'Unlocking the power of robots: enhancing computational thinking through innovative teaching methods', 'Unlocking the power of robots: enhancing computational thinking through innovative teaching methods', 'Unlocking the power of robots: enhancing computational thinking through innovative teaching methods', 'Unlocking the power of robots: enhancing computational thinking through innovative teaching methods', 'CiRA-Core: The Connector for Developer Teachers and User Teachers to Artificial Intelligence']


In [51]:
sorted_pub_ids

['dd91027b-f496-4494-be8b-1103433cfa5b',
 'e46aba8b-5d83-49ee-9519-94196add256a',
 'e3a69891-399d-42a1-90bf-06e13f396b07',
 '656b6501-008f-456f-8f4d-30c60104c299',
 '7aa62682-3793-4327-a5d7-7963bf3f4489',
 '79ee2a1e-4201-44f6-b222-132ca7c1b1eb',
 '3f90f3e5-26a9-44a5-a999-6791c4bc2fae',
 '9d67c504-fe4b-45e9-bab7-789edbcccaa0',
 'bb7b1900-b8c3-45c9-b905-910f4809caef',
 '7790a0c7-ad93-4e94-96f8-e49a1b16d98a',
 'b216d728-1310-4ca2-ab65-bdc90d8a2bfd',
 '3db6d058-bb23-48a1-aa83-d21729b295ec',
 '18cdd275-5dae-45cb-9ad9-c5f6d1da6eac',
 'e8fd65ee-5c93-4dcb-a9f8-d29bf0b04c84',
 'a7644211-de41-4bcd-a545-6f5cf151ff3a',
 'ca523234-0863-452c-948e-70eeaa3894a4',
 '281a71e5-52f9-448c-89fa-d1fcb0ed3483',
 'a696c3e8-29ae-4299-8a63-5749ca2a5963',
 '1d9ea7d2-2baf-4237-9d1d-ad74b251014d',
 'ec73f1d9-190b-4b50-98d9-8e009e6251cd',
 '1fe32560-497c-4dcd-99f6-6674c8fc7c4f',
 'd467cf00-7eca-45fd-8bbc-2ee87583d5ad',
 'f1784e5f-a6f2-4e1a-8e46-9442492a2a13',
 '7c2b45e1-1a86-4c36-9ca8-ecba3aef62b6',
 '18a6cf04-0ff6-

In [55]:
filtered_df.show()

+-----+------------+--------------------+--------------+--------------------+--------------------+-------------+--------------------+----------------+------------+--------------------+--------------------+--------------------+-------+
|  _c0|Unnamed: 0_x|      publication_id|affiliation_id|              header|                city|      country|             title_x|publication_year|Unnamed: 0_y|             title_y|            abstract|       combined_text|cluster|
+-----+------------+--------------------+--------------+--------------------+--------------------+-------------+--------------------+----------------+------------+--------------------+--------------------+--------------------+-------+
|  497|         813|3deada49-20f3-441...|      60103462|King Mongkut's Un...|             Bangkok|     Thailand|Development of a ...|            2024|         327|Development of a ...|Mild Cognitive Im...|Development of a ...|     16|
|  626|        1002|1ff8f044-1078-45e...|      60103462|King

In [34]:
df.show()

+---+------------+--------------------+--------------+--------------------+--------------------+--------------+--------------------+----------------+------------+--------------------+--------------------+--------------------+-------+
|_c0|Unnamed: 0_x|      publication_id|affiliation_id|              header|                city|       country|             title_x|publication_year|Unnamed: 0_y|             title_y|            abstract|       combined_text|cluster|
+---+------------+--------------------+--------------+--------------------+--------------------+--------------+--------------------+----------------+------------+--------------------+--------------------+--------------------+-------+
|  0|           0|998a43f1-b808-46c...|      60029445|University of Min...|                  MN| United States|Wer Viewership an...|            2024|           0|Wer Viewership an...|This article brin...|Wer Viewership an...|      6|
|  1|           2|0b554e30-f32a-4e2...|      60104301|Nazarbayev

In [25]:
print(f"Number of embeddings: {len(embeddings)}")
print(f"Number of mappings: {len(index_to_pub_id)}")


Number of embeddings: 19999
Number of mappings: 18545
