You shall know a piece by the company it keeps. Chess plays as a data for word2vec models
Paper
•
2407.19600
•
Published
digital humanities, NLP
import gensim
from sklearn.decomposition import PCA
import matplotlib
import matplotlib.pyplot as plt
model = gensim.models.Word2Vec.load('white_moves.model')
dict_moves = model.wv.vocab
dict_moves_appr = {}
for k in dict_moves:
if not k.startswith('->'):
continue
dict_moves_appr[k] = dict_moves[k]
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
fig, ax = plt.subplots()
ax.plot(Y[:, 0], Y[:, 1], 'o')
ax.set_title('White moves')
lab = list(dict_moves_appr)
for i, lb in enumerate(lab):
plt.annotate(lb, xy=(Y[i, 0], Y[i, 1]))
plt.show()
<div style="min-height:494px"><script type="text/javascript" defer src="https://datawrapper.dwcdn.net/q3waH/embed.js?v=2" charset="utf-8"></script><noscript><img src="https://datawrapper.dwcdn.net/q3waH/full.png" alt="" /></noscript></div>
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
modelLNT2 = Word2Vec.load("cbow_300_10.model")
# skip some code... for full version see model's card
tsnescatterplot(modelLNT2, 'жизнь_S', [i[0] for i in modelLNT2.wv.most_similar(negative=["жизнь_S"])])
import numpy as np
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
modell = Word2Vec.load("w2vlemmas.model")
keys = ['Шекспир', 'Пушкин', 'Гоголь', 'матрос', 'кот', 'роман']
embedding_clusters = []
word_clusters = []
for word in keys:
embeddings = []
words = []
for similar_word, _ in modell.wv.most_similar(word, topn=30):
words.append(similar_word)
embeddings.append(modell.wv[similar_word])
embedding_clusters.append(embeddings)
word_clusters.append(words)
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)