Codenames is a party game where players need to hint at words by providing other words that have something in common. Can we use Graph Spectral Clustering to find groups of similar words?
We use GloVE embeddings of Wikipedia as vectorized representations of words.
from lib.datasets import load_wikipedia_wordvecs from lib.spectral_clustering import similarity_matrix, laplacian_matrix, spectral_clustering import numpy as np import matplotlib.pyplot as plt
words, vecs = load_wikipedia_wordvecs().values()
def similarity_matrix(data, s=1): n = len(data) scale = [np.sum(np.linalg.norm( data - x.reshape((1, -1)), axis=0)) for x in data ] similarity_matrix = np.zeros((n, n)) for i in range(n): for j in range(n): similarity_matrix[i][j] = s * np.linalg.norm(data[i] - data[j]) / (scale[i]+scale[j]) return similarity_matrix
codenames = "penguin,Germany,spy,battery,stadium,opera,shop,ambulance,brush,forest,Mexico,beat,fire,whip,\ switch,horse,band,deck,concert,horn,link,charge,row,line,lock".split(",") indices = np.array([words.index(c) for c in codenames]) word_vecs = vecs[indices] s = similarity_matrix(word_vecs)
k = 14 assns, (evals, evecs) = spectral_clustering(word_vecs, k=k, lform = "rw", metric="g", s=0.5, with_eigen=True)
[[codenames[j] for j in range(len(codenames)) if assns[j] == i] for i in range(k)]
[['Germany'], ['penguin'], ['whip'], ['Mexico'], ['horse'], ['forest'], ['horn'], ['spy', 'battery', 'brush', 'beat', 'switch', 'band', 'concert', 'link', 'charge', 'row', 'lock'], ['deck'], ['shop'], ['stadium'], ['opera'], ['line'], ['ambulance', 'fire']]
def avg_vec(words, candidates, vocabulary): # words is a list of word strings # candidates is a Nx300 dataset of all candidate word vectors # vocabulary is a list of word strings, 1 per vector in candidate mean = np.mean([candidates[vocabulary.index(word)] for word in words], axis=0).reshape((1, -1)) diff = np.linalg.norm(candidates - mean, axis=0) return vocabulary[np.argmin(diff)]
avg_vec(('fly', 'Moscow'), vecs, words)