Graph Spectral Clustering for Codenames¶

Codenames is a party game where players need to hint at words by providing other words that have something in common. Can we use Graph Spectral Clustering to find groups of similar words?

We use GloVE embeddings of Wikipedia as vectorized representations of words.

from lib.datasets import load_wikipedia_wordvecs
from lib.spectral_clustering import similarity_matrix, laplacian_matrix, spectral_clustering
import numpy as np
import matplotlib.pyplot as plt

words, vecs = load_wikipedia_wordvecs().values()

def similarity_matrix(data, s=1):
    n = len(data)
    scale = [np.sum(np.linalg.norm( data - x.reshape((1, -1)), axis=0)) for x in data ]
    similarity_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(n):
            similarity_matrix[i][j] = s * np.linalg.norm(data[i] - data[j]) / (scale[i]+scale[j])
    return similarity_matrix

codenames = "penguin,Germany,spy,battery,stadium,opera,shop,ambulance,brush,forest,Mexico,beat,fire,whip,\
switch,horse,band,deck,concert,horn,link,charge,row,line,lock".split(",")
indices = np.array([words.index(c) for c in codenames])


word_vecs = vecs[indices]

s = similarity_matrix(word_vecs)

k = 14
assns, (evals, evecs) = spectral_clustering(word_vecs, k=k, lform = "rw", metric="g", s=0.5, with_eigen=True)

[[codenames[j] for j in range(len(codenames)) if assns[j] == i] for i in range(k)]

[['Germany'],
 ['penguin'],
 ['whip'],
 ['Mexico'],
 ['horse'],
 ['forest'],
 ['horn'],
 ['spy',
  'battery',
  'brush',
  'beat',
  'switch',
  'band',
  'concert',
  'link',
  'charge',
  'row',
  'lock'],
 ['deck'],
 ['shop'],
 ['stadium'],
 ['opera'],
 ['line'],
 ['ambulance', 'fire']]

def avg_vec(words, candidates, vocabulary):
    # words is a list of word strings
    # candidates is a Nx300 dataset of all candidate word vectors
    # vocabulary is a list of word strings, 1 per vector in candidate
    mean = np.mean([candidates[vocabulary.index(word)] for word in words], axis=0).reshape((1, -1))
    diff = np.linalg.norm(candidates - mean, axis=0)
    return vocabulary[np.argmin(diff)]

avg_vec(('fly', 'Moscow'), vecs, words)

'hold'