## Imports

In [None]:
import pandas as pd
import pickle
import os
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

## Configuration

*seed_words_dir:* The path to the directory containing seed words. Please make sure to use a '/' (slash) in the end. For example: path/to/input/.

*w2v_model_filename:* The filename for the word2vec model.

*extended_words_dir:* The path to the directory where you want to save resulting dictionaries. Please make sure to use a '/' (slash) in the end. For example: path/to/output/.

In [None]:
seed_words_dir = "../seed_words/results/selected/"
w2v_model_filename = "../word_embeddings/best_model.p"
extended_words_dir = "result/"

## Directory Setup (Optional)
Creates directories according to the configuration if not already created manually.

In [None]:
if not os.path.exists(extended_words_dir):
 os.makedirs(extended_words_dir)

## Classification

### Load seed words

In [None]:
sentiment_dictionary = {}
with open("{}negative.txt".format(seed_words_dir), "r", encoding="utf-8") as fr:
 sentiment_dictionary["neg"] = fr.read().splitlines()
with open("{}positive.txt".format(seed_words_dir), "r", encoding="utf-8") as fr:
 sentiment_dictionary["pos"] = fr.read().splitlines()
with open("{}neutral.txt".format(seed_words_dir), "r", encoding="utf-8") as fr:
 sentiment_dictionary["neu"] = fr.read().splitlines()
print("loaded {} positive seed words".format(len(sentiment_dictionary["pos"])))
print("loaded {} negative seed words".format(len(sentiment_dictionary["neg"])))
print("loaded {} neutral seed words".format(len(sentiment_dictionary["neu"])))

### Load word2vec model

In [None]:
with open(w2v_model_filename, "rb") as handle:
 w2v_model = pickle.load(handle)

### Train KNN model

In [None]:
neg_words = set(sentiment_dictionary["neg"]) & set(w2v_model.wv.vocab)
pos_words = set(sentiment_dictionary["pos"]) & set(w2v_model.wv.vocab)
neu_words = set(sentiment_dictionary["neu"]) & set(w2v_model.wv.vocab)

X = pd.concat([pd.DataFrame(w2v_model.wv[neg_words], index=neg_words), pd.DataFrame(w2v_model.wv[pos_words], index=pos_words), pd.DataFrame(w2v_model.wv[neu_words], index=neu_words)])
y = X.apply(lambda x: 1 if x.name in pos_words else 0, axis=1)
y = X.apply(lambda x: -1 if x.name in neg_words else y[x.name], axis=1)

neigh = KNeighborsClassifier(n_neighbors=5, metric="cosine", weights="distance")
neigh.fit(X, y)

### Transfer sentiment

In [None]:
X_new = pd.DataFrame(w2v_model.wv.vectors, index=w2v_model.wv.index2word)
X_new.drop(X.index, inplace=True)

neigh_pred = pd.Series(neigh.predict(X_new), index=X_new.index)
neigh_pred_neu = neigh_pred[neigh_pred == 0]
neigh_pred_pos = neigh_pred[neigh_pred == 1]
neigh_pred_neg = neigh_pred[neigh_pred == -1]

### Save extended words

In [None]:
with open("{}positive.txt".format(extended_words_dir), mode="wt", encoding="utf-8") as pos_file:
 pos_file.write("\n".join(neigh_pred_pos.index))
with open("{}negative.txt".format(extended_words_dir), mode="wt", encoding="utf-8") as neg_file:
 neg_file.write("\n".join(neigh_pred_neg.index))
with open("{}neutral.txt".format(extended_words_dir), mode="wt", encoding="utf-8") as neu_file:
 neu_file.write("\n".join(neigh_pred_neu.index))