{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import pickle\n", "import os\n", "import numpy as np\n", "from sklearn.neighbors import KNeighborsClassifier" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configuration\n", "\n", "*seed_words_dir:* The path to the directory containing seed words. Please make sure to use a '/' (slash) in the end. For example: path/to/input/.\n", "\n", "*w2v_model_filename:* The filename for the word2vec model.\n", "\n", "*extended_words_dir:* The path to the directory where you want to save resulting dictionaries. Please make sure to use a '/' (slash) in the end. For example: path/to/output/." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "seed_words_dir = \"../seed_words/results/selected/\"\n", "w2v_model_filename = \"../word_embeddings/best_model.p\"\n", "extended_words_dir = \"result/\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Directory Setup (Optional)\n", "Creates directories according to the configuration if not already created manually." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if not os.path.exists(extended_words_dir):\n", " os.makedirs(extended_words_dir)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load seed words" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sentiment_dictionary = {}\n", "with open(\"{}negative.txt\".format(seed_words_dir), \"r\", encoding=\"utf-8\") as fr:\n", " sentiment_dictionary[\"neg\"] = fr.read().splitlines()\n", "with open(\"{}positive.txt\".format(seed_words_dir), \"r\", encoding=\"utf-8\") as fr:\n", " sentiment_dictionary[\"pos\"] = fr.read().splitlines()\n", "with open(\"{}neutral.txt\".format(seed_words_dir), \"r\", encoding=\"utf-8\") as fr:\n", " sentiment_dictionary[\"neu\"] = fr.read().splitlines()\n", "print(\"loaded {} positive seed words\".format(len(sentiment_dictionary[\"pos\"])))\n", "print(\"loaded {} negative seed words\".format(len(sentiment_dictionary[\"neg\"])))\n", "print(\"loaded {} neutral seed words\".format(len(sentiment_dictionary[\"neu\"])))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load word2vec model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(w2v_model_filename, \"rb\") as handle:\n", " w2v_model = pickle.load(handle)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Train KNN model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "neg_words = set(sentiment_dictionary[\"neg\"]) & set(w2v_model.wv.vocab)\n", "pos_words = set(sentiment_dictionary[\"pos\"]) & set(w2v_model.wv.vocab)\n", "neu_words = set(sentiment_dictionary[\"neu\"]) & set(w2v_model.wv.vocab)\n", "\n", "X = pd.concat([pd.DataFrame(w2v_model.wv[neg_words], index=neg_words), pd.DataFrame(w2v_model.wv[pos_words], index=pos_words), pd.DataFrame(w2v_model.wv[neu_words], index=neu_words)])\n", "y = X.apply(lambda x: 1 if x.name in pos_words else 0, axis=1)\n", "y = X.apply(lambda x: -1 if x.name in neg_words else y[x.name], axis=1)\n", "\n", "neigh = KNeighborsClassifier(n_neighbors=5, metric=\"cosine\", weights=\"distance\")\n", "neigh.fit(X, y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Transfer sentiment" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_new = pd.DataFrame(w2v_model.wv.vectors, index=w2v_model.wv.index2word)\n", "X_new.drop(X.index, inplace=True)\n", "\n", "neigh_pred = pd.Series(neigh.predict(X_new), index=X_new.index)\n", "neigh_pred_neu = neigh_pred[neigh_pred == 0]\n", "neigh_pred_pos = neigh_pred[neigh_pred == 1]\n", "neigh_pred_neg = neigh_pred[neigh_pred == -1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save extended words" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"{}positive.txt\".format(extended_words_dir), mode=\"wt\", encoding=\"utf-8\") as pos_file:\n", " pos_file.write(\"\\n\".join(neigh_pred_pos.index))\n", "with open(\"{}negative.txt\".format(extended_words_dir), mode=\"wt\", encoding=\"utf-8\") as neg_file:\n", " neg_file.write(\"\\n\".join(neigh_pred_neg.index))\n", "with open(\"{}neutral.txt\".format(extended_words_dir), mode=\"wt\", encoding=\"utf-8\") as neu_file:\n", " neu_file.write(\"\\n\".join(neigh_pred_neu.index))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }