{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import glob" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configuration\n", "\n", "*input_dir:* The path to the directory containg annoted seed words. Please make sure to use a '/' (slash) in the end. For example: `path/to/annotated/seed_words/`.\n", "\n", "*output_dir:* The path to the directory where you want to save selected seed words. Please make sure to use a '/' (slash) in the end. For example: `path/to/output/`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "input_dir = \"results/annotated/\"\n", "output_dir = \"results/selected/\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Directory Setup (Optional)\n", "Creates directories according to the configuration if not already created manually." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if not os.path.exists(input_dir):\n", " os.makedirs(input_dir)\n", "if not os.path.exists(output_dir):\n", " os.makedirs(output_dir)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Seed Word Selection" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load annoteted seed words" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "annotation_file_names = glob.glob(\"{}*.csv\".format(input_dir))\n", "print(\"found {} annotations\".format(len(annotation_file_names)))\n", "annotations = []\n", "for annotation_file_name in annotation_file_names:\n", " annotations.append(pd.read_csv(annotation_file_name, index_col=\"word\"))\n", "print(\"loaded {} annotations\".format(len(annotations)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select seed words\n", "This is based on a majority vote." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "annotations_df = pd.concat(annotations, axis=1).fillna(\"neutral\")\n", "pos_words = []\n", "neg_words = []\n", "neu_words = []\n", "for w, row in annotations_df.mode(axis=1).iterrows():\n", " row = row.dropna()\n", " if len(row) > 1:\n", " continue\n", " if row[0] == \"positive\":\n", " pos_words.append(w)\n", " elif row[0] == \"negative\":\n", " neg_words.append(w)\n", " elif row[0] == \"neutral\":\n", " neu_words.append(w)\n", "print(\"number of positive:\", len(pos_words))\n", "print(\"number of negative:\", len(neg_words))\n", "print(\"number of neutral:\", len(neu_words))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save selected seed words" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"{}positive.txt\".format(output_dir), mode=\"wt\", encoding=\"utf-8\") as pos_file:\n", " pos_file.write(\"\\n\".join(pos_words))\n", "with open(\"{}negative.txt\".format(output_dir), mode=\"wt\", encoding=\"utf-8\") as neg_file:\n", " neg_file.write(\"\\n\".join(neg_words))\n", "with open(\"{}neutral.txt\".format(output_dir), mode=\"wt\", encoding=\"utf-8\") as neu_file:\n", " neu_file.write(\"\\n\".join(neu_words))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }