{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import glob\n", "from sklearn.feature_extraction.text import CountVectorizer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configuration\n", "\n", "*input_dir:* The path to the directory that contains your text files. Please make sure to use a '/' (slash) in the end. For example: `path/to/texts/`.\n", "\n", "*output_dir:* The path to the directory where you want to save extracted seed words. Please make sure to use a '/' (slash) in the end. For example: `path/to/output/`.\n", "\n", "*seed_words_filename:* The filename for the resulting list of seed words. This must use the **.txt** extension.\n", "\n", "*max_df & min_df*: Please refer to the [CountVectorizer documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) for these parameters.\n", "\n", "*num_words:* The number of words to extract." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "input_dir = \"../data/texts/\"\n", "output_dir = \"results/raw/\"\n", "seed_words_filename = \"seed_words.txt\"\n", "max_df = 0.8\n", "min_df = 1\n", "num_words = 3000" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Directory Setup (Optional)\n", "Creates directories according to the configuration if not already created manually." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if not os.path.exists(input_dir):\n", " os.makedirs(input_dir)\n", "if not os.path.exists(output_dir):\n", " os.makedirs(output_dir)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Seed Word Extraction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load texts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text_file_names = glob.glob(\"{}*.txt\".format(input_dir))\n", "print(\"found {} texts\".format(len(text_file_names)))\n", "texts = []\n", "for text_file_name in text_file_names:\n", " with open(text_file_name, \"r\", encoding=\"utf-8\") as input_file:\n", " texts.append(input_file.read())\n", "print(\"loaded {} texts\".format(len(texts)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Extract seed words" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cv = CountVectorizer(max_df=max_df, min_df=min_df, token_pattern=r\"\\b[^\\d\\W]{3,}\\b\")\n", "tf_raw = cv.fit_transform(texts)\n", "tf_df = pd.DataFrame(tf_raw.todense(), columns=cv.get_feature_names()) \n", "sorted_words = tf_df.sum().sort_values(ascending=False).head(num_words)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save seed words" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"{}{}\".format(output_dir, seed_words_filename), \"w\", encoding=\"utf-8\") as textfile:\n", " for sw in sorted_words.index:\n", " textfile.write(\"{}\\n\".format(sw))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }