{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import glob" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configuration\n", "*input_dir:* The path to the directory that contains your text files. Please make sure to use a '/' (slash) in the end. For example: `path/to/texts/`.\n", "\n", "*dataframe_filename:* The filename for the resulting pandas DataFrame. You may use the **.p** extension indicating a pickled file, but you are free to use whatever you like. Just make sure this is consistent in the subsequent sentiment analysis step." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "input_dir = \"\"\n", "dataframe_filename = \"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Directory Setup (Optional)\n", "Creates directories according to the configuration if not already created manually." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if not os.path.exists(input_dir):\n", " os.makedirs(input_dir)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Preparation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load texts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text_file_names = glob.glob(\"{}*.txt\".format(input_dir))\n", "print(\"found {} texts\".format(len(text_file_names)))\n", "texts = []\n", "for text_file_name in text_file_names:\n", " if \"\\\\\" in text_file_name:\n", " corrected_filename = text_file_name.split(\"\\\\\")[-1]\n", " elif \"/\" in text_file_name:\n", " corrected_filename = text_file_name.split(\"/\")[-1]\n", " with open(text_file_name, \"r\", encoding=\"utf-8\") as input_file:\n", " texts.append([corrected_filename, input_file.read()])\n", "print(\"loaded {} texts\".format(len(texts)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create DataFrame" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"searching files for attributes and text\")\n", "prepared_texts = []\n", "num_attributes = 0\n", "for filename, text in texts:\n", " lines = text.split(\"\\n\")\n", " prepared_text = {\"filename\": filename}\n", " cur_line = 0\n", " for line in lines:\n", " line_type, line_content = line.split(\"=\")[:2]\n", " if line_type != \"text\":\n", " try:\n", " line_content = float(line_content)\n", " except ValueError:\n", " pass\n", " prepared_text.update({line_type: line_content})\n", " else:\n", " break\n", " cur_line += 1\n", " num_attributes = max(num_attributes, cur_line)\n", " prepared_text.update({\"text\": \" \".join(lines[cur_line:])[5:]})\n", " prepared_texts.append(prepared_text)\n", "\n", "print(\"found {} additional attributes in .txt files\".format(num_attributes))\n", "\n", "texts_df = pd.DataFrame(prepared_texts)\n", "texts_df.set_index(\"filename\", inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save DataFrame" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "texts_df.to_pickle(dataframe_filename)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }