## Imports

In [None]:
import pandas as pd
import os
import glob
from sklearn.feature_extraction.text import CountVectorizer

## Configuration

*input_dir:* The path to the directory that contains your text files. Please make sure to use a '/' (slash) in the end. For example: `path/to/texts/`.

*output_dir:* The path to the directory where you want to save extracted seed words. Please make sure to use a '/' (slash) in the end. For example: `path/to/output/`.

*seed_words_filename:* The filename for the resulting list of seed words. This must use the **.txt** extension.

*max_df & min_df*: Please refer to the [CountVectorizer documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) for these parameters.

*num_words:* The number of words to extract.

In [None]:
input_dir = "../data/texts/"
output_dir = "results/raw/"
seed_words_filename = "seed_words.txt"
max_df = 0.8
min_df = 1
num_words = 3000

## Directory Setup (Optional)
Creates directories according to the configuration if not already created manually.

In [None]:
if not os.path.exists(input_dir):
 os.makedirs(input_dir)
if not os.path.exists(output_dir):
 os.makedirs(output_dir)

## Seed Word Extraction

### Load texts

In [None]:
text_file_names = glob.glob("{}*.txt".format(input_dir))
print("found {} texts".format(len(text_file_names)))
texts = []
for text_file_name in text_file_names:
 with open(text_file_name, "r", encoding="utf-8") as input_file:
 texts.append(input_file.read())
print("loaded {} texts".format(len(texts)))

### Extract seed words

In [None]:
cv = CountVectorizer(max_df=max_df, min_df=min_df, token_pattern=r"\b[^\d\W]{3,}\b")
tf_raw = cv.fit_transform(texts)
tf_df = pd.DataFrame(tf_raw.todense(), columns=cv.get_feature_names()) 
sorted_words = tf_df.sum().sort_values(ascending=False).head(num_words)

### Save seed words

In [None]:
with open("{}{}".format(output_dir, seed_words_filename), "w", encoding="utf-8") as textfile:
 for sw in sorted_words.index:
 textfile.write("{}\n".format(sw))