## Imports

In [None]:
import pandas as pd
import os
import glob
from sklearn.metrics import balanced_accuracy_score

## Configuration

*extended_words_dir:* The path to the directory where you saved resulting dictionaries. Please make sure to use a '/' (slash) in the end. For example: path/to/input/.

*annotated_positive_words_filename:* The complete path to the **.txt** file containing annotated positive evaluation words.

*annotated_negative_words_filename:* The complete path to the **.txt** file containing annotated negative evaluation words.

*annotated_neutral_words_filename:* The complete path to the **.txt** file containing annotated neutral evaluation words.

*num_words_to_annotate:* The number of words to randomly sample from each class (positive, negative, neutral) for annotation. This is only needed if you do not use the ready-to-use ecaluation words.

*annotated_words_dir:* The path to the directory where you want to save randomly extracted evaluation words as well as selected evaluation words. Please make sure to use a '/' (slash) in the end. For example: path/to/output/.

In [None]:
extended_words_dir = "result/"
annotated_positive_words_filename = "ready_to_use/French_positive.txt"
annotated_negative_words_filename = "ready_to_use/French_negative.txt"
annotated_neutral_words_filename = "ready_to_use/French_neutral.txt"
num_words_to_annotate = 10
annotated_words_dir = "evaluation_words/"

## Directory Setup (Optional)
Creates directories according to the configuration if not already created manually.

In [None]:
if not os.path.exists(annotated_words_dir):
 os.makedirs(annotated_words_dir)

## Create annotation lists
The following cells are only necessary if you want to create your own evaluation word lists. You can also use the ready-to-use lists and skip to the *Evaluate extended dictionaries* section.

### Randomly sample evaluation words

In [None]:
with open("{}negative.txt".format(extended_words_dir), "r", encoding="utf-8") as fr:
 neg = fr.read().splitlines()
with open("{}positive.txt".format(extended_words_dir), "r", encoding="utf-8") as fr:
 pos = fr.read().splitlines()
with open("{}neutral.txt".format(extended_words_dir), "r", encoding="utf-8") as fr:
 neu = fr.read().splitlines()
neg_s = pd.Series(index=neg, dtype="object")
pos_s = pd.Series(index=pos, dtype="object")
neu_s = pd.Series(index=neu, dtype="object")
neg_samples = neg_s.sample(num_words_to_annotate)
pos_samples = pos_s.sample(num_words_to_annotate)
neu_samples = neu_s.sample(num_words_to_annotate)

### Save evaluation words to .csv

In [None]:
evaluation_words_s = pd.concat([neg_samples, pos_samples, neu_samples])

print("enter name of annotator: ")
annotator = input()

evaluation_words_s.to_csv("{}{}_evaluation_words.csv".format(annotated_words_dir, annotator.lower()), index_label="word", header=["sentiment"])

print("set up annotation file for: {}".format(annotator))

### Annotate seed words
Please open the created annotation files (.csv files) with a spreadsheet program of your choice (e.g., Excel or LibreOffice Calc) and annotate the seed words.
Make sure you use either of the following sentiment classes:

* positive
* negative
* neutral

Example:

| word | sentiment |
| --- | --- |
| good | positive |
| bad | negative |
| house | neutral |

Once you are finished, make sure to save the file using the **.csv** extension.

### Select evaluation words

In [None]:
annotation_file_names = glob.glob("{}*.csv".format(annotated_words_dir))
print("found {} annotations".format(len(annotation_file_names)))
annotations = []
for annotation_file_name in annotation_file_names:
 annotations.append(pd.read_csv(annotation_file_name, index_col="word"))
print("loaded {} annotations".format(len(annotations)))

### Select evaluation words
This is similar to the procedure for seed words and based on a majority vote.

In [None]:
annotations_df = pd.concat(annotations, axis=1).fillna("neutral")
pos_words = []
neg_words = []
neu_words = []
for w, row in annotations_df.mode(axis=1).iterrows():
 row = row.dropna()
 if len(row) > 1:
 continue
 if row[0] == "positive":
 pos_words.append(w)
 elif row[0] == "negative":
 neg_words.append(w)
 elif row[0] == "neutral":
 neu_words.append(w)
print("number of positive:", len(pos_words))
print("number of negative:", len(neg_words))
print("number of neutral:", len(neu_words))

### Save selected evaluation words

In [None]:
with open("{}positive.txt".format(annotated_words_dir), mode="wt", encoding="utf-8") as pos_file:
 pos_file.write("\n".join(pos_words))
with open("{}negative.txt".format(annotated_words_dir), mode="wt", encoding="utf-8") as neg_file:
 neg_file.write("\n".join(neg_words))
with open("{}neutral.txt".format(annotated_words_dir), mode="wt", encoding="utf-8") as neu_file:
 neu_file.write("\n".join(neu_words))

## Evaluate extended dictionaries

### Load extended dictionaries

In [None]:
with open("{}negative.txt".format(extended_words_dir), "r", encoding="utf-8") as fr:
 pred_neg = fr.read().splitlines()
with open("{}positive.txt".format(extended_words_dir), "r", encoding="utf-8") as fr:
 pred_pos = fr.read().splitlines()
with open("{}neutral.txt".format(extended_words_dir), "r", encoding="utf-8") as fr:
 pred_neu = fr.read().splitlines()
pred_y_neg_s = pd.Series("negative", index=pred_neg)
pred_y_pos_s = pd.Series("positive", index=pred_pos)
pred_y_neu_s = pd.Series("neutral", index=pred_neu)
pred_y_s = pd.concat([pred_y_neg_s, pred_y_pos_s, pred_y_neu_s])

### Load annotated evaluation words

In [None]:
with open("{}".format(annotated_negative_words_filename), "r", encoding="utf-8") as fr:
 true_neg = fr.read().splitlines()
with open("{}".format(annotated_positive_words_filename), "r", encoding="utf-8") as fr:
 true_pos = fr.read().splitlines()
with open("{}".format(annotated_neutral_words_filename), "r", encoding="utf-8") as fr:
 true_neu = fr.read().splitlines()
true_y_neg_s = pd.Series("negative", index=true_neg)
true_y_pos_s = pd.Series("positive", index=true_pos)
true_y_neu_s = pd.Series("neutral", index=true_neu)
true_y_s = pd.concat([true_y_neg_s, true_y_pos_s, true_y_neu_s])

### Evaluate
Compute the balanced accuracy score. For three classes (i.e., positive, negative, neutral), the random baseline is 0.33. If the score is higher than that, you are better than random guessing.

In [None]:
pred_y_s = pred_y_s[pred_y_s.index.isin(true_y_s.index)]
pred_y_s.sort_index(inplace=True)
true_y_s.sort_index(inplace=True)
print("balanced accuracy score:", balanced_accuracy_score(true_y_s, pred_y_s))