Commit bbe2eb64 authored by Niels-Oliver Walkowski's avatar Niels-Oliver Walkowski
Browse files

update(vdhd): Implement recommendations from review

parent 5f668ffe
.ipynb_checkpoints
*.pyc
.DS_Store
/data/raw_data/vectorian_cache/embeddings
/data/raw_data/sentence_transformers/sbert.net_models_paraphrase-distilroberta-base-v1.zip
/data/raw_data/sentence_transformers/sbert.net_models_paraphrase-distilroberta-base-v1
/data/raw_data/vectorian_cache
/data/raw_data/sentence_transformers/sentence-transformers*
/data/raw_data/numberbatch-19.08-en-pca-50.zip
/data/processed_data/temp_corpus/temp_corpus_*
/data/processed_data/sbert_cache_tok_*
/.idea
pyproject.toml
poetry.lock
manifest.yml
melusina.css
publication.css
# “Embed, embed! There’s knocking at the gate." - Detecting Intertextuality with the Vectorian Notebook of Embeddings
# “Embed, embed! There’s knocking at the gate. - Detecting Intertextuality with Embeddings and the Vectorian
# Abstract
*Bernhard Liebl & Manuel Burghardt, Computational Humanities Group, Leipzig University*
The detection of intertextual references in text corpora is a digital humanities topic that has gained a lot of attention in recent years. While intertextuality – from a literary studies perspective – describes the phenomenon of one text being present in another text, the computational problem at hand is the task of text similarity detection, and more concretely, semantic similarity detection. In this notebook, we introduce the Vectorian as a framework to build queries through word embeddings such as fastText and GloVe. We evaluate the influence of computing document similarity through alignments such as Waterman-Smith-Beyer and two variants of Word Mover’s Distance. We also investigate the performance of state-of-art sentence embeddings like Siamese BERT networks for the task - both as document embeddings and as contextual token embeddings. Overall, we find that Waterman-Smith-Beyer with fastText and token similarities weighted by Part-of-speech-tags offers highly competitive performance. The notebook can also be used to upload new data for performing custom search queries.
The detection of intertextual references in text corpora is a digital humanities topic that has gained a lot of attention in recent years. While intertextuality – from a literary studies perspective – describes the phenomenon of one text being present in another text, the computational problem at hand is the task of text similarity detection, and more concretely, semantic similarity detection. In this notebook, we introduce the Vectorian as a framework to build queries through word embeddings such as fastText and GloVe. We evaluate the influence of computing document similarity through alignments such as Waterman-Smith-Beyer and two variants of Word Mover’s Distance. We also investigate the performance of state-of-art sentence embeddings like Siamese BERT networks for the task - both as document embeddings and as contextual token embeddings. Overall, we find that Waterman-Smith-Beyer with fastText offers highly competitive performance. The notebook can also be used to upload new data for performing custom search queries.
# Components
......
graphviz
import json
from collections import namedtuple
import networkx as nx
Source = namedtuple("Source", ["work", "author"])
def load_data(path):
graph = nx.DiGraph()
with open(path, "r") as f:
data = json.loads(f.read())
class Pattern:
def __init__(self, phrase, source):
self._phrase = phrase
self._source = source
self._occurrences = []
@property
def phrase(self):
return self._phrase
for k, v in data["nodes"].items():
v["id"] = k
graph.add_node(k, **v)
@property
def occurrences(self):
return self._occurrences
def add_occurrence(self, occ):
self._occurrences.append(occ)
occ.attach(self)
Evidence = namedtuple("Text", ["context", "phrase"])
for edge in data["edges"]:
graph.add_edge(edge[0], edge[1])
class Occurrence:
def __init__(self, unique_id, evidence, source):
self._unique_id = unique_id
self._evidence = evidence
self._source = source
self._pattern = None
@property
def unique_id(self):
return self._unique_id
@property
def evidence(self):
return self._evidence
@property
def source(self):
return self._source
@property
def pattern(self):
return self._pattern
def attach(self, pattern):
assert self._pattern is None
self._pattern = pattern
class Data:
def __init__(self, path):
self._patterns = []
self._occurrences = []
with open(path, "r") as f:
data = json.loads(f.read())
for entry in data:
pattern = Pattern(entry["phrase"], entry["source"])
self._patterns.append(pattern)
for m in entry["matches"]:
occ = Occurrence(
m["id"],
Evidence(
m["context"],
m["quote"]),
Source(
m["work"],
m["author"]
))
pattern.add_occurrence(occ)
self._occurrences.append(occ)
@property
def patterns(self):
return self._patterns
@property
def occurrences(self):
return self._occurrences
return graph
This diff is collapsed.
%% Cell type:code id:967dfa28-fed4-4da8-ad85-1a925ef05137 tags:
``` python
import sys
import os
from pathlib import Path
data_path = Path("../data")
os.environ["VECTORIAN_DEMO_DATA_PATH"] = str(data_path)
sys.path.append("code")
import nbutils
import gold
import json
from vectorian.importers import StringImporter
from vectorian.embeddings import SentenceBertEmbedding
from vectorian.importers import TextImporter
from vectorian.corpus import Corpus
from tqdm import tqdm
nlp = nbutils.make_nlp()
with open("../data/raw_data/gold.json", "r") as f:
queries = json.loads(f.read())
bert_embedding = SentenceBertEmbedding(nlp)
gold_data = gold.load_data(data_path / "raw_data" / "gold.json")
contextual_embeddings = dict(
(k, v) for k, v in nbutils.load_embeddings(data_path / "raw_data" / "embeddings.yml").items()
if v.is_contextual)
def prepare_docs():
if bert_embedding is not None:
im = StringImporter(nlp, embeddings=[bert_embedding])
if contextual_embeddings:
im = TextImporter(nlp, embeddings=list(contextual_embeddings.values()))
else:
im = StringImporter(nlp)
im = TextImporter(nlp)
docs = []
corpus = Corpus(data_path / "processed_data"/ "corpus")
for query in tqdm(queries, desc="Importing"):
for m in query["matches"]:
docs.append(im(
m["context"],
title=m["work"],
author=m["author"],
unique_id=m["id"]))
for x, d in tqdm(gold_data.in_degree(gold_data.nodes), desc="Importing"):
if d < 1:
continue
node = gold_data.nodes[x]
doc = im(
node["context"],
title=node["source"]["book"],
author=node["source"]["author"],
extra_metadata={
'gold_id': node["id"]
},
show_progress=False)
Corpus(docs).save("../data/processed_data/corpus")
corpus.add_doc(doc)
prepare_docs()
```
%% Output
Importing: 100%|█████████████████████████████████████████████████████████████████████████████████████| 120/120 [04:17<00:00, 2.14s/it]
%% Cell type:code id:4426e21d-156c-4e18-97c7-1eaff03afd2b tags:
``` python
```
......
# copied from the notebook but executable as a single script as
# a simple way to test regressions in the computations.
import sys
sys.path.append("code") # make "nbutils" and "code" importable
import nbutils, gold, vectorian
import ipywidgets as widgets
from ipywidgets import interact
nbutils.initialize("auto")
gold_data = gold.Data("data/raw_data/gold.json")
from vectorian.embeddings import Zoo
the_embeddings = {}
the_embeddings["glove"] = Zoo.load("glove-6B-50")
the_embeddings["fasttext"] = Zoo.load("fasttext-en-mini")
if nbutils.running_inside_binder(): # use precomputed version of Numberbatch?
the_embeddings["numberbatch"] = nbutils.download_word2vec_embedding(
"data/raw_data/numberbatch-19.08-en-pca-50",
"https://zenodo.org/record/4916056/files/numberbatch-19.08-en-pca-50.zip",
)
else:
# The following reduction of full Numberbatch to n=50 only works in envs
# with enough memory. For Binder etc. use the Zenodo version above.
the_embeddings["numberbatch"] = Zoo.load("numberbatch-19.08-en").pca(50)
from vectorian.embeddings import StackedEmbedding
the_embeddings["fasttext_numberbatch"] = StackedEmbedding(
[the_embeddings["fasttext"], the_embeddings["numberbatch"]]
)
nlp = nbutils.make_nlp("en_paraphrase_distilroberta_base_v1")
from vectorian.embeddings import SpacyVectorEmbedding, VectorCache
the_embeddings["sbert"] = SpacyVectorEmbedding(
nlp, 768, cache=VectorCache("data/processed_data/sbert_contextual", readonly=True)
)
from vectorian.session import LabSession
from vectorian.corpus import Corpus
corpus = Corpus("data/processed_data/corpus", mutable=False)
session = LabSession(
corpus,
embeddings=the_embeddings.values())
# the following command loads all contextual embedding vectors into RAM.
# while not necessary, this speeds up some of the ensuing computations.
session.cache_contextual_embeddings()
from vectorian.embeddings import CachedPartitionEncoder, SpanEncoder
# create an encoder that basically calls nlp(t).vector
sbert_encoder = CachedPartitionEncoder(
SpanEncoder(lambda texts: [nlp(t).vector for t in texts])
)
# compute encodings and/or save cached data
sbert_encoder.try_load("data/processed_data/doc_embeddings")
sbert_encoder.cache(session.documents, session.partition("document"))
sbert_encoder.save("data/processed_data/doc_embeddings")
# extract name of encoder for later use
sbert_encoder_name = nlp.meta["name"]
def make_index_builder(**kwargs):
return nbutils.InteractiveIndexBuilder(
session, nlp, partition_encoders={sbert_encoder_name: sbert_encoder}, **kwargs
)
import collections
import ipywidgets as widgets
# define 4 different search stratgies via make_index_builder
index_builders = collections.OrderedDict(
{
"wsb": make_index_builder(
strategy="Alignment",
strategy_options={
"alignment": vectorian.alignment.LocalAlignment(
gap={
"s": vectorian.alignment.smooth_gap_cost(5),
"t": vectorian.alignment.smooth_gap_cost(5)
}
)
},
),
"wmd nbow": make_index_builder(
strategy="Alignment",
strategy_options={
"alignment": vectorian.alignment.WordMoversDistance.wmd("nbow")
},
),
"wmd bow": make_index_builder(
strategy="Alignment",
strategy_options={
"alignment": vectorian.alignment.WordMoversDistance.wmd("bow")
},
),
"doc embedding": make_index_builder(strategy="Partition Embedding"),
}
)
# present UI of various options that allows for editing
accordion = widgets.Accordion(children=[x.displayable for x in index_builders.values()])
for i, k in enumerate(index_builders.keys()):
accordion.set_title(i, k)
nbutils.plot_ndcgs(
gold_data, dict((k, v.build_index()) for k, v in index_builders.items()),
save_to="ndcg_results.png"
);
the .h5 data was produced with h5py 2.10.0
{"https://explosion.ai/en/paraphrase-distilroberta-base-v1_with_en_core_web_sm/3.1.0": "176182c625fd11ecac4c1e294004a039", "https://explosion.ai/en/msmarco-distilbert-base-v4_with_en_core_web_sm/3.1.0": "1761a31425fd11ecac4c1e294004a039"}
\ No newline at end of file
{"https://explosion.ai/en/paraphrase-distilroberta-base-v1_with_en_core_web_sm/3.1.0": "25e3569425fd11ecac4c1e294004a039", "https://explosion.ai/en/msmarco-distilbert-base-v4_with_en_core_web_sm/3.1.0": "25e3761025fd11ecac4c1e294004a039"}
\ No newline at end of file
{"https://explosion.ai/en/paraphrase-distilroberta-base-v1_with_en_core_web_sm/3.1.0": "c049520225fc11ecac4c1e294004a039", "https://explosion.ai/en/msmarco-distilbert-base-v4_with_en_core_web_sm/3.1.0": "c049980225fc11ecac4c1e294004a039"}
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment