Commit 157f1d3e authored by Niels-Oliver Walkowski's avatar Niels-Oliver Walkowski
Browse files

upd(vdhd.2): Update to rc.2

parent 1a57d4a4
......@@ -17,9 +17,8 @@ The detection of intertextual references in text corpora is a digital humanities
+-- data
| +-- processed_data
| | +-- corpus: preprocessed Vectorian document data for parts of gold.json (e.g. tokenization)
| | +-- doc_embeddings.*: precomputed Sentence-BERT document embeddings for parts of gold.json
| | +-- sbert_contextual: precomputed Sentence-BERT contextual token embeddings for pattern phrases
| +-- raw_data
| | +-- embeddings.yml: specifies which embeddings are used and from where they are loaded
| | +-- gold.json: gold standard data for Shakespeare text reuse as JSON
| | +-- sentence_transformers: will contain S-BERT model (downloaded in the notebook)
| | +-- vectorian_cache: will contain word embedding data (downloaded in the notebook)
......
......@@ -29,6 +29,8 @@ import vectorian
import yaml
import shutil
import urllib
import re
import logging
import bokeh.plotting
import bokeh.models
......@@ -63,17 +65,17 @@ if os.environ.get("VECTORIAN_DEV"):
sys.path.append(str(vectorian_path))
import vectorian
from vectorian.embeddings import Word2VecVectors, TokenEmbeddingAggregator, prepare_docs
from vectorian.embeddings import CachedPartitionEncoder
from vectorian.embeddings import StackedEmbedding
from vectorian.embeddings import Zoo
from vectorian.embedding import Word2VecVectors
from vectorian.embedding import ContextualEmbedding, SentenceEmbedding
from vectorian.embedding import StackedEmbedding
from vectorian.embedding.zoo import Zoo
from vectorian.embedding.span import prepare_docs
from vectorian.index import DummyIndex
from vectorian.metrics import TokenSimilarity, CosineSimilarity
from vectorian.sim.token import EmbeddingTokenSim
from vectorian.sim.vector import CosineSim
from vectorian.interact import PartitionMetricWidget
from vectorian.importers import TextImporter
from vectorian.session import LabSession
from vectorian.embeddings import SpacyVectorEmbedding, VectorCache
from vectorian.embeddings import CachedPartitionEncoder, SpanEncoder
import warnings
......@@ -95,6 +97,7 @@ class DisplayMode(enum.Enum):
_display_mode = DisplayMode.EXPORT
_export_cells = False
def default_plot_width():
if _display_mode.static:
......@@ -107,8 +110,8 @@ def running_inside_binder():
return os.environ.get("BINDER_SERVICE_HOST") is not None
def initialize(display_mode="auto"):
global _display_mode
def initialize(display_mode="auto", export=False):
global _display_mode, _export_cells
if display_mode == "auto":
_display_mode = DisplayMode.BINDER if running_inside_binder() else DisplayMode.SERVER
else:
......@@ -121,10 +124,26 @@ def initialize(display_mode="auto"):
[f"localhost:{port}" for port in range(8888, 8898)])
bokeh.io.output_notebook()
_export_cells = export
def _export_path():
plot_path = data_path / "plots"
plot_path.mkdir(exist_ok=True)
return plot_path
def _bokeh_show(plot):
if _export_cells:
from bokeh.io import export_svg, export_png
plot_path = _export_path()
try:
export_png(plot, filename=plot_path / "plot.png", width=1200, height=800)
export_svg(plot, filename=plot_path / "plot.svg")
except:
pass
def _bokeh_show(root):
bokeh.io.show(root)
bokeh.io.show(plot)
def make_limited_function_warning_widget(action_text):
......@@ -382,9 +401,11 @@ def format_embedding_name(name):
def find_index_by_filter(terms, s):
pattern = "(.*)".join([""] + [re.escape(x) for x in s.split()] + [""])
candidates = []
for i, x in enumerate(terms):
if s in x:
if re.search(pattern, x):
candidates.append((i, x))
if not candidates:
raise ValueError(f"did not find '{s}' in {terms}")
......@@ -528,20 +549,13 @@ class DocEncoder:
def __init__(self, embedding, session):
assert embedding.is_contextual
nlp = embedding._nlp
nlp = embedding.nlp
self._nlp = nlp
k = self.name
# create an encoder that basically calls nlp(t).vector
self._encoder = CachedPartitionEncoder(
SpanEncoder(lambda texts: [nlp(t).vector for t in texts])
)
# compute encodings and/or save cached data
self._encoder.try_load("data/processed_data/sbert_cache_doc_" + k)
self._encoder.cache(session.documents, session.partition("document"))
self._encoder.save("data/processed_data/sbert_cache_doc_" + k)
self._encoder = SentenceEmbedding(nlp).create_encoder(
session.partition("document"))
@property
def name(self):
......@@ -587,10 +601,10 @@ class DocEmbedder:
self._partition = session.partition("document")
Option = collections.namedtuple("Option", ["name", "token_embedding", "doc_encoder"])
Option = collections.namedtuple("Option", ["name", "token_encoder", "doc_encoder"])
options = []
for k, v in self._session.embeddings.items():
for k, v in self._session.encoders.items():
options.append(Option(format_embedding_name(k) + " [token]", v, None))
for k, v in doc_encoders.items():
options.append(Option(format_embedding_name(k) + " [doc]", None, v))
......@@ -659,7 +673,7 @@ class DocEmbedder:
cb()
def embedding_changed(self):
visible = self.option.token_embedding is not None
visible = self.option.token_encoder is not None
if _display_mode.bokeh:
self._aggregator.visible = visible
else:
......@@ -684,13 +698,13 @@ class DocEmbedder:
@property
def widget(self):
if _display_mode.bokeh:
return bokeh.layouts.row(self._embedding_select, self._aggregator)
return bokeh.layouts.column(self._embedding_select, self._aggregator)
else:
return widgets.HBox([self._embedding_select, self._aggregator])
return widgets.VBox([self._embedding_select, self._aggregator])
def display(self):
if _display_mode.bokeh:
bokeh.io.show(lambda doc: doc.add_root(self.widget))
_bokeh_show(lambda doc: doc.add_root(self.widget))
else:
display(self.widget)
......@@ -701,8 +715,7 @@ class DocEmbedder:
return option.doc_encoder.encoder
else:
agg = getattr(np, self._aggregator.value)
return CachedPartitionEncoder(
TokenEmbeddingAggregator(option.token_embedding.factory, agg))
return option.token_encoder.embedding.to_sentence_embedding(agg).create_encoder(self._partition)
@property
def partition(self):
......@@ -719,7 +732,7 @@ class DocEmbedder:
nlp = self._nlp
return self.encoder.encode(
prepare_docs(docs, nlp), self.partition).unmodified
prepare_docs(docs, nlp))
class EmbeddingPlotter:
......@@ -884,7 +897,7 @@ class EmbeddingPlotter:
self._pw.js_init("\n".join(script_code))
def mk_plot(self, bokeh_doc, selection=[], locator=None, plot_width=1200):
has_tok_emb = self._embedder.option.token_embedding is not None
has_tok_emb = self._embedder.option.token_encoder is not None
if _display_mode.bokeh:
intruder_select = bokeh.models.Select(
......@@ -1015,7 +1028,7 @@ class EmbeddingPlotter:
if tok_emb_p is None:
return
embedding = self._embedder.encoder.embedding
embedding = self._embedder.encoder.token_embedding
if embedding is None:
clear_token_plot()
set_tok_emb_status("No token embedding.")
......@@ -1320,12 +1333,12 @@ def plot_doc_embeddings(embedder_factory, gold, plot_args):
return bokeh.layouts.row(widgets)
if _display_mode.static:
bokeh.io.show(mk_root(None))
_bokeh_show(mk_root(None))
else:
def add_root(bokeh_doc):
bokeh_doc.add_root(mk_root(bokeh_doc))
bokeh.io.show(add_root)
_bokeh_show(add_root)
else:
plots = [
plotter.mk_plot(None, plot_width=plot_width, **clean_kwargs(kwargs))
......@@ -1352,9 +1365,9 @@ class DocEmbeddingExplorer:
class TokenSimilarityPlotter:
def _create_data(self, doc, ref_token, embedding):
token_sim = TokenSimilarity(
self._session.embeddings[embedding].factory,
CosineSimilarity())
token_sim = EmbeddingTokenSim(
self._session.encoders[embedding].embedding,
CosineSim())
sim = partial(self._session.similarity, token_sim)
is_ctx = any(e.is_contextual for e in token_sim.embeddings)
......@@ -1432,7 +1445,7 @@ class TokenSimilarityPlotter:
self._gold = gold
self._doc_id_to_doc = dict((get_gold_id(x), x) for x in self._session.documents)
self._embedding_names = sorted(session.embeddings.keys(), key=lambda x: len(x))
self._embedding_names = sorted(session.encoders.keys(), key=lambda x: len(x))
if initial_occ is None:
initial_occ_id = get_gold_id(gold.occurrences[0])
......@@ -1440,7 +1453,7 @@ class TokenSimilarityPlotter:
initial_occ_id = get_gold_id(initial_occ)
self._figures = None
self._n_figures = min(n_figures, len(session.embeddings))
self._n_figures = min(n_figures, len(session.encoders))
self._height_per_token = 20
self._top_n = top_n
......@@ -1549,7 +1562,7 @@ class TokenSimilarityPlotter:
bokeh.layouts.column(x['embedding_select'], x['figure']) for x in self._figures]))
if _display_mode.static:
bokeh.io.show(root)
_bokeh_show(root)
else:
bokeh_doc.add_root(root)
......@@ -1570,7 +1583,7 @@ def plot_token_similarity(session, nlp, gold, token="high", occ=None, n_figures=
gold = to_legacy_gold(gold)
plotter = TokenSimilarityPlotter(session, nlp, gold, token, occ, n_figures=n_figures, top_n=top_n)
if _display_mode.fully_interactive:
bokeh.io.show(plotter.create)
_bokeh_show(plotter.create)
else:
plotter.create(None)
......@@ -1825,8 +1838,8 @@ class InteractiveQuery:
return self._session.partition("document").index(self._ui.make(), self._nlp)
@property
def ordered_embedding(self):
return sorted(list(self._session.embeddings.items()), key=lambda x: x[0])
def ordered_encoders(self):
return sorted(list(self._session.encoders.items()), key=lambda x: x[0])
class InteractiveIndexBuilder:
......@@ -2024,9 +2037,9 @@ def plot_results(gold, index, query=None, rank=None, plot_height=200):
bk_root = bokeh.layouts.row(*bks)
if _display_mode.fully_interactive:
bokeh.io.show(lambda doc: doc.add_root(bk_root))
_bokeh_show(lambda doc: doc.add_root(bk_root))
else:
bokeh.io.show(bk_root)
_bokeh_show(bk_root)
result_widgets = [widgets.HBox(jps, layout=widgets.Layout(width=f'{plot_width}px'))] if len(jps) > 1 else jps[:1]
if not _display_mode.fully_interactive:
......@@ -2142,7 +2155,7 @@ def plot_gold(gold, title=""):
x_start=u[0], y_start=u[1],
x_end=v[0], y_end=v[1], line_width=1.5))
bokeh.io.show(plot)
_bokeh_show(plot)
def get_token_scores_s(match):
......@@ -2303,7 +2316,7 @@ def token_scores_stacked_bar_chart(matches, ranks=None, highlight=None, show_gap
y.append(scores[i - 1] + 0.05)
p.triangle(color="black", x=x, y=y, size=10, angle=np.pi)
bokeh.io.show(p)
_bokeh_show(p)
def token_scores_pie_chart(match, plot_size=350):
......@@ -2412,7 +2425,7 @@ def vis_token_scores(matches, kind="bar", ranks=None, highlight=None, plot_width
n_rows = int(np.ceil(len(picked) / n_cols))
plot_size = plot_width // n_cols
figures = [token_scores_pie_chart(m, plot_size=plot_size) for m in picked]
bokeh.io.show(bokeh.layouts.gridplot(
_bokeh_show(bokeh.layouts.gridplot(
figures, ncols=n_cols, width=plot_size, height=plot_size * n_rows))
else:
raise ValueError(kind)
......@@ -2467,7 +2480,7 @@ def plot_embedding_vectors(labels, vectors, palette, bg, extra_height=0, w_forma
color_mapper=color_mapper, label_standoff=3, margin=20, height=5, padding=5)
p.add_layout(color_bar, 'below')
bokeh.io.show(p)
_bokeh_show(p)
def _embedding_vectors(words, get_vec, normalize):
......@@ -2551,9 +2564,12 @@ def plot_dot(dot_path):
with open(dot_path, "r") as f:
graphs = pydot.graph_from_dot_data(f.read())
graph = graphs[0]
if _export_cells:
with open(_export_path() / "plot.svg", "wb") as f:
f.write(graph.create_svg())
from IPython.display import SVG
return SVG(graph.create_svg())
......@@ -2587,15 +2603,20 @@ class CustomSearch:
temp_corpus_path = data_path / "processed_data" / "temp_corpus"
temp_corpus_path.mkdir(exist_ok=True)
from vectorian.corpus import Corpus
import tempfile
corpus = Corpus(tempfile.mkdtemp(prefix="temp_corpus_", dir=temp_corpus_path))
from vectorian.corpus import TemporaryCorpus
corpus = TemporaryCorpus()
# for each uploaded file, import it via importer "im" and add to corpus
for k, data in upload.value.items():
corpus.add_doc(
im(codecs.decode(data["content"], encoding="utf-8"), title=k)
)
doc = im(codecs.decode(data["content"], encoding="utf-8"), title=k)
if doc is not None:
corpus.add_doc(doc)
else:
logging.warn(f"document '{k}' is empty and was ignored.")
if len(corpus) < 1:
display(HTML("<strong>all provided text files were empty.</strong>"))
return None
self._session = LabSession(
corpus,
......@@ -2648,7 +2669,7 @@ def eval_strategies(data, gold_data, strategies=["wsb_weighted", "wsb_unweighted
])
class SentenceTransformersEmbedding(vectorian.embeddings.SpacyVectorEmbedding):
class SentenceTransformersEmbedding(ContextualEmbedding):
def __init__(self, name, path=None, readonly=False):
if path is not None:
......@@ -2661,16 +2682,13 @@ class SentenceTransformersEmbedding(vectorian.embeddings.SpacyVectorEmbedding):
with monkey_patch_sentence_transformers_tqdm("Downloading sentence-transformers model " + name):
nlp.add_pipe('sentence_bert', config={'model_name': name})
nlp.meta["name"] = name + "_with_" + NLP_BASE_MODEL
# we provide an additional cache for token embeddings, which helps
# speed up notebook execution in Binder environments.
cache_dir = data_path / "processed_data" / ("sbert_cache_tok_" + name)
cache_dir.mkdir(exist_ok=True)
super().__init__(
nlp, 768,
cache=VectorCache(cache_dir, readonly=readonly))
super().__init__(nlp)
self._nlp = nlp
@property
def nlp(self):
return self._nlp
def load_embeddings(yml_path):
......
%% Cell type:code id:967dfa28-fed4-4da8-ad85-1a925ef05137 tags:
``` python
import sys
import os
from pathlib import Path
data_path = Path("../data")
os.environ["VECTORIAN_DEMO_DATA_PATH"] = str(data_path)
sys.path.append("code")
import nbutils
import gold
import json
from vectorian.importers import TextImporter
from vectorian.corpus import Corpus
from tqdm import tqdm
nlp = nbutils.make_nlp()
gold_data = gold.load_data(data_path / "raw_data" / "gold.json")
contextual_embeddings = dict(
(k, v) for k, v in nbutils.load_embeddings(data_path / "raw_data" / "embeddings.yml").items()
if v.is_contextual)
def prepare_docs():
if contextual_embeddings:
im = TextImporter(nlp, embeddings=list(contextual_embeddings.values()))
else:
im = TextImporter(nlp)
corpus = Corpus(data_path / "processed_data"/ "corpus")
for x, d in tqdm(gold_data.in_degree(gold_data.nodes), desc="Importing"):
if d < 1:
continue
node = gold_data.nodes[x]
doc = im(
node["context"],
title=node["source"]["book"],
author=node["source"]["author"],
extra_metadata={
'gold_id': node["id"]
},
show_progress=False)
corpus.add_doc(doc)
prepare_docs()
```
%% Output
Importing: 100%|█████████████████████████████████████████████████████████████████████████████████████| 120/120 [04:17<00:00, 2.14s/it]
Importing: 100%|███████████████████████████████████████████████████████████████████████████| 120/120 [04:37<00:00, 2.31s/it]
%% Cell type:code id:4426e21d-156c-4e18-97c7-1eaff03afd2b tags:
``` python
```
......
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="571pt" height="242pt" viewBox="21.60 21.60 549.60 220.60">
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 2.50.0 (20211209.0339)
-->
<!-- Title: my_graph Pages: 1 -->
<svg width="571pt" height="242pt"
viewBox="21.60 21.60 549.60 220.60" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(25.6 216.6)">
<title>my_graph</title>
<polygon fill="Transparent" stroke="transparent" points="-4,4 -4,-195 524,-195 524,4 -4,4"/>
......@@ -16,7 +23,7 @@
</g>
<!-- token_embeddings&#45;&gt;contextual -->
<g id="edge1" class="edge">
<title>token_embeddings-&gt;contextual</title>
<title>token_embeddings&#45;&gt;contextual</title>
<path fill="none" stroke="black" d="M107.07,-123.36C116.77,-126.2 127.06,-129.2 137.21,-132.17"/>
<polygon fill="black" stroke="black" points="136.3,-135.55 146.88,-134.99 138.26,-128.83 136.3,-135.55"/>
</g>
......@@ -28,7 +35,7 @@
</g>
<!-- token_embeddings&#45;&gt;static -->
<g id="edge2" class="edge">
<title>token_embeddings-&gt;static</title>
<title>token_embeddings&#45;&gt;static</title>
<path fill="none" stroke="black" d="M107.07,-99.47C119.51,-97.45 132.91,-95.27 145.75,-93.19"/>
<polygon fill="black" stroke="black" points="146.6,-96.59 155.91,-91.54 145.48,-89.69 146.6,-96.59"/>
</g>
......@@ -39,12 +46,12 @@
<polyline fill="none" stroke="black" points="397,-187 312,-187 "/>
<polyline fill="none" stroke="black" points="397,-187 397,-155 "/>
<polyline fill="none" stroke="black" points="397,-187 401,-191 "/>
<text text-anchor="start" x="320" y="-176" font-family="Arial" font-size="10.00">Sentence-BERT</text>
<text text-anchor="start" x="320" y="-176" font-family="Arial" font-size="10.00">Sentence&#45;BERT</text>
<text text-anchor="start" x="321.5" y="-166" font-family="Arial" font-style="italic" font-size="10.00">on token level</text>
</g>
<!-- contextual&#45;&gt;sbert -->
<g id="edge9" class="edge">
<title>contextual-&gt;sbert</title>
<title>contextual&#45;&gt;sbert</title>
<path fill="none" stroke="orange" d="M276.21,-162.06C284.74,-163.24 293.39,-164.43 301.68,-165.58"/>
<polygon fill="orange" stroke="orange" points="301.22,-169.05 311.6,-166.95 302.17,-162.11 301.22,-169.05"/>
</g>
......@@ -59,7 +66,7 @@
</g>
<!-- static&#45;&gt;glove -->
<g id="edge3" class="edge">
<title>static-&gt;glove</title>
<title>static&#45;&gt;glove</title>
<path fill="none" stroke="orange" d="M263.26,-97.55C281.9,-102.7 302.43,-108.36 319.38,-113.03"/>
<polygon fill="orange" stroke="orange" points="318.85,-116.52 329.42,-115.8 320.71,-109.77 318.85,-116.52"/>
</g>
......@@ -74,7 +81,7 @@
</g>
<!-- static&#45;&gt;fasttext -->
<g id="edge4" class="edge">
<title>static-&gt;fasttext</title>
<title>static&#45;&gt;fasttext</title>
<path fill="none" stroke="orange" d="M263.25,-81.42C315.42,-79.86 393.94,-77.5 440.03,-76.12"/>
<polygon fill="orange" stroke="orange" points="440.29,-79.62 450.18,-75.82 440.08,-72.62 440.29,-79.62"/>
</g>
......@@ -89,7 +96,7 @@
</g>
<!-- static&#45;&gt;numberbatch -->
<g id="edge5" class="edge">
<title>static-&gt;numberbatch</title>
<title>static&#45;&gt;numberbatch</title>
<path fill="none" stroke="orange" d="M230.9,-64.89C250.23,-49.02 280.87,-27.11 312,-18 349.36,-7.06 393.53,-7.7 426.84,-10.73"/>
<polygon fill="orange" stroke="orange" points="426.51,-14.21 436.81,-11.73 427.21,-7.25 426.51,-14.21"/>
</g>
......@@ -101,19 +108,19 @@
</g>
<!-- static&#45;&gt;stacked -->
<g id="edge6" class="edge">
<title>static-&gt;stacked</title>
<title>static&#45;&gt;stacked</title>
<path fill="none" stroke="orange" d="M263.26,-68.45C281.9,-63.3 302.43,-57.64 319.38,-52.97"/>
<polygon fill="orange" stroke="orange" points="320.71,-56.23 329.42,-50.2 318.85,-49.48 320.71,-56.23"/>
</g>
<!-- stacked&#45;&gt;fasttext -->
<g id="edge7" class="edge">
<title>stacked-&gt;fasttext</title>
<title>stacked&#45;&gt;fasttext</title>
<path fill="none" stroke="lightblue" d="M383.54,-49.94C400.17,-54.38 422,-60.2 440.43,-65.11"/>
<polygon fill="lightblue" stroke="lightblue" points="439.7,-68.54 450.26,-67.74 441.5,-61.78 439.7,-68.54"/>
</g>
<!-- stacked&#45;&gt;numberbatch -->
<g id="edge8" class="edge">
<title>stacked-&gt;numberbatch</title>
<title>stacked&#45;&gt;numberbatch</title>
<path fill="none" stroke="lightblue" d="M383.54,-37.58C396.2,-34.94 411.86,-31.67 426.75,-28.57"/>
<polygon fill="lightblue" stroke="lightblue" points="427.81,-31.93 436.88,-26.46 426.38,-25.08 427.81,-31.93"/>
</g>
......@@ -123,4 +130,4 @@
<ellipse fill="none" stroke="black" cx="53.5" cy="-158" rx="27" ry="18"/>
</g>
</g>
</svg>
\ No newline at end of file
</svg>
{"https://explosion.ai/en/paraphrase-distilroberta-base-v1_with_en_core_web_sm/3.1.0": "176182c625fd11ecac4c1e294004a039", "https://explosion.ai/en/msmarco-distilbert-base-v4_with_en_core_web_sm/3.1.0": "1761a31425fd11ecac4c1e294004a039"}
\ No newline at end of file
{"https://explosion.ai/en/paraphrase-distilroberta-base-v1_with_en_core_web_sm/3.1.0": "25e3569425fd11ecac4c1e294004a039", "https://explosion.ai/en/msmarco-distilbert-base-v4_with_en_core_web_sm/3.1.0": "25e3761025fd11ecac4c1e294004a039"}
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment