Commit b5002e0b authored by Bernhard Liebl's avatar Bernhard Liebl
Browse files

more about word embeddings

parent 28ec112f
...@@ -90,49 +90,6 @@ def make_nlp(): ...@@ -90,49 +90,6 @@ def make_nlp():
nlp.meta["name"] = "core_web_sm_AND_en_paraphrase_distilroberta_base_v1" nlp.meta["name"] = "core_web_sm_AND_en_paraphrase_distilroberta_base_v1"
return nlp return nlp
'''
class Gold:
def __init__(self, data):
self._data = data
@cached_property
def num_contexts(self):
return sum(len(q["matches"]) for q in self._data)
@property
def phrases(self):
return [q["phrase"] for q in self._data]
def matches(self, phrase):
for q in self._data:
if q["phrase"] == phrase:
return q["matches"]
return []
@property
def items(self):
return self._data
@cached_property
def by_id(self):
doc_details = {}
for query in self._data:
for m in query["matches"]:
doc_details[m["id"]] = {
'query': query,
'match': m
}
return doc_details
def doc_digests(self, n=80):
for query in self._data:
for m in query["matches"]:
yield (f"{m['work']}: {m['context']}"[:n] + "..."), m['id']
'''
def occ_digest(occ, n=80): def occ_digest(occ, n=80):
return f"{occ.source.work}: {occ.evidence.context}"[:n] + "..." return f"{occ.source.work}: {occ.evidence.context}"[:n] + "..."
...@@ -1605,9 +1562,22 @@ def plot_gold(gold): ...@@ -1605,9 +1562,22 @@ def plot_gold(gold):
nx.set_node_attributes(G, phrase, "phrase") nx.set_node_attributes(G, phrase, "phrase")
nx.set_node_attributes(G, context, "context") nx.set_node_attributes(G, context, "context")
fixed = []
pos = {}
for i, pattern in enumerate(gold.patterns):
fixed.append(pattern.phrase)
y = i // 8
x = i % 8
s = 0.75
pos[pattern.phrase] = (x * s, y * s)
pos_arr = np.array(list(pos.values()))
pad = 1
plot = bokeh.models.Plot( plot = bokeh.models.Plot(
plot_width=1000, plot_height=400, plot_width=1000, plot_height=400,
x_range=bokeh.models.Range1d(-1.1, 1.1), y_range=bokeh.models.Range1d(-1.1, 1.1), x_range=bokeh.models.Range1d(np.min(pos_arr[:, 0]) - pad, np.max(pos_arr[:, 0]) + pad),
y_range=bokeh.models.Range1d(np.min(pos_arr[:, 1]) - pad, np.max(pos_arr[:, 1]) + pad),
output_backend="svg") output_backend="svg")
node_hover_tool = bokeh.models.HoverTool( node_hover_tool = bokeh.models.HoverTool(
...@@ -1616,9 +1586,10 @@ def plot_gold(gold): ...@@ -1616,9 +1586,10 @@ def plot_gold(gold):
@context @context
""") """)
plot.add_tools(node_hover_tool) plot.add_tools(node_hover_tool)
graph_renderer = bokeh.plotting.from_networkx(G, nx.spring_layout, scale=0.9, k=0.095, center=(0, 0)) graph_renderer = bokeh.plotting.from_networkx(
graph_renderer.node_renderer.glyph = bokeh.models.Circle(size=7.5, fill_color="node_color") G, nx.spring_layout, fixed=fixed, pos=pos, scale=0.5, k=0.15, center=(0, 0), iterations=100)
graph_renderer.node_renderer.glyph = bokeh.models.Circle(size=10, fill_color="node_color")
graph_renderer.edge_renderer.glyph = bokeh.models.MultiLine(line_color="black", line_alpha=1, line_width=1.5) graph_renderer.edge_renderer.glyph = bokeh.models.MultiLine(line_color="black", line_alpha=1, line_width=1.5)
plot.renderers.append(graph_renderer) plot.renderers.append(graph_renderer)
...@@ -1901,3 +1872,78 @@ def vis_token_scores(matches, kind="bar", ranks=None, highlight=None, plot_width ...@@ -1901,3 +1872,78 @@ def vis_token_scores(matches, kind="bar", ranks=None, highlight=None, plot_width
figures, ncols=n_cols, plot_width=plot_size, plot_height=plot_size * n_rows)) figures, ncols=n_cols, plot_width=plot_size, plot_height=plot_size * n_rows))
else: else:
raise ValueError(kind) raise ValueError(kind)
def plot_embedding_vectors(labels, vectors, palette, bg, extra_height=0, w_format="0.00"):
words = labels[::-1]
vecs = np.array(vectors[::-1])
dims = vecs.shape[-1]
vecs = vecs.flatten()
source = bokeh.models.ColumnDataSource({
'x': np.array(list(range(dims)) * len(words)) + 1,
'y': list(itertools.chain(*[[word] * dims for word in words])),
'w': vecs
})
color_mapper = bokeh.models.LinearColorMapper(
palette=palette, low=np.amin(vecs), high=np.amax(vecs))
p = bokeh.plotting.figure(
y_range=words,
x_axis_type=None,
x_range=(1 - 0.5, dims + 0.5),
plot_width=900,
plot_height=30 * len(words) + 20 + extra_height,
title="",
toolbar_location="below",
tools="pan, wheel_zoom, box_zoom, reset",
active_drag="box_zoom",
tooltips=[("dim", "@x"), ("w", "@w{%s}" % w_format)])
p.background_fill_color = "black"
p.background_fill_alpha = bg
p.square(source=source, size=10, color={'field': 'w', 'transform': color_mapper})
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
if dims <= 50:
ticker = bokeh.models.SingleIntervalTicker(interval=5, num_minor_ticks=5)
else:
ticker = bokeh.models.SingleIntervalTicker(interval=10, num_minor_ticks=2)
xaxis = bokeh.models.LinearAxis(ticker=ticker)
p.add_layout(xaxis, 'below')
color_bar = bokeh.models.ColorBar(
color_mapper=color_mapper, label_standoff=3, margin=20, height=5, padding=5)
p.add_layout(color_bar, 'above')
bokeh.io.show(p)
def _embedding_vectors(words, get_vec):
def get_norm_vec(word):
v = get_vec(word)
return v / np.linalg.norm(v)
return np.array([get_norm_vec(word) for word in words])
def plot_embedding_vectors_val(words, get_vec):
vecs = _embedding_vectors(words, get_vec)
plot_embedding_vectors(words, vecs, "Viridis256", 0.7)
def plot_embedding_vectors_mul(pairs, get_vec):
words = []
vecs = []
for u, v in pairs:
words.append(u + "-" + v)
u_vec, v_vec = _embedding_vectors([u, v], get_vec)
vecs.append([u_vec * v_vec])
vecs = np.array(vecs)
plot_embedding_vectors(words, vecs, "Inferno256", 1, 20, w_format="0.0000")
...@@ -10,7 +10,7 @@ dependencies: ...@@ -10,7 +10,7 @@ dependencies:
- faiss - faiss
- pip - pip
- pip: - pip:
- vectorian>=0.8.1.dev2 - vectorian>=0.8.1.dev3
- sentence-transformers>=0.4 - sentence-transformers>=0.4
- jupyter_bokeh>=3.0.0 - jupyter_bokeh>=3.0.0
- opentsne>=0.6.0 - opentsne>=0.6.0
......
...@@ -9,7 +9,6 @@ ...@@ -9,7 +9,6 @@
"source": [ "source": [
"import vectorian\n", "import vectorian\n",
"import sys\n", "import sys\n",
"import os\n",
"\n", "\n",
"sys.path.append(\"code\")\n", "sys.path.append(\"code\")\n",
"import nbutils\n", "import nbutils\n",
...@@ -31,6 +30,12 @@ ...@@ -31,6 +30,12 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import os\n",
"\n",
"import importlib\n",
"importlib.reload(nbutils)\n",
"importlib.reload(gold)\n",
"\n",
"nbutils.initialize(\"auto\")" "nbutils.initialize(\"auto\")"
] ]
}, },
...@@ -189,7 +194,7 @@ ...@@ -189,7 +194,7 @@
"id": "4b8c4c2b-7768-41e9-a20a-63e49c32700e", "id": "4b8c4c2b-7768-41e9-a20a-63e49c32700e",
"metadata": {}, "metadata": {},
"source": [ "source": [
"For performing our actual investigations we rely on a framework called The Vectorian, which we first introduced in 2020 in a less versatile state (Liebl and Burghardt, 2020). By employing highly optimized algorithms and data structures, the Vectorian allows us to perform rapid searches over the gold standard texts using a variety of approaches and strategies. " "For performing our actual investigations we rely on a framework called The Vectorian, which we first introduced in 2020 (Liebl and Burghardt, 2020). By employing highly optimized algorithms and data structures, the Vectorian allows us to perform rapid searches over the gold standard texts using a variety of approaches and strategies. "
] ]
}, },
{ {
...@@ -262,13 +267,33 @@ ...@@ -262,13 +267,33 @@
"id": "43a6edee-c756-4ef2-9801-00e0145ba65d", "id": "43a6edee-c756-4ef2-9801-00e0145ba65d",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [
"the_embeddings = {}\n",
"\n",
"the_embeddings['glove'] = Zoo.load('glove-6B-50')\n",
"the_embeddings['numberbatch'] = Zoo.load('numberbatch-19.08-en-50')\n",
"the_embeddings['fasttext'] = Zoo.load('fasttext-en-mini')"
]
},
{
"cell_type": "markdown",
"id": "64f9355b-c300-4690-a409-964065b4761e",
"metadata": {},
"source": [
"We also use one stacked embedding, in which we combine fasttext and numberbatch."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a11df296-c928-4a3a-8698-f00937f832e3",
"metadata": {},
"outputs": [],
"source": [ "source": [
"from vectorian.embeddings import StackedEmbedding\n", "from vectorian.embeddings import StackedEmbedding\n",
"\n", "\n",
"emb_glove = Zoo.load('glove-6B-50')\n", "the_embeddings['fasttext_numberbatch'] = StackedEmbedding([\n",
"emb_numberbatch = Zoo.load('numberbatch-19.08-en-50')\n", " the_embeddings['fasttext'], the_embeddings['numberbatch']])"
"emb_fasttext = Zoo.load('fasttext-en-mini')\n",
"emb_fasttext_numberbatch = StackedEmbedding([emb_fasttext, emb_numberbatch])"
] ]
}, },
{ {
...@@ -276,7 +301,7 @@ ...@@ -276,7 +301,7 @@
"id": "e6658905-b3e5-496d-9e62-828082dc354a", "id": "e6658905-b3e5-496d-9e62-828082dc354a",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Finally we instantiate an NLP parser based on Sentence-BERT (Reimers and Gurevych, 2019) and a shim that allows us to use this model's contextual token embeddings in the Vectorian." "We instantiate an NLP parser that is able to provide embeddings based on Sentence-BERT (Reimers and Gurevych, 2019)."
] ]
}, },
{ {
...@@ -286,10 +311,27 @@ ...@@ -286,10 +311,27 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"nlp = nbutils.make_nlp()\n", "nlp = nbutils.make_nlp()"
"\n", ]
},
{
"cell_type": "markdown",
"id": "8c772ad8-af05-4074-9392-b263a5d6b358",
"metadata": {},
"source": [
"Finally, we add a shim that allows us to use Sentence-BERT's contextual token embeddings in the Vectorian."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6e565e6f-cd21-4b38-9421-78a097608af1",
"metadata": {},
"outputs": [],
"source": [
"from vectorian.embeddings import SentenceBertEmbedding\n", "from vectorian.embeddings import SentenceBertEmbedding\n",
"emb_sbert = SentenceBertEmbedding(nlp)" "\n",
"the_embeddings['sbert'] = SentenceBertEmbedding(nlp)"
] ]
}, },
{ {
...@@ -320,12 +362,7 @@ ...@@ -320,12 +362,7 @@
"\n", "\n",
"session = LabSession(\n", "session = LabSession(\n",
" Corpus.load(\"data/processed_data/corpus\"),\n", " Corpus.load(\"data/processed_data/corpus\"),\n",
" embeddings=[\n", " embeddings=the_embeddings.values(),\n",
" emb_sbert,\n",
" emb_glove,\n",
" emb_numberbatch,\n",
" emb_fasttext,\n",
" emb_fasttext_numberbatch],\n",
" normalizers=\"default\")" " normalizers=\"default\")"
] ]
}, },
...@@ -350,9 +387,7 @@ ...@@ -350,9 +387,7 @@
"id": "76d7b8be-f3b6-4604-b924-7a453ddc89d0", "id": "76d7b8be-f3b6-4604-b924-7a453ddc89d0",
"metadata": {}, "metadata": {},
"source": [ "source": [
"We now turn to single word embeddings.\n", "We now turn to single word embeddings. A word embedding is a vector, i.e. a series of numbers."
"\n",
"reproduce Mikolov-example man-woman king-queen?"
] ]
}, },
{ {
...@@ -362,7 +397,101 @@ ...@@ -362,7 +397,101 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"session.word_vec(emb_glove, \"hot\")" "session.word_vec(the_embeddings[\"glove\"], \"coffee\")"
]
},
{
"cell_type": "markdown",
"id": "94f35262-219c-4764-9568-e2e25f042c45",
"metadata": {},
"source": [
"Since the above representation is hard to grasp, we can visualize various words under one embedding - using colors to show the strength of activation in different vector components."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f344867c-e095-4d6d-baf5-1fa559b37f6b",
"metadata": {},
"outputs": [],
"source": [
"import ipywidgets as widgets\n",
"from ipywidgets import interact\n",
"\n",
" \n",
"@interact(embedding=widgets.Dropdown(\n",
" options=[(k, v) for k, v in the_embeddings.items() if not v.is_contextual],\n",
" value=the_embeddings[\"numberbatch\"]))\n",
"def plot(embedding):\n",
" nbutils.plot_embedding_vectors_val(\n",
" [\"sail\", \"boat\", \"coffee\", \"tea\", \"guitar\", \"piano\"],\n",
" get_vec=lambda w: session.word_vec(embedding, w))"
]
},
{
"cell_type": "markdown",
"id": "42aa064d-65d8-49e1-9787-5176d36542d1",
"metadata": {},
"source": [
"Looking at these color patterns, we can gain some intuitive understanding of why and how word embeddings are suitable for word similarity computations. For example, *sail* and *boat* both show a strong activation on dimension 27. Similarly, *guitar* and *piano* share similar values around dimension 24. The words *coffee* and *tea* also share some similar patterns around dimension 2 and dimension 49, that sets them apart from the other four words."
]
},
{
"cell_type": "markdown",
"id": "7a591ebe-dc7d-4109-a2f5-0480593057ca",
"metadata": {},
"source": [
"By multiplying the normalized vectors component by component, we can derive the terms that make up the computation of the so-called cosine similarity, which is commonly used to compute word similarity using word embeddings. The visualization below makes it clear from which dimensions a cosine similarity between two words derives large positive components."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "da7436d6-bf5a-4e2d-a4b0-0174c0606cf0",
"metadata": {},
"outputs": [],
"source": [
"@interact(embedding=widgets.Dropdown(\n",
" options=[(k, v) for k, v in the_embeddings.items() if not v.is_contextual],\n",
" value=the_embeddings[\"numberbatch\"]))\n",
"def plot(embedding):\n",
" nbutils.plot_embedding_vectors_mul([\n",
" (\"sail\", \"boat\"),\n",
" (\"coffee\", \"tea\"),\n",
" (\"guitar\", \"piano\")], get_vec=lambda w: session.word_vec(embedding, w))"
]
},
{
"cell_type": "markdown",
"id": "eb908081-598a-491c-b070-439989691705",
"metadata": {},
"source": [
"A similar investigation into fastText shows comparable spots of positive contribution. The situation is more complex due to the higher number of dimensions."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1b603839-2495-411d-b9e6-e1d133277f0c",
"metadata": {},
"outputs": [],
"source": [
"@interact(embedding=widgets.Dropdown(\n",
" options=[(k, v) for k, v in the_embeddings.items() if not v.is_contextual],\n",
" value=the_embeddings[\"fasttext\"]))\n",
"def plot(embedding):\n",
" nbutils.plot_embedding_vectors_mul([\n",
" (\"sail\", \"boat\"),\n",
" (\"coffee\", \"tea\"),\n",
" (\"guitar\", \"piano\")], get_vec=lambda w: session.word_vec(embedding, w))"
]
},
{
"cell_type": "markdown",
"id": "17bb4eb6-33c7-4efe-9ea5-5ddf20316bd1",
"metadata": {},
"source": [
"Computing the cosine similarity is mathematically equivalent to summing up the terms in the diagram above. The overall similarity between *guitar* and *piano* is measured at about 68% with the fastText embedding we use."
] ]
}, },
{ {
...@@ -375,25 +504,53 @@ ...@@ -375,25 +504,53 @@
"from vectorian.metrics import TokenSimilarity, CosineSimilarity\n", "from vectorian.metrics import TokenSimilarity, CosineSimilarity\n",
"\n", "\n",
"token_sim = TokenSimilarity(\n", "token_sim = TokenSimilarity(\n",
" emb_numberbatch,\n", " the_embeddings[\"fasttext\"],\n",
" CosineSimilarity()\n", " CosineSimilarity()\n",
")\n", ")\n",
"\n", "\n",
"session.similarity(token_sim, \"hot\", \"cold\")" "session.similarity(token_sim, \"guitar\", \"piano\")"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "82787688-a005-4bab-bcff-5954b333ac3c", "id": "69fe41d8-ef30-41aa-8513-b7fe88affa09",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"token_sim = TokenSimilarity(\n", "def plot_mik(words):\n",
" emb_glove,\n", " from sklearn.decomposition import PCA\n",
" CosineSimilarity())\n", " import bokeh.models\n",
" import bokeh.plotting\n",
" import bokeh.io\n",
" \n",
" vectors = [session.word_vec(emb_fasttext, word) for word in words]\n",
"\n", "\n",
"session.similarity(token_sim, \"hot\", \"cold\")" " pca = PCA(n_components=2, whiten=True)\n",
" v2d = pca.fit(vectors).transform(vectors)\n",
" \n",
" source = bokeh.models.ColumnDataSource({\n",
" 'x': v2d[:, 0],\n",
" 'y': v2d[:, 1]\n",
" })\n",
" \n",
"\n",
" p = bokeh.plotting.figure(\n",
" plot_width=800,\n",
" plot_height=400,\n",
" title=\"\",\n",
" toolbar_location=None, tools=\"\")\n",
" \n",
" p.circle(source=source, size=10)\n",
" \n",
" for i in range(0, len(words), 2):\n",
" p.add_layout(bokeh.models.Arrow(end=bokeh.models.NormalHead(line_color=\"black\", line_width=1),\n",
" x_start=v2d[i, 0], y_start=v2d[i, 1], x_end=v2d[i + 1, 0], y_end=v2d[i + 1, 1]))\n",
" \n",
" bokeh.io.show(p)\n",
" \n",
" \n",
"plot_mik([\"man\", \"woman\", \"king\", \"queen\", \"prince\", \"princess\"])"
] ]
}, },
{ {
...@@ -404,7 +561,7 @@ ...@@ -404,7 +561,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"token_sim = TokenSimilarity(\n", "token_sim = TokenSimilarity(\n",
" emb_sbert,\n", " the_embeddings[\"sbert\"],\n",
" CosineSimilarity())\n", " CosineSimilarity())\n",
"\n", "\n",
"a = list(session.documents[0].spans(session.partition(\"document\")))[0][3]\n", "a = list(session.documents[0].spans(session.partition(\"document\")))[0][3]\n",
...@@ -769,10 +926,10 @@ ...@@ -769,10 +926,10 @@
" strategy_options={\"alignment\": vectorian.alignment.WatermanSmithBeyer()}),\n", " strategy_options={\"alignment\": vectorian.alignment.WatermanSmithBeyer()}),\n",
" \"wmd nbow\": make_index_builder(\n", " \"wmd nbow\": make_index_builder(\n",
" strategy=\"Alignment\",\n", " strategy=\"Alignment\",\n",
" strategy_options={\"alignment\": vectorian.alignment.WordMoversDistance.wmd(\"kusner\")}),\n", " strategy_options={\"alignment\": vectorian.alignment.WordMoversDistance.wmd(\"nbow\")}),\n",
" \"wmd bow\": make_index_builder(\n", " \"wmd bow\": make_index_builder(\n",
" strategy=\"Alignment\",\n", " strategy=\"Alignment\",\n",
" strategy_options={\"alignment\": vectorian.alignment.WordMoversDistance.wmd(\"vectorian\")}),\n", " strategy_options={\"alignment\": vectorian.alignment.WordMoversDistance.wmd(\"bow\")}),\n",
" \"doc embedding\": make_index_builder(\n", " \"doc embedding\": make_index_builder(\n",
" strategy=\"Partition Embedding\")\n", " strategy=\"Partition Embedding\")\n",
"})\n", "})\n",
...@@ -790,7 +947,7 @@ ...@@ -790,7 +947,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"vectorian.alignment.WordMoversDistance.wmd(\"kusner\").to_args(None)" "vectorian.alignment.WordMoversDistance.wmd(\"nbow\").to_args(None)"
] ]
}, },
{ {
...@@ -800,7 +957,7 @@ ...@@ -800,7 +957,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"vectorian.alignment.WordMoversDistance.wmd(\"vectorian\").to_args(None)" "vectorian.alignment.WordMoversDistance.wmd(\"bow\").to_args(None)"
] ]
}, },
{ {
...@@ -1019,7 +1176,7 @@ ...@@ -1019,7 +1176,7 @@
"id": "be0ea111-9c4a-4ec1-8dee-6a5b1a5ef619", "id": "be0ea111-9c4a-4ec1-8dee-6a5b1a5ef619",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Changing the Embedding" "# The Influence of Embeddings"
] ]
}, },
{ {
...@@ -1031,7 +1188,7 @@ ...@@ -1031,7 +1188,7 @@
"source": [ "source": [
"index_builders = {}\n", "index_builders = {}\n",
"\n", "\n",
"for e in [emb_fasttext, emb_numberbatch, emb_glove, emb_fasttext_numberbatch, emb_sbert]:\n", "for e in the_embeddings.values():\n",
" index_builders[e.name] = make_index_builder(\n", " index_builders[e.name] = make_index_builder(\n",
" strategy=\"Tag-Weighted Alignment\",\n", " strategy=\"Tag-Weighted Alignment\",\n",
" strategy_options={\n", " strategy_options={\n",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment