Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Melusina
vDHd Volume
burghardt_embed
Commits
20695e44
Commit
20695e44
authored
Jun 10, 2021
by
Bernhard Liebl
Browse files
Improve reliability and transparence of data sources used
parent
7e0c6fcf
Changes
7
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
20695e44
.ipynb_checkpoints
.ipynb_checkpoints
*.pyc
*.pyc
.DS_Store
.DS_Store
/data/raw_data/vectorian_cache/embeddings
/data/raw_data/sentence_transformers/sbert.net_models_paraphrase-distilroberta-base-v1.zip
/data/raw_data/sentence_transformers/sbert.net_models_paraphrase-distilroberta-base-v1
/data/raw_data/numberbatch-19.08-en-pca-50.zip
README.md
View file @
20695e44
...
@@ -25,6 +25,8 @@ We introduce the Vectorian as a framework that allows researchers to try out dif
...
@@ -25,6 +25,8 @@ We introduce the Vectorian as a framework that allows researchers to try out dif
| | +-- sbert_contextual: precomputed Sentence-BERT contextual token embeddings for pattern phrases
| | +-- sbert_contextual: precomputed Sentence-BERT contextual token embeddings for pattern phrases
| +-- raw_data
| +-- raw_data
| | +-- gold.json: gold standard data for Shakespeare text reuse as JSON
| | +-- gold.json: gold standard data for Shakespeare text reuse as JSON
| | +-- sentence_transformers: will contain S-BERT model (downloaded in the notebook)
| | +-- vectorian_cache: will contain word embedding data (downloaded in the notebook)
+-- environment.yml: Python dependencies needed to run the notebook in a conda environment
+-- environment.yml: Python dependencies needed to run the notebook in a conda environment
+-- installation.md: additional documentation how to run this notebook locally or via Binder
+-- installation.md: additional documentation how to run this notebook locally or via Binder
+-- miscellaneous: various images used in the notebook
+-- miscellaneous: various images used in the notebook
...
...
code/nbutils.py
View file @
20695e44
...
@@ -38,6 +38,8 @@ from tqdm.autonotebook import tqdm
...
@@ -38,6 +38,8 @@ from tqdm.autonotebook import tqdm
from
pathlib
import
Path
from
pathlib
import
Path
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
os
.
environ
[
"VECTORIAN_CACHE_HOME"
]
=
"data/raw_data/vectorian_cache"
if
os
.
environ
.
get
(
"VECTORIAN_DEV"
):
if
os
.
environ
.
get
(
"VECTORIAN_DEV"
):
os
.
environ
[
"VECTORIAN_CPP_IMPORT"
]
=
"1"
os
.
environ
[
"VECTORIAN_CPP_IMPORT"
]
=
"1"
vectorian_path
=
Path
(
"/Users/arbeit/Projects/vectorian-2021"
)
vectorian_path
=
Path
(
"/Users/arbeit/Projects/vectorian-2021"
)
...
@@ -125,25 +127,13 @@ def monkey_patch_sentence_transformers_tqdm(desc):
...
@@ -125,25 +127,13 @@ def monkey_patch_sentence_transformers_tqdm(desc):
sentence_transformers
.
util
.
tqdm
=
old_tqdm
sentence_transformers
.
util
.
tqdm
=
old_tqdm
def
make_nlp
():
# uses 'tagger' from en_core_web_sm
# we include 'parser' so that Vectorian can detect sentence boundaries
with
monkey_patch_sentence_transformers_tqdm
(
"downloading Sentence BERT model"
):
nlp
=
spacy
.
load
(
'en_core_web_sm'
,
exclude
=
[
'ner'
])
nlp
.
add_pipe
(
'sentence_bert'
,
config
=
{
'model_name'
:
'en_paraphrase_distilroberta_base_v1'
})
nlp
.
meta
[
"name"
]
=
"core_web_sm_AND_en_paraphrase_distilroberta_base_v1"
return
nlp
# the following function is adapted from:
# the following function is adapted from:
# https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51
# https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51
def
download
(
url
:
str
,
fname
:
str
):
def
download
(
url
:
str
,
fname
:
str
):
resp
=
requests
.
get
(
url
,
stream
=
True
)
resp
=
requests
.
get
(
url
,
stream
=
True
)
total
=
int
(
resp
.
headers
.
get
(
'content-length'
,
0
))
total
=
int
(
resp
.
headers
.
get
(
'content-length'
,
0
))
with
open
(
fname
,
'wb'
)
as
file
,
tqdm
(
with
open
(
fname
,
'wb'
)
as
file
,
tqdm
(
desc
=
url
,
desc
=
f
"Downloading
{
url
}
"
,
total
=
total
,
total
=
total
,
unit
=
'iB'
,
unit
=
'iB'
,
unit_scale
=
True
,
unit_scale
=
True
,
...
@@ -174,6 +164,30 @@ def download_word2vec_embedding(name, url):
...
@@ -174,6 +164,30 @@ def download_word2vec_embedding(name, url):
raise
ValueError
(
"zip file is empty"
)
raise
ValueError
(
"zip file is empty"
)
def
make_nlp
(
sbert_model_name
):
# uses 'tagger' from en_core_web_sm
# we include 'parser' so that Vectorian can detect sentence boundaries
sbert_cache_path
=
Path
(
"data/raw_data/sentence_transformers"
)
os
.
environ
[
"SENTENCE_TRANSFORMERS_HOME"
]
=
str
(
sbert_cache_path
)
sbert_model_path
=
sbert_cache_path
/
"sbert.net_models_paraphrase-distilroberta-base-v1"
sbert_model_zip_path
=
sbert_model_path
.
parent
/
(
sbert_model_path
.
name
+
".zip"
)
if
not
sbert_model_zip_path
.
exists
():
download
(
"https://zenodo.org/record/4923260/files/sbert.net_models_paraphrase-distilroberta-base-v1.zip"
,
sbert_model_zip_path
)
if
not
sbert_model_path
.
is_dir
():
with
zipfile
.
ZipFile
(
sbert_model_zip_path
,
"r"
)
as
zf
:
zf
.
extractall
(
sbert_model_path
.
parent
)
with
monkey_patch_sentence_transformers_tqdm
(
"Downloading Sentence BERT model"
):
nlp
=
spacy
.
load
(
'en_core_web_sm'
,
exclude
=
[
'ner'
])
nlp
.
add_pipe
(
'sentence_bert'
,
config
=
{
'model_name'
:
sbert_model_name
})
nlp
.
meta
[
"name"
]
=
"core_web_sm_AND_"
+
sbert_model_name
return
nlp
def
occ_digest
(
occ
,
n
=
80
):
def
occ_digest
(
occ
,
n
=
80
):
return
f
"
{
occ
.
source
.
work
}
:
{
occ
.
evidence
.
context
}
"
[:
n
]
+
"..."
return
f
"
{
occ
.
source
.
work
}
:
{
occ
.
evidence
.
context
}
"
[:
n
]
+
"..."
...
...
data/raw_data/sentence_transformers/README.md
0 → 100644
View file @
20695e44
Sentence-BERT models from https://github.com/UKPLab/sentence-transformers are
stored here. They will be downloaded from Zenodo during the first run of the
notebook.
data/raw_data/vectorian_cache/README.md
0 → 100644
View file @
20695e44
Data downloaded and cached from the Vectorian API ends up here. It will
be downloaded from Zenodo during the first run of the notebook.
environment.yml
View file @
20695e44
...
@@ -11,7 +11,7 @@ dependencies:
...
@@ -11,7 +11,7 @@ dependencies:
-
beautifulsoup4==4.9
-
beautifulsoup4==4.9
-
pip
-
pip
-
pip
:
-
pip
:
-
vectorian==0.8.3dev
3
-
vectorian==0.8.3dev
4
-
sentence-transformers==0.4
-
sentence-transformers==0.4
-
jupyter_bokeh==3.0.0
-
jupyter_bokeh==3.0.0
-
opentsne==0.6.0
-
opentsne==0.6.0
...
...
publication.ipynb
View file @
20695e44
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment