Commit 3470c3e3 authored by Bernhard Liebl's avatar Bernhard Liebl
Browse files

better progress bars, fix upload demo

parent b1c8a1af
...@@ -34,8 +34,9 @@ from functools import partial ...@@ -34,8 +34,9 @@ from functools import partial
from cached_property import cached_property from cached_property import cached_property
from IPython.core.display import HTML, display from IPython.core.display import HTML, display
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from tqdm import tqdm from tqdm.autonotebook import tqdm
from pathlib import Path from pathlib import Path
from contextlib import contextmanager
if os.environ.get("VECTORIAN_DEV"): if os.environ.get("VECTORIAN_DEV"):
os.environ["VECTORIAN_CPP_IMPORT"] = "1" os.environ["VECTORIAN_CPP_IMPORT"] = "1"
...@@ -109,22 +110,54 @@ def make_limited_function_warning_widget(action_text): ...@@ -109,22 +110,54 @@ def make_limited_function_warning_widget(action_text):
layout=widgets.Layout(border='solid 1px #F9E79F')) layout=widgets.Layout(border='solid 1px #F9E79F'))
@contextmanager
def monkey_patch_sentence_transformers_tqdm(desc):
# patch progress so we know that it's actually sentence_transformers we are seeing,
# since sentence_transformers does not set "desc" on its tqdm instance :-/
from functools import partial
import sentence_transformers.util
old_tqdm = sentence_transformers.util.tqdm
sentence_transformers.util.tqdm = partial(tqdm, desc=desc)
try:
yield
finally:
sentence_transformers.util.tqdm = old_tqdm
def make_nlp(): def make_nlp():
# uses 'tagger' from en_core_web_sm # uses 'tagger' from en_core_web_sm
# we include 'parser' so that Vectorian can detect sentence boundaries # we include 'parser' so that Vectorian can detect sentence boundaries
nlp = spacy.load('en_core_web_sm', exclude=['ner']) with monkey_patch_sentence_transformers_tqdm("downloading Sentence BERT model"):
nlp.add_pipe('sentence_bert', config={'model_name': 'en_paraphrase_distilroberta_base_v1'}) nlp = spacy.load('en_core_web_sm', exclude=['ner'])
nlp.meta["name"] = "core_web_sm_AND_en_paraphrase_distilroberta_base_v1" nlp.add_pipe('sentence_bert', config={'model_name': 'en_paraphrase_distilroberta_base_v1'})
return nlp nlp.meta["name"] = "core_web_sm_AND_en_paraphrase_distilroberta_base_v1"
return nlp
# the following function is adapted from:
# https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51
def download(url: str, fname: str):
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
with open(fname, 'wb') as file, tqdm(
desc=url,
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
def download_word2vec_embedding(name, url): def download_word2vec_embedding(name, url):
data_path = Path(f"{name}.zip") data_path = Path(f"{name}.zip")
if not data_path.exists(): if not data_path.exists():
resp = requests.get(url) download(url, data_path)
with open(data_path, "wb") as f:
f.write(resp.content)
with zipfile.ZipFile(data_path, 'r') as zf: with zipfile.ZipFile(data_path, 'r') as zf:
for zi in zf.infolist(): for zi in zf.infolist():
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment