From baa75c2e241cdcf22ebbb81e010fea7a9a304d4a Mon Sep 17 00:00:00 2001 From: Andrey Kutuzov Date: Mon, 19 Oct 2020 20:39:53 +0200 Subject: [PATCH] Default warmup for all methods (improves performance) --- setup.py | 2 +- simple_elmo/elmo_helpers.py | 30 +++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index b468ae2..03478be 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="simple_elmo", - version="0.4.1", + version="0.5.0", author="Andrey Kutuzov", author_email="andreku@ifi.uio.no", description="Handy library to work with pre-trained ELMo embeddings in TensorFlow", diff --git a/simple_elmo/elmo_helpers.py b/simple_elmo/elmo_helpers.py index 2582972..c707449 100644 --- a/simple_elmo/elmo_helpers.py +++ b/simple_elmo/elmo_helpers.py @@ -115,9 +115,10 @@ def load(self, directory, top=False, max_batch_size=32, limit=100): return self.batcher, self.sentence_character_ids, self.elmo_sentence_input, self.batch_size - def get_elmo_vectors(self, texts): + def get_elmo_vectors(self, texts, warmup=True): """ :param texts: list of sentences (lists of words) + :param warmup: warm up the model before actual inference (by running it over the 1st batch) :return: embedding matrix for all sentences (max word count by vector size) """ @@ -130,6 +131,9 @@ def get_elmo_vectors(self, texts): # It is necessary to initialize variables once before running inference. sess.run(tf.compat.v1.global_variables_initializer()) + if warmup: + self.warmup(sess, texts) + # Running batches: chunk_counter = 0 for chunk in divide_chunks(texts, self.batch_size): @@ -148,9 +152,10 @@ def get_elmo_vectors(self, texts): return final_vectors - def get_elmo_vector_average(self, texts): + def get_elmo_vector_average(self, texts, warmup=True): """ :param texts: list of sentences (lists of words) + :param warmup: warm up the model before actual inference (by running it over the 1st batch) :return: matrix of averaged embeddings for all sentences """ average_vectors = np.zeros((len(texts), self.vector_size)) @@ -161,6 +166,9 @@ def get_elmo_vector_average(self, texts): # It is necessary to initialize variables once before running inference. sess.run(tf.compat.v1.global_variables_initializer()) + if warmup: + self.warmup(sess, texts) + # Running batches: for chunk in divide_chunks(texts, self.batch_size): # Converting sentences to character ids: @@ -185,6 +193,20 @@ def get_elmo_vector_average(self, texts): return average_vectors + def warmup(self, sess, texts): + for chunk0 in divide_chunks(texts, self.batch_size): + self.logger.info(f"Warming up ELMo on {len(chunk0)} sentences...") + sentence_ids = self.batcher.batch_sentences(chunk0) + _ = sess.run(self.elmo_sentence_input['weighted_op'], + feed_dict={self.sentence_character_ids: sentence_ids}) + break + self.logger.info("Warming up finished.") + + +def divide_chunks(data, n): + for i in range(0, len(data), n): + yield data[i:i + n] + def tokenize(string, limit=None): """ @@ -198,6 +220,4 @@ def tokenize(string, limit=None): return tokens -def divide_chunks(data, n): - for i in range(0, len(data), n): - yield data[i:i + n] +