Additional imports

ltgoslo · Sep 14, 2020 · b83ac45 · b83ac45
1 parent 08374ac
commit b83ac45
Show file tree

Hide file tree

Showing 7 changed files with 57 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -1,38 +1,61 @@
 # Simple ELMo
-Minimal Python code to get vectors from pre-trained ELMo models in TensorFlow.
+_simple_elmo_ is a Python library to work with pre-trained [ELMo embeddings](https://allennlp.org/elmo) in TensorFlow.
 
-Heavily based on https://github.com/allenai/bilm-tf.
-Requires Python >= 3.6
+This is a significantly updated wrapper to the [original ELMo implementation](https://github.com/allenai/bilm-tf).
+The main changes are:
+- more convenient and transparent data loading (including from compressed files)
+- code adapted to modern TensorFlow versions (including TensorFlow 2).
 
-The main changes:
-- more convenient data loading (including from compressed files)
-- code adapted to recent TensorFlow versions (including TF 2.0).
+# Usage
 
-# Usage example
+`pip install simple_elmo`
 
-`python3 get_elmo_vectors.py -i test.txt -e ~/PATH_TO_ELMO/`
+ `model = ElmoModel()`
+
+ `model.load(PATH_TO_ELMO)`
+
+ `elmo_vectors = model.get_elmo_vectors(SENTENCES)`
+
+  `averaged_vectors = model.get_elmo_vector_average(SENTENCES)`
 
 `PATH_TO_ELMO` is a ZIP archive downloaded from the [NLPL vector repository](http://vectors.nlpl.eu/repository/),
-or a directory containing 3 files extracted from such an archive:
+OR a directory containing 3 files extracted from such an archive:
 - `model.hdf5`, pre-trained ELMo weights in HDF5 format;
 - `options.json`, description of the model architecture in JSON;
 - `vocab.txt`/`vocab.txt.gz`, one-word-per-line vocabulary of the most frequent words you would like to cache during inference
 (not really necessary, the model will infer embeddings for OOV words from their characters).
 
-Use the `elmo_vectors` tensor for your downstream tasks. 
-Its dimensions are: (number of sentences, the length of the longest sentence, ELMo dimensionality).
+`SENTENCES` is a list of sentences (lists of words).
+
+Use the `elmo_vectors` and `averaged_vectors` tensors for your downstream tasks. 
 
-# Text classification
+`elmo_vectors` contains contextualized word embeddings. Its shape is: (number of sentences, the length of the longest sentence, ELMo dimensionality).
 
-Use this code to perform document pair classification (like in text entailment or paraphrase detection).
+`averaged_vectors` contains one vector per each input sentence, 
+constructed by averaging individual contextualized word embeddings. 
+It is a list of vectors (the shape is (ELMo dimensionality)).
+
+
+# Example scripts
+
+We provide two example scripts to make it easier to start using _simple_elmo_ right away:
+- [Token embeddings](https://github.com/ltgoslo/simple_elmo/blob/master/simple_elmo/get_elmo_vectors.py)
+
+`python3 get_elmo_vectors.py -i test.txt -e ~/PATH_TO_ELMO/`
+
+- [Text classification](https://github.com/ltgoslo/simple_elmo/blob/master/simple_elmo/text_classification.py)
+
+`python3 text_classification.py -i paraphrases_lemm.tsv.gz -e ~/PATH_TO_ELMO/`
+
+The second script can be used to perform document pair classification (like in text entailment or paraphrase detection).
 
 Simple average of ELMo embeddings for all words in a document is used;
 then, the cosine similarity between two documents is calculated and used as a classifier feature.
 
-Example datasets for Russian (adapted from http://paraphraser.ru/):
+Example paraphrase datasets for Russian (adapted from http://paraphraser.ru/):
 - https://rusvectores.org/static/testsets/paraphrases.tsv.gz
 - https://rusvectores.org/static/testsets/paraphrases_lemm.tsv.gz (lemmatized)
 
-`python3 text_classification.py -i paraphrases_lemm.tsv.gz -e ~/PATH_TO_ELMO/`
 
+The library requires Python >= 3.6.
 
diff --git a/setup.py b/setup.py
@@ -5,19 +5,20 @@
 
 setuptools.setup(
     name="simple_elmo",
-    version="0.1.0",
+    version="0.2.0",
     author="Andrey Kutuzov",
     author_email="[email protected]",
-    description="Useful library to work with pre-trained ELMo embeddings in TensorFlow ",
+    description="Handy library to work with pre-trained ELMo embeddings in TensorFlow",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/ltgoslo/simple_elmo",
     packages=setuptools.find_packages(),
     python_requires='>=3.6',
-    install_requires=["tensorflow>1.15", "h5py", "numpy", "smart_open>1.8.1", "pandas", "scikit-learn"],
+    install_requires=["tensorflow>1.15", "h5py", "numpy", "smart_open>1.8.1", "pandas",
+                      "scikit-learn"],
     classifiers=[
         "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License",
+        "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
         "Operating System :: OS Independent",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
         "Topic :: Utilities"

diff --git a/simple_elmo/__init__.py b/simple_elmo/__init__.py
@@ -1,5 +1,5 @@
 name = "simple_elmo"
 from simple_elmo.elmo_helpers import ElmoModel, divide_chunks
-from data import Batcher
-from model import BidirectionalLanguageModel
-from elmo import weight_layers
+from simple_elmo.data import Batcher
+from simple_elmo.model import BidirectionalLanguageModel
+from simple_elmo.elmo import weight_layers
diff --git a/simple_elmo/elmo_helpers.py b/simple_elmo/elmo_helpers.py
@@ -6,13 +6,13 @@
 import os
 import numpy as np
 import tensorflow as tf
-from data import Batcher
-from model import BidirectionalLanguageModel
-from elmo import weight_layers
 from sklearn import preprocessing
 import json
 import zipfile
 import logging
+from simple_elmo.data import Batcher
+from simple_elmo.model import BidirectionalLanguageModel
+from simple_elmo.elmo import weight_layers
 
 
 class ElmoModel:
@@ -34,7 +34,7 @@ def __init__(self):
         logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
         self.logger = logging.getLogger(__name__)
 
-    def load(self, directory, top=False, max_batch_size=128):
+    def load(self, directory, top=False, max_batch_size=96):
         # Loading a pre-trained ELMo model:
         # You can call load with top=True to use only the top ELMo layer
         """

diff --git a/simple_elmo/get_elmo_vectors.py b/simple_elmo/get_elmo_vectors.py
@@ -2,9 +2,9 @@
 # coding: utf-8
 
 import argparse
-from elmo_helpers import ElmoModel, tokenize
-from smart_open import open
+from simple_elmo import ElmoModel
 import numpy as np
+from smart_open import open
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -27,15 +27,15 @@
             raw_sentences.append(res)
             if len(raw_sentences) > max_sentences:
                 break
-    sentences = [tokenize(s, limit=100) for s in raw_sentences]
+    sentences = [s.split()[:100] for s in raw_sentences]
 
     print('=====')
     print(f'{len(sentences)} sentences total')
     print('=====')
 
     model = ElmoModel()
 
-    model.load(args.elmo, top=True)
+    model.load(args.elmo, top=False)
 
     # Actually producing ELMo embeddings for our data:
 
@@ -57,7 +57,7 @@
     # A quick test:
     # in each sentence, we find the tokens most similar to a given token of a given sentence
     query_sentence_nr = -2
-    query_word_nr = 0
+    query_word_nr = 1
     query_word = sentences[query_sentence_nr][query_word_nr]
     print(f"Query sentence: {sentences[query_sentence_nr]}")
     print(f"Query: {query_word}")

diff --git a/simple_elmo/model.py b/simple_elmo/model.py
@@ -6,8 +6,7 @@
 import numpy as np
 import tensorflow as tf
 from zipfile import ZipExtFile
-
-from data import UnicodeCharsVocabulary, Batcher, InvalidNumberOfCharacters
+from simple_elmo.data import UnicodeCharsVocabulary, Batcher, InvalidNumberOfCharacters
 
 DTYPE = 'float32'
 DTYPE_INT = 'int64'

diff --git a/simple_elmo/text_classification.py b/simple_elmo/text_classification.py
@@ -7,9 +7,9 @@
 from sklearn.neural_network import MLPClassifier
 from sklearn.model_selection import cross_validate
 from sklearn.dummy import DummyClassifier
-from elmo_helpers import ElmoModel
 import pandas as pd
 import numpy as np
+from simple_elmo import ElmoModel
 
 # You can use this code to perform document pair classification
 # (like in text entailment or paraphrase detection).