diff --git a/README.md b/README.md index 3fe5d47..1793b9c 100644 --- a/README.md +++ b/README.md @@ -1,38 +1,61 @@ # Simple ELMo -Minimal Python code to get vectors from pre-trained ELMo models in TensorFlow. +_simple_elmo_ is a Python library to work with pre-trained [ELMo embeddings](https://allennlp.org/elmo) in TensorFlow. -Heavily based on https://github.com/allenai/bilm-tf. -Requires Python >= 3.6 +This is a significantly updated wrapper to the [original ELMo implementation](https://github.com/allenai/bilm-tf). +The main changes are: +- more convenient and transparent data loading (including from compressed files) +- code adapted to modern TensorFlow versions (including TensorFlow 2). -The main changes: -- more convenient data loading (including from compressed files) -- code adapted to recent TensorFlow versions (including TF 2.0). +# Usage -# Usage example +`pip install simple_elmo` -`python3 get_elmo_vectors.py -i test.txt -e ~/PATH_TO_ELMO/` + `model = ElmoModel()` + + `model.load(PATH_TO_ELMO)` + + `elmo_vectors = model.get_elmo_vectors(SENTENCES)` + + `averaged_vectors = model.get_elmo_vector_average(SENTENCES)` `PATH_TO_ELMO` is a ZIP archive downloaded from the [NLPL vector repository](http://vectors.nlpl.eu/repository/), -or a directory containing 3 files extracted from such an archive: +OR a directory containing 3 files extracted from such an archive: - `model.hdf5`, pre-trained ELMo weights in HDF5 format; - `options.json`, description of the model architecture in JSON; - `vocab.txt`/`vocab.txt.gz`, one-word-per-line vocabulary of the most frequent words you would like to cache during inference (not really necessary, the model will infer embeddings for OOV words from their characters). -Use the `elmo_vectors` tensor for your downstream tasks. -Its dimensions are: (number of sentences, the length of the longest sentence, ELMo dimensionality). +`SENTENCES` is a list of sentences (lists of words). + +Use the `elmo_vectors` and `averaged_vectors` tensors for your downstream tasks. -# Text classification +`elmo_vectors` contains contextualized word embeddings. Its shape is: (number of sentences, the length of the longest sentence, ELMo dimensionality). -Use this code to perform document pair classification (like in text entailment or paraphrase detection). +`averaged_vectors` contains one vector per each input sentence, +constructed by averaging individual contextualized word embeddings. +It is a list of vectors (the shape is (ELMo dimensionality)). + + +# Example scripts + +We provide two example scripts to make it easier to start using _simple_elmo_ right away: +- [Token embeddings](https://github.com/ltgoslo/simple_elmo/blob/master/simple_elmo/get_elmo_vectors.py) + +`python3 get_elmo_vectors.py -i test.txt -e ~/PATH_TO_ELMO/` + +- [Text classification](https://github.com/ltgoslo/simple_elmo/blob/master/simple_elmo/text_classification.py) + +`python3 text_classification.py -i paraphrases_lemm.tsv.gz -e ~/PATH_TO_ELMO/` + +The second script can be used to perform document pair classification (like in text entailment or paraphrase detection). Simple average of ELMo embeddings for all words in a document is used; then, the cosine similarity between two documents is calculated and used as a classifier feature. -Example datasets for Russian (adapted from http://paraphraser.ru/): +Example paraphrase datasets for Russian (adapted from http://paraphraser.ru/): - https://rusvectores.org/static/testsets/paraphrases.tsv.gz - https://rusvectores.org/static/testsets/paraphrases_lemm.tsv.gz (lemmatized) -`python3 text_classification.py -i paraphrases_lemm.tsv.gz -e ~/PATH_TO_ELMO/` +The library requires Python >= 3.6. diff --git a/setup.py b/setup.py index 681df45..44216df 100644 --- a/setup.py +++ b/setup.py @@ -5,19 +5,20 @@ setuptools.setup( name="simple_elmo", - version="0.1.0", + version="0.2.0", author="Andrey Kutuzov", author_email="andreku@ifi.uio.no", - description="Useful library to work with pre-trained ELMo embeddings in TensorFlow ", + description="Handy library to work with pre-trained ELMo embeddings in TensorFlow", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/ltgoslo/simple_elmo", packages=setuptools.find_packages(), python_requires='>=3.6', - install_requires=["tensorflow>1.15", "h5py", "numpy", "smart_open>1.8.1", "pandas", "scikit-learn"], + install_requires=["tensorflow>1.15", "h5py", "numpy", "smart_open>1.8.1", "pandas", + "scikit-learn"], classifiers=[ "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Utilities" diff --git a/simple_elmo/__init__.py b/simple_elmo/__init__.py index 0e5d667..4c7d53e 100644 --- a/simple_elmo/__init__.py +++ b/simple_elmo/__init__.py @@ -1,5 +1,5 @@ name = "simple_elmo" from simple_elmo.elmo_helpers import ElmoModel, divide_chunks -from data import Batcher -from model import BidirectionalLanguageModel -from elmo import weight_layers +from simple_elmo.data import Batcher +from simple_elmo.model import BidirectionalLanguageModel +from simple_elmo.elmo import weight_layers diff --git a/simple_elmo/elmo_helpers.py b/simple_elmo/elmo_helpers.py index 8880b04..0a0b712 100644 --- a/simple_elmo/elmo_helpers.py +++ b/simple_elmo/elmo_helpers.py @@ -6,13 +6,13 @@ import os import numpy as np import tensorflow as tf -from data import Batcher -from model import BidirectionalLanguageModel -from elmo import weight_layers from sklearn import preprocessing import json import zipfile import logging +from simple_elmo.data import Batcher +from simple_elmo.model import BidirectionalLanguageModel +from simple_elmo.elmo import weight_layers class ElmoModel: @@ -34,7 +34,7 @@ def __init__(self): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) self.logger = logging.getLogger(__name__) - def load(self, directory, top=False, max_batch_size=128): + def load(self, directory, top=False, max_batch_size=96): # Loading a pre-trained ELMo model: # You can call load with top=True to use only the top ELMo layer """ diff --git a/simple_elmo/get_elmo_vectors.py b/simple_elmo/get_elmo_vectors.py old mode 100644 new mode 100755 index a999098..eacf6ba --- a/simple_elmo/get_elmo_vectors.py +++ b/simple_elmo/get_elmo_vectors.py @@ -2,9 +2,9 @@ # coding: utf-8 import argparse -from elmo_helpers import ElmoModel, tokenize -from smart_open import open +from simple_elmo import ElmoModel import numpy as np +from smart_open import open if __name__ == '__main__': parser = argparse.ArgumentParser() @@ -27,7 +27,7 @@ raw_sentences.append(res) if len(raw_sentences) > max_sentences: break - sentences = [tokenize(s, limit=100) for s in raw_sentences] + sentences = [s.split()[:100] for s in raw_sentences] print('=====') print(f'{len(sentences)} sentences total') @@ -35,7 +35,7 @@ model = ElmoModel() - model.load(args.elmo, top=True) + model.load(args.elmo, top=False) # Actually producing ELMo embeddings for our data: @@ -57,7 +57,7 @@ # A quick test: # in each sentence, we find the tokens most similar to a given token of a given sentence query_sentence_nr = -2 - query_word_nr = 0 + query_word_nr = 1 query_word = sentences[query_sentence_nr][query_word_nr] print(f"Query sentence: {sentences[query_sentence_nr]}") print(f"Query: {query_word}") diff --git a/simple_elmo/model.py b/simple_elmo/model.py index 0acb7e9..faa3d1e 100644 --- a/simple_elmo/model.py +++ b/simple_elmo/model.py @@ -6,8 +6,7 @@ import numpy as np import tensorflow as tf from zipfile import ZipExtFile - -from data import UnicodeCharsVocabulary, Batcher, InvalidNumberOfCharacters +from simple_elmo.data import UnicodeCharsVocabulary, Batcher, InvalidNumberOfCharacters DTYPE = 'float32' DTYPE_INT = 'int64' diff --git a/simple_elmo/text_classification.py b/simple_elmo/text_classification.py index 35a40ad..61fa979 100755 --- a/simple_elmo/text_classification.py +++ b/simple_elmo/text_classification.py @@ -7,9 +7,9 @@ from sklearn.neural_network import MLPClassifier from sklearn.model_selection import cross_validate from sklearn.dummy import DummyClassifier -from elmo_helpers import ElmoModel import pandas as pd import numpy as np +from simple_elmo import ElmoModel # You can use this code to perform document pair classification # (like in text entailment or paraphrase detection).