From 14b305f7620396ede6e735a86e09b351ac707a25 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Sat, 23 Mar 2024 11:07:32 -0500 Subject: [PATCH 01/18] Updated quickstart. --- src/examples/quickstart.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/examples/quickstart.ipynb b/src/examples/quickstart.ipynb index 838d488..49a6aca 100644 --- a/src/examples/quickstart.ipynb +++ b/src/examples/quickstart.ipynb @@ -71,7 +71,7 @@ " atl=atl,\n", " crt=crt,\n", " atlas_dir=\"atlas\",\n", - " target_size=1000,\n", + " target_size=10000,\n", " center=atl.center,\n", ")" ] From efd4948da2f299f5a5a7f3f3d0ed90fa34186df1 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Sat, 23 Mar 2024 11:10:24 -0500 Subject: [PATCH 02/18] More-restrictive gitignore. --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index b5b13f8..ffbc9d0 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,9 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +# Only include models if forced +*.model + +# Only include vscode settings if forced. +.vscode/settings.json From b0268896fc19113eef794e22d6c0f94505a3cde0 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Sat, 23 Mar 2024 11:18:12 -0500 Subject: [PATCH 03/18] New huggingface exploration notebook. --- src/examples/scratch/huggingface.ipynb | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 src/examples/scratch/huggingface.ipynb diff --git a/src/examples/scratch/huggingface.ipynb b/src/examples/scratch/huggingface.ipynb new file mode 100644 index 0000000..e19ebaa --- /dev/null +++ b/src/examples/scratch/huggingface.ipynb @@ -0,0 +1,18 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example Notebook for Integrating with Hugging Face" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 361d941f3179af76d059bb94c69287916cbfa771 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Mon, 25 Mar 2024 11:20:03 -0500 Subject: [PATCH 04/18] Hugging face example notebook. --- src/examples/scratch/huggingface.ipynb | 99 +++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/src/examples/scratch/huggingface.ipynb b/src/examples/scratch/huggingface.ipynb index e19ebaa..42d95a8 100644 --- a/src/examples/scratch/huggingface.ipynb +++ b/src/examples/scratch/huggingface.ipynb @@ -6,11 +6,108 @@ "source": [ "# Example Notebook for Integrating with Hugging Face" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import bitsandbytes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoModelForCausalLM\n", + "\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " '/Users/zhafensaavedra/repos/llama2/llama.cpp/models/7B',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import BitsAndBytesConfig\n", + "\n", + "quantization_config = BitsAndBytesConfig(load_in_4bit=True)\n", + "\n", + "model_str = \"meta-llama/Llama-2-7b-hf\"\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_str,\n", + " device_map=\"cpu\",\n", + " quantization_config=quantization_config,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_str, padding_side=\"left\")\n", + "model_inputs = tokenizer([\"A list of colors: red, blue\"], return_tensors=\"pt\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "generated_ids = model.generate(**model_inputs)\n", + "tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default\n", + "model_inputs = tokenizer(\n", + " [\"A list of colors: red, blue\", \"Portugal is\"], return_tensors=\"pt\", padding=True\n", + ").to(\"cuda\")\n", + "generated_ids = model.generate(**model_inputs)\n", + "tokenizer.batch_decode(generated_ids, skip_special_tokens=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { + "kernelspec": { + "display_name": "llm", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" } }, "nbformat": 4, From 2dfd3b0c4dfd216b14d2501c05b099f8a485c191 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 15:04:37 -0500 Subject: [PATCH 05/18] huggingface.ipynb now focuses on Inference Endpoints. --- src/examples/scratch/huggingface.ipynb | 66 ++++++++++++++------------ 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/src/examples/scratch/huggingface.ipynb b/src/examples/scratch/huggingface.ipynb index 42d95a8..a26c2d3 100644 --- a/src/examples/scratch/huggingface.ipynb +++ b/src/examples/scratch/huggingface.ipynb @@ -8,12 +8,10 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import bitsandbytes" + "# Setup" ] }, { @@ -22,11 +20,7 @@ "metadata": {}, "outputs": [], "source": [ - "from transformers import AutoModelForCausalLM\n", - "\n", - "model = AutoModelForCausalLM.from_pretrained(\n", - " '/Users/zhafensaavedra/repos/llama2/llama.cpp/models/7B',\n", - ")" + "import huggingface_hub as hfhub" ] }, { @@ -35,15 +29,18 @@ "metadata": {}, "outputs": [], "source": [ - "from transformers import BitsAndBytesConfig\n", - "\n", - "quantization_config = BitsAndBytesConfig(load_in_4bit=True)\n", - "\n", - "model_str = \"meta-llama/Llama-2-7b-hf\"\n", - "model = AutoModelForCausalLM.from_pretrained(\n", - " model_str,\n", - " device_map=\"cpu\",\n", - " quantization_config=quantization_config,\n", + "# Settings\n", + "endpoint_name=\"my-endpoint-name\"\n", + "endpoint_config = dict(\n", + " repository=\"gpt2\",\n", + " framework=\"pytorch\",\n", + " task=\"text-generation\",\n", + " accelerator=\"cpu\",\n", + " vendor=\"aws\",\n", + " region=\"us-east-1\",\n", + " type=\"protected\",\n", + " instance_size=\"medium\",\n", + " instance_type=\"c6i\"\n", ")" ] }, @@ -53,10 +50,11 @@ "metadata": {}, "outputs": [], "source": [ - "from transformers import AutoTokenizer\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(model_str, padding_side=\"left\")\n", - "model_inputs = tokenizer([\"A list of colors: red, blue\"], return_tensors=\"pt\")" + "# Login\n", + "token = hfhub.get_token()\n", + "if token is None:\n", + " hfhub.login()\n", + " token = hfhub.get_token()" ] }, { @@ -65,8 +63,21 @@ "metadata": {}, "outputs": [], "source": [ - "generated_ids = model.generate(**model_inputs)\n", - "tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]" + "# get or create endpoint\n", + "endpoint_names = [\n", + " _.name for _ in hfhub.list_inference_endpoints(namespace=\"*\")\n", + "]\n", + "if endpoint_name in endpoint_names:\n", + " endpoint = hfhub.get_inference_endpoint(endpoint_name)\n", + "else:\n", + " endpoint = hfhub.create_inference_endpoint(endpoint_name, **endpoint_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scratch" ] }, { @@ -75,12 +86,7 @@ "metadata": {}, "outputs": [], "source": [ - "tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default\n", - "model_inputs = tokenizer(\n", - " [\"A list of colors: red, blue\", \"Portugal is\"], return_tensors=\"pt\", padding=True\n", - ").to(\"cuda\")\n", - "generated_ids = model.generate(**model_inputs)\n", - "tokenizer.batch_decode(generated_ids, skip_special_tokens=True)" + "endpoint" ] }, { From dc9d28c3803e621c245f66d75752dbe680b65ee0 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 15:23:57 -0500 Subject: [PATCH 06/18] I have a full custom endpoint example. --- src/examples/scratch/hf_custom_endpoint.ipynb | 134 ++++++++++++++++++ src/examples/scratch/huggingface.ipynb | 23 ++- 2 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 src/examples/scratch/hf_custom_endpoint.ipynb diff --git a/src/examples/scratch/hf_custom_endpoint.ipynb b/src/examples/scratch/hf_custom_endpoint.ipynb new file mode 100644 index 0000000..061e343 --- /dev/null +++ b/src/examples/scratch/hf_custom_endpoint.ipynb @@ -0,0 +1,134 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example Notebook for Integrating with Hugging Face" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import huggingface_hub as hfhub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Settings\n", + "endpoint_name=\"my-endpoint-name\"\n", + "endpoint_config = dict(\n", + " repository=\"gpt2\",\n", + " framework=\"pytorch\",\n", + " task=\"text-generation\",\n", + " accelerator=\"gpu\",\n", + " vendor=\"aws\",\n", + " region=\"us-east-1\",\n", + " type=\"protected\",\n", + " instance_size=\"small\",\n", + " instance_type=\"c6i\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Login\n", + "token = hfhub.get_token()\n", + "if token is None:\n", + " hfhub.login()\n", + " token = hfhub.get_token()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get or create endpoint\n", + "endpoint_names = [\n", + " _.name for _ in hfhub.list_inference_endpoints(namespace=\"*\")\n", + "]\n", + "if endpoint_name in endpoint_names:\n", + " endpoint = hfhub.get_inference_endpoint(endpoint_name)\n", + "else:\n", + " endpoint = hfhub.create_inference_endpoint(endpoint_name, **endpoint_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the endpoint up\n", + "endpoint.wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "endpoint.client.text_generation(\"I am\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Shut down\n", + "endpoint.pause()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llm", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/examples/scratch/huggingface.ipynb b/src/examples/scratch/huggingface.ipynb index a26c2d3..061e343 100644 --- a/src/examples/scratch/huggingface.ipynb +++ b/src/examples/scratch/huggingface.ipynb @@ -35,11 +35,11 @@ " repository=\"gpt2\",\n", " framework=\"pytorch\",\n", " task=\"text-generation\",\n", - " accelerator=\"cpu\",\n", + " accelerator=\"gpu\",\n", " vendor=\"aws\",\n", " region=\"us-east-1\",\n", " type=\"protected\",\n", - " instance_size=\"medium\",\n", + " instance_size=\"small\",\n", " instance_type=\"c6i\"\n", ")" ] @@ -74,10 +74,22 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the endpoint up\n", + "endpoint.wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "# Scratch" + "endpoint.client.text_generation(\"I am\")" ] }, { @@ -86,7 +98,8 @@ "metadata": {}, "outputs": [], "source": [ - "endpoint" + "# Shut down\n", + "endpoint.pause()" ] }, { From 66540529a78ba03f730959bc0bb452178030dc57 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 15:31:17 -0500 Subject: [PATCH 07/18] Got the inference API nb set up. --- src/examples/scratch/huggingface.ipynb | 52 ++++++++++++-------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/src/examples/scratch/huggingface.ipynb b/src/examples/scratch/huggingface.ipynb index 061e343..792d9c5 100644 --- a/src/examples/scratch/huggingface.ipynb +++ b/src/examples/scratch/huggingface.ipynb @@ -14,6 +14,15 @@ "# Setup" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests" + ] + }, { "cell_type": "code", "execution_count": null, @@ -30,18 +39,7 @@ "outputs": [], "source": [ "# Settings\n", - "endpoint_name=\"my-endpoint-name\"\n", - "endpoint_config = dict(\n", - " repository=\"gpt2\",\n", - " framework=\"pytorch\",\n", - " task=\"text-generation\",\n", - " accelerator=\"gpu\",\n", - " vendor=\"aws\",\n", - " region=\"us-east-1\",\n", - " type=\"protected\",\n", - " instance_size=\"small\",\n", - " instance_type=\"c6i\"\n", - ")" + "API_URL = \"https://api-inference.huggingface.co/models/facebook/bart-large-cnn\"" ] }, { @@ -54,7 +52,10 @@ "token = hfhub.get_token()\n", "if token is None:\n", " hfhub.login()\n", - " token = hfhub.get_token()" + " token = hfhub.get_token()\n", + "\n", + "# Format for Inference API\n", + "headers = {\"Authorization\": f\"Bearer {token}\"}" ] }, { @@ -63,24 +64,16 @@ "metadata": {}, "outputs": [], "source": [ - "# get or create endpoint\n", - "endpoint_names = [\n", - " _.name for _ in hfhub.list_inference_endpoints(namespace=\"*\")\n", - "]\n", - "if endpoint_name in endpoint_names:\n", - " endpoint = hfhub.get_inference_endpoint(endpoint_name)\n", - "else:\n", - " endpoint = hfhub.create_inference_endpoint(endpoint_name, **endpoint_config)" + "def query(payload):\n", + "\tresponse = requests.post(API_URL, headers=headers, json=payload)\n", + "\treturn response.json()" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# Start the endpoint up\n", - "endpoint.wait()" + "# Exploration" ] }, { @@ -89,7 +82,9 @@ "metadata": {}, "outputs": [], "source": [ - "endpoint.client.text_generation(\"I am\")" + "output = query({\n", + "\t\"inputs\": \"The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.\",\n", + "})" ] }, { @@ -98,8 +93,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Shut down\n", - "endpoint.pause()" + "output" ] }, { From c920bcce1d898d4a4b9a2727df950cec124d33a5 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 15:59:24 -0500 Subject: [PATCH 08/18] Refactored argsort out of expand so it can be used independently. --- src/sciterra/mapping/cartography.py | 72 +++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 20 deletions(-) diff --git a/src/sciterra/mapping/cartography.py b/src/sciterra/mapping/cartography.py index 517f982..bb36618 100644 --- a/src/sciterra/mapping/cartography.py +++ b/src/sciterra/mapping/cartography.py @@ -250,6 +250,50 @@ def project(self, atl: Atlas, **kwargs) -> Atlas: atl_filtered.projection = merged_projection return atl_filtered + ###################################################################### + # Sort Atlas + ###################################################################### + + def argsort( + self, + atl: Atlas, + center: str, + ) -> list[str]: + """Sort an atlas according to cosine similarity to a center publication. + Like numpy argsort, this returns identifiers that can be used to + index the original atlas. + + Args: + atl: the atlas to sort + + center: center the search on this publication + + Returns: + expand_keys: keys in descending order of similarity to the center publication + """ + + # If atlas is initial + if atl.projection is None: + atl = self.project(atl) + if atl.projection is None: + raise Exception( + f"Initial projection of atlas failed; make sure the initial publication has all the required attributes." + ) + + if len(atl.projection): + # build cosine similarity matrix, of shape (1, num_pubs) + cospsi_matrix = cosine_similarity( + atl.projection.identifiers_to_embeddings([center]), + atl.projection.embeddings, + ) + # get most similar keys from center, including center itself + sort_inds = np.argsort(cospsi_matrix)[::-1][ + 0 + ] # argsort orders from least to greatest similarity, so reverse + expand_keys = atl.projection.indices_to_identifiers(sort_inds) + + return expand_keys + ###################################################################### # Expand Atlas ###################################################################### @@ -280,28 +324,16 @@ def expand( Returns: atl_expanded: the expanded atlas """ - existing_keys = set(atl.ids) - expand_keys = existing_keys + + # Get the keys to expand + expand_keys = None if center is not None: - # If atlas is initial - if atl.projection is None: - atl = self.project(atl) - if atl.projection is None: - raise Exception( - f"Initial projection of atlas failed; make sure the initial publication has all the required attributes." - ) + expand_keys = self.argsort(atl, center, n_sources_max) - if len(atl.projection): - # build cosine similarity matrix, of shape (1, num_pubs) - cospsi_matrix = cosine_similarity( - atl.projection.identifiers_to_embeddings([center]), - atl.projection.embeddings, - ) - # get most similar keys from center, including center itself - sort_inds = np.argsort(cospsi_matrix)[::-1][ - 0 - ] # argsort orders from least to greatest similarity, so reverse - expand_keys = atl.projection.indices_to_identifiers(sort_inds) + # If that didn't work, just use all the keys + if expand_keys is None: + existing_keys = set(atl.ids) + expand_keys = existing_keys if n_sources_max is not None: expand_keys = expand_keys[:n_sources_max] From 12855df4c2902d2cd2ac565bd25eb0d25e75a76a Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 16:01:06 -0500 Subject: [PATCH 09/18] Quick rename. --- src/sciterra/mapping/cartography.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sciterra/mapping/cartography.py b/src/sciterra/mapping/cartography.py index bb36618..5699afb 100644 --- a/src/sciterra/mapping/cartography.py +++ b/src/sciterra/mapping/cartography.py @@ -269,7 +269,7 @@ def argsort( center: center the search on this publication Returns: - expand_keys: keys in descending order of similarity to the center publication + sort_keys: keys in descending order of similarity to the center publication """ # If atlas is initial @@ -290,9 +290,9 @@ def argsort( sort_inds = np.argsort(cospsi_matrix)[::-1][ 0 ] # argsort orders from least to greatest similarity, so reverse - expand_keys = atl.projection.indices_to_identifiers(sort_inds) + sort_keys = atl.projection.indices_to_identifiers(sort_inds) - return expand_keys + return sort_keys ###################################################################### # Expand Atlas From 1fc8df6a0ab7cfeb701e6e02b9dd91826c1fa544 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 16:14:18 -0500 Subject: [PATCH 10/18] Adding a test to ensure correct sort order. --- src/tests/test_cartography.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/tests/test_cartography.py b/src/tests/test_cartography.py index 75138c2..7b958e8 100644 --- a/src/tests/test_cartography.py +++ b/src/tests/test_cartography.py @@ -274,6 +274,32 @@ def test_project_correct_number(self, tmp_path): assert len(embed_ids) <= after - before +class TestS2SBSort: + librarian = SemanticScholarLibrarian() + vectorizer = SciBERTVectorizer() + crt = Cartographer(librarian, vectorizer) + + def test_argsort(self, tmp_path): + # Load single file from bibtex + # Load expected values + bibtex_fp = single_pub_bibtex_fp + with open(bibtex_fp, "r") as f: + bib_database = bibtexparser.load(f) + + path = tmp_path / atlas_dir + path.mkdir() + # Construct Atlas + atl = TestS2SBExpand.crt.bibtex_to_atlas(bibtex_fp) + + pub = list(atl.publications.values())[0] + ids = pub.citations + pub.references + center = pub.identifier + + sorted_keys = TestS2SBSort.crt.argsort(atl, center=center) + assert len(sorted_keys) == len(ids) + assert sorted_keys[0] == center + + class TestS2SBExpand: librarian = SemanticScholarLibrarian() vectorizer = SciBERTVectorizer() From b1846ce9dbef009134df361c8d22e319d9060457 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 16:17:16 -0500 Subject: [PATCH 11/18] argsort is now sort and returns a tuple --- src/sciterra/mapping/cartography.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/sciterra/mapping/cartography.py b/src/sciterra/mapping/cartography.py index 5699afb..71dce81 100644 --- a/src/sciterra/mapping/cartography.py +++ b/src/sciterra/mapping/cartography.py @@ -14,7 +14,7 @@ from ..vectorization.projection import Projection, merge, get_empty_projection from ..misc.utils import get_verbose, custom_formatwarning -from typing import Callable +from typing import Callable, Tuple from tqdm import tqdm from sklearn.metrics.pairwise import cosine_similarity @@ -254,11 +254,11 @@ def project(self, atl: Atlas, **kwargs) -> Atlas: # Sort Atlas ###################################################################### - def argsort( + def sort( self, atl: Atlas, center: str, - ) -> list[str]: + ) -> Tuple[list[str], list[str]]: """Sort an atlas according to cosine similarity to a center publication. Like numpy argsort, this returns identifiers that can be used to index the original atlas. @@ -269,7 +269,8 @@ def argsort( center: center the search on this publication Returns: - sort_keys: keys in descending order of similarity to the center publication + sorted_keys: keys in descending order of similarity to the center publication + sorted_values: values in descending order of similarity to the center publication """ # If atlas is initial @@ -290,9 +291,10 @@ def argsort( sort_inds = np.argsort(cospsi_matrix)[::-1][ 0 ] # argsort orders from least to greatest similarity, so reverse - sort_keys = atl.projection.indices_to_identifiers(sort_inds) + sorted_keys = atl.projection.indices_to_identifiers(sort_inds) + sorted_values = cospsi_matrix[0][sort_inds] - return sort_keys + return sorted_keys, sorted_values ###################################################################### # Expand Atlas From 45d7adcb7a8afc613355c1dbc9041ba699841ebb Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 16:40:00 -0500 Subject: [PATCH 12/18] Revising test for sort inds to piggyback off of single expansion test. --- src/tests/test_cartography.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/tests/test_cartography.py b/src/tests/test_cartography.py index 7b958e8..ceff3c8 100644 --- a/src/tests/test_cartography.py +++ b/src/tests/test_cartography.py @@ -280,6 +280,8 @@ class TestS2SBSort: crt = Cartographer(librarian, vectorizer) def test_argsort(self, tmp_path): + # TODO: This takes a while, and we can probably reduce the time + # Load single file from bibtex # Load expected values bibtex_fp = single_pub_bibtex_fp @@ -295,9 +297,11 @@ def test_argsort(self, tmp_path): ids = pub.citations + pub.references center = pub.identifier - sorted_keys = TestS2SBSort.crt.argsort(atl, center=center) + sorted_keys, sorted_values = TestS2SBSort.crt.sort( + atl, center=center) assert len(sorted_keys) == len(ids) assert sorted_keys[0] == center + assert sorted_values[0] > sorted_values[1] class TestS2SBExpand: @@ -365,10 +369,17 @@ def test_expand_center_single(self, tmp_path): pub = list(atl.publications.values())[0] ids = pub.citations + pub.references center = pub.identifier - + atl_exp_single = TestS2SBExpand.crt.expand(atl, center=center) assert len(atl_exp_single) == len(ids) + # Also check that our sorting works okay + sorted_keys, sorted_values = TestS2SBExpand.crt.sort( + atl_exp_single, center=center) + assert len(sorted_keys) == len(ids) + assert sorted_keys[0] == center + assert sorted_values[0] > sorted_values[1] + # Save atlas atl_exp_single.save(path) From 29909ebab5cd47a3560e056411ae094f3a7dda37 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 16:49:00 -0500 Subject: [PATCH 13/18] Fixed up argsort test --- src/tests/test_cartography.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/tests/test_cartography.py b/src/tests/test_cartography.py index ceff3c8..2fee870 100644 --- a/src/tests/test_cartography.py +++ b/src/tests/test_cartography.py @@ -284,7 +284,7 @@ def test_argsort(self, tmp_path): # Load single file from bibtex # Load expected values - bibtex_fp = single_pub_bibtex_fp + bibtex_fp = ten_pub_bibtex_fp with open(bibtex_fp, "r") as f: bib_database = bibtexparser.load(f) @@ -294,12 +294,11 @@ def test_argsort(self, tmp_path): atl = TestS2SBExpand.crt.bibtex_to_atlas(bibtex_fp) pub = list(atl.publications.values())[0] - ids = pub.citations + pub.references center = pub.identifier sorted_keys, sorted_values = TestS2SBSort.crt.sort( atl, center=center) - assert len(sorted_keys) == len(ids) + assert len(sorted_keys) == 10 assert sorted_keys[0] == center assert sorted_values[0] > sorted_values[1] From 0a31aedcacc9dfc122c888e488d1e8d177e08ab4 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 16:58:29 -0500 Subject: [PATCH 14/18] Sort passes now. --- src/sciterra/mapping/cartography.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sciterra/mapping/cartography.py b/src/sciterra/mapping/cartography.py index 71dce81..4c73843 100644 --- a/src/sciterra/mapping/cartography.py +++ b/src/sciterra/mapping/cartography.py @@ -290,7 +290,7 @@ def sort( # get most similar keys from center, including center itself sort_inds = np.argsort(cospsi_matrix)[::-1][ 0 - ] # argsort orders from least to greatest similarity, so reverse + ][::-1] # argsort orders from least to greatest similarity, so reverse sorted_keys = atl.projection.indices_to_identifiers(sort_inds) sorted_values = cospsi_matrix[0][sort_inds] From 25ab6124817ddb0b35d4c74b8d91b3753d506763 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 17:13:26 -0500 Subject: [PATCH 15/18] Cleaned up the sorting logic. --- src/sciterra/mapping/cartography.py | 5 ++--- src/tests/test_cartography.py | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sciterra/mapping/cartography.py b/src/sciterra/mapping/cartography.py index 4c73843..1df58c5 100644 --- a/src/sciterra/mapping/cartography.py +++ b/src/sciterra/mapping/cartography.py @@ -288,9 +288,8 @@ def sort( atl.projection.embeddings, ) # get most similar keys from center, including center itself - sort_inds = np.argsort(cospsi_matrix)[::-1][ - 0 - ][::-1] # argsort orders from least to greatest similarity, so reverse + sort_inds = np.argsort(cospsi_matrix)[-1][::-1] + # argsort orders from least to greatest similarity, so reverse sorted_keys = atl.projection.indices_to_identifiers(sort_inds) sorted_values = cospsi_matrix[0][sort_inds] diff --git a/src/tests/test_cartography.py b/src/tests/test_cartography.py index 2fee870..7a87492 100644 --- a/src/tests/test_cartography.py +++ b/src/tests/test_cartography.py @@ -301,6 +301,7 @@ def test_argsort(self, tmp_path): assert len(sorted_keys) == 10 assert sorted_keys[0] == center assert sorted_values[0] > sorted_values[1] + assert sorted_values[1] > sorted_values[-1] class TestS2SBExpand: From ebf5079bfaa49bc3a709fa2ec84d73022d199034 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 17:22:40 -0500 Subject: [PATCH 16/18] Fixed integration of the refactored sort function. --- src/sciterra/mapping/cartography.py | 4 ++-- src/tests/test_cartography.py | 7 ------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/src/sciterra/mapping/cartography.py b/src/sciterra/mapping/cartography.py index 1df58c5..9691328 100644 --- a/src/sciterra/mapping/cartography.py +++ b/src/sciterra/mapping/cartography.py @@ -329,11 +329,11 @@ def expand( # Get the keys to expand expand_keys = None if center is not None: - expand_keys = self.argsort(atl, center, n_sources_max) + expand_keys = self.sort(atl, center)[0] # If that didn't work, just use all the keys + existing_keys = set(atl.ids) if expand_keys is None: - existing_keys = set(atl.ids) expand_keys = existing_keys if n_sources_max is not None: diff --git a/src/tests/test_cartography.py b/src/tests/test_cartography.py index 7a87492..750e71c 100644 --- a/src/tests/test_cartography.py +++ b/src/tests/test_cartography.py @@ -373,13 +373,6 @@ def test_expand_center_single(self, tmp_path): atl_exp_single = TestS2SBExpand.crt.expand(atl, center=center) assert len(atl_exp_single) == len(ids) - # Also check that our sorting works okay - sorted_keys, sorted_values = TestS2SBExpand.crt.sort( - atl_exp_single, center=center) - assert len(sorted_keys) == len(ids) - assert sorted_keys[0] == center - assert sorted_values[0] > sorted_values[1] - # Save atlas atl_exp_single.save(path) From 396b5f60371ebb04f98e6dff8de09092e49559c2 Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Tue, 26 Mar 2024 17:50:44 -0500 Subject: [PATCH 17/18] All set up to start testing summarization modules. --- src/examples/scratch/huggingface.ipynb | 121 +++++++++++++++++++++++-- 1 file changed, 112 insertions(+), 9 deletions(-) diff --git a/src/examples/scratch/huggingface.ipynb b/src/examples/scratch/huggingface.ipynb index 792d9c5..45d22cb 100644 --- a/src/examples/scratch/huggingface.ipynb +++ b/src/examples/scratch/huggingface.ipynb @@ -20,7 +20,8 @@ "metadata": {}, "outputs": [], "source": [ - "import requests" + "import requests\n", + "import huggingface_hub as hfhub" ] }, { @@ -29,7 +30,17 @@ "metadata": {}, "outputs": [], "source": [ - "import huggingface_hub as hfhub" + "from sciterra import Atlas\n", + "from sciterra import Cartographer\n", + "from sciterra.librarians import SemanticScholarLibrarian # or ADSLibrarian\n", + "from sciterra.vectorization import SciBERTVectorizer # among others" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Settings" ] }, { @@ -39,7 +50,46 @@ "outputs": [], "source": [ "# Settings\n", - "API_URL = \"https://api-inference.huggingface.co/models/facebook/bart-large-cnn\"" + "atlas_dirpath = \"../atlas\"\n", + "# model = \"Falconsai/text_summarization\"\n", + "model = \"liminerity/Phigments12\"\n", + "n_summarized = 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sciterra" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "atl = Atlas.load(atlas_dirpath)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a cartographer with a Semantic Scholar librarian and a SciBERT vectorizer\n", + "crt = Cartographer(\n", + " librarian=SemanticScholarLibrarian(),\n", + " vectorizer=SciBERTVectorizer(),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### HFHub" ] }, { @@ -65,8 +115,10 @@ "outputs": [], "source": [ "def query(payload):\n", - "\tresponse = requests.post(API_URL, headers=headers, json=payload)\n", - "\treturn response.json()" + "\n", + "\tapi_url = f\"https://api-inference.huggingface.co/models/{model}\"\n", + "\tresponse = requests.post(api_url, headers=headers, json=payload)\n", + "\treturn response" ] }, { @@ -82,9 +134,60 @@ "metadata": {}, "outputs": [], "source": [ - "output = query({\n", - "\t\"inputs\": \"The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.\",\n", - "})" + "# Find the publications most-similar to the original\n", + "sorted_keys, sorted_values = crt.sort(atl, center=atl.center)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the abstracts for the most-similar publications\n", + "combined_abstracts = \"Please summarize the following abstracts:\"\n", + "for i, identifier in enumerate(sorted_keys[:n_summarized]):\n", + "\n", + " combined_abstracts += f\"This is the {i}th abstract:\\n\"\n", + " combined_abstracts += atl.publications[identifier].abstract\n", + " combined_abstracts += \"\\n\"\n", + "\n", + "print(combined_abstracts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the abstracts for the most-similar publications\n", + "combined_abstracts = \"\"\n", + "for i, identifier in enumerate(sorted_keys[:n_summarized]):\n", + "\n", + " combined_abstracts += '\\n\\n' + atl.publications[identifier].abstract\n", + "\n", + "print(combined_abstracts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = query({\"input\": combined_abstracts})\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from evaluate import load\n", + "eval_module = load(\"rouge\")" ] }, { @@ -93,7 +196,7 @@ "metadata": {}, "outputs": [], "source": [ - "output" + "eval_module.compute(predictions=[prediction,], references=[combined_abstracts,])" ] }, { From be6f89a8ee49d797464b8972ccbc2fb1c6175d0a Mon Sep 17 00:00:00 2001 From: Zach Hafen-Saavedra Date: Wed, 27 Mar 2024 13:17:29 -0500 Subject: [PATCH 18/18] Returned to summarization by passing in abstracts one at a time. --- src/examples/scratch/huggingface.ipynb | 68 ++++++++++++++++---------- 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/src/examples/scratch/huggingface.ipynb b/src/examples/scratch/huggingface.ipynb index 45d22cb..93aee74 100644 --- a/src/examples/scratch/huggingface.ipynb +++ b/src/examples/scratch/huggingface.ipynb @@ -7,6 +7,13 @@ "# Example Notebook for Integrating with Hugging Face" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TODO: Future work may include using `Arthur bench` to compare LLMS." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -51,9 +58,10 @@ "source": [ "# Settings\n", "atlas_dirpath = \"../atlas\"\n", - "# model = \"Falconsai/text_summarization\"\n", - "model = \"liminerity/Phigments12\"\n", - "n_summarized = 10" + "model = \"Falconsai/text_summarization\"\n", + "# model = \"microsoft/phi-1_5\"\n", + "# model = \"amu/spin-phi2\"\n", + "n_summarized = 2" ] }, { @@ -145,29 +153,27 @@ "outputs": [], "source": [ "# Get the abstracts for the most-similar publications\n", - "combined_abstracts = \"Please summarize the following abstracts:\"\n", + "prompt = (\n", + "'''The following text are abstracts from several publications that are most\n", + "similar to the original publication. We will share each one, and then we will\n", + "summarize them.\n", + "'''\n", + ")\n", + "prompts = []\n", "for i, identifier in enumerate(sorted_keys[:n_summarized]):\n", "\n", - " combined_abstracts += f\"This is the {i}th abstract:\\n\"\n", - " combined_abstracts += atl.publications[identifier].abstract\n", - " combined_abstracts += \"\\n\"\n", + " abstract = \"\\n\".join(atl.publications[identifier].abstract.split(\".\"))\n", "\n", - "print(combined_abstracts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get the abstracts for the most-similar publications\n", - "combined_abstracts = \"\"\n", - "for i, identifier in enumerate(sorted_keys[:n_summarized]):\n", + " # Combined prompt; used for LLMs\n", + " prompt += f\"This is the abstract for paper {i+1}:\\n\"\n", + " prompt += abstract + \"\\n\"\n", + "\n", + " # Individual prompts; used for summarization models\n", + " prompts.append(abstract)\n", "\n", - " combined_abstracts += '\\n\\n' + atl.publications[identifier].abstract\n", + "prompt += \"The summary of the papers, with one sentence per paper, is as follows:\"\n", "\n", - "print(combined_abstracts)" + "print(prompt)" ] }, { @@ -176,8 +182,13 @@ "metadata": {}, "outputs": [], "source": [ - "response = query({\"input\": combined_abstracts})\n", - "response" + "response = query({\n", + " 'inputs': prompts,\n", + " 'parameters': {\n", + " 'max_new_tokens': 250\n", + " },\n", + "})\n", + "response, response.json()" ] }, { @@ -186,8 +197,11 @@ "metadata": {}, "outputs": [], "source": [ - "from evaluate import load\n", - "eval_module = load(\"rouge\")" + "for possible_key in ['summary_text', 'generated_text']:\n", + " if possible_key in response.json()[0]:\n", + " output_key = possible_key \n", + "predictions = [ _[output_key] for _ in response.json()]\n", + "print(predictions[0])" ] }, { @@ -196,7 +210,9 @@ "metadata": {}, "outputs": [], "source": [ - "eval_module.compute(predictions=[prediction,], references=[combined_abstracts,])" + "from evaluate import load\n", + "eval_module = load(\"rouge\")\n", + "eval_module.compute(predictions=predictions, references=prompts)" ] }, {