add TestConvergence

nathimel · Nov 9, 2023 · 362b43c · 362b43c
1 parent a646bf9
commit 362b43c
Show file tree

Hide file tree

Showing 3 changed files with 264 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,6 @@
 
 [![build](https://github.com/nathimel/sciterra/actions/workflows/test.yml/badge.svg)](https://github.com/nathimel/sciterra/actions/workflows/test.yml)
 
-Software library to support data-driven analyses of scientific literature
+Software library to support data-driven analyses of scientific literature.
 
-Inspired heavily by Zach Hafen's [cc](https://github.com/zhafen/cc) library.
+This library is a reimplementation of Zach Hafen's [cc](https://github.com/zhafen/cc) library.
diff --git a/src/sciterra/mapping/cartography.py b/src/sciterra/mapping/cartography.py
@@ -20,8 +20,40 @@
 warnings.formatwarning = custom_formatwarning
 
 
+def batch_cospsi_matrix(embeddings: np.ndarray) -> np.ndarray:
+    """Batch-process a pairwise cosine similarity matrix between embeddings.
+    
+    In order to avoid memory errors (e.g. bus error, segfaults) resulting from too large arrays, we batch process the construction of the cospsi_matrix.
+
+    Args:
+        embeddings: a 1D numpy array of embeddings
+
+    Returns:
+        cosine_similarities: a 2D numpy array representing the pairwise cosine similarity between each embedding
+    """
+    batch_size = min(1000, len(embeddings))  # Define a batch size
+
+    cosine_similarities = None
+    print(f"computing cosine similarity for {len(embeddings)} embeddings with batch size {batch_size}.")
+    for i in tqdm(range(0, len(embeddings), batch_size)):
+        # Process batches to compute cosine similarity
+        batch = embeddings[i:i+batch_size]
+        if cosine_similarities is None:
+            cosine_similarities = cosine_similarity(batch, embeddings)
+        else:
+            cosine_similarities = np.vstack((cosine_similarities, cosine_similarity(batch, embeddings)))
+
+    return cosine_similarities
+
+
 class Cartographer:
-    """A basic wrapper for obtaining and updating atlas projections."""
+    """A basic wrapper for obtaining and updating atlas projections.
+    
+    `self.librarian`: the Librarian object used to query a bibliographic database API.
+    `self.vectorizer`: the Vectorizer object used to get a document embedding for each abstract
+    `self.pubs_per_update`: a list of lists of publication str ids, representing the publications that exist at each time step / expansion update.
+    `self.update_history`: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.
+    """
 
     def __init__(
         self,
@@ -31,6 +63,9 @@ def __init__(
         self.librarian = librarian
         self.vectorizer = vectorizer
 
+        self.pubs_per_update: list[list[str]] = []
+        self.update_history: np.ndarray = None
+
     ######################################################################
     # Get an Atlas from bibtex
     ######################################################################
@@ -169,6 +204,7 @@ def expand(
         center: str = None,
         n_pubs_max: int = 4000,
         n_sources_max: int = None,
+        record_pubs_per_update: bool = False,
         **kwargs,
     ) -> Atlas:
         """Expand an atlas by retrieving a list of publications resulting from traversal of the citation network.
@@ -182,6 +218,8 @@ def expand(
 
             n_sources_max: maximum number of publications (already in the atlas) to draw references and citations from.
 
+            record_pubs_per_update: whether to track all the publications that exist in the resulting atlas to `self.pubs_per_update`. This should only be set to `True` when you need to later filter by degree of convergence of the atlas.
+
         Returns:
             atl_expanded: the expanded atlas
         """
@@ -241,6 +279,10 @@ def expand(
             atl.projection
         )  # new projection will be updated in `project`
 
+        # Record the new list of publications
+        if record_pubs_per_update:
+            self.pubs_per_update.append(list(atl_exp.publications.keys()))
+
         return atl_exp
 
     ######################################################################
@@ -319,6 +361,100 @@ def filter(
         atl_filtered = Atlas(new_publications, new_projection)
         atl_filtered.bad_ids = new_bad_ids
         return atl_filtered
+
+    ########################################################################
+    # Record Atlas history
+    ########################################################################
+
+    def record_update_history( 
+        self, 
+        pubs: list[str] = None,
+        pubs_per_update: list[list[str]] = None,
+    ) -> None:
+        """Record when publications were added, by updating atl.update_history. 
+
+        atl.update_history is a np.array of ints representing when publications were added. A value of -2 indicates no record of being added.
+
+        Args:
+            pubs: a list of str ids corresponding to publications at the final update in the update history. By default `None`, and `self.pubs_per_update[-1]` will be used.
+
+            pubs_per_update: a list of which publications existed at which iteration, with the index of the overall list corresponding to the iteration the publication was added. By default `None`, and `self.pubs_per_update` will be used.
+
+        Updates:
+            `self.update_history`: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.
+        
+        Returns: 
+            `None`
+        """
+        if pubs is None:
+            pubs = np.array(self.pubs_per_update[-1])
+
+        if pubs_per_update is None:
+            pubs_per_update = self.pubs_per_update
+
+        # Loop backwards
+        i_max = len( pubs_per_update ) - 1
+        update_history = np.full(pubs.shape, -2 )
+        for i, pubs_i in enumerate( pubs_per_update[::-1] ):
+            is_in = np.isin( pubs, pubs_i )
+            update_history[is_in] = i_max - i
+
+        self.update_history = update_history
+
+    ########################################################################
+    # Calculate Atlas convergence
+    ########################################################################
+
+    def converged_kernel_size(self, atl: Atlas) -> np.ndarray:
+        """Calculate the largest size of the kernel that's converged (at differing levels of convergence) for each publication in a sample at each update.
+
+        Args:
+            atl: Atlas containing publications; for each publication we compute the largest converged kernel size at each update
+
+        Returns:
+            kernel_size: an array of ints of shape `(num_pubs, max_update)` representing the kernel size for converged kernels. 
+                - The first column indicates the largest kernel size that hasn't changed since the beginning,
+                - The second column indicates the largest kernel size that hasn't changed since the first update,
+                - etc. for the nth column.
+        """
+
+        if self.update_history is None:
+            raise ValueError('update_history is None; make sure you have called record_update_history()!')
+
+        if -2 in self.update_history:
+            raise ValueError('Incomplete update history as indicated by entries with values of -2.')
+
+        publications = np.array(list(atl.publications.keys()))
+
+        # 1. Loop over each publication
+        cospsi_kernel = []
+        for pub in tqdm(publications):
+
+            # 2. Identify the similarity with the other publications relative to this publication, and sort accordingly.
+            cospsi = cosine_similarity(
+                atl.projection.identifiers_to_embeddings([pub]),
+                atl.projection.embeddings,
+            ).flatten() # shape `(num_pubs,)`
+            sort_inds = np.argsort(cospsi)[::-1] # shape `(num_pubs,)`
+
+            # 3. Identify the expansion iteration at which those publications were added to the atlas (`sorted_history`).
+            sorted_history = self.update_history[sort_inds] # shape `(num_pubs,)`
+
+            # 4. Identify the latest iteration at which any publication was added to the atlas; this can be less than the total iterations.
+            last_update = self.update_history.max()
+
+            # 5. Loop through each iteration until `last_update`, and identify which publications were added at or before that iteration.
+            result_2 = [
+                # 6. Compute how many publications out we can go and still only contain publications added at or before that iteration.
+                # Use `argmin` to get the first instance of False
+                # Finally, subtract 1: we want the first index before False.                
+                np.argmin(sorted_history <= update) - 1
+                for update in range(last_update)
+            ]  # shape `(num_pubs, last_update)`
+
+            cospsi_kernel.append(result_2)
+
+        return np.array(cospsi_kernel)
 
     ########################################################################
     # Measure Atlas topography
@@ -380,21 +516,7 @@ def measure_topography(
             ]
         )
 
-        # To avoid memory errors (e.g. bus error, segfaults) resulting from too large arrays, we batch process the construction of the cospsi_matrix.
-        embeddings = atl.projection.embeddings
-        batch_size = min(1000, len(embeddings))  # Define a batch size
-
-        cosine_similarities = None
-        print(f"computing cosine similarity for {len(embeddings)} embeddings with batch size {batch_size}.")
-        for i in tqdm(range(0, len(embeddings), batch_size)):
-            # Process batches to compute cosine similarity
-            batch = embeddings[i:i+batch_size]
-            if cosine_similarities is None:
-                cosine_similarities = cosine_similarity(batch, embeddings)
-            else:
-                cosine_similarities = np.vstack((cosine_similarities, cosine_similarity(batch, embeddings)))
-
-        cospsi_matrix = cosine_similarities
+        cospsi_matrix = batch_cospsi_matrix(atl.projection.embeddings)
 
         print(f"Computing {metrics} for {len(publication_indices)} publications.")
         estimates = []

diff --git a/src/tests/test_cartography.py b/src/tests/test_cartography.py
@@ -22,6 +22,7 @@
 
 atlas_dir = "atlas_tmpdir"
 
+# NOTE: Any time you are querying an API for papers, it is not a good idea to have strict tests on the resulting output size, since there is a significant amount of things out of our control, including that the online database may have added new papers.
 
 class TestS2BibtexToAtlas:
     """Test functionality that maps a bibtex file to a list of identifiers, and then populates an atlas. The purpose of this functionality is to map a human-readable / very popular dataformat to the Atlas data structure."""
@@ -101,8 +102,10 @@ def test_bibtex_to_atlas_realistic(self, tmp_path):
 
         # I find that I get 28 out of 86 total refs, i.e. less than a third of papers targeted.
         # or 32 lol
+        # or 31
         # assert len(atl) == 28
-        assert len(atl) == 32
+        # assert len(atl) == 32
+        # assert len(atl) == 31
 
 
 class TestS2SBProjection:
@@ -320,7 +323,8 @@ def test_expand_center_double(self, tmp_path):
         )
         # empirically found this
         # do no assert len(atl_exp_double)  == 4000 + len(ids), because we want 4000 + len(valid_ids), i.e. 148 not 154
-        assert len(atl_exp_double) == 348
+        # assert len(atl_exp_double) == 348 # why off by a few?
+        # assert len(atl_exp_double) == 345
 
         atl_exp_double.save(path)
 
@@ -361,3 +365,121 @@ def test_measure_topography(self):
             metrics=metrics,
         )
         assert measurements.shape == tuple((len(atl_exp_single), len(metrics)))
+
+class TestConvergence:
+    librarian = SemanticScholarLibrarian()
+    vectorizer = SciBERTVectorizer()
+    crt = Cartographer(librarian, vectorizer)    
+
+    def test_record_update_history(self):
+        # Construct Atlas
+        bibtex_fp = ten_pub_bibtex_fp
+
+        atl = TestConvergence.crt.bibtex_to_atlas(bibtex_fp)
+
+        # Mock expansion/update history data
+        input = [
+            ['f2c251056dee4c6f9130b31e5e3e4b3296051c49'], # it=0
+
+            ['4364af31229f7e9a3d83a289a928b2f2a43d30cb', # it=1
+             'f2c251056dee4c6f9130b31e5e3e4b3296051c49', 
+             '287fa946f30eaa78ea86f9c5bd61d67238202356',],
+
+            ['50dea78a96f03ba7fc3398c5deea5174630ef186', # it=2
+             '54a83cd1d94814b0f37ee48084260a2d1882648d',
+             '4364af31229f7e9a3d83a289a928b2f2a43d30cb', 'f2c251056dee4c6f9130b31e5e3e4b3296051c49', 
+             '287fa946f30eaa78ea86f9c5bd61d67238202356', 
+             '2e6438be4901cb9b42ff23dcc3d433789b37d032', 
+             '04da6471743468b6bb1d26dd9a6eac4c03ca73ee',],
+
+            ]
+
+        TestConvergence.crt.record_update_history(
+            input[-1], pubs_per_update=input)
+
+        expected = np.array([-2, -2, 2, 2, 1, 0, 1, 2, 2, -2,])
+        actual = TestConvergence.crt.update_history
+
+        assert np.array_equal(expected, actual)
+
+    def test_converged_kernel_size(self):
+
+        # Construct Atlas
+        bibtex_fp = ten_pub_bibtex_fp
+        atl = TestConvergence.crt.bibtex_to_atlas(bibtex_fp)
+        atl = TestConvergence.crt.project(atl)
+
+        # Mock expansion/update history data
+        input = [
+            ['f2c251056dee4c6f9130b31e5e3e4b3296051c49'], # it=0
+
+            ['4364af31229f7e9a3d83a289a928b2f2a43d30cb', # it=1
+             'f2c251056dee4c6f9130b31e5e3e4b3296051c49', 
+             '287fa946f30eaa78ea86f9c5bd61d67238202356',],
+
+            ['50dea78a96f03ba7fc3398c5deea5174630ef186', # it=2
+             '54a83cd1d94814b0f37ee48084260a2d1882648d',
+             '4364af31229f7e9a3d83a289a928b2f2a43d30cb', 'f2c251056dee4c6f9130b31e5e3e4b3296051c49', 
+             '287fa946f30eaa78ea86f9c5bd61d67238202356', 
+             '2e6438be4901cb9b42ff23dcc3d433789b37d032', 
+             '04da6471743468b6bb1d26dd9a6eac4c03ca73ee',],
+
+            list(atl.publications.keys()), # it=3
+            ]
+
+        TestTopography.crt.record_update_history()
+
+        expected = np.array([3, 3, 2, 2, 1, 0, 1, 2, 2, 3,])
+        actual =  TestTopography.crt.update_history
+
+        assert np.array_equal(expected, actual)
+
+        # mock center
+        actual = TestConvergence.crt.converged_kernel_size( atl )
+
+        expected = np.array(
+            [[-1, -1, -1],
+            [-1, -1, -1],
+            [-1, -1,  0],
+            [-1, -1,  1],
+            [-1,  0,  3],
+            [ 0,  0,  2],
+            [-1,  0,  0],
+            [-1, -1,  1],
+            [-1, -1,  3],
+            [-1, -1, -1]]
+       )
+
+        assert np.array_equal(actual, expected)
+
+    def test_pubs_per_update_expand_consistent(self, tmp_path):
+        # Load single file from bibtex
+        # Load expected values
+        bibtex_fp = single_pub_bibtex_fp
+        with open(bibtex_fp, "r") as f:
+            bib_database = bibtexparser.load(f)
+
+        path = tmp_path / atlas_dir
+        path.mkdir()
+        # Construct Atlas
+        atl = TestConvergence.crt.bibtex_to_atlas(bibtex_fp)
+
+        pub = list(atl.publications.values())[0]
+        center = pub.identifier
+
+        # Expand repeatedly
+        num_expansions = 10
+        for _ in range(num_expansions):
+            atl = TestConvergence.crt.expand(
+                atl, center=center, n_pubs_max=10, record_pubs_per_update=True,
+            )
+
+        assert len(TestConvergence.crt.pubs_per_update) == num_expansions
+        breakpoint()
+        TestConvergence.crt.record_update_history()
+
+        # need to project all pubs before kernel calculations!        
+        atl = TestConvergence.crt.project(atl)
+
+        # test convergence calculations        
+        result = TestConvergence.crt.converged_kernel_size(atl)