Skip to content

Commit

Permalink
add TestConvergence
Browse files Browse the repository at this point in the history
  • Loading branch information
Nathaniel Imel authored and Nathaniel Imel committed Nov 9, 2023
1 parent a646bf9 commit 362b43c
Show file tree
Hide file tree
Showing 3 changed files with 264 additions and 20 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

[![build](https://github.com/nathimel/sciterra/actions/workflows/test.yml/badge.svg)](https://github.com/nathimel/sciterra/actions/workflows/test.yml)

Software library to support data-driven analyses of scientific literature
Software library to support data-driven analyses of scientific literature.

Inspired heavily by Zach Hafen's [cc](https://github.com/zhafen/cc) library.
This library is a reimplementation of Zach Hafen's [cc](https://github.com/zhafen/cc) library.
154 changes: 138 additions & 16 deletions src/sciterra/mapping/cartography.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,40 @@
warnings.formatwarning = custom_formatwarning


def batch_cospsi_matrix(embeddings: np.ndarray) -> np.ndarray:
"""Batch-process a pairwise cosine similarity matrix between embeddings.
In order to avoid memory errors (e.g. bus error, segfaults) resulting from too large arrays, we batch process the construction of the cospsi_matrix.
Args:
embeddings: a 1D numpy array of embeddings
Returns:
cosine_similarities: a 2D numpy array representing the pairwise cosine similarity between each embedding
"""
batch_size = min(1000, len(embeddings)) # Define a batch size

cosine_similarities = None
print(f"computing cosine similarity for {len(embeddings)} embeddings with batch size {batch_size}.")
for i in tqdm(range(0, len(embeddings), batch_size)):
# Process batches to compute cosine similarity
batch = embeddings[i:i+batch_size]
if cosine_similarities is None:
cosine_similarities = cosine_similarity(batch, embeddings)
else:
cosine_similarities = np.vstack((cosine_similarities, cosine_similarity(batch, embeddings)))

return cosine_similarities


class Cartographer:
"""A basic wrapper for obtaining and updating atlas projections."""
"""A basic wrapper for obtaining and updating atlas projections.
`self.librarian`: the Librarian object used to query a bibliographic database API.
`self.vectorizer`: the Vectorizer object used to get a document embedding for each abstract
`self.pubs_per_update`: a list of lists of publication str ids, representing the publications that exist at each time step / expansion update.
`self.update_history`: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.
"""

def __init__(
self,
Expand All @@ -31,6 +63,9 @@ def __init__(
self.librarian = librarian
self.vectorizer = vectorizer

self.pubs_per_update: list[list[str]] = []
self.update_history: np.ndarray = None

######################################################################
# Get an Atlas from bibtex
######################################################################
Expand Down Expand Up @@ -169,6 +204,7 @@ def expand(
center: str = None,
n_pubs_max: int = 4000,
n_sources_max: int = None,
record_pubs_per_update: bool = False,
**kwargs,
) -> Atlas:
"""Expand an atlas by retrieving a list of publications resulting from traversal of the citation network.
Expand All @@ -182,6 +218,8 @@ def expand(
n_sources_max: maximum number of publications (already in the atlas) to draw references and citations from.
record_pubs_per_update: whether to track all the publications that exist in the resulting atlas to `self.pubs_per_update`. This should only be set to `True` when you need to later filter by degree of convergence of the atlas.
Returns:
atl_expanded: the expanded atlas
"""
Expand Down Expand Up @@ -241,6 +279,10 @@ def expand(
atl.projection
) # new projection will be updated in `project`

# Record the new list of publications
if record_pubs_per_update:
self.pubs_per_update.append(list(atl_exp.publications.keys()))

return atl_exp

######################################################################
Expand Down Expand Up @@ -319,6 +361,100 @@ def filter(
atl_filtered = Atlas(new_publications, new_projection)
atl_filtered.bad_ids = new_bad_ids
return atl_filtered

########################################################################
# Record Atlas history
########################################################################

def record_update_history(
self,
pubs: list[str] = None,
pubs_per_update: list[list[str]] = None,
) -> None:
"""Record when publications were added, by updating atl.update_history.
atl.update_history is a np.array of ints representing when publications were added. A value of -2 indicates no record of being added.
Args:
pubs: a list of str ids corresponding to publications at the final update in the update history. By default `None`, and `self.pubs_per_update[-1]` will be used.
pubs_per_update: a list of which publications existed at which iteration, with the index of the overall list corresponding to the iteration the publication was added. By default `None`, and `self.pubs_per_update` will be used.
Updates:
`self.update_history`: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.
Returns:
`None`
"""
if pubs is None:
pubs = np.array(self.pubs_per_update[-1])

if pubs_per_update is None:
pubs_per_update = self.pubs_per_update

# Loop backwards
i_max = len( pubs_per_update ) - 1
update_history = np.full(pubs.shape, -2 )
for i, pubs_i in enumerate( pubs_per_update[::-1] ):
is_in = np.isin( pubs, pubs_i )
update_history[is_in] = i_max - i

self.update_history = update_history

########################################################################
# Calculate Atlas convergence
########################################################################

def converged_kernel_size(self, atl: Atlas) -> np.ndarray:
"""Calculate the largest size of the kernel that's converged (at differing levels of convergence) for each publication in a sample at each update.
Args:
atl: Atlas containing publications; for each publication we compute the largest converged kernel size at each update
Returns:
kernel_size: an array of ints of shape `(num_pubs, max_update)` representing the kernel size for converged kernels.
- The first column indicates the largest kernel size that hasn't changed since the beginning,
- The second column indicates the largest kernel size that hasn't changed since the first update,
- etc. for the nth column.
"""

if self.update_history is None:
raise ValueError('update_history is None; make sure you have called record_update_history()!')

if -2 in self.update_history:
raise ValueError('Incomplete update history as indicated by entries with values of -2.')

publications = np.array(list(atl.publications.keys()))

# 1. Loop over each publication
cospsi_kernel = []
for pub in tqdm(publications):

# 2. Identify the similarity with the other publications relative to this publication, and sort accordingly.
cospsi = cosine_similarity(
atl.projection.identifiers_to_embeddings([pub]),
atl.projection.embeddings,
).flatten() # shape `(num_pubs,)`
sort_inds = np.argsort(cospsi)[::-1] # shape `(num_pubs,)`

# 3. Identify the expansion iteration at which those publications were added to the atlas (`sorted_history`).
sorted_history = self.update_history[sort_inds] # shape `(num_pubs,)`

# 4. Identify the latest iteration at which any publication was added to the atlas; this can be less than the total iterations.
last_update = self.update_history.max()

# 5. Loop through each iteration until `last_update`, and identify which publications were added at or before that iteration.
result_2 = [
# 6. Compute how many publications out we can go and still only contain publications added at or before that iteration.
# Use `argmin` to get the first instance of False
# Finally, subtract 1: we want the first index before False.
np.argmin(sorted_history <= update) - 1
for update in range(last_update)
] # shape `(num_pubs, last_update)`

cospsi_kernel.append(result_2)

return np.array(cospsi_kernel)

########################################################################
# Measure Atlas topography
Expand Down Expand Up @@ -380,21 +516,7 @@ def measure_topography(
]
)

# To avoid memory errors (e.g. bus error, segfaults) resulting from too large arrays, we batch process the construction of the cospsi_matrix.
embeddings = atl.projection.embeddings
batch_size = min(1000, len(embeddings)) # Define a batch size

cosine_similarities = None
print(f"computing cosine similarity for {len(embeddings)} embeddings with batch size {batch_size}.")
for i in tqdm(range(0, len(embeddings), batch_size)):
# Process batches to compute cosine similarity
batch = embeddings[i:i+batch_size]
if cosine_similarities is None:
cosine_similarities = cosine_similarity(batch, embeddings)
else:
cosine_similarities = np.vstack((cosine_similarities, cosine_similarity(batch, embeddings)))

cospsi_matrix = cosine_similarities
cospsi_matrix = batch_cospsi_matrix(atl.projection.embeddings)

print(f"Computing {metrics} for {len(publication_indices)} publications.")
estimates = []
Expand Down
126 changes: 124 additions & 2 deletions src/tests/test_cartography.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

atlas_dir = "atlas_tmpdir"

# NOTE: Any time you are querying an API for papers, it is not a good idea to have strict tests on the resulting output size, since there is a significant amount of things out of our control, including that the online database may have added new papers.

class TestS2BibtexToAtlas:
"""Test functionality that maps a bibtex file to a list of identifiers, and then populates an atlas. The purpose of this functionality is to map a human-readable / very popular dataformat to the Atlas data structure."""
Expand Down Expand Up @@ -101,8 +102,10 @@ def test_bibtex_to_atlas_realistic(self, tmp_path):

# I find that I get 28 out of 86 total refs, i.e. less than a third of papers targeted.
# or 32 lol
# or 31
# assert len(atl) == 28
assert len(atl) == 32
# assert len(atl) == 32
# assert len(atl) == 31


class TestS2SBProjection:
Expand Down Expand Up @@ -320,7 +323,8 @@ def test_expand_center_double(self, tmp_path):
)
# empirically found this
# do no assert len(atl_exp_double) == 4000 + len(ids), because we want 4000 + len(valid_ids), i.e. 148 not 154
assert len(atl_exp_double) == 348
# assert len(atl_exp_double) == 348 # why off by a few?
# assert len(atl_exp_double) == 345

atl_exp_double.save(path)

Expand Down Expand Up @@ -361,3 +365,121 @@ def test_measure_topography(self):
metrics=metrics,
)
assert measurements.shape == tuple((len(atl_exp_single), len(metrics)))

class TestConvergence:
librarian = SemanticScholarLibrarian()
vectorizer = SciBERTVectorizer()
crt = Cartographer(librarian, vectorizer)

def test_record_update_history(self):
# Construct Atlas
bibtex_fp = ten_pub_bibtex_fp

atl = TestConvergence.crt.bibtex_to_atlas(bibtex_fp)

# Mock expansion/update history data
input = [
['f2c251056dee4c6f9130b31e5e3e4b3296051c49'], # it=0

['4364af31229f7e9a3d83a289a928b2f2a43d30cb', # it=1
'f2c251056dee4c6f9130b31e5e3e4b3296051c49',
'287fa946f30eaa78ea86f9c5bd61d67238202356',],

['50dea78a96f03ba7fc3398c5deea5174630ef186', # it=2
'54a83cd1d94814b0f37ee48084260a2d1882648d',
'4364af31229f7e9a3d83a289a928b2f2a43d30cb', 'f2c251056dee4c6f9130b31e5e3e4b3296051c49',
'287fa946f30eaa78ea86f9c5bd61d67238202356',
'2e6438be4901cb9b42ff23dcc3d433789b37d032',
'04da6471743468b6bb1d26dd9a6eac4c03ca73ee',],

]

TestConvergence.crt.record_update_history(
input[-1], pubs_per_update=input)

expected = np.array([-2, -2, 2, 2, 1, 0, 1, 2, 2, -2,])
actual = TestConvergence.crt.update_history

assert np.array_equal(expected, actual)

def test_converged_kernel_size(self):

# Construct Atlas
bibtex_fp = ten_pub_bibtex_fp
atl = TestConvergence.crt.bibtex_to_atlas(bibtex_fp)
atl = TestConvergence.crt.project(atl)

# Mock expansion/update history data
input = [
['f2c251056dee4c6f9130b31e5e3e4b3296051c49'], # it=0

['4364af31229f7e9a3d83a289a928b2f2a43d30cb', # it=1
'f2c251056dee4c6f9130b31e5e3e4b3296051c49',
'287fa946f30eaa78ea86f9c5bd61d67238202356',],

['50dea78a96f03ba7fc3398c5deea5174630ef186', # it=2
'54a83cd1d94814b0f37ee48084260a2d1882648d',
'4364af31229f7e9a3d83a289a928b2f2a43d30cb', 'f2c251056dee4c6f9130b31e5e3e4b3296051c49',
'287fa946f30eaa78ea86f9c5bd61d67238202356',
'2e6438be4901cb9b42ff23dcc3d433789b37d032',
'04da6471743468b6bb1d26dd9a6eac4c03ca73ee',],

list(atl.publications.keys()), # it=3
]

TestTopography.crt.record_update_history()

expected = np.array([3, 3, 2, 2, 1, 0, 1, 2, 2, 3,])
actual = TestTopography.crt.update_history

assert np.array_equal(expected, actual)

# mock center
actual = TestConvergence.crt.converged_kernel_size( atl )

expected = np.array(
[[-1, -1, -1],
[-1, -1, -1],
[-1, -1, 0],
[-1, -1, 1],
[-1, 0, 3],
[ 0, 0, 2],
[-1, 0, 0],
[-1, -1, 1],
[-1, -1, 3],
[-1, -1, -1]]
)

assert np.array_equal(actual, expected)

def test_pubs_per_update_expand_consistent(self, tmp_path):
# Load single file from bibtex
# Load expected values
bibtex_fp = single_pub_bibtex_fp
with open(bibtex_fp, "r") as f:
bib_database = bibtexparser.load(f)

path = tmp_path / atlas_dir
path.mkdir()
# Construct Atlas
atl = TestConvergence.crt.bibtex_to_atlas(bibtex_fp)

pub = list(atl.publications.values())[0]
center = pub.identifier

# Expand repeatedly
num_expansions = 10
for _ in range(num_expansions):
atl = TestConvergence.crt.expand(
atl, center=center, n_pubs_max=10, record_pubs_per_update=True,
)

assert len(TestConvergence.crt.pubs_per_update) == num_expansions
breakpoint()
TestConvergence.crt.record_update_history()

# need to project all pubs before kernel calculations!
atl = TestConvergence.crt.project(atl)

# test convergence calculations
result = TestConvergence.crt.converged_kernel_size(atl)

0 comments on commit 362b43c

Please sign in to comment.