+sciterra.mapping.topography
+
+ Functions for measuring topographic properties of (the semantic feature space of publications inside) an Atlas.
+1"""Functions for measuring topographic properties of (the semantic feature space of publications inside) an Atlas.""" + 2 + 3import inspect + 4import numpy as np + 5 + 6 + 7######################################################################## + 8# Density metrics + 9######################################################################## + 10 + 11 + 12def smoothing_length_metric( + 13 idx: int, + 14 cospsi_matrix: np.ndarray, + 15 valid_indices: np.ndarray, + 16 kernel_size: int = 16, + 17): + 18 """Proxy for the density of a publication defined as the minimum + 19 arc length that encloses kernel_size other publications. + 20 + 21 Args: + 22 idx: the index of the vector to calculate the measurement for. + 23 + 24 cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings. + 25 + 26 valid_indices: Indices of the other publication used when calculating the measurements. + 27 + 28 kernel_size: number of K nearest neighbors to calculate the measurement on. + 29 + 30 Returns: + 31 h: float representing arc length containing `kernel_size` other publications. (Assumes normalized to a radius of 1.) + 32 """ + 33 + 34 # We can't have the kernel larger than the number of valid publications + 35 if kernel_size > len(valid_indices): + 36 return np.nan + 37 + 38 # Get 1D array of similarity scores to idx vector + 39 try: + 40 cospsi = cospsi_matrix[idx][valid_indices] + 41 except IndexError: + 42 breakpoint() + 43 + 44 # Get cosine distance to the least similar vector + 45 # np.sort orders from least to greatest similarity, so reverse after + 46 cospsi_max = np.sort(cospsi)[::-1][kernel_size - 1] + 47 + 48 # Compute arclength to furthest vector + 49 return np.arccos(cospsi_max) + 50 + 51 + 52def density_metric( + 53 idx: int, + 54 cospsi_matrix: np.ndarray, + 55 valid_indices: np.ndarray, + 56 kernel_size: int = 16, + 57): + 58 """Estimate the density of a publication by calculating the + 59 smoothing length that encloses kernel_size other publications. + 60 + 61 Args: + 62 idx: the index of the vector to calculate the measurement for. + 63 + 64 cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings. + 65 + 66 valid_indices: Indices of the other publication used when calculating the measurements. + 67 + 68 kernel_size: number of K nearest neighbors to calculate the measurement on. + 69 + 70 Returns: + 71 density: a float representing `kernel_size` divided by arc length containing `kernel_size` other publications. + 72 """ + 73 + 74 h = smoothing_length_metric(idx, cospsi_matrix, valid_indices, kernel_size) + 75 density = kernel_size / h + 76 + 77 return density + 78 + 79 + 80######################################################################## + 81# Asymmetry metrics + 82######################################################################## + 83 + 84 + 85def edginess_metric( + 86 idx: int, + 87 cospsi_matrix: np.ndarray, + 88 valid_indices: np.ndarray, + 89 publication_indices: np.ndarray, + 90 embeddings: np.ndarray, + 91 kernel_size: int = 16, + 92) -> float: + 93 """Estimate the asymmetry of a publication by calculating the difference between that publication's projection and the other publications within the kernel. Normalized to between 0 and 1. + 94 + 95 Args: + 96 idx: the index of the vector to calculate the measurement for. + 97 + 98 cospsi_matrix: an np.ndarray of shape `(num_pubs, num_pubs)` representing pairwise cosine similarity scores for publication embeddings. + 99 +100 valid_indices: an np.ndarray of shape `(num_valid_pubs)` representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs. +101 +102 publication_indices: an np.ndarray of shape `(num_pubs,)` representing indices of all publications in the atlas projection +103 +104 embeddings: an np.ndarray of shape `(num_pubs, embedding_dim)` vectors for all publications in the atlas projection +105 +106 kernel_size: number of K nearest neighbors to calculate the measurement on. +107 +108 Returns: +109 a float representing the normalized magnitude of the asymmetry metric. +110 +111 """ +112 return ( +113 kernel_constant_asymmetry_metric( +114 idx, +115 cospsi_matrix, +116 valid_indices, +117 publication_indices, +118 embeddings, +119 kernel_size=kernel_size, +120 ) +121 / kernel_size +122 ) +123 +124 +125def kernel_constant_asymmetry_metric( +126 idx: int, +127 cospsi_matrix: np.ndarray, +128 valid_indices: np.ndarray, +129 publication_indices: np.ndarray, +130 embeddings: np.ndarray, +131 kernel_size: int = 16, +132) -> float: +133 """Estimate the asymmetry of a publication by calculating the difference +134 between that publication's projection and the other publications within +135 the kernel. +136 +137 Args: +138 idx: an int representing the index of the vector to calculate the measurement for. +139 +140 cospsi_matrix: an np.ndarray of shape `(num_pubs, num_pubs)` representing pairwise cosine similarity scores for publication embeddings. +141 +142 valid_indices: an np.ndarray of shape `(num_valid_pubs)` representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs. +143 +144 publication_indices: an np.ndarray of shape `(num_pubs,)` representing indices of all publications in the atlas projection +145 +146 embeddings: an np.ndarray of shape `(num_pubs, embedding_dim)` vectors for all publications in the atlas projection +147 +148 kernel_size: an int representing the number of K nearest neighbors to calculate the measurement on. +149 +150 Returns: +151 mag: a float representing the magnitude of the asymmetry metric. +152 """ +153 +154 # We can't have the kernel larger than the number of valid publications +155 if kernel_size > len(valid_indices): +156 return np.nan +157 +158 # Input +159 cospsi = cospsi_matrix[idx][valid_indices] +160 sorted_inds = np.argsort(cospsi)[::-1][:kernel_size] +161 other_inds = publication_indices[valid_indices][sorted_inds] +162 embedding = embeddings[idx] +163 other_embeddings = embeddings[other_inds] +164 +165 # Differences +166 diff = embedding - other_embeddings +167 diff_mag = np.linalg.norm(diff, axis=1) +168 result = (diff / diff_mag[:, np.newaxis]).sum(axis=0) +169 mag = np.linalg.norm(result) +170 +171 return mag +
13def smoothing_length_metric( +14 idx: int, +15 cospsi_matrix: np.ndarray, +16 valid_indices: np.ndarray, +17 kernel_size: int = 16, +18): +19 """Proxy for the density of a publication defined as the minimum +20 arc length that encloses kernel_size other publications. +21 +22 Args: +23 idx: the index of the vector to calculate the measurement for. +24 +25 cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings. +26 +27 valid_indices: Indices of the other publication used when calculating the measurements. +28 +29 kernel_size: number of K nearest neighbors to calculate the measurement on. +30 +31 Returns: +32 h: float representing arc length containing `kernel_size` other publications. (Assumes normalized to a radius of 1.) +33 """ +34 +35 # We can't have the kernel larger than the number of valid publications +36 if kernel_size > len(valid_indices): +37 return np.nan +38 +39 # Get 1D array of similarity scores to idx vector +40 try: +41 cospsi = cospsi_matrix[idx][valid_indices] +42 except IndexError: +43 breakpoint() +44 +45 # Get cosine distance to the least similar vector +46 # np.sort orders from least to greatest similarity, so reverse after +47 cospsi_max = np.sort(cospsi)[::-1][kernel_size - 1] +48 +49 # Compute arclength to furthest vector +50 return np.arccos(cospsi_max) +
Proxy for the density of a publication defined as the minimum +arc length that encloses kernel_size other publications.
+ +Arguments:
+ +-
+
- idx: the index of the vector to calculate the measurement for. +
- cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings. +
- valid_indices: Indices of the other publication used when calculating the measurements. +
- kernel_size: number of K nearest neighbors to calculate the measurement on. +
Returns:
+ +++h: float representing arc length containing
+kernel_size
other publications. (Assumes normalized to a radius of 1.)
53def density_metric( +54 idx: int, +55 cospsi_matrix: np.ndarray, +56 valid_indices: np.ndarray, +57 kernel_size: int = 16, +58): +59 """Estimate the density of a publication by calculating the +60 smoothing length that encloses kernel_size other publications. +61 +62 Args: +63 idx: the index of the vector to calculate the measurement for. +64 +65 cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings. +66 +67 valid_indices: Indices of the other publication used when calculating the measurements. +68 +69 kernel_size: number of K nearest neighbors to calculate the measurement on. +70 +71 Returns: +72 density: a float representing `kernel_size` divided by arc length containing `kernel_size` other publications. +73 """ +74 +75 h = smoothing_length_metric(idx, cospsi_matrix, valid_indices, kernel_size) +76 density = kernel_size / h +77 +78 return density +
Estimate the density of a publication by calculating the +smoothing length that encloses kernel_size other publications.
+ +Arguments:
+ +-
+
- idx: the index of the vector to calculate the measurement for. +
- cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings. +
- valid_indices: Indices of the other publication used when calculating the measurements. +
- kernel_size: number of K nearest neighbors to calculate the measurement on. +
Returns:
+ +++density: a float representing
+kernel_size
divided by arc length containingkernel_size
other publications.
86def edginess_metric( + 87 idx: int, + 88 cospsi_matrix: np.ndarray, + 89 valid_indices: np.ndarray, + 90 publication_indices: np.ndarray, + 91 embeddings: np.ndarray, + 92 kernel_size: int = 16, + 93) -> float: + 94 """Estimate the asymmetry of a publication by calculating the difference between that publication's projection and the other publications within the kernel. Normalized to between 0 and 1. + 95 + 96 Args: + 97 idx: the index of the vector to calculate the measurement for. + 98 + 99 cospsi_matrix: an np.ndarray of shape `(num_pubs, num_pubs)` representing pairwise cosine similarity scores for publication embeddings. +100 +101 valid_indices: an np.ndarray of shape `(num_valid_pubs)` representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs. +102 +103 publication_indices: an np.ndarray of shape `(num_pubs,)` representing indices of all publications in the atlas projection +104 +105 embeddings: an np.ndarray of shape `(num_pubs, embedding_dim)` vectors for all publications in the atlas projection +106 +107 kernel_size: number of K nearest neighbors to calculate the measurement on. +108 +109 Returns: +110 a float representing the normalized magnitude of the asymmetry metric. +111 +112 """ +113 return ( +114 kernel_constant_asymmetry_metric( +115 idx, +116 cospsi_matrix, +117 valid_indices, +118 publication_indices, +119 embeddings, +120 kernel_size=kernel_size, +121 ) +122 / kernel_size +123 ) +
Estimate the asymmetry of a publication by calculating the difference between that publication's projection and the other publications within the kernel. Normalized to between 0 and 1.
+ +Arguments:
+ +-
+
- idx: the index of the vector to calculate the measurement for. +
- cospsi_matrix: an np.ndarray of shape
(num_pubs, num_pubs)
representing pairwise cosine similarity scores for publication embeddings.
+ - valid_indices: an np.ndarray of shape
(num_valid_pubs)
representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs.
+ - publication_indices: an np.ndarray of shape
(num_pubs,)
representing indices of all publications in the atlas projection
+ - embeddings: an np.ndarray of shape
(num_pubs, embedding_dim)
vectors for all publications in the atlas projection
+ - kernel_size: number of K nearest neighbors to calculate the measurement on. +
Returns:
+ +++a float representing the normalized magnitude of the asymmetry metric.
+
126def kernel_constant_asymmetry_metric( +127 idx: int, +128 cospsi_matrix: np.ndarray, +129 valid_indices: np.ndarray, +130 publication_indices: np.ndarray, +131 embeddings: np.ndarray, +132 kernel_size: int = 16, +133) -> float: +134 """Estimate the asymmetry of a publication by calculating the difference +135 between that publication's projection and the other publications within +136 the kernel. +137 +138 Args: +139 idx: an int representing the index of the vector to calculate the measurement for. +140 +141 cospsi_matrix: an np.ndarray of shape `(num_pubs, num_pubs)` representing pairwise cosine similarity scores for publication embeddings. +142 +143 valid_indices: an np.ndarray of shape `(num_valid_pubs)` representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs. +144 +145 publication_indices: an np.ndarray of shape `(num_pubs,)` representing indices of all publications in the atlas projection +146 +147 embeddings: an np.ndarray of shape `(num_pubs, embedding_dim)` vectors for all publications in the atlas projection +148 +149 kernel_size: an int representing the number of K nearest neighbors to calculate the measurement on. +150 +151 Returns: +152 mag: a float representing the magnitude of the asymmetry metric. +153 """ +154 +155 # We can't have the kernel larger than the number of valid publications +156 if kernel_size > len(valid_indices): +157 return np.nan +158 +159 # Input +160 cospsi = cospsi_matrix[idx][valid_indices] +161 sorted_inds = np.argsort(cospsi)[::-1][:kernel_size] +162 other_inds = publication_indices[valid_indices][sorted_inds] +163 embedding = embeddings[idx] +164 other_embeddings = embeddings[other_inds] +165 +166 # Differences +167 diff = embedding - other_embeddings +168 diff_mag = np.linalg.norm(diff, axis=1) +169 result = (diff / diff_mag[:, np.newaxis]).sum(axis=0) +170 mag = np.linalg.norm(result) +171 +172 return mag +
Estimate the asymmetry of a publication by calculating the difference +between that publication's projection and the other publications within +the kernel.
+ +Arguments:
+ +-
+
- idx: an int representing the index of the vector to calculate the measurement for. +
- cospsi_matrix: an np.ndarray of shape
(num_pubs, num_pubs)
representing pairwise cosine similarity scores for publication embeddings.
+ - valid_indices: an np.ndarray of shape
(num_valid_pubs)
representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs.
+ - publication_indices: an np.ndarray of shape
(num_pubs,)
representing indices of all publications in the atlas projection
+ - embeddings: an np.ndarray of shape
(num_pubs, embedding_dim)
vectors for all publications in the atlas projection
+ - kernel_size: an int representing the number of K nearest neighbors to calculate the measurement on. +
Returns:
+ +++mag: a float representing the magnitude of the asymmetry metric.
+