From fb483e28b8c98ebbd8fdd966b44939654841e391 Mon Sep 17 00:00:00 2001 From: Savita Karthikeyan Date: Fri, 25 Aug 2023 11:23:08 +0100 Subject: [PATCH 1/4] Added child_left, child_right columns to nodes_df and tests for these. --- model.py | 49 ++++++++++++++++++++++++++++++++++++++++ pages/nodes.py | 11 ++++++++- tests/test_data_model.py | 4 ++++ 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/model.py b/model.py index b5b5a5f..0f59d78 100644 --- a/model.py +++ b/model.py @@ -449,6 +449,8 @@ def nodes_df(self): "time": ts.nodes_time, "num_mutations": self.nodes_num_mutations, "ancestors_span": child_right - child_left, + "child_left": child_left, # FIXME add test for this + "child_right": child_right, # FIXME add test for this "is_sample": is_sample, } ) @@ -458,6 +460,8 @@ def nodes_df(self): "time": "float64", "num_mutations": "int", "ancestors_span": "float64", + "child_left": "float64", + "child_right": "float64", "is_sample": "bool", } ) @@ -584,3 +588,48 @@ def calc_mutations_per_tree(self): mutations_per_tree = np.zeros(self.ts.num_trees, dtype=np.int64) mutations_per_tree[unique_values] = counts return mutations_per_tree + + def compute_ancestor_spans_heatmap_data(self, win_x_size=1_000_000, win_y_size=500): + """ + Calculates the average ancestor span in a genomic-time window + """ + nodes_df = self.nodes_df[self.nodes_df.ancestors_span != -np.inf] + nodes_df = nodes_df.reset_index(drop=True) + nodes_left = nodes_df.child_left + nodes_right = nodes_df.child_right + nodes_time = nodes_df.time + ancestors_span = nodes_df.ancestors_span + + num_x_wins = int(np.ceil(nodes_right.max() - nodes_left.min()) / win_x_size) + num_y_wins = int(np.ceil(nodes_time.max() / win_y_size)) + heatmap_sums = np.zeros((num_x_wins, num_y_wins)) + heatmap_counts = np.zeros((num_x_wins, num_y_wins)) + + for u in range(len(nodes_left)): + x_start = int( + np.floor(nodes_left[u] / win_x_size) + ) # map the node span to the x-axis bins it overlaps + x_end = int(np.floor(nodes_right[u] / win_x_size)) + y = max(0, int(np.floor(nodes_time[u] / win_y_size)) - 1) + heatmap_sums[x_start:x_end, y] += min(ancestors_span[u], win_x_size) + heatmap_counts[x_start:x_end, y] += 1 + + avg_spans = heatmap_sums / heatmap_counts + indices = np.indices((num_x_wins, num_y_wins)) + x_coords = indices[0] * win_x_size + y_coords = indices[1] * win_y_size + + df = pd.DataFrame( + { + "genomic_position": x_coords.flatten(), + "time": y_coords.flatten(), + "average_ancestor_span": avg_spans.flatten(), + } + ) + return df.astype( + { + "genomic_position": "int", + "time": "int", + "average_ancestor_span": "float64", + } + ) diff --git a/pages/nodes.py b/pages/nodes.py index 81954cd..e66fc6c 100644 --- a/pages/nodes.py +++ b/pages/nodes.py @@ -57,4 +57,13 @@ def make_node_hist_panel(tsm, log_y): pn.pane.Markdown("# Plot Options"), log_y_checkbox, ) - return pn.Column(main, hist_panel, plot_options) + + anc_span_data = tsm.compute_ancestor_spans_heatmap_data() + heatmap = hv.HeatMap(anc_span_data).opts( + width=config.PLOT_WIDTH, + height=config.PLOT_HEIGHT, + tools=["hover"], + colorbar=True, + ) + + return pn.Column(main, hist_panel, heatmap, plot_options) diff --git a/tests/test_data_model.py b/tests/test_data_model.py index a88a211..61d6fce 100644 --- a/tests/test_data_model.py +++ b/tests/test_data_model.py @@ -168,6 +168,8 @@ def test_single_tree_example(self): nt.assert_array_equal(df.time, [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0]) nt.assert_array_equal(df.num_mutations, [1, 1, 1, 1, 1, 1, 1]) nt.assert_array_equal(df.ancestors_span, [10, 10, 10, 10, 10, 10, -np.inf]) + nt.assert_array_equal(df.child_left, [0, 0, 0, 0, 0, 0, np.inf]) + nt.assert_array_equal(df.child_right, [10, 10, 10, 10, 10, 10, 0]) nt.assert_array_equal(df.is_sample, [1, 1, 1, 1, 0, 0, 0]) def test_multiple_tree_example(self): @@ -178,6 +180,8 @@ def test_multiple_tree_example(self): nt.assert_array_equal(df.time, [0.0, 0.0, 0.0, 1.0, 2.0]) nt.assert_array_equal(df.num_mutations, [0, 0, 0, 0, 0]) nt.assert_array_equal(df.ancestors_span, [10, 10, 10, 10, -np.inf]) + nt.assert_array_equal(df.child_left, [0, 0, 0, 0, np.inf]) + nt.assert_array_equal(df.child_right, [10, 10, 10, 10, 0]) nt.assert_array_equal(df.is_sample, [1, 1, 1, 0, 0]) From e86158c004d35b357c83201f5abfa9388f36493f Mon Sep 17 00:00:00 2001 From: Savita Karthikeyan Date: Sat, 2 Sep 2023 09:51:29 +0100 Subject: [PATCH 2/4] int slider removed, edge case fixed, bin count logged, actual node count on hover and plot titles added --- model.py | 48 ++++++++++++++++++++-------------------- pages/nodes.py | 59 +++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 74 insertions(+), 33 deletions(-) diff --git a/model.py b/model.py index 0f59d78..3741038 100644 --- a/model.py +++ b/model.py @@ -451,6 +451,8 @@ def nodes_df(self): "ancestors_span": child_right - child_left, "child_left": child_left, # FIXME add test for this "child_right": child_right, # FIXME add test for this + "child_left": child_left, # FIXME add test for this + "child_right": child_right, # FIXME add test for this "is_sample": is_sample, } ) @@ -589,7 +591,7 @@ def calc_mutations_per_tree(self): mutations_per_tree[unique_values] = counts return mutations_per_tree - def compute_ancestor_spans_heatmap_data(self, win_x_size=1_000_000, win_y_size=500): + def compute_ancestor_spans_heatmap_data(self, num_x_bins, num_y_bins): """ Calculates the average ancestor span in a genomic-time window """ @@ -598,38 +600,38 @@ def compute_ancestor_spans_heatmap_data(self, win_x_size=1_000_000, win_y_size=5 nodes_left = nodes_df.child_left nodes_right = nodes_df.child_right nodes_time = nodes_df.time - ancestors_span = nodes_df.ancestors_span - num_x_wins = int(np.ceil(nodes_right.max() - nodes_left.min()) / win_x_size) - num_y_wins = int(np.ceil(nodes_time.max() / win_y_size)) - heatmap_sums = np.zeros((num_x_wins, num_y_wins)) - heatmap_counts = np.zeros((num_x_wins, num_y_wins)) + x_bins = np.linspace(nodes_left.min(), nodes_right.max(), num_x_bins + 1) + y_bins = np.linspace(0, nodes_time.max(), num_y_bins + 1) + heatmap_counts = np.zeros((num_x_bins, num_y_bins)) - for u in range(len(nodes_left)): - x_start = int( - np.floor(nodes_left[u] / win_x_size) - ) # map the node span to the x-axis bins it overlaps - x_end = int(np.floor(nodes_right[u] / win_x_size)) - y = max(0, int(np.floor(nodes_time[u] / win_y_size)) - 1) - heatmap_sums[x_start:x_end, y] += min(ancestors_span[u], win_x_size) - heatmap_counts[x_start:x_end, y] += 1 - - avg_spans = heatmap_sums / heatmap_counts - indices = np.indices((num_x_wins, num_y_wins)) - x_coords = indices[0] * win_x_size - y_coords = indices[1] * win_y_size + x_starts = np.digitize(nodes_left, x_bins, right=True) + x_ends = np.digitize(nodes_right, x_bins, right=True) + y_starts = np.digitize(nodes_time, y_bins, right=True) + for u in range(len(nodes_left)): + x_start = max(0, x_starts[u] - 1) + x_end = max(0, x_ends[u] - 1) + y_bin = max(0, y_starts[u] - 1) + heatmap_counts[x_start : x_end + 1, y_bin] += 1 + + x_coords = np.repeat(x_bins[:-1], num_y_bins) + y_coords = np.tile(y_bins[:-1], num_x_bins) + overlapping_node_count = heatmap_counts.flatten() + overlapping_node_count[overlapping_node_count == 0] = 1 + # FIXME - better way to avoid log 0 above? df = pd.DataFrame( { - "genomic_position": x_coords.flatten(), + "position": x_coords.flatten(), "time": y_coords.flatten(), - "average_ancestor_span": avg_spans.flatten(), + "overlapping_node_count_log10": np.log10(overlapping_node_count), + "overlapping_node_count": overlapping_node_count, } ) return df.astype( { - "genomic_position": "int", + "position": "int", "time": "int", - "average_ancestor_span": "float64", + "overlapping_node_count": "int", } ) diff --git a/pages/nodes.py b/pages/nodes.py index e66fc6c..de714ac 100644 --- a/pages/nodes.py +++ b/pages/nodes.py @@ -3,6 +3,7 @@ import hvplot.pandas # noqa import numpy as np import panel as pn +from bokeh.models import HoverTool import config from plot_helpers import filter_points @@ -40,8 +41,15 @@ def make_node_hist_panel(tsm, log_y): points = df_nodes.hvplot.scatter( x="ancestors_span", y="time", - hover_cols=["ancestors_span", "time"], - ).opts(width=config.PLOT_WIDTH, height=config.PLOT_HEIGHT) + hover_cols=["ancestors_span", "time"], # add node ID + ).opts( + width=config.PLOT_WIDTH, + height=config.PLOT_HEIGHT, + title="Node span by time", + xlabel="width of genome spanned by node ancestors", + ylabel="node time", + axiswise=True, + ) range_stream = hv.streams.RangeXY(source=points) streams = [range_stream] @@ -54,16 +62,47 @@ def make_node_hist_panel(tsm, log_y): ) plot_options = pn.Column( - pn.pane.Markdown("# Plot Options"), log_y_checkbox, ) - anc_span_data = tsm.compute_ancestor_spans_heatmap_data() - heatmap = hv.HeatMap(anc_span_data).opts( - width=config.PLOT_WIDTH, - height=config.PLOT_HEIGHT, - tools=["hover"], - colorbar=True, + def make_heatmap(num_x_bins, num_y_bins): + anc_span_data = tsm.compute_ancestor_spans_heatmap_data(num_x_bins, num_y_bins) + tooltips = [ + ("position", "@position"), + ("time", "@time"), + ("overlapping_nodes", "@overlapping_node_count"), + ] + hover = HoverTool(tooltips=tooltips) + heatmap = hv.HeatMap(anc_span_data).opts( + width=config.PLOT_WIDTH, + height=config.PLOT_HEIGHT, + tools=[hover], + colorbar=True, + title="Average ancestor length in time and genome bins", + axiswise=True, + ) + return heatmap + + max_x_bins = int(np.sqrt(df_nodes.child_right.max())) + x_bin_input = pn.widgets.IntInput( + name="genome bins", + value=min(50, max_x_bins), + start=1, + end=max_x_bins, + ) + max_y_bins = int(np.sqrt(df_nodes.time.max())) + y_bin_input = pn.widgets.IntInput( + name="time bins", value=min(50, int(max_y_bins)), start=1, end=max_y_bins ) + hm_options = pn.Column(x_bin_input, y_bin_input) - return pn.Column(main, hist_panel, heatmap, plot_options) + hm_panel = pn.bind( + make_heatmap, + num_x_bins=x_bin_input, + num_y_bins=y_bin_input, + ) + + return pn.Column( + pn.Row(main, pn.Column(hist_panel, plot_options)), + pn.Column(hm_panel, hm_options), + ) From 47536db69a362b61ff4bfff9c43813b28e5a98d2 Mon Sep 17 00:00:00 2001 From: Savita Karthikeyan Date: Mon, 2 Oct 2023 11:04:01 +0100 Subject: [PATCH 3/4] naively handling case for heatmap where time units are uncalibrated. --- model.py | 94 +++++++++++++++++++++++++++++--------------------- pages/nodes.py | 3 +- 2 files changed, 56 insertions(+), 41 deletions(-) diff --git a/model.py b/model.py index 3741038..1a6daf2 100644 --- a/model.py +++ b/model.py @@ -595,43 +595,57 @@ def compute_ancestor_spans_heatmap_data(self, num_x_bins, num_y_bins): """ Calculates the average ancestor span in a genomic-time window """ - nodes_df = self.nodes_df[self.nodes_df.ancestors_span != -np.inf] - nodes_df = nodes_df.reset_index(drop=True) - nodes_left = nodes_df.child_left - nodes_right = nodes_df.child_right - nodes_time = nodes_df.time - - x_bins = np.linspace(nodes_left.min(), nodes_right.max(), num_x_bins + 1) - y_bins = np.linspace(0, nodes_time.max(), num_y_bins + 1) - heatmap_counts = np.zeros((num_x_bins, num_y_bins)) - - x_starts = np.digitize(nodes_left, x_bins, right=True) - x_ends = np.digitize(nodes_right, x_bins, right=True) - y_starts = np.digitize(nodes_time, y_bins, right=True) - - for u in range(len(nodes_left)): - x_start = max(0, x_starts[u] - 1) - x_end = max(0, x_ends[u] - 1) - y_bin = max(0, y_starts[u] - 1) - heatmap_counts[x_start : x_end + 1, y_bin] += 1 - - x_coords = np.repeat(x_bins[:-1], num_y_bins) - y_coords = np.tile(y_bins[:-1], num_x_bins) - overlapping_node_count = heatmap_counts.flatten() - overlapping_node_count[overlapping_node_count == 0] = 1 - # FIXME - better way to avoid log 0 above? - df = pd.DataFrame( - { - "position": x_coords.flatten(), - "time": y_coords.flatten(), - "overlapping_node_count_log10": np.log10(overlapping_node_count), - "overlapping_node_count": overlapping_node_count, - } - ) - return df.astype( - { - "position": "int", - "time": "int", - "overlapping_node_count": "int", - } - ) + if self.ts.time_units == tskit.TIME_UNITS_UNCALIBRATED: + logger.warning( + "Cannot compute ancestor spans for uncalibrated tree sequence" + ) + return pd.DataFrame( + { + "position": [], + "time": [], + "overlapping_node_count_log10": [], + "overlapping_node_count": [], + } + ) + else: + nodes_df = self.nodes_df[self.nodes_df.ancestors_span != -np.inf] + nodes_df = nodes_df.reset_index(drop=True) + nodes_left = nodes_df.child_left + nodes_right = nodes_df.child_right + nodes_time = nodes_df.time + + x_bins = np.linspace(nodes_left.min(), nodes_right.max(), num_x_bins + 1) + y_bins = np.linspace(0, nodes_time.max(), num_y_bins + 1) + heatmap_counts = np.zeros((num_x_bins, num_y_bins)) + + x_starts = np.digitize(nodes_left, x_bins, right=True) + x_ends = np.digitize(nodes_right, x_bins, right=True) + y_starts = np.digitize(nodes_time, y_bins, right=True) + + for u in range(len(nodes_left)): + x_start = max(0, x_starts[u] - 1) + x_end = max(0, x_ends[u] - 1) + y_bin = max(0, y_starts[u] - 1) + heatmap_counts[x_start : x_end + 1, y_bin] += 1 + + x_coords = np.repeat(x_bins[:-1], num_y_bins) + y_coords = np.tile(y_bins[:-1], num_x_bins) + overlapping_node_count = heatmap_counts.flatten() + overlapping_node_count[overlapping_node_count == 0] = 1 + # FIXME - better way to avoid log 0 above? + df = pd.DataFrame( + { + "position": x_coords.flatten(), + "time": y_coords.flatten(), + "overlapping_node_count_log10": np.log10(overlapping_node_count), + "overlapping_node_count": overlapping_node_count, + } + ) + return df.astype( + { + "position": "int", + "time": "int", + "overlapping_node_count_log10": "int", + "overlapping_node_count": "int", + } + ) diff --git a/pages/nodes.py b/pages/nodes.py index de714ac..8e37e63 100644 --- a/pages/nodes.py +++ b/pages/nodes.py @@ -103,6 +103,7 @@ def make_heatmap(num_x_bins, num_y_bins): ) return pn.Column( - pn.Row(main, pn.Column(hist_panel, plot_options)), + pn.Column(main), + pn.Column(hist_panel, plot_options), pn.Column(hm_panel, hm_options), ) From cb74ebc5dcedab93af6b57c59500d1496a8716f8 Mon Sep 17 00:00:00 2001 From: Savita Karthikeyan Date: Mon, 2 Oct 2023 11:25:10 +0100 Subject: [PATCH 4/4] incremented nodes cache version --- model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model.py b/model.py index 1a6daf2..08f1505 100644 --- a/model.py +++ b/model.py @@ -436,7 +436,7 @@ def edges_df(self): ) @cached_property - @disk_cache("v1") + @disk_cache("v2") def nodes_df(self): ts = self.ts child_left, child_right = self.child_bounds(