From fb483e28b8c98ebbd8fdd966b44939654841e391 Mon Sep 17 00:00:00 2001
From: Savita Karthikeyan <savitakartik@gmail.com>
Date: Fri, 25 Aug 2023 11:23:08 +0100
Subject: [PATCH 1/4] Added child_left, child_right columns to nodes_df and
 tests for these.

---
 model.py                 | 49 ++++++++++++++++++++++++++++++++++++++++
 pages/nodes.py           | 11 ++++++++-
 tests/test_data_model.py |  4 ++++
 3 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/model.py b/model.py
index b5b5a5f..0f59d78 100644
--- a/model.py
+++ b/model.py
@@ -449,6 +449,8 @@ def nodes_df(self):
                 "time": ts.nodes_time,
                 "num_mutations": self.nodes_num_mutations,
                 "ancestors_span": child_right - child_left,
+                "child_left": child_left,  # FIXME add test for this
+                "child_right": child_right,  # FIXME add test for this
                 "is_sample": is_sample,
             }
         )
@@ -458,6 +460,8 @@ def nodes_df(self):
                 "time": "float64",
                 "num_mutations": "int",
                 "ancestors_span": "float64",
+                "child_left": "float64",
+                "child_right": "float64",
                 "is_sample": "bool",
             }
         )
@@ -584,3 +588,48 @@ def calc_mutations_per_tree(self):
         mutations_per_tree = np.zeros(self.ts.num_trees, dtype=np.int64)
         mutations_per_tree[unique_values] = counts
         return mutations_per_tree
+
+    def compute_ancestor_spans_heatmap_data(self, win_x_size=1_000_000, win_y_size=500):
+        """
+        Calculates the average ancestor span in a genomic-time window
+        """
+        nodes_df = self.nodes_df[self.nodes_df.ancestors_span != -np.inf]
+        nodes_df = nodes_df.reset_index(drop=True)
+        nodes_left = nodes_df.child_left
+        nodes_right = nodes_df.child_right
+        nodes_time = nodes_df.time
+        ancestors_span = nodes_df.ancestors_span
+
+        num_x_wins = int(np.ceil(nodes_right.max() - nodes_left.min()) / win_x_size)
+        num_y_wins = int(np.ceil(nodes_time.max() / win_y_size))
+        heatmap_sums = np.zeros((num_x_wins, num_y_wins))
+        heatmap_counts = np.zeros((num_x_wins, num_y_wins))
+
+        for u in range(len(nodes_left)):
+            x_start = int(
+                np.floor(nodes_left[u] / win_x_size)
+            )  # map the node span to the x-axis bins it overlaps
+            x_end = int(np.floor(nodes_right[u] / win_x_size))
+            y = max(0, int(np.floor(nodes_time[u] / win_y_size)) - 1)
+            heatmap_sums[x_start:x_end, y] += min(ancestors_span[u], win_x_size)
+            heatmap_counts[x_start:x_end, y] += 1
+
+        avg_spans = heatmap_sums / heatmap_counts
+        indices = np.indices((num_x_wins, num_y_wins))
+        x_coords = indices[0] * win_x_size
+        y_coords = indices[1] * win_y_size
+
+        df = pd.DataFrame(
+            {
+                "genomic_position": x_coords.flatten(),
+                "time": y_coords.flatten(),
+                "average_ancestor_span": avg_spans.flatten(),
+            }
+        )
+        return df.astype(
+            {
+                "genomic_position": "int",
+                "time": "int",
+                "average_ancestor_span": "float64",
+            }
+        )
diff --git a/pages/nodes.py b/pages/nodes.py
index 81954cd..e66fc6c 100644
--- a/pages/nodes.py
+++ b/pages/nodes.py
@@ -57,4 +57,13 @@ def make_node_hist_panel(tsm, log_y):
         pn.pane.Markdown("# Plot Options"),
         log_y_checkbox,
     )
-    return pn.Column(main, hist_panel, plot_options)
+
+    anc_span_data = tsm.compute_ancestor_spans_heatmap_data()
+    heatmap = hv.HeatMap(anc_span_data).opts(
+        width=config.PLOT_WIDTH,
+        height=config.PLOT_HEIGHT,
+        tools=["hover"],
+        colorbar=True,
+    )
+
+    return pn.Column(main, hist_panel, heatmap, plot_options)
diff --git a/tests/test_data_model.py b/tests/test_data_model.py
index a88a211..61d6fce 100644
--- a/tests/test_data_model.py
+++ b/tests/test_data_model.py
@@ -168,6 +168,8 @@ def test_single_tree_example(self):
         nt.assert_array_equal(df.time, [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0])
         nt.assert_array_equal(df.num_mutations, [1, 1, 1, 1, 1, 1, 1])
         nt.assert_array_equal(df.ancestors_span, [10, 10, 10, 10, 10, 10, -np.inf])
+        nt.assert_array_equal(df.child_left, [0, 0, 0, 0, 0, 0, np.inf])
+        nt.assert_array_equal(df.child_right, [10, 10, 10, 10, 10, 10, 0])
         nt.assert_array_equal(df.is_sample, [1, 1, 1, 1, 0, 0, 0])
 
     def test_multiple_tree_example(self):
@@ -178,6 +180,8 @@ def test_multiple_tree_example(self):
         nt.assert_array_equal(df.time, [0.0, 0.0, 0.0, 1.0, 2.0])
         nt.assert_array_equal(df.num_mutations, [0, 0, 0, 0, 0])
         nt.assert_array_equal(df.ancestors_span, [10, 10, 10, 10, -np.inf])
+        nt.assert_array_equal(df.child_left, [0, 0, 0, 0, np.inf])
+        nt.assert_array_equal(df.child_right, [10, 10, 10, 10, 0])
         nt.assert_array_equal(df.is_sample, [1, 1, 1, 0, 0])
 
 

From e86158c004d35b357c83201f5abfa9388f36493f Mon Sep 17 00:00:00 2001
From: Savita Karthikeyan <savitakartik@gmail.com>
Date: Sat, 2 Sep 2023 09:51:29 +0100
Subject: [PATCH 2/4] int slider removed, edge case fixed, bin count logged,
 actual node count on hover and plot titles added

---
 model.py       | 48 ++++++++++++++++++++--------------------
 pages/nodes.py | 59 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 74 insertions(+), 33 deletions(-)

diff --git a/model.py b/model.py
index 0f59d78..3741038 100644
--- a/model.py
+++ b/model.py
@@ -451,6 +451,8 @@ def nodes_df(self):
                 "ancestors_span": child_right - child_left,
                 "child_left": child_left,  # FIXME add test for this
                 "child_right": child_right,  # FIXME add test for this
+                "child_left": child_left,  # FIXME add test for this
+                "child_right": child_right,  # FIXME add test for this
                 "is_sample": is_sample,
             }
         )
@@ -589,7 +591,7 @@ def calc_mutations_per_tree(self):
         mutations_per_tree[unique_values] = counts
         return mutations_per_tree
 
-    def compute_ancestor_spans_heatmap_data(self, win_x_size=1_000_000, win_y_size=500):
+    def compute_ancestor_spans_heatmap_data(self, num_x_bins, num_y_bins):
         """
         Calculates the average ancestor span in a genomic-time window
         """
@@ -598,38 +600,38 @@ def compute_ancestor_spans_heatmap_data(self, win_x_size=1_000_000, win_y_size=5
         nodes_left = nodes_df.child_left
         nodes_right = nodes_df.child_right
         nodes_time = nodes_df.time
-        ancestors_span = nodes_df.ancestors_span
 
-        num_x_wins = int(np.ceil(nodes_right.max() - nodes_left.min()) / win_x_size)
-        num_y_wins = int(np.ceil(nodes_time.max() / win_y_size))
-        heatmap_sums = np.zeros((num_x_wins, num_y_wins))
-        heatmap_counts = np.zeros((num_x_wins, num_y_wins))
+        x_bins = np.linspace(nodes_left.min(), nodes_right.max(), num_x_bins + 1)
+        y_bins = np.linspace(0, nodes_time.max(), num_y_bins + 1)
+        heatmap_counts = np.zeros((num_x_bins, num_y_bins))
 
-        for u in range(len(nodes_left)):
-            x_start = int(
-                np.floor(nodes_left[u] / win_x_size)
-            )  # map the node span to the x-axis bins it overlaps
-            x_end = int(np.floor(nodes_right[u] / win_x_size))
-            y = max(0, int(np.floor(nodes_time[u] / win_y_size)) - 1)
-            heatmap_sums[x_start:x_end, y] += min(ancestors_span[u], win_x_size)
-            heatmap_counts[x_start:x_end, y] += 1
-
-        avg_spans = heatmap_sums / heatmap_counts
-        indices = np.indices((num_x_wins, num_y_wins))
-        x_coords = indices[0] * win_x_size
-        y_coords = indices[1] * win_y_size
+        x_starts = np.digitize(nodes_left, x_bins, right=True)
+        x_ends = np.digitize(nodes_right, x_bins, right=True)
+        y_starts = np.digitize(nodes_time, y_bins, right=True)
 
+        for u in range(len(nodes_left)):
+            x_start = max(0, x_starts[u] - 1)
+            x_end = max(0, x_ends[u] - 1)
+            y_bin = max(0, y_starts[u] - 1)
+            heatmap_counts[x_start : x_end + 1, y_bin] += 1
+
+        x_coords = np.repeat(x_bins[:-1], num_y_bins)
+        y_coords = np.tile(y_bins[:-1], num_x_bins)
+        overlapping_node_count = heatmap_counts.flatten()
+        overlapping_node_count[overlapping_node_count == 0] = 1
+        # FIXME - better way to avoid log 0 above?
         df = pd.DataFrame(
             {
-                "genomic_position": x_coords.flatten(),
+                "position": x_coords.flatten(),
                 "time": y_coords.flatten(),
-                "average_ancestor_span": avg_spans.flatten(),
+                "overlapping_node_count_log10": np.log10(overlapping_node_count),
+                "overlapping_node_count": overlapping_node_count,
             }
         )
         return df.astype(
             {
-                "genomic_position": "int",
+                "position": "int",
                 "time": "int",
-                "average_ancestor_span": "float64",
+                "overlapping_node_count": "int",
             }
         )
diff --git a/pages/nodes.py b/pages/nodes.py
index e66fc6c..de714ac 100644
--- a/pages/nodes.py
+++ b/pages/nodes.py
@@ -3,6 +3,7 @@
 import hvplot.pandas  # noqa
 import numpy as np
 import panel as pn
+from bokeh.models import HoverTool
 
 import config
 from plot_helpers import filter_points
@@ -40,8 +41,15 @@ def make_node_hist_panel(tsm, log_y):
     points = df_nodes.hvplot.scatter(
         x="ancestors_span",
         y="time",
-        hover_cols=["ancestors_span", "time"],
-    ).opts(width=config.PLOT_WIDTH, height=config.PLOT_HEIGHT)
+        hover_cols=["ancestors_span", "time"],  # add node ID
+    ).opts(
+        width=config.PLOT_WIDTH,
+        height=config.PLOT_HEIGHT,
+        title="Node span by time",
+        xlabel="width of genome spanned by node ancestors",
+        ylabel="node time",
+        axiswise=True,
+    )
 
     range_stream = hv.streams.RangeXY(source=points)
     streams = [range_stream]
@@ -54,16 +62,47 @@ def make_node_hist_panel(tsm, log_y):
     )
 
     plot_options = pn.Column(
-        pn.pane.Markdown("# Plot Options"),
         log_y_checkbox,
     )
 
-    anc_span_data = tsm.compute_ancestor_spans_heatmap_data()
-    heatmap = hv.HeatMap(anc_span_data).opts(
-        width=config.PLOT_WIDTH,
-        height=config.PLOT_HEIGHT,
-        tools=["hover"],
-        colorbar=True,
+    def make_heatmap(num_x_bins, num_y_bins):
+        anc_span_data = tsm.compute_ancestor_spans_heatmap_data(num_x_bins, num_y_bins)
+        tooltips = [
+            ("position", "@position"),
+            ("time", "@time"),
+            ("overlapping_nodes", "@overlapping_node_count"),
+        ]
+        hover = HoverTool(tooltips=tooltips)
+        heatmap = hv.HeatMap(anc_span_data).opts(
+            width=config.PLOT_WIDTH,
+            height=config.PLOT_HEIGHT,
+            tools=[hover],
+            colorbar=True,
+            title="Average ancestor length in time and genome bins",
+            axiswise=True,
+        )
+        return heatmap
+
+    max_x_bins = int(np.sqrt(df_nodes.child_right.max()))
+    x_bin_input = pn.widgets.IntInput(
+        name="genome bins",
+        value=min(50, max_x_bins),
+        start=1,
+        end=max_x_bins,
+    )
+    max_y_bins = int(np.sqrt(df_nodes.time.max()))
+    y_bin_input = pn.widgets.IntInput(
+        name="time bins", value=min(50, int(max_y_bins)), start=1, end=max_y_bins
     )
+    hm_options = pn.Column(x_bin_input, y_bin_input)
 
-    return pn.Column(main, hist_panel, heatmap, plot_options)
+    hm_panel = pn.bind(
+        make_heatmap,
+        num_x_bins=x_bin_input,
+        num_y_bins=y_bin_input,
+    )
+
+    return pn.Column(
+        pn.Row(main, pn.Column(hist_panel, plot_options)),
+        pn.Column(hm_panel, hm_options),
+    )

From 47536db69a362b61ff4bfff9c43813b28e5a98d2 Mon Sep 17 00:00:00 2001
From: Savita Karthikeyan <savitakartik@gmail.com>
Date: Mon, 2 Oct 2023 11:04:01 +0100
Subject: [PATCH 3/4] naively handling case for heatmap where time units are
 uncalibrated.

---
 model.py       | 94 +++++++++++++++++++++++++++++---------------------
 pages/nodes.py |  3 +-
 2 files changed, 56 insertions(+), 41 deletions(-)

diff --git a/model.py b/model.py
index 3741038..1a6daf2 100644
--- a/model.py
+++ b/model.py
@@ -595,43 +595,57 @@ def compute_ancestor_spans_heatmap_data(self, num_x_bins, num_y_bins):
         """
         Calculates the average ancestor span in a genomic-time window
         """
-        nodes_df = self.nodes_df[self.nodes_df.ancestors_span != -np.inf]
-        nodes_df = nodes_df.reset_index(drop=True)
-        nodes_left = nodes_df.child_left
-        nodes_right = nodes_df.child_right
-        nodes_time = nodes_df.time
-
-        x_bins = np.linspace(nodes_left.min(), nodes_right.max(), num_x_bins + 1)
-        y_bins = np.linspace(0, nodes_time.max(), num_y_bins + 1)
-        heatmap_counts = np.zeros((num_x_bins, num_y_bins))
-
-        x_starts = np.digitize(nodes_left, x_bins, right=True)
-        x_ends = np.digitize(nodes_right, x_bins, right=True)
-        y_starts = np.digitize(nodes_time, y_bins, right=True)
-
-        for u in range(len(nodes_left)):
-            x_start = max(0, x_starts[u] - 1)
-            x_end = max(0, x_ends[u] - 1)
-            y_bin = max(0, y_starts[u] - 1)
-            heatmap_counts[x_start : x_end + 1, y_bin] += 1
-
-        x_coords = np.repeat(x_bins[:-1], num_y_bins)
-        y_coords = np.tile(y_bins[:-1], num_x_bins)
-        overlapping_node_count = heatmap_counts.flatten()
-        overlapping_node_count[overlapping_node_count == 0] = 1
-        # FIXME - better way to avoid log 0 above?
-        df = pd.DataFrame(
-            {
-                "position": x_coords.flatten(),
-                "time": y_coords.flatten(),
-                "overlapping_node_count_log10": np.log10(overlapping_node_count),
-                "overlapping_node_count": overlapping_node_count,
-            }
-        )
-        return df.astype(
-            {
-                "position": "int",
-                "time": "int",
-                "overlapping_node_count": "int",
-            }
-        )
+        if self.ts.time_units == tskit.TIME_UNITS_UNCALIBRATED:
+            logger.warning(
+                "Cannot compute ancestor spans for uncalibrated tree sequence"
+            )
+            return pd.DataFrame(
+                {
+                    "position": [],
+                    "time": [],
+                    "overlapping_node_count_log10": [],
+                    "overlapping_node_count": [],
+                }
+            )
+        else:
+            nodes_df = self.nodes_df[self.nodes_df.ancestors_span != -np.inf]
+            nodes_df = nodes_df.reset_index(drop=True)
+            nodes_left = nodes_df.child_left
+            nodes_right = nodes_df.child_right
+            nodes_time = nodes_df.time
+
+            x_bins = np.linspace(nodes_left.min(), nodes_right.max(), num_x_bins + 1)
+            y_bins = np.linspace(0, nodes_time.max(), num_y_bins + 1)
+            heatmap_counts = np.zeros((num_x_bins, num_y_bins))
+
+            x_starts = np.digitize(nodes_left, x_bins, right=True)
+            x_ends = np.digitize(nodes_right, x_bins, right=True)
+            y_starts = np.digitize(nodes_time, y_bins, right=True)
+
+            for u in range(len(nodes_left)):
+                x_start = max(0, x_starts[u] - 1)
+                x_end = max(0, x_ends[u] - 1)
+                y_bin = max(0, y_starts[u] - 1)
+                heatmap_counts[x_start : x_end + 1, y_bin] += 1
+
+            x_coords = np.repeat(x_bins[:-1], num_y_bins)
+            y_coords = np.tile(y_bins[:-1], num_x_bins)
+            overlapping_node_count = heatmap_counts.flatten()
+            overlapping_node_count[overlapping_node_count == 0] = 1
+            # FIXME - better way to avoid log 0 above?
+            df = pd.DataFrame(
+                {
+                    "position": x_coords.flatten(),
+                    "time": y_coords.flatten(),
+                    "overlapping_node_count_log10": np.log10(overlapping_node_count),
+                    "overlapping_node_count": overlapping_node_count,
+                }
+            )
+            return df.astype(
+                {
+                    "position": "int",
+                    "time": "int",
+                    "overlapping_node_count_log10": "int",
+                    "overlapping_node_count": "int",
+                }
+            )
diff --git a/pages/nodes.py b/pages/nodes.py
index de714ac..8e37e63 100644
--- a/pages/nodes.py
+++ b/pages/nodes.py
@@ -103,6 +103,7 @@ def make_heatmap(num_x_bins, num_y_bins):
     )
 
     return pn.Column(
-        pn.Row(main, pn.Column(hist_panel, plot_options)),
+        pn.Column(main),
+        pn.Column(hist_panel, plot_options),
         pn.Column(hm_panel, hm_options),
     )

From cb74ebc5dcedab93af6b57c59500d1496a8716f8 Mon Sep 17 00:00:00 2001
From: Savita Karthikeyan <savitakartik@gmail.com>
Date: Mon, 2 Oct 2023 11:25:10 +0100
Subject: [PATCH 4/4] incremented nodes cache version

---
 model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model.py b/model.py
index 1a6daf2..08f1505 100644
--- a/model.py
+++ b/model.py
@@ -436,7 +436,7 @@ def edges_df(self):
         )
 
     @cached_property
-    @disk_cache("v1")
+    @disk_cache("v2")
     def nodes_df(self):
         ts = self.ts
         child_left, child_right = self.child_bounds(