From d5c3f905b68c68ef56d3ca28c1c533f7f40624c8 Mon Sep 17 00:00:00 2001
From: Felipe <87530733+FelipePCarcanholo@users.noreply.github.com>
Date: Thu, 12 Dec 2024 09:37:27 -0600
Subject: [PATCH 1/4] Vis-work.py

adding the options in the run_and_plot function to select how many genes want to appear in the count per cell plot (ntop_genes). Choose to filter or not the genes in the box plot of ratio (filtred=True). Choose the size of the dots in the spatial plot, default is a good guess of the optimal dot size.
---
 spatial_compare/spatial_compare.py | 195 ++++++++++++++---------------
 1 file changed, 92 insertions(+), 103 deletions(-)

diff --git a/spatial_compare/spatial_compare.py b/spatial_compare/spatial_compare.py
index da1ad34..526eb0e 100644
--- a/spatial_compare/spatial_compare.py
+++ b/spatial_compare/spatial_compare.py
@@ -58,14 +58,17 @@ class SpatialCompare:
     -------
     set_category(category)
         Set the category to compare.
-    spatial_plot(plot_legend=True, min_cells_to_plot=10, decimate_for_spatial_plot=1, figsize=[20,10], category_values=[])
+    spatial_plot(plot_legend=True, min_cells_to_plot=10, decimate_for_spatial_plot=1, figsize=[20,10], category_values=[], dot_size=3)
         Plot the spatial data for the two datasets.
     de_novo_cluster(plot_stuff=False, correspondence_level="leiden_1",rerun_preprocessing=False)
         Perform de novo clustering on the two datasets.
     find_matched_groups(n_top_groups=100, n_shared_groups=30, min_n_cells=100, category_values=[], exclude_group_string="zzzzzzzzzzzzzzz", plot_stuff=False, figsize=[10,10])
         Find matched groups between the two datasets.
-    compare_expression(category_values=[], plot_stuff=False, min_mean_expression=.2, min_genes_to_compare=5, min_cells=10)
+    compare_expression(category_values=[], plot_stuff=False, min_mean_expression=.2, min_genes_to_compare=5, min_cells=10, ntop_genes=10)
         Compare gene expression between the two datasets.
+    
+    run_and_plot(category_values = d1d2_cells, min_mean_expression=.2, ntop_genes=5, filtred=True, dot_size=) 
+        Run all the plots, can select the genes to appear the label (ntop_genes), choose to filter 25 bottom, middle and top genes in the boxplot (filtred=True). Can choose the size of dots of spatial plot (dot_size=(3*18231)/(self.ad_0.n_obs)).    
 
     """
 
@@ -178,8 +181,9 @@ def spatial_plot(
         decimate_for_spatial_plot=1,
         figsize=[20, 10],
         category_values=[],
+        dot_size=None,  # Add a parameter for dot size
     ):
-
+        
         plt.figure(figsize=figsize)
         all_category_values = set(self.ad_0.obs[self.category].unique()) | set(
             self.ad_1.obs[self.category].unique()
@@ -204,7 +208,7 @@ def spatial_plot(
                 ],
                 ".",
                 label=label,
-                markersize=0.5,
+                markersize=dot_size,  # Use the dot_size parameter
             )
             plt.axis("equal")
             if plot_legend:
@@ -224,7 +228,7 @@ def spatial_plot(
                 ],
                 ".",
                 label=label,
-                markersize=0.5,
+                markersize=dot_size,  # Use the dot_size parameter
             )
             plt.axis("equal")
             if plot_legend:
@@ -369,7 +373,7 @@ def find_matched_groups(
             "n0": in_top_N_0,
             "n1": in_top_N_1,
         }
-
+    
     def compare_expression(
         self,
         category_values=[],
@@ -377,89 +381,61 @@ def compare_expression(
         min_mean_expression=0.2,
         min_genes_to_compare=5,
         min_cells=10,
+        ntop_genes=10,
     ):
-        # group cells
+        # Group cells
         if len(category_values) == 0:
-            raise ValueError(
-                "please supply a list of values for the category " + self.category
-            )
+            raise ValueError("please supply a list of values for the category " + self.category)
 
         category_records = []
         gene_ratio_dfs = {}
+    
         for category_value in category_values:
             group_mask_0 = self.ad_0.obs[self.category] == category_value
             group_mask_1 = self.ad_1.obs[self.category] == category_value
 
             if np.sum(group_mask_0) < min_cells or np.sum(group_mask_1) < min_cells:
-                print(
-                    "at least 1 input has less than "
-                    + str(min_cells)
-                    + " cells in "
-                    + self.category
-                    + " == "
-                    + category_value
-                )
+                print("at least 1 input has less than " + str(min_cells) + " cells in " + self.category + " == " + category_value)
                 continue
 
-            means_0 = np.array(
-                np.mean(
-                    self.ad_0[
-                        group_mask_0, self.ad_0.var.index.isin(self.shared_genes)
-                    ].X,
-                    axis=0,
-                )
-            ).flatten()
-            means_1 = np.array(
-                np.mean(
-                    self.ad_1[
-                        group_mask_1, self.ad_1.var.index.isin(self.shared_genes)
-                    ].X,
-                    axis=0,
-                )
-            ).flatten()
+            means_0 = np.array(np.mean(self.ad_0[group_mask_0, self.ad_0.var.index.isin(self.shared_genes)].X, axis=0)).flatten()
+            means_1 = np.array(np.mean(self.ad_1[group_mask_1, self.ad_1.var.index.isin(self.shared_genes)].X, axis=0)).flatten()
+
+            # Filter genes above minimum mean expression
             means_0_gt_min = np.nonzero(means_0 > min_mean_expression)[0]
             means_1_gt_min = np.nonzero(means_1 > min_mean_expression)[0]
-            above_means0 = self.ad_0.var[
-                self.ad_0.var.index.isin(self.shared_genes)
-            ].iloc[means_0_gt_min]
-            above_means1 = self.ad_1.var[
-                self.ad_1.var.index.isin(self.shared_genes)
-            ].iloc[means_1_gt_min]
-            shared_above_mean = [
-                g for g in above_means1.index if g in above_means0.index
-            ]
+        
+            above_means0 = self.ad_0.var[self.ad_0.var.index.isin(self.shared_genes)].iloc[means_0_gt_min]
+            above_means1 = self.ad_1.var[self.ad_1.var.index.isin(self.shared_genes)].iloc[means_1_gt_min]
+        
+            shared_above_mean = [g for g in above_means1.index if g in above_means0.index]
+        
             if len(shared_above_mean) < min_genes_to_compare:
-                print(
-                    self.category
-                    + " "
-                    + category_value
-                    + " has less than "
-                    + str(min_genes_to_compare)
-                    + "\n shared genes above minimum mean = "
-                    + str(min_mean_expression)
-                )
+                print(self.category + " " + category_value + " has less than " + str(min_genes_to_compare) + "\n shared genes above minimum mean = " + str(min_mean_expression))
                 continue
 
-            means_0 = np.array(
-                np.mean(self.ad_0[group_mask_0, shared_above_mean].X, axis=0)
-            ).flatten()
-            means_1 = np.array(
-                np.mean(self.ad_1[group_mask_1, shared_above_mean].X, axis=0)
-            ).flatten()
+            # Calculate means again after filtering
+            means_0 = np.array(np.mean(self.ad_0[group_mask_0, shared_above_mean].X, axis=0)).flatten()
+            means_1 = np.array(np.mean(self.ad_1[group_mask_1, shared_above_mean].X, axis=0)).flatten()
+        
+            # Calculate average counts for selecting top genes
+            average_counts = (means_0 + means_1) / 2
+        
+            # Get indices of the top 20 genes based on average counts for this subclass
+            top_indices = np.argsort(average_counts)[-ntop_genes:]  # Get indices of top 10 genes
+
             shared_genes = shared_above_mean
+        
             p_coef = np.polynomial.Polynomial.fit(means_0, means_1, 1).convert().coef
-            category_records.append(
-                {
-                    self.category: category_value,
-                    "slope": p_coef[1],
-                    "mean_ratio": np.mean(means_1 / means_0),
-                    "correlation": np.corrcoef(means_0, means_1)[0][1],
-                    "n_cells_0": np.sum(group_mask_0),
-                    "n_cells_1": np.sum(group_mask_1),
-                    "total_count_ratio": np.sum(self.ad_1[group_mask_1, shared_genes].X)
-                    / np.sum(self.ad_0[group_mask_0, shared_genes].X),
-                }
-            )
+            category_records.append({
+                self.category: category_value,
+                "slope": p_coef[1],
+                "mean_ratio": np.mean(means_1 / means_0),
+                "correlation": np.corrcoef(means_0, means_1)[0][1],
+                "n_cells_0": np.sum(group_mask_0),
+                "n_cells_1": np.sum(group_mask_1),
+                "total_count_ratio": np.sum(self.ad_1[group_mask_1, shared_genes].X) / np.sum(self.ad_0[group_mask_0, shared_genes].X),
+            })
 
             gene_ratio_dfs[category_value] = pd.DataFrame(
                 means_1 / means_0,
@@ -470,63 +446,56 @@ def compare_expression(
             if plot_stuff:
                 plt.figure(figsize=[10, 10])
                 plt.title(
-                    self.category
-                    + ": "
-                    + category_value
-                    + "\nmean counts per cell\ncorrelation: "
-                    + str(category_records[-1]["correlation"])[:4]
-                    + " mean ratio: "
-                    + str(category_records[-1]["mean_ratio"])[:4]
+                    self.category + ": " + category_value +
+                    "\nmean counts per cell\ncorrelation: " +
+                    str(category_records[-1]["correlation"])[:4] +
+                    " mean ratio: " + str(category_records[-1]["mean_ratio"])[:4]
                 )
+            
                 low_expression = np.logical_and(means_0 < 1.0, means_1 < 1.0)
-                plt.loglog(
-                    means_0[low_expression],
-                    means_1[low_expression],
-                    ".",
-                    color=[0.5, 0.5, 0.5],
-                )
-                plt.loglog(
-                    means_0[np.logical_not(low_expression)],
-                    means_1[np.logical_not(low_expression)],
-                    ".",
-                )
+                plt.loglog(means_0[low_expression], means_1[low_expression], ".", color=[0.5, 0.5, 0.5])
+                plt.loglog(means_0[np.logical_not(low_expression)], means_1[np.logical_not(low_expression)], ".")
 
                 plt.xlabel(self.data_names[0] + ", N = " + str(np.sum(group_mask_0)))
                 plt.ylabel(self.data_names[1] + ", N = " + str(np.sum(group_mask_1)))
 
-                for g in shared_genes:
-                    if (
-                        means_0[np.nonzero(np.array(shared_genes) == g)] == 0
-                        or means_1[np.nonzero(np.array(shared_genes) == g)] == 0
-                    ) or low_expression[np.array(shared_genes) == g]:
+                # Add labels only for the top 20 genes based on average counts for this subclass
+                for idx in top_indices:
+                    g = shared_genes[idx] if idx < len(shared_genes) else None
+                
+                    if g is None or (means_0[idx] == 0 or means_1[idx] == 0 or low_expression[idx]):
                         continue
+                
                     plt.text(
-                        means_0[np.nonzero(np.array(shared_genes) == g)],
-                        means_1[np.nonzero(np.array(shared_genes) == g)],
+                        means_0[idx],
+                        means_1[idx],
                         g,
                         fontsize=10,
                     )
+                
                 plt.plot(
                     [np.min(means_0), np.max(means_0)],
                     [np.min(means_0), np.max(means_0)],
                     "--",
                 )
+
         print(gene_ratio_dfs.keys())
         if len(gene_ratio_dfs.keys()) > 0:
-
             gene_ratio_df = pd.concat(gene_ratio_dfs, axis=1)
         else:
             gene_ratio_df = None
+        
         return {
             "data_names": self.data_names,
             "category_results": pd.DataFrame.from_records(category_records),
             "gene_ratio_dataframe": gene_ratio_df,
         }
 
-    def plot_detection_ratio(self, gene_ratio_dataframe, figsize=[15, 15]):
+
+    def plot_detection_ratio(self, gene_ratio_dataframe, figsize=[15, 15], filtred=True):
 
         detection_ratio_plots(
-            gene_ratio_dataframe, data_names=self.data_names, figsize=figsize
+            gene_ratio_dataframe, data_names=self.data_names, figsize=figsize, filtred=filtred,
         )
 
     def spatial_compare(self, **kwargs):
@@ -565,12 +534,16 @@ def spatial_compare(self, **kwargs):
     def run_and_plot(self, **kwargs):
         if "category" in kwargs.keys():
             self.set_category(kwargs["category"])
+        dot_size = kwargs.get('dot_size', (3*18231)/(self.ad_0.n_obs))
+        ntop_genes = kwargs.get('ntop_genes', 10)
+        filtred = kwargs.get('filtred', True)
+
 
-        self.spatial_plot()
+        self.spatial_plot(dot_size=dot_size)
         self.spatial_compare_results = self.spatial_compare(plot_stuff=True, **kwargs)
         self.plot_detection_ratio(
             self.spatial_compare_results["expression_results"]["gene_ratio_dataframe"],
-            figsize=[30, 20],
+            figsize=[30, 20], filtred=filtred,
         )
         return True
 
@@ -926,17 +899,29 @@ def filter_and_cluster_twice(
 
 
 def detection_ratio_plots(
-    gene_ratio_df, data_names=DEFAULT_DATA_NAMES, figsize=[15, 15]
+    gene_ratio_df, data_names=DEFAULT_DATA_NAMES, figsize=[15, 15], filtred=True,
 ):
 
     sorted_genes = [
         str(s) for s in gene_ratio_df.mean(axis=1).sort_values().index.values
     ]
+    # Select top 25, bottom 25 and middle 25
+    top_25 = sorted_genes[-25:]  # Top 25 highest
+    bottom_25 = sorted_genes[:25]  # Bottom 25 lowest
+    middle_index = len(sorted_genes) // 2
+    middle_25 = sorted_genes[middle_index - 12:middle_index + 13]  # Middle 25
+
+    # Combine selected ratios for plotting
+    selected_ratios = bottom_25 + middle_25 + top_25
+    if filtred:
+        genes_boxplot = selected_ratios
+    else:
+        genes_boxplot = sorted_genes
 
     plt.figure(figsize=figsize)
     plt.subplot(3, 1, 1)
     p = sns.boxplot(
-        gene_ratio_df.loc[sorted_genes, :].T,
+        gene_ratio_df.loc[genes_boxplot, :].T,
     )
     p.set_yscale("log")
     p.set_xlabel("gene", fontsize=20)
@@ -944,7 +929,11 @@ def detection_ratio_plots(
         "detection ratio\n" + data_names[1] + " / " + data_names[0], fontsize=20
     )
     ax = plt.gca()
-    ax.tick_params(axis="x", labelrotation=45, labelsize=10)
+    if filtred:
+        ax.tick_params(axis="x", labelrotation=45, labelsize=18)
+    else:
+        ax.tick_params(axis="x", labelrotation=45, labelsize=10)
+
     ax.tick_params(axis="y", labelsize=20, which="major")
     ax.tick_params(axis="y", labelsize=10, which="minor")
 

From 2b8595143110d83b4fe98b944a44de744c10ca4a Mon Sep 17 00:00:00 2001
From: Brian Long <berl@users.noreply.github.com>
Date: Thu, 12 Dec 2024 13:44:49 -0800
Subject: [PATCH 2/4] run black

---
 spatial_compare/spatial_compare.py | 175 ++++++++++++++++++++---------
 1 file changed, 121 insertions(+), 54 deletions(-)

diff --git a/spatial_compare/spatial_compare.py b/spatial_compare/spatial_compare.py
index 526eb0e..6598ba7 100644
--- a/spatial_compare/spatial_compare.py
+++ b/spatial_compare/spatial_compare.py
@@ -66,9 +66,9 @@ class SpatialCompare:
         Find matched groups between the two datasets.
     compare_expression(category_values=[], plot_stuff=False, min_mean_expression=.2, min_genes_to_compare=5, min_cells=10, ntop_genes=10)
         Compare gene expression between the two datasets.
-    
-    run_and_plot(category_values = d1d2_cells, min_mean_expression=.2, ntop_genes=5, filtred=True, dot_size=) 
-        Run all the plots, can select the genes to appear the label (ntop_genes), choose to filter 25 bottom, middle and top genes in the boxplot (filtred=True). Can choose the size of dots of spatial plot (dot_size=(3*18231)/(self.ad_0.n_obs)).    
+
+    run_and_plot(category_values = d1d2_cells, min_mean_expression=.2, ntop_genes=5, filtred=True, dot_size=)
+        Run all the plots, can select the genes to appear the label (ntop_genes), choose to filter 25 bottom, middle and top genes in the boxplot (filtred=True). Can choose the size of dots of spatial plot (dot_size=(3*18231)/(self.ad_0.n_obs)).
 
     """
 
@@ -183,7 +183,7 @@ def spatial_plot(
         category_values=[],
         dot_size=None,  # Add a parameter for dot size
     ):
-        
+
         plt.figure(figsize=figsize)
         all_category_values = set(self.ad_0.obs[self.category].unique()) | set(
             self.ad_1.obs[self.category].unique()
@@ -373,7 +373,7 @@ def find_matched_groups(
             "n0": in_top_N_0,
             "n1": in_top_N_1,
         }
-    
+
     def compare_expression(
         self,
         category_values=[],
@@ -385,57 +385,103 @@ def compare_expression(
     ):
         # Group cells
         if len(category_values) == 0:
-            raise ValueError("please supply a list of values for the category " + self.category)
+            raise ValueError(
+                "please supply a list of values for the category " + self.category
+            )
 
         category_records = []
         gene_ratio_dfs = {}
-    
+
         for category_value in category_values:
             group_mask_0 = self.ad_0.obs[self.category] == category_value
             group_mask_1 = self.ad_1.obs[self.category] == category_value
 
             if np.sum(group_mask_0) < min_cells or np.sum(group_mask_1) < min_cells:
-                print("at least 1 input has less than " + str(min_cells) + " cells in " + self.category + " == " + category_value)
+                print(
+                    "at least 1 input has less than "
+                    + str(min_cells)
+                    + " cells in "
+                    + self.category
+                    + " == "
+                    + category_value
+                )
                 continue
 
-            means_0 = np.array(np.mean(self.ad_0[group_mask_0, self.ad_0.var.index.isin(self.shared_genes)].X, axis=0)).flatten()
-            means_1 = np.array(np.mean(self.ad_1[group_mask_1, self.ad_1.var.index.isin(self.shared_genes)].X, axis=0)).flatten()
+            means_0 = np.array(
+                np.mean(
+                    self.ad_0[
+                        group_mask_0, self.ad_0.var.index.isin(self.shared_genes)
+                    ].X,
+                    axis=0,
+                )
+            ).flatten()
+            means_1 = np.array(
+                np.mean(
+                    self.ad_1[
+                        group_mask_1, self.ad_1.var.index.isin(self.shared_genes)
+                    ].X,
+                    axis=0,
+                )
+            ).flatten()
 
             # Filter genes above minimum mean expression
             means_0_gt_min = np.nonzero(means_0 > min_mean_expression)[0]
             means_1_gt_min = np.nonzero(means_1 > min_mean_expression)[0]
-        
-            above_means0 = self.ad_0.var[self.ad_0.var.index.isin(self.shared_genes)].iloc[means_0_gt_min]
-            above_means1 = self.ad_1.var[self.ad_1.var.index.isin(self.shared_genes)].iloc[means_1_gt_min]
-        
-            shared_above_mean = [g for g in above_means1.index if g in above_means0.index]
-        
+
+            above_means0 = self.ad_0.var[
+                self.ad_0.var.index.isin(self.shared_genes)
+            ].iloc[means_0_gt_min]
+            above_means1 = self.ad_1.var[
+                self.ad_1.var.index.isin(self.shared_genes)
+            ].iloc[means_1_gt_min]
+
+            shared_above_mean = [
+                g for g in above_means1.index if g in above_means0.index
+            ]
+
             if len(shared_above_mean) < min_genes_to_compare:
-                print(self.category + " " + category_value + " has less than " + str(min_genes_to_compare) + "\n shared genes above minimum mean = " + str(min_mean_expression))
+                print(
+                    self.category
+                    + " "
+                    + category_value
+                    + " has less than "
+                    + str(min_genes_to_compare)
+                    + "\n shared genes above minimum mean = "
+                    + str(min_mean_expression)
+                )
                 continue
 
             # Calculate means again after filtering
-            means_0 = np.array(np.mean(self.ad_0[group_mask_0, shared_above_mean].X, axis=0)).flatten()
-            means_1 = np.array(np.mean(self.ad_1[group_mask_1, shared_above_mean].X, axis=0)).flatten()
-        
+            means_0 = np.array(
+                np.mean(self.ad_0[group_mask_0, shared_above_mean].X, axis=0)
+            ).flatten()
+            means_1 = np.array(
+                np.mean(self.ad_1[group_mask_1, shared_above_mean].X, axis=0)
+            ).flatten()
+
             # Calculate average counts for selecting top genes
             average_counts = (means_0 + means_1) / 2
-        
+
             # Get indices of the top 20 genes based on average counts for this subclass
-            top_indices = np.argsort(average_counts)[-ntop_genes:]  # Get indices of top 10 genes
+            top_indices = np.argsort(average_counts)[
+                -ntop_genes:
+            ]  # Get indices of top 10 genes
 
             shared_genes = shared_above_mean
-        
+
             p_coef = np.polynomial.Polynomial.fit(means_0, means_1, 1).convert().coef
-            category_records.append({
-                self.category: category_value,
-                "slope": p_coef[1],
-                "mean_ratio": np.mean(means_1 / means_0),
-                "correlation": np.corrcoef(means_0, means_1)[0][1],
-                "n_cells_0": np.sum(group_mask_0),
-                "n_cells_1": np.sum(group_mask_1),
-                "total_count_ratio": np.sum(self.ad_1[group_mask_1, shared_genes].X) / np.sum(self.ad_0[group_mask_0, shared_genes].X),
-            })
+            category_records.append(
+                {
+                    self.category: category_value,
+                    "slope": p_coef[1],
+                    "mean_ratio": np.mean(means_1 / means_0),
+                    "correlation": np.corrcoef(means_0, means_1)[0][1],
+                    "n_cells_0": np.sum(group_mask_0),
+                    "n_cells_1": np.sum(group_mask_1),
+                    "total_count_ratio": np.sum(self.ad_1[group_mask_1, shared_genes].X)
+                    / np.sum(self.ad_0[group_mask_0, shared_genes].X),
+                }
+            )
 
             gene_ratio_dfs[category_value] = pd.DataFrame(
                 means_1 / means_0,
@@ -446,15 +492,27 @@ def compare_expression(
             if plot_stuff:
                 plt.figure(figsize=[10, 10])
                 plt.title(
-                    self.category + ": " + category_value +
-                    "\nmean counts per cell\ncorrelation: " +
-                    str(category_records[-1]["correlation"])[:4] +
-                    " mean ratio: " + str(category_records[-1]["mean_ratio"])[:4]
+                    self.category
+                    + ": "
+                    + category_value
+                    + "\nmean counts per cell\ncorrelation: "
+                    + str(category_records[-1]["correlation"])[:4]
+                    + " mean ratio: "
+                    + str(category_records[-1]["mean_ratio"])[:4]
                 )
-            
+
                 low_expression = np.logical_and(means_0 < 1.0, means_1 < 1.0)
-                plt.loglog(means_0[low_expression], means_1[low_expression], ".", color=[0.5, 0.5, 0.5])
-                plt.loglog(means_0[np.logical_not(low_expression)], means_1[np.logical_not(low_expression)], ".")
+                plt.loglog(
+                    means_0[low_expression],
+                    means_1[low_expression],
+                    ".",
+                    color=[0.5, 0.5, 0.5],
+                )
+                plt.loglog(
+                    means_0[np.logical_not(low_expression)],
+                    means_1[np.logical_not(low_expression)],
+                    ".",
+                )
 
                 plt.xlabel(self.data_names[0] + ", N = " + str(np.sum(group_mask_0)))
                 plt.ylabel(self.data_names[1] + ", N = " + str(np.sum(group_mask_1)))
@@ -462,17 +520,19 @@ def compare_expression(
                 # Add labels only for the top 20 genes based on average counts for this subclass
                 for idx in top_indices:
                     g = shared_genes[idx] if idx < len(shared_genes) else None
-                
-                    if g is None or (means_0[idx] == 0 or means_1[idx] == 0 or low_expression[idx]):
+
+                    if g is None or (
+                        means_0[idx] == 0 or means_1[idx] == 0 or low_expression[idx]
+                    ):
                         continue
-                
+
                     plt.text(
                         means_0[idx],
                         means_1[idx],
                         g,
                         fontsize=10,
                     )
-                
+
                 plt.plot(
                     [np.min(means_0), np.max(means_0)],
                     [np.min(means_0), np.max(means_0)],
@@ -484,18 +544,22 @@ def compare_expression(
             gene_ratio_df = pd.concat(gene_ratio_dfs, axis=1)
         else:
             gene_ratio_df = None
-        
+
         return {
             "data_names": self.data_names,
             "category_results": pd.DataFrame.from_records(category_records),
             "gene_ratio_dataframe": gene_ratio_df,
         }
 
-
-    def plot_detection_ratio(self, gene_ratio_dataframe, figsize=[15, 15], filtred=True):
+    def plot_detection_ratio(
+        self, gene_ratio_dataframe, figsize=[15, 15], filtred=True
+    ):
 
         detection_ratio_plots(
-            gene_ratio_dataframe, data_names=self.data_names, figsize=figsize, filtred=filtred,
+            gene_ratio_dataframe,
+            data_names=self.data_names,
+            figsize=figsize,
+            filtred=filtred,
         )
 
     def spatial_compare(self, **kwargs):
@@ -534,16 +598,16 @@ def spatial_compare(self, **kwargs):
     def run_and_plot(self, **kwargs):
         if "category" in kwargs.keys():
             self.set_category(kwargs["category"])
-        dot_size = kwargs.get('dot_size', (3*18231)/(self.ad_0.n_obs))
-        ntop_genes = kwargs.get('ntop_genes', 10)
-        filtred = kwargs.get('filtred', True)
-
+        dot_size = kwargs.get("dot_size", (3 * 18231) / (self.ad_0.n_obs))
+        ntop_genes = kwargs.get("ntop_genes", 10)
+        filtred = kwargs.get("filtred", True)
 
         self.spatial_plot(dot_size=dot_size)
         self.spatial_compare_results = self.spatial_compare(plot_stuff=True, **kwargs)
         self.plot_detection_ratio(
             self.spatial_compare_results["expression_results"]["gene_ratio_dataframe"],
-            figsize=[30, 20], filtred=filtred,
+            figsize=[30, 20],
+            filtred=filtred,
         )
         return True
 
@@ -899,7 +963,10 @@ def filter_and_cluster_twice(
 
 
 def detection_ratio_plots(
-    gene_ratio_df, data_names=DEFAULT_DATA_NAMES, figsize=[15, 15], filtred=True,
+    gene_ratio_df,
+    data_names=DEFAULT_DATA_NAMES,
+    figsize=[15, 15],
+    filtred=True,
 ):
 
     sorted_genes = [
@@ -909,7 +976,7 @@ def detection_ratio_plots(
     top_25 = sorted_genes[-25:]  # Top 25 highest
     bottom_25 = sorted_genes[:25]  # Bottom 25 lowest
     middle_index = len(sorted_genes) // 2
-    middle_25 = sorted_genes[middle_index - 12:middle_index + 13]  # Middle 25
+    middle_25 = sorted_genes[middle_index - 12 : middle_index + 13]  # Middle 25
 
     # Combine selected ratios for plotting
     selected_ratios = bottom_25 + middle_25 + top_25

From 6fdbfc783dd9ced6effcca60e67d8d3b20411a0a Mon Sep 17 00:00:00 2001
From: Brian Long <berl@users.noreply.github.com>
Date: Thu, 12 Dec 2024 14:38:58 -0800
Subject: [PATCH 3/4] automatic marker size for each dataset, fixed legend spot
 size. increase default genes to 20

---
 spatial_compare/spatial_compare.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/spatial_compare/spatial_compare.py b/spatial_compare/spatial_compare.py
index 6598ba7..884ca64 100644
--- a/spatial_compare/spatial_compare.py
+++ b/spatial_compare/spatial_compare.py
@@ -16,7 +16,7 @@
 
 
 DEFAULT_DATA_NAMES = ["Data 0", "Data 1"]
-
+TARGET_LEGEND_MARKER_SIZE = 20
 
 class SpatialCompare:
     """
@@ -64,10 +64,10 @@ class SpatialCompare:
         Perform de novo clustering on the two datasets.
     find_matched_groups(n_top_groups=100, n_shared_groups=30, min_n_cells=100, category_values=[], exclude_group_string="zzzzzzzzzzzzzzz", plot_stuff=False, figsize=[10,10])
         Find matched groups between the two datasets.
-    compare_expression(category_values=[], plot_stuff=False, min_mean_expression=.2, min_genes_to_compare=5, min_cells=10, ntop_genes=10)
+    compare_expression(category_values=[], plot_stuff=False, min_mean_expression=.2, min_genes_to_compare=5, min_cells=10, ntop_genes=20)
         Compare gene expression between the two datasets.
 
-    run_and_plot(category_values = d1d2_cells, min_mean_expression=.2, ntop_genes=5, filtred=True, dot_size=)
+    run_and_plot(category_values = d1d2_cells, min_mean_expression=.2, ntop_genes=20, filtred=True, dot_size=)
         Run all the plots, can select the genes to appear the label (ntop_genes), choose to filter 25 bottom, middle and top genes in the boxplot (filtred=True). Can choose the size of dots of spatial plot (dot_size=(3*18231)/(self.ad_0.n_obs)).
 
     """
@@ -191,8 +191,16 @@ def spatial_plot(
         if len(category_values) == 0:
             category_values = all_category_values
 
+        if dot_size is None:
+            ad0_dot_size = (3 * 18231) / (self.ad_0.n_obs)
+            ad1_dot_size = (3 * 18231) / (self.ad_1.n_obs)
+        else:
+            ad0_dot_size = dot_size
+            ad1_dot_size = dot_size
+
         for c in category_values:
             plt.subplot(1, 2, 1)
+
             plt.title(self.data_names[0])
             if np.sum(self.ad_0.obs[self.category] == c) > min_cells_to_plot:
                 label = c + ": " + str(np.sum(self.ad_0.obs[self.category] == c))
@@ -208,11 +216,12 @@ def spatial_plot(
                 ],
                 ".",
                 label=label,
-                markersize=dot_size,  # Use the dot_size parameter
+                markersize=ad0_dot_size,  # Use the dot_size parameter
             )
             plt.axis("equal")
             if plot_legend:
-                plt.legend(markerscale=5)
+                markerscale = TARGET_LEGEND_MARKER_SIZE/ad0_dot_size
+                plt.legend(markerscale=markerscale)
             plt.subplot(1, 2, 2)
             plt.title(self.data_names[1])
             if np.sum(self.ad_1.obs[self.category] == c) > min_cells_to_plot:
@@ -228,11 +237,12 @@ def spatial_plot(
                 ],
                 ".",
                 label=label,
-                markersize=dot_size,  # Use the dot_size parameter
+                markersize=ad1_dot_size,  # Use the dot_size parameter
             )
             plt.axis("equal")
             if plot_legend:
-                plt.legend(markerscale=5)
+                markerscale = TARGET_LEGEND_MARKER_SIZE/ad1_dot_size
+                plt.legend(markerscale=markerscale)
 
     def de_novo_cluster(
         self, plot_stuff=False, correspondence_level="leiden_1", run_preprocessing=False
@@ -381,7 +391,7 @@ def compare_expression(
         min_mean_expression=0.2,
         min_genes_to_compare=5,
         min_cells=10,
-        ntop_genes=10,
+        ntop_genes=20,
     ):
         # Group cells
         if len(category_values) == 0:
@@ -598,8 +608,8 @@ def spatial_compare(self, **kwargs):
     def run_and_plot(self, **kwargs):
         if "category" in kwargs.keys():
             self.set_category(kwargs["category"])
-        dot_size = kwargs.get("dot_size", (3 * 18231) / (self.ad_0.n_obs))
-        ntop_genes = kwargs.get("ntop_genes", 10)
+        dot_size = kwargs.get("dot_size", None)
+        ntop_genes = kwargs.get("ntop_genes", 20)
         filtred = kwargs.get("filtred", True)
 
         self.spatial_plot(dot_size=dot_size)

From cb47609d2417fd90aef606e9f4abdfbc8f327d80 Mon Sep 17 00:00:00 2001
From: Brian Long <berl@users.noreply.github.com>
Date: Thu, 12 Dec 2024 14:40:59 -0800
Subject: [PATCH 4/4] black

---
 spatial_compare/spatial_compare.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spatial_compare/spatial_compare.py b/spatial_compare/spatial_compare.py
index 884ca64..2c533a1 100644
--- a/spatial_compare/spatial_compare.py
+++ b/spatial_compare/spatial_compare.py
@@ -18,6 +18,7 @@
 DEFAULT_DATA_NAMES = ["Data 0", "Data 1"]
 TARGET_LEGEND_MARKER_SIZE = 20
 
+
 class SpatialCompare:
     """
     A class for comparing spatial data between two AnnData objects.
@@ -220,7 +221,7 @@ def spatial_plot(
             )
             plt.axis("equal")
             if plot_legend:
-                markerscale = TARGET_LEGEND_MARKER_SIZE/ad0_dot_size
+                markerscale = TARGET_LEGEND_MARKER_SIZE / ad0_dot_size
                 plt.legend(markerscale=markerscale)
             plt.subplot(1, 2, 2)
             plt.title(self.data_names[1])
@@ -241,7 +242,7 @@ def spatial_plot(
             )
             plt.axis("equal")
             if plot_legend:
-                markerscale = TARGET_LEGEND_MARKER_SIZE/ad1_dot_size
+                markerscale = TARGET_LEGEND_MARKER_SIZE / ad1_dot_size
                 plt.legend(markerscale=markerscale)
 
     def de_novo_cluster(