From 3ee5880c42812c2c9ed58eade791a07308266cdb Mon Sep 17 00:00:00 2001
From: Malwina Prater <mprater@altoslabs.com>
Date: Tue, 12 May 2026 15:55:41 +0100
Subject: [PATCH 1/2] feat: add nuclei-per-cell plot and fix
 transcripts-in-nucleus derivation

---
 CHANGELOG.md                         |  21 ++++
 multiqc_xenium_extra/xenium_extra.py | 182 +++++++++++++++++++++------
 pyproject.toml                       |   2 +-
 3 files changed, 166 insertions(+), 39 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e6d6d4c..546c17b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,26 @@
 # multiqc-xenium-extra changelog
 
+## v1.1.0 [2026-05-12]
+
+- Add new "Nuclei per Cell" stacked bar plot showing the per-sample
+  distribution of cells with 0, 1, or 2+ segmented nuclei.
+- Add hidden general-stats columns "% 0-Nuclei Cells" and
+  "% Multi-Nuclei Cells".
+- **Fix "Fraction of Transcripts in Nucleus" plot.** Previous releases
+  computed this incorrectly as `nucleus_count / total_counts` from
+  `cells.parquet`, where `nucleus_count` is the number of segmented
+  nuclei per cell (typically 0/1/2+), not a transcript count. The plot
+  is now correctly derived from `transcripts.parquet` via the
+  per-transcript `overlaps_nucleus` flag.
+
+  **Note for users tracking these values historically**: post-fix
+  values will be substantially different from pre-1.1.0 values
+  (approximately 6× larger on representative samples). The previous
+  numbers were not biologically meaningful; the new values are.
+  Downstream analyses that consumed `nucleus_rna_fraction_mean` /
+  `nucleus_rna_fraction_median` from `multiqc_data.json` should be
+  re-baselined.
+
 ## v1.0.2 [2025-12-10]
 
 Increase file size limit from 5GB to 50GB to handle larger Xenium files.
diff --git a/multiqc_xenium_extra/xenium_extra.py b/multiqc_xenium_extra/xenium_extra.py
index 2a72ed9..c49cc5e 100644
--- a/multiqc_xenium_extra/xenium_extra.py
+++ b/multiqc_xenium_extra/xenium_extra.py
@@ -165,6 +165,10 @@ def extend_xenium_module(xenium_module):
                 xenium_module.data_by_sample[sample_name]["nucleus_to_cell_area_ratio_median"] = cell_data[
                     "nucleus_to_cell_area_ratio_median"
                 ]
+            if "nuclei_0_frac" in cell_data:
+                xenium_module.data_by_sample[sample_name]["nuclei_0_frac"] = cell_data["nuclei_0_frac"]
+            if "nuclei_multi_frac" in cell_data:
+                xenium_module.data_by_sample[sample_name]["nuclei_multi_frac"] = cell_data["nuclei_multi_frac"]
 
     # Use transcript count from parquet file if missing from JSON
     for sample_name, transcript_data in transcript_data_by_sample.items():
@@ -232,6 +236,34 @@ def extend_xenium_module(xenium_module):
                 }
             )
 
+            if any("nuclei_0_frac" in data for data in xenium_module.data_by_sample.values()):
+                xenium_module.genstat_headers["nuclei_0_frac"] = ColumnDict(
+                    {
+                        "title": "% 0-Nuclei Cells",
+                        "description": "Fraction of cells with no detected nucleus (segmentation QC)",
+                        "suffix": "%",
+                        "scale": "Reds",
+                        "modify": lambda x: x * 100.0,
+                        "max": 100.0,
+                        "format": "{:,.2f}",
+                        "hidden": True,
+                    }
+                )
+
+            if any("nuclei_multi_frac" in data for data in xenium_module.data_by_sample.values()):
+                xenium_module.genstat_headers["nuclei_multi_frac"] = ColumnDict(
+                    {
+                        "title": "% Multi-Nuclei Cells",
+                        "description": "Fraction of cells with 2+ detected nuclei",
+                        "suffix": "%",
+                        "scale": "Purples",
+                        "modify": lambda x: x * 100.0,
+                        "max": 100.0,
+                        "format": "{:,.2f}",
+                        "hidden": True,
+                    }
+                )
+
     # Create plots
 
     # 1. Segmentation Method
@@ -380,7 +412,7 @@ def extend_xenium_module(xenium_module):
             )
 
         # 5. Fraction of Transcripts in Nucleus
-        nucleus_rna_plot = xenium_nucleus_rna_fraction_plot(cells_data_by_sample)
+        nucleus_rna_plot = xenium_nucleus_rna_fraction_plot(transcript_data_by_sample)
         if nucleus_rna_plot:
             xenium_module.add_section(
                 name="Fraction of Transcripts in Nucleus",
@@ -558,6 +590,39 @@ def extend_xenium_module(xenium_module):
                 plot=fov_plot,
             )
 
+    if cells_data_by_sample:
+        # 9. Nuclei per Cell
+        nuclei_plot = xenium_nuclei_per_cell_plot(cells_data_by_sample)
+        if nuclei_plot:
+            xenium_module.add_section(
+                name="Nuclei per Cell",
+                anchor="xenium-nuclei-per-cell",
+                description="Distribution of cells by number of segmented nuclei per cell (0 / 1 / 2+)",
+                helptext="""
+                This stacked bar shows the per-sample distribution of cells grouped by the number of
+                segmented nuclei per cell.
+
+                **Categories:**
+
+                * **0 nuclei**: Cells with no detected nucleus. Often indicates under-segmentation,
+                  boundary-stain failure, or genuinely anucleate fragments.
+                * **1 nucleus**: The expected majority for most tissues.
+                * **2+ nuclei**: Multi-nucleated cells. May reflect merge errors during segmentation,
+                  or genuine biology in tissues like skeletal/cardiac muscle.
+
+                **What to look for:**
+
+                * **High 0-nucleus fraction**: Segmentation problem — check DAPI channel and boundary
+                  stain inputs.
+                * **Elevated multi-nucleus fraction**: Expected for muscle/heart tissue;
+                  unexpected elsewhere may suggest cell-merge artifacts.
+
+                Note: 0-nucleus rates can be inflated in samples dominated by very-low-transcript-count
+                "cells" (debris, fragments); interpret alongside the transcripts-per-cell distribution.
+                """,
+                plot=nuclei_plot,
+            )
+
 
 def xenium_segmentation_plot(data_by_sample):
     """Create stacked bar chart showing cell segmentation methods"""
@@ -579,6 +644,31 @@ def xenium_segmentation_plot(data_by_sample):
     return bargraph.plot(data_by_sample, keys, plot_config)
 
 
+def xenium_nuclei_per_cell_plot(cells_data_by_sample):
+    """Create stacked bar chart showing distribution of nuclei per cell (0 / 1 / 2+)"""
+    keys = {
+        "nuclei_0_frac": {"name": "0 nuclei", "color": "#d62728"},
+        "nuclei_1_frac": {"name": "1 nucleus", "color": "#2ca02c"},
+        "nuclei_multi_frac": {"name": "2+ nuclei", "color": "#1f77b4"},
+    }
+
+    relevant = {s_name: {k: data[k] for k in keys if k in data} for s_name, data in cells_data_by_sample.items()}
+    relevant = {s_name: d for s_name, d in relevant.items() if d}
+    if not relevant:
+        return None
+
+    plot_config = {
+        "id": "xenium_nuclei_per_cell",
+        "title": "Xenium: Nuclei per Cell",
+        "ylab": "Fraction of cells",
+        "stacking": "normal",
+        "ymax": 1.0,
+        "cpswitch": False,
+    }
+
+    return bargraph.plot(relevant, keys, plot_config)
+
+
 def parse_transcripts_parquet(f) -> Optional[Dict]:
     """Parse Xenium transcripts.parquet file with optimized lazy dataframe processing
 
@@ -715,6 +805,47 @@ def parse_transcripts_parquet(f) -> Optional[Dict]:
         result["fov_quality_stats"] = fov_quality_stats
         result["fov_median_qualities"] = fov_medians  # For heatmap generation
 
+    # Per-cell fraction of transcripts inside the nucleus, derived from the
+    # per-transcript `overlaps_nucleus` flag. Both numerator and denominator
+    # come from the same scan so the fraction is bounded in [0, 1].
+    if "overlaps_nucleus" in schema and "cell_id" in schema:
+        # `overlaps_nucleus` is uint8 on current Xenium output and bool on
+        # older ones; cast to Int64 to handle both.
+        # Drop the "UNASSIGNED" sentinel — transcripts not attributable to
+        # any cell, otherwise they aggregate into a fake "cell" row.
+        per_cell_nuc = (
+            df_lazy.filter(pl.col("cell_id") != "UNASSIGNED")
+            .group_by("cell_id")
+            .agg(
+                [
+                    pl.col("overlaps_nucleus").sum().cast(pl.Int64).alias("n_nuc"),
+                    pl.len().alias("n_tx"),
+                ]
+            )
+            .with_columns((pl.col("n_nuc") / pl.col("n_tx")).alias("fraction"))
+            .collect()
+        )
+
+        if not per_cell_nuc.is_empty():
+            frac = per_cell_nuc["fraction"]
+            result["nucleus_rna_fraction_values"] = frac.to_list()
+            result["nucleus_rna_fraction_mean"] = float(frac.mean())
+            result["nucleus_rna_fraction_median"] = float(frac.median())
+            result["nucleus_rna_fraction_box_stats"] = {
+                "min": float(frac.min()),
+                "q1": float(frac.quantile(0.25)),
+                "median": float(frac.median()),
+                "q3": float(frac.quantile(0.75)),
+                "max": float(frac.max()),
+                "mean": float(frac.mean()),
+                "count": int(len(frac)),
+            }
+    elif "cell_id" in schema:
+        log.info(
+            f"{f['fn']} missing 'overlaps_nucleus' column - "
+            "'Fraction of Transcripts in Nucleus' section will be skipped for this sample."
+        )
+
     return result
 
 
@@ -930,56 +1061,31 @@ def parse_cells_parquet(f) -> Optional[Dict]:
             "count": gene_counts_stats["count"].item(),
         }
 
-    # Add nucleus RNA fraction if nucleus_count is available
+    # Distribution of segmented nuclei per cell (0 / 1 / 2+)
     if "nucleus_count" in schema:
-        nucleus_fraction_stats = (
-            lazy_df.filter(pl.col("total_counts") >= 10)
-            .with_columns((pl.col("nucleus_count") / pl.col("total_counts")).alias("fraction"))
+        nuclei_bins = (
+            lazy_df.filter(pl.col("nucleus_count").is_not_null())
             .select(
                 [
-                    pl.col("fraction").mean().alias("mean"),
-                    pl.col("fraction").median().alias("median"),
-                    pl.col("fraction").count().alias("count"),
+                    (pl.col("nucleus_count") == 0).sum().alias("n_0"),
+                    (pl.col("nucleus_count") == 1).sum().alias("n_1"),
+                    (pl.col("nucleus_count") >= 2).sum().alias("n_multi"),
+                    pl.col("nucleus_count").count().alias("n_total"),
                 ]
             )
             .collect()
         )
 
-        if nucleus_fraction_stats["count"].item() > 0:
+        n_total = nuclei_bins["n_total"].item()
+        if n_total > 0:
             cell_stats.update(
                 {
-                    "nucleus_rna_fraction_mean": nucleus_fraction_stats["mean"].item(),
-                    "nucleus_rna_fraction_median": nucleus_fraction_stats["median"].item(),
+                    "nuclei_0_frac": nuclei_bins["n_0"].item() / n_total,
+                    "nuclei_1_frac": nuclei_bins["n_1"].item() / n_total,
+                    "nuclei_multi_frac": nuclei_bins["n_multi"].item() / n_total,
                 }
             )
 
-            # Calculate nucleus RNA fraction distribution statistics for box plots
-            nucleus_fraction_dist_stats = (
-                lazy_df.filter(pl.col("total_counts") > 0)
-                .with_columns((pl.col("nucleus_count") / pl.col("total_counts")).alias("fraction"))
-                .select(
-                    [
-                        pl.col("fraction").min().alias("min"),
-                        pl.col("fraction").quantile(0.25).alias("q1"),
-                        pl.col("fraction").median().alias("median"),
-                        pl.col("fraction").quantile(0.75).alias("q3"),
-                        pl.col("fraction").max().alias("max"),
-                        pl.col("fraction").mean().alias("mean"),
-                        pl.col("fraction").count().alias("count"),
-                    ]
-                )
-                .collect()
-            )
-            cell_stats["nucleus_rna_fraction_box_stats"] = {
-                "min": nucleus_fraction_dist_stats["min"].item(),
-                "q1": nucleus_fraction_dist_stats["q1"].item(),
-                "median": nucleus_fraction_dist_stats["median"].item(),
-                "q3": nucleus_fraction_dist_stats["q3"].item(),
-                "max": nucleus_fraction_dist_stats["max"].item(),
-                "mean": nucleus_fraction_dist_stats["mean"].item(),
-                "count": nucleus_fraction_dist_stats["count"].item(),
-            }
-
     return cell_stats
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 9c63320..2e63b7b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "multiqc_xenium_extra"
-version = "1.0.2"
+version = "1.1.0"
 authors = [
     {name = "Phil Ewels", email = "phil.ewels@seqera.io"},
 ]

From b210e7e2efd35048d2559ce5268b70e67ae5e4ad Mon Sep 17 00:00:00 2001
From: Malwina Prater <mprater@altoslabs.com>
Date: Wed, 13 May 2026 13:28:31 +0100
Subject: [PATCH 2/2] fix: emit raw per-cell values to render single-sample
 density plots

---
 CHANGELOG.md                         | 13 ++++++++++
 multiqc_xenium_extra/xenium_extra.py | 39 +++++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 546c17b..d7e01cf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,19 @@
   distribution of cells with 0, 1, or 2+ segmented nuclei.
 - Add hidden general-stats columns "% 0-Nuclei Cells" and
   "% Multi-Nuclei Cells".
+- **Fix single-sample density plots.** On single-sample reports, three
+  cell-related density sections — "Cell Area Distribution",
+  "Nucleus to Cell Area", and the transcripts-per-cell side of
+  "Distribution of Transcripts/Genes per Cell" — silently skipped
+  because the parser stored only summary statistics (`*_box_stats`)
+  and the single-sample density helpers require raw per-cell value
+  lists. `parse_cells_parquet` now emits `cell_area_values`,
+  `nucleus_to_cell_area_ratio_values`, and `total_counts_values`
+  alongside the existing box-stats summaries, so all three sections
+  render as KDE density plots on single-sample reports (matching the
+  helptext's existing description). Multi-sample reports are
+  unaffected. Note: `multiqc_data.json` is now larger by `O(n_cells)`
+  values per added metric per sample (~12 MB for a 500k-cell sample).
 - **Fix "Fraction of Transcripts in Nucleus" plot.** Previous releases
   computed this incorrectly as `nucleus_count / total_counts` from
   `cells.parquet`, where `nucleus_count` is the number of segmented
diff --git a/multiqc_xenium_extra/xenium_extra.py b/multiqc_xenium_extra/xenium_extra.py
index c49cc5e..09850a5 100644
--- a/multiqc_xenium_extra/xenium_extra.py
+++ b/multiqc_xenium_extra/xenium_extra.py
@@ -912,6 +912,10 @@ def parse_cells_parquet(f) -> Optional[Dict]:
             "count": cell_area_stats["count"].item(),
         }
 
+        # Raw per-cell values for the single-sample density helper.
+        cell_area_values = lazy_df.filter(pl.col("cell_area").is_not_null()).select(pl.col("cell_area")).collect()
+        cell_stats["cell_area_values"] = cell_area_values["cell_area"].to_list()
+
     # Nucleus area distribution stats using lazy operations
     nucleus_area_stats = (
         lazy_df.filter(pl.col("nucleus_area").is_not_null())
@@ -992,6 +996,19 @@ def parse_cells_parquet(f) -> Optional[Dict]:
                     "count": ratio_dist_stats["count"].item(),
                 }
 
+                # Raw per-cell values for the single-sample density helper.
+                ratio_values = (
+                    lazy_df.filter(
+                        (pl.col("cell_area").is_not_null())
+                        & (pl.col("nucleus_area").is_not_null())
+                        & (pl.col("cell_area") > 0)
+                    )
+                    .with_columns((pl.col("nucleus_area") / pl.col("cell_area")).alias("ratio"))
+                    .select(pl.col("ratio"))
+                    .collect()
+                )
+                cell_stats["nucleus_to_cell_area_ratio_values"] = ratio_values["ratio"].to_list()
+
     # Store total transcript counts per cell (total_counts) for distribution plots
     total_count_check = (
         lazy_df.filter(pl.col("total_counts").is_not_null())
@@ -1026,6 +1043,12 @@ def parse_cells_parquet(f) -> Optional[Dict]:
             "count": total_counts_stats["count"].item(),
         }
 
+        # Raw per-cell values for the single-sample density helper.
+        total_counts_values = (
+            lazy_df.filter(pl.col("total_counts").is_not_null()).select(pl.col("total_counts")).collect()
+        )
+        cell_stats["total_counts_values"] = total_counts_values["total_counts"].to_list()
+
     # Store detected genes per cell (transcript_counts) for distribution plots
     # NOTE: This will be overridden by H5-based calculation if cell_feature_matrix.h5 is available
     detected_count_check = (
@@ -1771,16 +1794,18 @@ def xenium_cell_distributions_combined_plot(cells_data_by_sample):
     samples_with_gene_counts = {}
 
     for s_name, data in cells_data_by_sample.items():
-        # Check for pre-calculated statistics first, fall back to raw values
-        if data and "total_counts_box_stats" in data:
-            samples_with_transcript_counts[s_name] = data["total_counts_box_stats"]
-        elif data and "total_counts_values" in data and data["total_counts_values"]:
+        # Prefer raw values when available so the single-sample density path
+        # can run; fall back to pre-computed box stats otherwise. Multi-sample
+        # `box.plot()` handles either shape.
+        if data and "total_counts_values" in data and data["total_counts_values"]:
             samples_with_transcript_counts[s_name] = data["total_counts_values"]
+        elif data and "total_counts_box_stats" in data:
+            samples_with_transcript_counts[s_name] = data["total_counts_box_stats"]
 
-        if data and "detected_genes_stats" in data:
-            samples_with_gene_counts[s_name] = data["detected_genes_stats"]
-        elif data and "detected_genes_values" in data and data["detected_genes_values"]:
+        if data and "detected_genes_values" in data and data["detected_genes_values"]:
             samples_with_gene_counts[s_name] = data["detected_genes_values"]
+        elif data and "detected_genes_stats" in data:
+            samples_with_gene_counts[s_name] = data["detected_genes_stats"]
 
     # If neither dataset is available, return None
     if not samples_with_transcript_counts and not samples_with_gene_counts: