From 3ee5880c42812c2c9ed58eade791a07308266cdb Mon Sep 17 00:00:00 2001 From: Malwina Prater Date: Tue, 12 May 2026 15:55:41 +0100 Subject: [PATCH 1/2] feat: add nuclei-per-cell plot and fix transcripts-in-nucleus derivation --- CHANGELOG.md | 21 ++++ multiqc_xenium_extra/xenium_extra.py | 182 +++++++++++++++++++++------ pyproject.toml | 2 +- 3 files changed, 166 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e6d6d4c..546c17b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,26 @@ # multiqc-xenium-extra changelog +## v1.1.0 [2026-05-12] + +- Add new "Nuclei per Cell" stacked bar plot showing the per-sample + distribution of cells with 0, 1, or 2+ segmented nuclei. +- Add hidden general-stats columns "% 0-Nuclei Cells" and + "% Multi-Nuclei Cells". +- **Fix "Fraction of Transcripts in Nucleus" plot.** Previous releases + computed this incorrectly as `nucleus_count / total_counts` from + `cells.parquet`, where `nucleus_count` is the number of segmented + nuclei per cell (typically 0/1/2+), not a transcript count. The plot + is now correctly derived from `transcripts.parquet` via the + per-transcript `overlaps_nucleus` flag. + + **Note for users tracking these values historically**: post-fix + values will be substantially different from pre-1.1.0 values + (approximately 6× larger on representative samples). The previous + numbers were not biologically meaningful; the new values are. + Downstream analyses that consumed `nucleus_rna_fraction_mean` / + `nucleus_rna_fraction_median` from `multiqc_data.json` should be + re-baselined. + ## v1.0.2 [2025-12-10] Increase file size limit from 5GB to 50GB to handle larger Xenium files. diff --git a/multiqc_xenium_extra/xenium_extra.py b/multiqc_xenium_extra/xenium_extra.py index 2a72ed9..c49cc5e 100644 --- a/multiqc_xenium_extra/xenium_extra.py +++ b/multiqc_xenium_extra/xenium_extra.py @@ -165,6 +165,10 @@ def extend_xenium_module(xenium_module): xenium_module.data_by_sample[sample_name]["nucleus_to_cell_area_ratio_median"] = cell_data[ "nucleus_to_cell_area_ratio_median" ] + if "nuclei_0_frac" in cell_data: + xenium_module.data_by_sample[sample_name]["nuclei_0_frac"] = cell_data["nuclei_0_frac"] + if "nuclei_multi_frac" in cell_data: + xenium_module.data_by_sample[sample_name]["nuclei_multi_frac"] = cell_data["nuclei_multi_frac"] # Use transcript count from parquet file if missing from JSON for sample_name, transcript_data in transcript_data_by_sample.items(): @@ -232,6 +236,34 @@ def extend_xenium_module(xenium_module): } ) + if any("nuclei_0_frac" in data for data in xenium_module.data_by_sample.values()): + xenium_module.genstat_headers["nuclei_0_frac"] = ColumnDict( + { + "title": "% 0-Nuclei Cells", + "description": "Fraction of cells with no detected nucleus (segmentation QC)", + "suffix": "%", + "scale": "Reds", + "modify": lambda x: x * 100.0, + "max": 100.0, + "format": "{:,.2f}", + "hidden": True, + } + ) + + if any("nuclei_multi_frac" in data for data in xenium_module.data_by_sample.values()): + xenium_module.genstat_headers["nuclei_multi_frac"] = ColumnDict( + { + "title": "% Multi-Nuclei Cells", + "description": "Fraction of cells with 2+ detected nuclei", + "suffix": "%", + "scale": "Purples", + "modify": lambda x: x * 100.0, + "max": 100.0, + "format": "{:,.2f}", + "hidden": True, + } + ) + # Create plots # 1. Segmentation Method @@ -380,7 +412,7 @@ def extend_xenium_module(xenium_module): ) # 5. Fraction of Transcripts in Nucleus - nucleus_rna_plot = xenium_nucleus_rna_fraction_plot(cells_data_by_sample) + nucleus_rna_plot = xenium_nucleus_rna_fraction_plot(transcript_data_by_sample) if nucleus_rna_plot: xenium_module.add_section( name="Fraction of Transcripts in Nucleus", @@ -558,6 +590,39 @@ def extend_xenium_module(xenium_module): plot=fov_plot, ) + if cells_data_by_sample: + # 9. Nuclei per Cell + nuclei_plot = xenium_nuclei_per_cell_plot(cells_data_by_sample) + if nuclei_plot: + xenium_module.add_section( + name="Nuclei per Cell", + anchor="xenium-nuclei-per-cell", + description="Distribution of cells by number of segmented nuclei per cell (0 / 1 / 2+)", + helptext=""" + This stacked bar shows the per-sample distribution of cells grouped by the number of + segmented nuclei per cell. + + **Categories:** + + * **0 nuclei**: Cells with no detected nucleus. Often indicates under-segmentation, + boundary-stain failure, or genuinely anucleate fragments. + * **1 nucleus**: The expected majority for most tissues. + * **2+ nuclei**: Multi-nucleated cells. May reflect merge errors during segmentation, + or genuine biology in tissues like skeletal/cardiac muscle. + + **What to look for:** + + * **High 0-nucleus fraction**: Segmentation problem — check DAPI channel and boundary + stain inputs. + * **Elevated multi-nucleus fraction**: Expected for muscle/heart tissue; + unexpected elsewhere may suggest cell-merge artifacts. + + Note: 0-nucleus rates can be inflated in samples dominated by very-low-transcript-count + "cells" (debris, fragments); interpret alongside the transcripts-per-cell distribution. + """, + plot=nuclei_plot, + ) + def xenium_segmentation_plot(data_by_sample): """Create stacked bar chart showing cell segmentation methods""" @@ -579,6 +644,31 @@ def xenium_segmentation_plot(data_by_sample): return bargraph.plot(data_by_sample, keys, plot_config) +def xenium_nuclei_per_cell_plot(cells_data_by_sample): + """Create stacked bar chart showing distribution of nuclei per cell (0 / 1 / 2+)""" + keys = { + "nuclei_0_frac": {"name": "0 nuclei", "color": "#d62728"}, + "nuclei_1_frac": {"name": "1 nucleus", "color": "#2ca02c"}, + "nuclei_multi_frac": {"name": "2+ nuclei", "color": "#1f77b4"}, + } + + relevant = {s_name: {k: data[k] for k in keys if k in data} for s_name, data in cells_data_by_sample.items()} + relevant = {s_name: d for s_name, d in relevant.items() if d} + if not relevant: + return None + + plot_config = { + "id": "xenium_nuclei_per_cell", + "title": "Xenium: Nuclei per Cell", + "ylab": "Fraction of cells", + "stacking": "normal", + "ymax": 1.0, + "cpswitch": False, + } + + return bargraph.plot(relevant, keys, plot_config) + + def parse_transcripts_parquet(f) -> Optional[Dict]: """Parse Xenium transcripts.parquet file with optimized lazy dataframe processing @@ -715,6 +805,47 @@ def parse_transcripts_parquet(f) -> Optional[Dict]: result["fov_quality_stats"] = fov_quality_stats result["fov_median_qualities"] = fov_medians # For heatmap generation + # Per-cell fraction of transcripts inside the nucleus, derived from the + # per-transcript `overlaps_nucleus` flag. Both numerator and denominator + # come from the same scan so the fraction is bounded in [0, 1]. + if "overlaps_nucleus" in schema and "cell_id" in schema: + # `overlaps_nucleus` is uint8 on current Xenium output and bool on + # older ones; cast to Int64 to handle both. + # Drop the "UNASSIGNED" sentinel — transcripts not attributable to + # any cell, otherwise they aggregate into a fake "cell" row. + per_cell_nuc = ( + df_lazy.filter(pl.col("cell_id") != "UNASSIGNED") + .group_by("cell_id") + .agg( + [ + pl.col("overlaps_nucleus").sum().cast(pl.Int64).alias("n_nuc"), + pl.len().alias("n_tx"), + ] + ) + .with_columns((pl.col("n_nuc") / pl.col("n_tx")).alias("fraction")) + .collect() + ) + + if not per_cell_nuc.is_empty(): + frac = per_cell_nuc["fraction"] + result["nucleus_rna_fraction_values"] = frac.to_list() + result["nucleus_rna_fraction_mean"] = float(frac.mean()) + result["nucleus_rna_fraction_median"] = float(frac.median()) + result["nucleus_rna_fraction_box_stats"] = { + "min": float(frac.min()), + "q1": float(frac.quantile(0.25)), + "median": float(frac.median()), + "q3": float(frac.quantile(0.75)), + "max": float(frac.max()), + "mean": float(frac.mean()), + "count": int(len(frac)), + } + elif "cell_id" in schema: + log.info( + f"{f['fn']} missing 'overlaps_nucleus' column - " + "'Fraction of Transcripts in Nucleus' section will be skipped for this sample." + ) + return result @@ -930,56 +1061,31 @@ def parse_cells_parquet(f) -> Optional[Dict]: "count": gene_counts_stats["count"].item(), } - # Add nucleus RNA fraction if nucleus_count is available + # Distribution of segmented nuclei per cell (0 / 1 / 2+) if "nucleus_count" in schema: - nucleus_fraction_stats = ( - lazy_df.filter(pl.col("total_counts") >= 10) - .with_columns((pl.col("nucleus_count") / pl.col("total_counts")).alias("fraction")) + nuclei_bins = ( + lazy_df.filter(pl.col("nucleus_count").is_not_null()) .select( [ - pl.col("fraction").mean().alias("mean"), - pl.col("fraction").median().alias("median"), - pl.col("fraction").count().alias("count"), + (pl.col("nucleus_count") == 0).sum().alias("n_0"), + (pl.col("nucleus_count") == 1).sum().alias("n_1"), + (pl.col("nucleus_count") >= 2).sum().alias("n_multi"), + pl.col("nucleus_count").count().alias("n_total"), ] ) .collect() ) - if nucleus_fraction_stats["count"].item() > 0: + n_total = nuclei_bins["n_total"].item() + if n_total > 0: cell_stats.update( { - "nucleus_rna_fraction_mean": nucleus_fraction_stats["mean"].item(), - "nucleus_rna_fraction_median": nucleus_fraction_stats["median"].item(), + "nuclei_0_frac": nuclei_bins["n_0"].item() / n_total, + "nuclei_1_frac": nuclei_bins["n_1"].item() / n_total, + "nuclei_multi_frac": nuclei_bins["n_multi"].item() / n_total, } ) - # Calculate nucleus RNA fraction distribution statistics for box plots - nucleus_fraction_dist_stats = ( - lazy_df.filter(pl.col("total_counts") > 0) - .with_columns((pl.col("nucleus_count") / pl.col("total_counts")).alias("fraction")) - .select( - [ - pl.col("fraction").min().alias("min"), - pl.col("fraction").quantile(0.25).alias("q1"), - pl.col("fraction").median().alias("median"), - pl.col("fraction").quantile(0.75).alias("q3"), - pl.col("fraction").max().alias("max"), - pl.col("fraction").mean().alias("mean"), - pl.col("fraction").count().alias("count"), - ] - ) - .collect() - ) - cell_stats["nucleus_rna_fraction_box_stats"] = { - "min": nucleus_fraction_dist_stats["min"].item(), - "q1": nucleus_fraction_dist_stats["q1"].item(), - "median": nucleus_fraction_dist_stats["median"].item(), - "q3": nucleus_fraction_dist_stats["q3"].item(), - "max": nucleus_fraction_dist_stats["max"].item(), - "mean": nucleus_fraction_dist_stats["mean"].item(), - "count": nucleus_fraction_dist_stats["count"].item(), - } - return cell_stats diff --git a/pyproject.toml b/pyproject.toml index 9c63320..2e63b7b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "multiqc_xenium_extra" -version = "1.0.2" +version = "1.1.0" authors = [ {name = "Phil Ewels", email = "phil.ewels@seqera.io"}, ] From b210e7e2efd35048d2559ce5268b70e67ae5e4ad Mon Sep 17 00:00:00 2001 From: Malwina Prater Date: Wed, 13 May 2026 13:28:31 +0100 Subject: [PATCH 2/2] fix: emit raw per-cell values to render single-sample density plots --- CHANGELOG.md | 13 ++++++++++ multiqc_xenium_extra/xenium_extra.py | 39 +++++++++++++++++++++++----- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 546c17b..d7e01cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,19 @@ distribution of cells with 0, 1, or 2+ segmented nuclei. - Add hidden general-stats columns "% 0-Nuclei Cells" and "% Multi-Nuclei Cells". +- **Fix single-sample density plots.** On single-sample reports, three + cell-related density sections — "Cell Area Distribution", + "Nucleus to Cell Area", and the transcripts-per-cell side of + "Distribution of Transcripts/Genes per Cell" — silently skipped + because the parser stored only summary statistics (`*_box_stats`) + and the single-sample density helpers require raw per-cell value + lists. `parse_cells_parquet` now emits `cell_area_values`, + `nucleus_to_cell_area_ratio_values`, and `total_counts_values` + alongside the existing box-stats summaries, so all three sections + render as KDE density plots on single-sample reports (matching the + helptext's existing description). Multi-sample reports are + unaffected. Note: `multiqc_data.json` is now larger by `O(n_cells)` + values per added metric per sample (~12 MB for a 500k-cell sample). - **Fix "Fraction of Transcripts in Nucleus" plot.** Previous releases computed this incorrectly as `nucleus_count / total_counts` from `cells.parquet`, where `nucleus_count` is the number of segmented diff --git a/multiqc_xenium_extra/xenium_extra.py b/multiqc_xenium_extra/xenium_extra.py index c49cc5e..09850a5 100644 --- a/multiqc_xenium_extra/xenium_extra.py +++ b/multiqc_xenium_extra/xenium_extra.py @@ -912,6 +912,10 @@ def parse_cells_parquet(f) -> Optional[Dict]: "count": cell_area_stats["count"].item(), } + # Raw per-cell values for the single-sample density helper. + cell_area_values = lazy_df.filter(pl.col("cell_area").is_not_null()).select(pl.col("cell_area")).collect() + cell_stats["cell_area_values"] = cell_area_values["cell_area"].to_list() + # Nucleus area distribution stats using lazy operations nucleus_area_stats = ( lazy_df.filter(pl.col("nucleus_area").is_not_null()) @@ -992,6 +996,19 @@ def parse_cells_parquet(f) -> Optional[Dict]: "count": ratio_dist_stats["count"].item(), } + # Raw per-cell values for the single-sample density helper. + ratio_values = ( + lazy_df.filter( + (pl.col("cell_area").is_not_null()) + & (pl.col("nucleus_area").is_not_null()) + & (pl.col("cell_area") > 0) + ) + .with_columns((pl.col("nucleus_area") / pl.col("cell_area")).alias("ratio")) + .select(pl.col("ratio")) + .collect() + ) + cell_stats["nucleus_to_cell_area_ratio_values"] = ratio_values["ratio"].to_list() + # Store total transcript counts per cell (total_counts) for distribution plots total_count_check = ( lazy_df.filter(pl.col("total_counts").is_not_null()) @@ -1026,6 +1043,12 @@ def parse_cells_parquet(f) -> Optional[Dict]: "count": total_counts_stats["count"].item(), } + # Raw per-cell values for the single-sample density helper. + total_counts_values = ( + lazy_df.filter(pl.col("total_counts").is_not_null()).select(pl.col("total_counts")).collect() + ) + cell_stats["total_counts_values"] = total_counts_values["total_counts"].to_list() + # Store detected genes per cell (transcript_counts) for distribution plots # NOTE: This will be overridden by H5-based calculation if cell_feature_matrix.h5 is available detected_count_check = ( @@ -1771,16 +1794,18 @@ def xenium_cell_distributions_combined_plot(cells_data_by_sample): samples_with_gene_counts = {} for s_name, data in cells_data_by_sample.items(): - # Check for pre-calculated statistics first, fall back to raw values - if data and "total_counts_box_stats" in data: - samples_with_transcript_counts[s_name] = data["total_counts_box_stats"] - elif data and "total_counts_values" in data and data["total_counts_values"]: + # Prefer raw values when available so the single-sample density path + # can run; fall back to pre-computed box stats otherwise. Multi-sample + # `box.plot()` handles either shape. + if data and "total_counts_values" in data and data["total_counts_values"]: samples_with_transcript_counts[s_name] = data["total_counts_values"] + elif data and "total_counts_box_stats" in data: + samples_with_transcript_counts[s_name] = data["total_counts_box_stats"] - if data and "detected_genes_stats" in data: - samples_with_gene_counts[s_name] = data["detected_genes_stats"] - elif data and "detected_genes_values" in data and data["detected_genes_values"]: + if data and "detected_genes_values" in data and data["detected_genes_values"]: samples_with_gene_counts[s_name] = data["detected_genes_values"] + elif data and "detected_genes_stats" in data: + samples_with_gene_counts[s_name] = data["detected_genes_stats"] # If neither dataset is available, return None if not samples_with_transcript_counts and not samples_with_gene_counts: