Boxplots & Strip Plots

Doubly-grouped distribution plots for one continuous variable against two binary outcomes. Two chart variants are provided:

Boxplot (plot_bivariate_boxes) — side-by-side boxplots for each (label_x, label_y) combination.
Strip plot (plot_bivariate_strip) — jittered strip plot showing every individual sample, coloured by label_y.

Example data

import polars as pl

df = pl.DataFrame({
    "score": [0.8, 0.6, 0.9, 0.7, 0.5, 0.3, 0.4, 0.2, 0.85, 0.65, 0.35, 0.25],
    "outcome_a": [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0],
    "outcome_b": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
    "patient_id": [f"P{i:03d}" for i in range(12)],
})

Boxplot

from plotutils.boxplot import plot_bivariate_boxes

chart = plot_bivariate_boxes(
    df,
    score_col="score",
    label_x_col="outcome_a",
    label_y_col="outcome_b",
)

Strip plot

from plotutils.boxplot import plot_bivariate_strip

chart = plot_bivariate_strip(
    df,
    score_col="score",
    label_x_col="outcome_a",
    label_y_col="outcome_b",
    id_col="patient_id",
)

Missing scores

When some patients have missing scores, pass missing_score_df to show them as cross marks below the main chart area:

missing_df = pl.DataFrame({
    "outcome_a": [1, 0, 1],
    "outcome_b": [0, 1, 0],
    "patient_id": ["M001", "M002", "M003"],
})

chart = plot_bivariate_boxes(
    df,
    score_col="score",
    label_x_col="outcome_a",
    label_y_col="outcome_b",
    id_col="patient_id",
    missing_score_df=missing_df,
)

Reference

`plotutils.boxplot.plot_bivariate_boxes(df, score_col, label_x_col, label_y_col, title='', width=340, height=300, y_title=None, x_title=None, color_title=None, id_col=None, missing_score_df=None)`

Doubly-grouped boxplot: score vs two binary outcomes.

The outer grouping (x-axis) is label_x_col; the inner grouping (side-by-side boxes within each outer group) is label_y_col. This produces four boxes — one per (label_x, label_y) combination — making it easy to read off the marginal and joint effects of both outcomes on the variable's distribution.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Raw data with score and binary label columns.	required
`score_col`	`str`	Column with the continuous variable.	required
`label_x_col`	`str`	First binary outcome column (outer x-axis grouping).	required
`label_y_col`	`str`	Second binary outcome column (inner grouping via color / xOffset).	required
`title`	`str`	Chart title.	`''`
`width`	`int`	Chart dimensions in pixels.	`340`
`height`	`int`	Chart dimensions in pixels.	`340`
`y_title`	`str or None`	Y-axis title. Defaults to `score_col`.	`None`
`x_title`	`str or None`	X-axis title. Defaults to `label_x_col`.	`None`
`color_title`	`str or None`	Legend title for the color encoding. Defaults to `label_y_col`.	`None`
`id_col`	`str or None`	Optional column name containing patient / sample identifiers. When provided, a transparent point layer is added on top of the boxes so that hovering over an individual data point reveals its ID.	`None`
`missing_score_df`	`DataFrame or None`	Optional DataFrame of patients with missing scores (containing `label_x_col`, `label_y_col`, and optionally `id_col`). When provided, these patients are shown as cross marks at a fixed position below the main chart area with a separator rule.	`None`

Returns:

Type	Description
`Chart or LayerChart`

Source code in src/plotutils/boxplot.py

def plot_bivariate_boxes(
    df: pl.DataFrame,
    score_col: str,
    label_x_col: str,
    label_y_col: str,
    title: str = "",
    width: int = 340,
    height: int = 300,
    y_title: str | None = None,
    x_title: str | None = None,
    color_title: str | None = None,
    id_col: str | None = None,
    missing_score_df: pl.DataFrame | None = None,
) -> alt.Chart | alt.LayerChart:
    """Doubly-grouped boxplot: score vs two binary outcomes.

    The outer grouping (x-axis) is ``label_x_col``; the inner grouping
    (side-by-side boxes within each outer group) is ``label_y_col``.
    This produces four boxes — one per (label_x, label_y) combination —
    making it easy to read off the marginal and joint effects of both
    outcomes on the variable's distribution.

    Parameters
    ----------
    df : pl.DataFrame
        Raw data with score and binary label columns.
    score_col : str
        Column with the continuous variable.
    label_x_col : str
        First binary outcome column (outer x-axis grouping).
    label_y_col : str
        Second binary outcome column (inner grouping via color / xOffset).
    title : str
        Chart title.
    width, height : int
        Chart dimensions in pixels.
    y_title : str or None
        Y-axis title. Defaults to ``score_col``.
    x_title : str or None
        X-axis title. Defaults to ``label_x_col``.
    color_title : str or None
        Legend title for the color encoding. Defaults to ``label_y_col``.
    id_col : str or None
        Optional column name containing patient / sample identifiers.  When
        provided, a transparent point layer is added on top of the boxes so
        that hovering over an individual data point reveals its ID.
    missing_score_df : pl.DataFrame or None
        Optional DataFrame of patients with missing scores (containing
        ``label_x_col``, ``label_y_col``, and optionally ``id_col``).  When
        provided, these patients are shown as cross marks at a fixed position
        below the main chart area with a separator rule.

    Returns
    -------
    alt.Chart or alt.LayerChart
    """
    alt.data_transformers.disable_max_rows()

    df_plot = df.with_columns(
        pl.col(label_x_col).cast(pl.Utf8),
        pl.col(label_y_col).cast(pl.Utf8),
    )

    # Build all layers flat to avoid nested-layer issues in Vega-Lite.
    layers: list = []

    if missing_score_df is not None and len(missing_score_df) > 0:
        score_min = float(df_plot[score_col].min())  # type: ignore[arg-type]
        score_max = float(df_plot[score_col].max())  # type: ignore[arg-type]
        score_range = score_max - score_min
        sentinel_y = score_min - 0.15 * score_range
        sep_y = score_min - 0.05 * score_range

        layers.append(
            alt.Chart(pl.DataFrame({"y_sep": [sep_y]}))
            .mark_rule(color="#aaa", strokeDash=[3, 3], strokeWidth=0.8)
            .encode(y=alt.Y("y_sep:Q"))
        )

    box_layer = (
        alt.Chart(df_plot)
        .mark_boxplot()
        .encode(
            x=alt.X(
                f"{label_x_col}:N",
                title=x_title or label_x_col,
                axis=alt.Axis(labelAngle=0),
            ),
            xOffset=alt.XOffset(f"{label_y_col}:N"),
            y=alt.Y(f"{score_col}:Q", title=y_title or score_col),
            color=alt.Color(f"{label_y_col}:N", title=color_title or label_y_col),
        )
    )
    layers.append(box_layer)

    if id_col is not None:
        layers.append(
            alt.Chart(df_plot)
            .mark_point(opacity=0, size=300, filled=True)
            .encode(
                x=alt.X(f"{label_x_col}:N"),
                xOffset=alt.XOffset(f"{label_y_col}:N"),
                y=alt.Y(f"{score_col}:Q"),
                tooltip=[
                    alt.Tooltip(f"{id_col}:N", title="ID"),
                    alt.Tooltip(f"{label_x_col}:N"),
                    alt.Tooltip(f"{label_y_col}:N"),
                    alt.Tooltip(f"{score_col}:Q", format=".3f"),
                ],
            )
        )

    if missing_score_df is not None and len(missing_score_df) > 0:
        df_miss = missing_score_df.with_columns(
            pl.col(label_x_col).cast(pl.Utf8),
            pl.col(label_y_col).cast(pl.Utf8),
            pl.lit(sentinel_y).alias("_miss_y"),
        )
        miss_tooltip = [
            alt.Tooltip(f"{label_x_col}:N"),
            alt.Tooltip(f"{label_y_col}:N"),
        ]
        if id_col is not None:
            miss_tooltip.insert(0, alt.Tooltip(f"{id_col}:N", title="ID"))

        layers.append(
            alt.Chart(df_miss)
            .mark_point(shape="cross", size=30, opacity=0.7, strokeWidth=2)
            .encode(
                x=alt.X(
                    f"{label_x_col}:N",
                    title=x_title or label_x_col,
                    axis=alt.Axis(labelAngle=0),
                ),
                xOffset=alt.XOffset(f"{label_y_col}:N"),
                y=alt.Y("_miss_y:Q", title=y_title or score_col),
                color=alt.Color(f"{label_y_col}:N", title=color_title or label_y_col),
                tooltip=miss_tooltip,
            )
        )

    chart: alt.Chart | alt.LayerChart
    if len(layers) == 1:
        chart = layers[0].properties(width=width, height=height)
    else:
        chart = alt.layer(*layers).properties(width=width, height=height)

    if title:
        chart = chart.properties(title=title)

    return chart.configure_axis(
        gridColor="gray", gridDash=[3, 3], gridOpacity=0.5
    ).configure_view(strokeWidth=0)

`plotutils.boxplot.plot_bivariate_strip(df, score_col, label_x_col, label_y_col, title='', width=340, height=300, y_title=None, x_title=None, color_title=None, jitter_seed=0, id_col=None, missing_score_df=None)`

Jittered strip plot: score vs two binary outcomes.

Each sample is drawn as a semi-transparent circle. Within each label_x_col band the points are spread horizontally: the two label_y_col sub-groups are offset to opposite sides of the band centre (±15 px by default), and an additional small random jitter (±7 px) reduces overplotting within each sub-group.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Raw data with score and binary label columns.	required
`score_col`	`str`	Column with the continuous variable.	required
`label_x_col`	`str`	First binary outcome column (outer x-axis grouping).	required
`label_y_col`	`str`	Second binary outcome column (inner color grouping).	required
`title`	`str`	Chart title.	`''`
`width`	`int`	Chart dimensions in pixels.	`340`
`height`	`int`	Chart dimensions in pixels.	`340`
`y_title`	`str or None`	Y-axis title. Defaults to `score_col`.	`None`
`x_title`	`str or None`	X-axis title. Defaults to `label_x_col`.	`None`
`color_title`	`str or None`	Legend title for the color encoding. Defaults to `label_y_col`.	`None`
`jitter_seed`	`int`	Seed for the random horizontal jitter (reproducible renders).	`0`
`id_col`	`str or None`	Optional column name containing patient / sample identifiers. When provided, the ID appears in the tooltip on mouseover.	`None`
`missing_score_df`	`DataFrame or None`	Optional DataFrame of patients with missing scores (containing `label_x_col`, `label_y_col`, and optionally `id_col`). When provided, these patients are shown as cross marks at a fixed position below the main chart area with a separator rule.	`None`

Returns:

Type	Description
`Chart`

Source code in src/plotutils/boxplot.py

def plot_bivariate_strip(
    df: pl.DataFrame,
    score_col: str,
    label_x_col: str,
    label_y_col: str,
    title: str = "",
    width: int = 340,
    height: int = 300,
    y_title: str | None = None,
    x_title: str | None = None,
    color_title: str | None = None,
    jitter_seed: int = 0,
    id_col: str | None = None,
    missing_score_df: pl.DataFrame | None = None,
) -> alt.Chart:
    """Jittered strip plot: score vs two binary outcomes.

    Each sample is drawn as a semi-transparent circle. Within each
    ``label_x_col`` band the points are spread horizontally: the two
    ``label_y_col`` sub-groups are offset to opposite sides of the band
    centre (±15 px by default), and an additional small random jitter
    (±7 px) reduces overplotting within each sub-group.

    Parameters
    ----------
    df : pl.DataFrame
        Raw data with score and binary label columns.
    score_col : str
        Column with the continuous variable.
    label_x_col : str
        First binary outcome column (outer x-axis grouping).
    label_y_col : str
        Second binary outcome column (inner color grouping).
    title : str
        Chart title.
    width, height : int
        Chart dimensions in pixels.
    y_title : str or None
        Y-axis title. Defaults to ``score_col``.
    x_title : str or None
        X-axis title. Defaults to ``label_x_col``.
    color_title : str or None
        Legend title for the color encoding. Defaults to ``label_y_col``.
    jitter_seed : int
        Seed for the random horizontal jitter (reproducible renders).
    id_col : str or None
        Optional column name containing patient / sample identifiers.  When
        provided, the ID appears in the tooltip on mouseover.
    missing_score_df : pl.DataFrame or None
        Optional DataFrame of patients with missing scores (containing
        ``label_x_col``, ``label_y_col``, and optionally ``id_col``).  When
        provided, these patients are shown as cross marks at a fixed position
        below the main chart area with a separator rule.

    Returns
    -------
    alt.Chart
    """
    alt.data_transformers.disable_max_rows()

    # Map each unique label_y value to a symmetric pixel offset so that the
    # two sub-groups are spread to opposite sides of the x-band centre.
    unique_ly = sorted(df[label_y_col].unique().to_list())
    n_ly = len(unique_ly)
    ly_to_idx = {v: i for i, v in enumerate(unique_ly)}

    half_span_px = 15  # half-distance between sub-group centres (pixels)
    jitter_px = 7      # max random jitter per point (pixels)

    rng = random.Random(jitter_seed)
    ly_list = df[label_y_col].to_list()

    if n_ly <= 1:
        offsets = [rng.uniform(-jitter_px, jitter_px) for _ in ly_list]
    else:
        offsets = [
            (2 * ly_to_idx[ly] / (n_ly - 1) - 1) * half_span_px
            + rng.uniform(-jitter_px, jitter_px)
            for ly in ly_list
        ]

    df_plot = df.with_columns(
        pl.col(label_x_col).cast(pl.Utf8),
        pl.col(label_y_col).cast(pl.Utf8),
        pl.Series("_x_offset", offsets),
    )

    tooltip = [
        alt.Tooltip(f"{label_x_col}:N"),
        alt.Tooltip(f"{label_y_col}:N"),
        alt.Tooltip(f"{score_col}:Q", format=".3f"),
    ]
    if id_col is not None:
        tooltip.insert(0, alt.Tooltip(f"{id_col}:N", title="ID"))

    main_strip = (
        alt.Chart(df_plot)
        .mark_circle(opacity=0.45, size=18)
        .encode(
            x=alt.X(
                f"{label_x_col}:N",
                title=x_title or label_x_col,
                axis=alt.Axis(labelAngle=0),
            ),
            xOffset=alt.XOffset("_x_offset:Q"),
            y=alt.Y(f"{score_col}:Q", title=y_title or score_col),
            color=alt.Color(f"{label_y_col}:N", title=color_title or label_y_col),
            tooltip=tooltip,
        )
    )

    layers: list = [main_strip]

    if missing_score_df is not None and len(missing_score_df) > 0:
        score_min = float(df_plot[score_col].min())  # type: ignore[arg-type]
        score_max = float(df_plot[score_col].max())  # type: ignore[arg-type]
        score_range = score_max - score_min
        sentinel_y = score_min - 0.15 * score_range
        sep_y = score_min - 0.05 * score_range

        miss_ly_list = missing_score_df[label_y_col].cast(pl.Utf8).to_list()
        miss_rng = random.Random(jitter_seed + 1)
        if n_ly <= 1:
            miss_offsets = [miss_rng.uniform(-jitter_px, jitter_px) for _ in miss_ly_list]
        else:
            miss_offsets = [
                (2 * ly_to_idx.get(ly, 0) / (n_ly - 1) - 1) * half_span_px
                + miss_rng.uniform(-jitter_px, jitter_px)
                for ly in miss_ly_list
            ]

        df_miss = missing_score_df.with_columns(
            pl.col(label_x_col).cast(pl.Utf8),
            pl.col(label_y_col).cast(pl.Utf8),
            pl.lit(sentinel_y).alias("_miss_y"),
            pl.Series("_x_offset", miss_offsets),
        )

        miss_tooltip = [
            alt.Tooltip(f"{label_x_col}:N"),
            alt.Tooltip(f"{label_y_col}:N"),
        ]
        if id_col is not None:
            miss_tooltip.insert(0, alt.Tooltip(f"{id_col}:N", title="ID"))

        miss_layer = (
            alt.Chart(df_miss)
            .mark_point(shape="cross", size=30, opacity=0.7, strokeWidth=2)
            .encode(
                x=alt.X(
                    f"{label_x_col}:N",
                    title=x_title or label_x_col,
                    axis=alt.Axis(labelAngle=0),
                ),
                xOffset=alt.XOffset("_x_offset:Q"),
                y=alt.Y("_miss_y:Q", title=y_title or score_col),
                color=alt.Color(f"{label_y_col}:N", title=color_title or label_y_col),
                tooltip=miss_tooltip,
            )
        )
        sep_rule = (
            alt.Chart(pl.DataFrame({"y_sep": [sep_y]}))
            .mark_rule(color="#aaa", strokeDash=[3, 3], strokeWidth=0.8)
            .encode(y=alt.Y("y_sep:Q"))
        )
        layers = [sep_rule, main_strip, miss_layer]

    chart: alt.Chart | alt.LayerChart
    if len(layers) == 1:
        chart = layers[0].properties(width=width, height=height)
    else:
        chart = alt.layer(*layers).properties(width=width, height=height)

    if title:
        chart = chart.properties(title=title)

    return chart.configure_axis(
        gridColor="gray", gridDash=[3, 3], gridOpacity=0.5
    ).configure_view(strokeWidth=0)