Source code for kdiagram.plot.errors

# License: Apache 2.0
# Author: LKouadio <etanoyau@gmail.com>

import warnings
from typing import Any, Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.colors import Normalize
from scipy.stats import gaussian_kde

from ..compat.matplotlib import get_cmap
from ..decorators import check_non_emptiness, isdf
from ..utils.plot import set_axis_grid
from ..utils.validator import exist_features

__all__ = ["plot_error_ellipses", "plot_error_bands", "plot_error_violins"]



[docs]
@check_non_emptiness
@isdf
def plot_error_violins(
    df: pd.DataFrame,
    *error_cols: str,
    names: Optional[list[str]] = None,
    title: Optional[str] = None,
    figsize: tuple[float, float] = (9, 9),
    cmap: str = "viridis",
    show_grid: bool = True,
    grid_props: Optional[dict[str, Any]] = None,
    savefig: Optional[str] = None,
    dpi: int = 300,
    **violin_kws,
):
    if not error_cols:
        raise ValueError("At least one error column must be provided.")
    exist_features(df, features=list(error_cols))

    if names and len(names) != len(error_cols):
        warnings.warn(
            f"Number of names ({len(names)}) does not match number of "
            f"error columns ({len(error_cols)}). Using default names.",
            UserWarning,
            stacklevel=2,
        )
        names = None
    if not names:
        names = [f"Model {i+1}" for i in range(len(error_cols))]

    # Prepare data and KDEs for each model
    violin_data = []
    all_errors = np.concatenate(
        [df[col].dropna().to_numpy() for col in error_cols]
    )
    r_min, r_max = all_errors.min(), all_errors.max()
    grid = np.linspace(r_min, r_max, 200)

    for col in error_cols:
        errors = df[col].dropna().to_numpy()
        if len(errors) < 2:
            violin_data.append(None)  # Cannot compute KDE
            continue

        kde = gaussian_kde(errors)
        density = kde(grid)
        violin_data.append(density / density.max())  # Normalize density

    # Plot setup
    fig, ax = plt.subplots(
        figsize=figsize, subplot_kw={"projection": "polar"}
    )
    num_violins = len(error_cols)
    angles = np.linspace(0, 2 * np.pi, num_violins, endpoint=False)
    cmap_obj = get_cmap(cmap, default="viridis")
    colors = cmap_obj(np.linspace(0, 1, num_violins))

    # Draw violins
    for i, (angle, density) in enumerate(zip(angles, violin_data)):
        if density is None:
            continue

        # Width of the violin slice
        width = (2 * np.pi / num_violins) * 0.8

        # Create the path for the violin polygon
        x = np.concatenate(
            [-density * width / 2, np.flip(density * width / 2)]
        )
        y = np.concatenate([grid, np.flip(grid)])

        # Rotate and translate path to the correct angle
        theta = x + angle
        r = y

        ax.fill(
            theta,
            r,
            color=colors[i],
            label=names[i],
            alpha=violin_kws.pop("alpha", 0.6),
            **violin_kws,
        )

    # Add zero-error reference line
    ax.plot(
        np.linspace(0, 2 * np.pi, 100),
        np.zeros(100),
        color="black",
        linestyle="--",
        lw=1.5,
        label="Zero Error",
    )

    ax.set_title(title or "Comparison of Error Distributions")
    ax.set_yticklabels([])  # Hide radial ticks for clarity
    ax.set_xticks(angles)
    ax.set_xticklabels(names)
    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
    set_axis_grid(ax, show_grid=show_grid, grid_props=grid_props)

    plt.tight_layout()
    if savefig:
        plt.savefig(savefig, dpi=dpi, bbox_inches="tight")
        plt.close(fig)
    else:
        plt.show()

    return ax



plot_error_violins.__doc__ = r"""
Plot polar violin plots to compare multiple error distributions.

This function creates a polar plot where each angular sector
contains a violin plot representing the error distribution of a
different model or dataset. It is a powerful tool for visually
comparing bias, variance, and the overall shape of error
distributions [1]_.

Parameters
----------
df : pd.DataFrame
    The input DataFrame containing the error data.

*error_cols : str
    One or more column names from ``df``, each containing the error
    values (e.g., ``actual - predicted``) for a model to be plotted.

names : list of str, optional
    Display names for each of the models corresponding to
    ``error_cols``. If not provided, generic names like
    ``'Model 1'`` will be generated. The list length must match
    the number of error columns.

title : str, optional
    The title for the plot. If ``None``, a default is generated.

figsize : tuple of (float, float), default=(9, 9)
    Figure size in inches.

cmap : str, default='viridis'
    Matplotlib colormap used to assign a unique color to each
    violin plot.

show_grid : bool, default=True
    Toggle gridlines via the package helper ``set_axis_grid``.

grid_props : dict, optional
    Keyword arguments passed to ``set_axis_grid`` for grid
    customization.

savefig : str, optional
    If provided, save the figure to this path; otherwise the
    plot is shown interactively.

dpi : int, default=300
    Resolution for the saved figure.

**violin_kws : dict, optional
    Additional keyword arguments passed to the ``ax.fill`` call
    for each violin (e.g., ``alpha``, ``edgecolor``).

Returns
-------
ax : matplotlib.axes.Axes or None
    The Matplotlib Axes object containing the plot, or ``None``
    if the plot could not be generated.

Notes
-----
The plot visualizes and compares several one-dimensional error
distributions. It adapts the standard violin plot [1]_ to a polar
coordinate system for multi-model comparison.


1.  **Kernel Density Estimation (KDE)**: For each model's error
    data :math:`\mathbf{x} = \{x_1, x_2, ..., x_n\}`, the
    probability density function (PDF), :math:`\hat{f}_h(x)`, is
    estimated using a Gaussian kernel. This creates a smooth curve
    representing the distribution's shape.

    .. math::

       \hat{f}_h(x) = \frac{1}{nh} \sum_{i=1}^{n} K\left(\frac{x - x_i}{h}\right)

    where :math:`K` is the Gaussian kernel and :math:`h` is the
    bandwidth, a smoothing parameter.

2.  **Violin Construction**: The violin shape is created by plotting
    the density curve :math:`\hat{f}_h(x)` symmetrically around a
    central axis. The width of the violin at any given error value
    :math:`x` is proportional to its estimated density.

3.  **Polar Arrangement**: Each model's violin is assigned a unique
    angular sector on the polar plot. The radial axis represents
    the error value, with a reference circle at :math:`r=0`
    indicating a perfect forecast. The violin is drawn radially
    within its assigned sector.

Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> from kdiagram.plot.errors import plot_polar_error_violins
>>>
>>> # Simulate errors from three different models
>>> np.random.seed(0)
>>> n_points = 1000
>>> df_errors = pd.DataFrame({
...     'Model A (Good)': np.random.normal(
...           loc=0.5, scale=1.5, size=n_points),
...     'Model B (Biased)': np.random.normal(
...           loc=-4.0, scale=1.5, size=n_points),
...     'Model C (Inconsistent)': np.random.normal(
...           loc=0, scale=4.0, size=n_points),
... })
>>>
>>> # Generate the polar violin plot
>>> ax = plot_polar_error_violins(
...     df_errors,
...     'Model A (Good)',
...     'Model B (Biased)',
...     'Model C (Inconsistent)',
...     title='Comparison of Model Error Distributions',
...     cmap='plasma',
...     alpha=0.7
... )

References
----------
.. [1] Hintze, J. L., & Nelson, R. D. (1998). Violin Plots: A Box
   Plot-Density Trace Synergism. The American Statistician, 52(2),
   181-184.

"""



[docs]
@check_non_emptiness
@isdf
def plot_error_bands(
    df: pd.DataFrame,
    error_col: str,
    theta_col: str,
    *,
    theta_period: Optional[float] = None,
    theta_bins: int = 24,
    n_std: float = 1.0,
    title: Optional[str] = None,
    figsize: tuple[float, float] = (8, 8),
    cmap: str = "viridis",
    show_grid: bool = True,
    grid_props: Optional[dict[str, Any]] = None,
    mask_angle: bool = False,
    savefig: Optional[str] = None,
    dpi: int = 300,
    **fill_kws,
):
    exist_features(df, features=[error_col, theta_col])

    data = df[[error_col, theta_col]].dropna()
    if data.empty:
        warnings.warn(
            "DataFrame is empty after dropping NaNs in required columns.",
            UserWarning,
            stacklevel=2,
        )
        return None

    if theta_period:
        data["theta_rad"] = (
            ((data[theta_col] % theta_period) / theta_period) * 2 * np.pi
        )
    else:
        min_theta, max_theta = data[theta_col].min(), data[theta_col].max()
        if (max_theta - min_theta) > 1e-9:
            data["theta_rad"] = (
                ((data[theta_col] - min_theta) / (max_theta - min_theta))
                * 2
                * np.pi
            )
        else:
            data["theta_rad"] = 0

    # Bin the data by angle
    theta_edges = np.linspace(0, 2 * np.pi, theta_bins + 1)
    theta_labels = (theta_edges[:-1] + theta_edges[1:]) / 2
    data["theta_bin"] = pd.cut(
        data["theta_rad"],
        bins=theta_edges,
        labels=theta_labels,
        include_lowest=True,
    )

    # Calculate stats per bin
    # stats = data.groupby("theta_bin")[error_col].agg(["mean", "std"]).reset_index()
    stats = (
        data.groupby("theta_bin", observed=False)[error_col]
        .agg(["mean", "std"])
        .reset_index()
    )

    stats["std"] = stats["std"].fillna(0)  # Handle bins with one sample

    # Create the plot
    fig, ax = plt.subplots(
        figsize=figsize, subplot_kw={"projection": "polar"}
    )

    # Plot the mean error line
    ax.plot(
        stats["theta_bin"],
        stats["mean"],
        color="black",
        lw=2,
        label="Mean Error",
    )

    # Create and plot the uncertainty band
    ax.fill_between(
        stats["theta_bin"],
        stats["mean"] - n_std * stats["std"],
        stats["mean"] + n_std * stats["std"],
        alpha=fill_kws.pop("alpha", 0.3),
        label=f"{n_std} Std. Dev. Band",
        **fill_kws,
    )

    # Add a zero-error reference line
    ax.axhline(0, color="red", linestyle="--", lw=1.5, label="Zero Error")

    ax.set_title(title or f"Error Distribution vs. {theta_col}")
    ax.set_ylabel(f"Forecast Error ({error_col})")
    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
    set_axis_grid(ax, show_grid=show_grid, grid_props=grid_props)

    if mask_angle:
        ax.set_xticklabels([])

    plt.tight_layout()
    if savefig:
        plt.savefig(savefig, dpi=dpi, bbox_inches="tight")
        plt.close(fig)
    else:
        plt.show()
    return ax



plot_error_bands.__doc__ = r"""
Plot polar error bands to visualize systemic vs. random error.

This function aggregates forecast errors across bins of a cyclical
or ordered feature (like month or hour) and plots the mean error
and its standard deviation. It is a powerful diagnostic tool for
identifying systemic biases and variations in model performance.

Parameters
----------
df : pd.DataFrame
    The input DataFrame containing the error and feature data.

error_col : str
    Name of the column containing the forecast error values,
    typically calculated as ``actual - predicted``.

theta_col : str
    Name of the column representing the feature to bin against,
    which will be mapped to the angular axis.

theta_period : float, optional
    The period of the cyclical data in ``theta_col``. For example,
    if ``theta_col`` is the month of the year, the period is 12.
    This ensures the data wraps around the circle correctly.

theta_bins : int, default=24
    The number of angular bins to group the data into for
    calculating statistics.

n_std : float, default=1.0
    The number of standard deviations to display in the shaded
    error band around the mean error line.

title : str, optional
    The title for the plot. If ``None``, a default is generated.

figsize : tuple of (float, float), default=(8, 8)
    Figure size in inches.

cmap : str, default='viridis'
    *Note: This parameter is currently not used in this function
    as colors are fixed for clarity (black, red, and a fill color).*

show_grid : bool, default=True
    Toggle gridlines via the package helper ``set_axis_grid``.

grid_props : dict, optional
    Keyword arguments passed to ``set_axis_grid`` for grid
    customization.

mask_angle : bool, default=False
    If ``True``, hide the angular tick labels.

savefig : str, optional
    If provided, save the figure to this path; otherwise the
    plot is shown interactively.

dpi : int, default=300
    Resolution for the saved figure.

**fill_kws : dict, optional
    Additional keyword arguments passed to the ``ax.fill_between``
    call for the shaded error band (e.g., ``color``, ``alpha``).

Returns
-------
ax : matplotlib.axes.Axes or None
    The Matplotlib Axes object containing the plot, or ``None``
    if the plot could not be generated.

Notes
-----
The plot visualizes the first two moments (mean and standard
deviation) of the error distribution conditioned on the angular
variable :math:`\theta`.

1.  **Binning**: The data is first partitioned into :math:`K` bins
    based on the values in ``theta_col``. Let :math:`B_k` be the set
    of indices of data points belonging to the :math:`k`-th bin.

2.  **Mean Error Calculation**: For each bin :math:`B_k`, the mean
    error :math:`\mu_{e,k}` is calculated. This value is plotted as a
    point on the central black line.

    .. math::

       \mu_{e,k} = \frac{1}{|B_k|} \sum_{i \in B_k} e_i

    where :math:`e_i` is the error for data point :math:`i`. A
    consistent deviation of this line from the zero-error circle
    indicates a **systemic bias**.

3.  **Error Variance Calculation**: For each bin, the standard
    deviation of the error, :math:`\sigma_{e,k}`, is also calculated.

    .. math::

       \sigma_{e,k} = \sqrt{\frac{1}{|B_k|-1}\\
                            \sum_{i \in B_k} (e_i - \mu_{e,k})^2}

4.  **Band Construction**: A shaded band is drawn between the lower
    and upper bounds, defined by the mean plus or minus a multiple
    of the standard deviation.

    .. math::

       \text{Upper Bound}_k &= \mu_{e,k} + n_{std} \cdot \sigma_{e,k} \\
       \text{Lower Bound}_k &= \mu_{e,k} - n_{std} \cdot \sigma_{e,k}

    The width of this band indicates the **random error** or
    inconsistency of the model within that bin.

Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> from kdiagram.plot.errors import plot_error_bands
>>>
>>> # Simulate a model with seasonal error patterns
>>> np.random.seed(42)
>>> n_points = 2000
>>> day_of_year = np.arange(n_points) % 365
>>> month = (day_of_year // 30) + 1
>>>
>>> # Create a bias (positive error) in summer and more noise in winter
>>> seasonal_bias = np.sin((day_of_year - 90) * np.pi / 180) * 5
>>> seasonal_noise = 2 + 2 * np.cos(day_of_year * np.pi / 180)**2
>>> errors = seasonal_bias + np.random.normal(0, seasonal_noise, n_points)
>>>
>>> df_seasonal = pd.DataFrame({'month': month, 'forecast_error': errors})
>>>
>>> # Generate the plot
>>> ax = plot_error_bands(
...     df=df_seasonal,
...     error_col='forecast_error',
...     theta_col='month',
...     theta_period=12,
...     theta_bins=12,
...     n_std=1.5,
...     title='Seasonal Forecast Error Analysis',
...     color='#2980B9',
...     alpha=0.3
... )
"""



[docs]
@check_non_emptiness
@isdf
def plot_error_ellipses(
    df: pd.DataFrame,
    r_col: str,
    theta_col: str,
    r_std_col: str,
    theta_std_col: str,
    *,
    color_col: Optional[str] = None,
    n_std: float = 2.0,
    title: Optional[str] = None,
    figsize: tuple[float, float] = (8, 8),
    cmap: str = "viridis",
    mask_angle: bool = False,
    mask_radius: bool = False,
    show_grid: bool = True,
    grid_props: Optional[dict[str, Any]] = None,
    savefig: Optional[str] = None,
    dpi: int = 300,
    **ellipse_kws,
):
    required = [r_col, theta_col, r_std_col, theta_std_col]
    if color_col:
        required.append(color_col)
    exist_features(df, features=required)

    data = df[required].dropna()
    if data.empty:
        warnings.warn(
            "DataFrame is empty after dropping NaNs in "
            "required columns. Cannot plot.",
            UserWarning,
            stacklevel=2,
        )
        return None

    if color_col:
        color_data = data[color_col].to_numpy()
        cbar_label = color_col
    else:
        # Default color to radial uncertainty
        color_data = data[r_std_col].to_numpy()
        cbar_label = f"Uncertainty ({r_std_col})"

    norm = Normalize(vmin=np.min(color_data), vmax=np.max(color_data))
    cmap_obj = get_cmap(cmap, default="viridis")
    colors = cmap_obj(norm(color_data))

    fig, ax = plt.subplots(
        figsize=figsize, subplot_kw={"projection": "polar"}
    )

    # Plot each ellipse as a filled path
    for i, (_, row) in enumerate(data.iterrows()):
        theta_path, r_path = _get_ellipse_path(
            r_mean=row[r_col],
            theta_mean=np.deg2rad(row[theta_col]),
            r_std=row[r_std_col],
            theta_std=row[theta_std_col],
            n_std=n_std,
        )
        ax.fill(theta_path, r_path, color=colors[i], **ellipse_kws)

    cbar = plt.colorbar(
        plt.cm.ScalarMappable(norm=norm, cmap=cmap_obj),
        ax=ax,
        pad=0.1,
        shrink=0.75,
    )
    cbar.set_label(cbar_label, fontsize=10)

    ax.set_title(title or f"Error Ellipses ({n_std:.1f} std. dev.)")
    set_axis_grid(ax, show_grid=show_grid, grid_props=grid_props)
    if mask_angle:
        ax.set_xticklabels([])

    if mask_radius:
        ax.set_yticklabels([])

    plt.tight_layout()
    if savefig:
        plt.savefig(savefig, dpi=dpi, bbox_inches="tight")
        plt.close(fig)
    else:
        plt.show()

    return ax



def _get_ellipse_path(r_mean, theta_mean, r_std, theta_std, n_std=2.0):
    """
    Helper to calculate the path of an ellipse in polar coordinates.
    The ellipse is defined in a local Cartesian frame and then
    transformed.
    """
    # Width (radial) and height (tangential) of the ellipse
    width = n_std * r_std
    height = n_std * (r_mean * np.sin(theta_std))

    # Center of the ellipse in Cartesian coordinates
    x_c = r_mean * np.cos(theta_mean)
    y_c = r_mean * np.sin(theta_mean)

    # Generate points on a standard ellipse
    t = np.linspace(0, 2 * np.pi, 100)
    x_local = (width / 2) * np.cos(t)
    y_local = (height / 2) * np.sin(t)

    # Rotation matrix to align ellipse with the radial direction
    R = np.array(
        [
            [np.cos(theta_mean), -np.sin(theta_mean)],
            [np.sin(theta_mean), np.cos(theta_mean)],
        ]
    )

    # Rotate and translate local points
    x_rotated, y_rotated = np.dot(R, [x_local, y_local])
    x_final = x_rotated + x_c
    y_final = y_rotated + y_c

    # Convert final Cartesian points back to polar
    r_path = np.sqrt(x_final**2 + y_final**2)
    theta_path = np.arctan2(y_final, x_final)

    return theta_path, r_path


plot_error_ellipses.__doc__ = r"""
Plot polar error ellipses to visualize two-dimensional uncertainty.

This function draws ellipses on a polar plot to represent the
uncertainty of data points where both the radial and angular
components have associated errors (standard deviations).

Parameters
----------
df : pd.DataFrame
    Input DataFrame containing the data for the plot.

r_col : str
    Name of the column for the mean radial position (e.g., distance).

theta_col : str
    Name of the column for the mean angular position. **Must be in
    degrees.**

r_std_col : str
    Name of the column for the standard deviation of the radial
    position.

theta_std_col : str
    Name of the column for the standard deviation of the angular
    position. **Must be in degrees.**

color_col : str, optional
    Name of a column to use for coloring the ellipses. If ``None``,
    ellipses are colored by their radial uncertainty (``r_std_col``).

n_std : float, default=2.0
    The number of standard deviations to use for the ellipse size.
    For example, ``n_std=2.0`` corresponds to approximately a 95%
    confidence region for a normal distribution.

title : str, optional
    The title for the plot. If ``None``, a default is generated.

figsize : tuple of (float, float), default=(8, 8)
    Figure size in inches.

cmap : str, default='viridis'
    Matplotlib colormap for coloring the ellipses.

show_grid : bool, default=True
    Toggle gridlines via the package helper ``set_axis_grid``.

grid_props : dict, optional
    Keyword arguments passed to ``set_axis_grid`` for grid
    customization.

mask_angle : bool, default=False
    If ``True``, hide the angular tick labels (degrees).

mask_radius : bool, default=False
    If ``True``, hide the radial tick labels.

savefig : str, optional
    If provided, save the figure to this path; otherwise the
    plot is shown interactively.

dpi : int, default=300
    Resolution for the saved figure.

**ellipse_kws : dict, optional
    Additional keyword arguments passed to the ``ax.fill`` call
    for each ellipse (e.g., ``alpha``, ``edgecolor``).

Returns
-------
ax : matplotlib.axes.Axes or None
    The Matplotlib Axes object containing the plot, or ``None``
    if the plot could not be generated.

Notes
-----
The visualization for each data point :math:`i` is constructed
from its mean radial position :math:`\mu_{r,i}`, mean angular
position :math:`\mu_{\theta,i}`, and their respective standard
deviations :math:`\sigma_{r,i}` and :math:`\sigma_{\theta,i}`.

1.  **Ellipse Dimensions**: The ellipse is first defined in a local
    Cartesian coordinate system at the origin. Its half-width (along
    the radial direction) and half-height (along the tangential
    direction) are determined by the standard deviations:

    .. math::

        \text{width} &= n_{std} \cdot \sigma_{r,i} \\
        \text{height} &= n_{std} \cdot (\mu_{r,i} \cdot \sin(\sigma_{\theta,i}))

    Note that the tangential height depends on the radial distance
    :math:`\mu_{r,i}`.

2.  **Transformation**: This local ellipse is then transformed to the
    correct position on the polar plot. This involves two steps:
    
    a. **Rotation**: The ellipse is rotated by the mean angle
       :math:`\mu_{\theta,i}` to align its primary axis with the
       radial direction from the origin.
    b. **Translation**: The rotated ellipse is translated to the
       mean position, which in Cartesian coordinates is
       :math:`(x_c, y_c) = (\mu_{r,i} \cos(\mu_{\theta,i}), \mu_{r,i} \sin(\mu_{\theta,i}))`.

3.  **Plotting**: The final transformed ellipse is drawn as a filled
    path on the polar axes.

Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> from kdiagram.plot.errors import plot_polar_error_ellipses
>>>
>>> # Simulate tracking data for 15 objects
>>> np.random.seed(1)
>>> n_points = 15
>>> df_tracking = pd.DataFrame({
...     'angle_deg': np.linspace(0, 360, n_points, endpoint=False),
...     'distance_km': np.random.uniform(20, 80, n_points),
...     'distance_std': np.random.uniform(2, 7, n_points),
...     'angle_std_deg': np.random.uniform(3, 10, n_points),
...     'object_priority': np.random.randint(1, 5, n_points)
... })
>>>
>>> # Generate the plot
>>> ax = plot_polar_error_ellipses(
...     df=df_tracking,
...     r_col='distance_km',
...     theta_col='angle_deg',
...     r_std_col='distance_std',
...     theta_std_col='angle_std_deg',
...     color_col='object_priority',
...     n_std=1.5,
...     title='1.5-Sigma Positional Uncertainty',
...     cmap='cividis',
...     alpha=0.7,
...     edgecolor='black',
...     linewidth=0.5
... )
"""