Source code for kdiagram.plot.errors

# License: Apache 2.0
# Author: LKouadio <etanoyau@gmail.com>

import warnings
from typing import Any, Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.colors import Normalize
from scipy.stats import gaussian_kde

from ..compat.matplotlib import get_cmap
from ..decorators import check_non_emptiness, isdf
from ..utils.plot import set_axis_grid
from ..utils.validator import exist_features

__all__ = ["plot_error_ellipses", "plot_error_bands", "plot_error_violins"]


[docs] @check_non_emptiness @isdf def plot_error_violins( df: pd.DataFrame, *error_cols: str, names: Optional[list[str]] = None, title: Optional[str] = None, figsize: tuple[float, float] = (9, 9), cmap: str = "viridis", show_grid: bool = True, grid_props: Optional[dict[str, Any]] = None, savefig: Optional[str] = None, dpi: int = 300, **violin_kws, ): if not error_cols: raise ValueError("At least one error column must be provided.") exist_features(df, features=list(error_cols)) if names and len(names) != len(error_cols): warnings.warn( f"Number of names ({len(names)}) does not match number of " f"error columns ({len(error_cols)}). Using default names.", UserWarning, stacklevel=2, ) names = None if not names: names = [f"Model {i+1}" for i in range(len(error_cols))] # Prepare data and KDEs for each model violin_data = [] all_errors = np.concatenate( [df[col].dropna().to_numpy() for col in error_cols] ) r_min, r_max = all_errors.min(), all_errors.max() grid = np.linspace(r_min, r_max, 200) for col in error_cols: errors = df[col].dropna().to_numpy() if len(errors) < 2: violin_data.append(None) # Cannot compute KDE continue kde = gaussian_kde(errors) density = kde(grid) violin_data.append(density / density.max()) # Normalize density # Plot setup fig, ax = plt.subplots( figsize=figsize, subplot_kw={"projection": "polar"} ) num_violins = len(error_cols) angles = np.linspace(0, 2 * np.pi, num_violins, endpoint=False) cmap_obj = get_cmap(cmap, default="viridis") colors = cmap_obj(np.linspace(0, 1, num_violins)) # Draw violins for i, (angle, density) in enumerate(zip(angles, violin_data)): if density is None: continue # Width of the violin slice width = (2 * np.pi / num_violins) * 0.8 # Create the path for the violin polygon x = np.concatenate( [-density * width / 2, np.flip(density * width / 2)] ) y = np.concatenate([grid, np.flip(grid)]) # Rotate and translate path to the correct angle theta = x + angle r = y ax.fill( theta, r, color=colors[i], label=names[i], alpha=violin_kws.pop("alpha", 0.6), **violin_kws, ) # Add zero-error reference line ax.plot( np.linspace(0, 2 * np.pi, 100), np.zeros(100), color="black", linestyle="--", lw=1.5, label="Zero Error", ) ax.set_title(title or "Comparison of Error Distributions") ax.set_yticklabels([]) # Hide radial ticks for clarity ax.set_xticks(angles) ax.set_xticklabels(names) ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1)) set_axis_grid(ax, show_grid=show_grid, grid_props=grid_props) plt.tight_layout() if savefig: plt.savefig(savefig, dpi=dpi, bbox_inches="tight") plt.close(fig) else: plt.show() return ax
plot_error_violins.__doc__ = r""" Plot polar violin plots to compare multiple error distributions. This function creates a polar plot where each angular sector contains a violin plot representing the error distribution of a different model or dataset. It is a powerful tool for visually comparing bias, variance, and the overall shape of error distributions [1]_. Parameters ---------- df : pd.DataFrame The input DataFrame containing the error data. *error_cols : str One or more column names from ``df``, each containing the error values (e.g., ``actual - predicted``) for a model to be plotted. names : list of str, optional Display names for each of the models corresponding to ``error_cols``. If not provided, generic names like ``'Model 1'`` will be generated. The list length must match the number of error columns. title : str, optional The title for the plot. If ``None``, a default is generated. figsize : tuple of (float, float), default=(9, 9) Figure size in inches. cmap : str, default='viridis' Matplotlib colormap used to assign a unique color to each violin plot. show_grid : bool, default=True Toggle gridlines via the package helper ``set_axis_grid``. grid_props : dict, optional Keyword arguments passed to ``set_axis_grid`` for grid customization. savefig : str, optional If provided, save the figure to this path; otherwise the plot is shown interactively. dpi : int, default=300 Resolution for the saved figure. **violin_kws : dict, optional Additional keyword arguments passed to the ``ax.fill`` call for each violin (e.g., ``alpha``, ``edgecolor``). Returns ------- ax : matplotlib.axes.Axes or None The Matplotlib Axes object containing the plot, or ``None`` if the plot could not be generated. Notes ----- The plot visualizes and compares several one-dimensional error distributions. It adapts the standard violin plot [1]_ to a polar coordinate system for multi-model comparison. 1. **Kernel Density Estimation (KDE)**: For each model's error data :math:`\mathbf{x} = \{x_1, x_2, ..., x_n\}`, the probability density function (PDF), :math:`\hat{f}_h(x)`, is estimated using a Gaussian kernel. This creates a smooth curve representing the distribution's shape. .. math:: \hat{f}_h(x) = \frac{1}{nh} \sum_{i=1}^{n} K\left(\frac{x - x_i}{h}\right) where :math:`K` is the Gaussian kernel and :math:`h` is the bandwidth, a smoothing parameter. 2. **Violin Construction**: The violin shape is created by plotting the density curve :math:`\hat{f}_h(x)` symmetrically around a central axis. The width of the violin at any given error value :math:`x` is proportional to its estimated density. 3. **Polar Arrangement**: Each model's violin is assigned a unique angular sector on the polar plot. The radial axis represents the error value, with a reference circle at :math:`r=0` indicating a perfect forecast. The violin is drawn radially within its assigned sector. Examples -------- >>> import numpy as np >>> import pandas as pd >>> from kdiagram.plot.errors import plot_polar_error_violins >>> >>> # Simulate errors from three different models >>> np.random.seed(0) >>> n_points = 1000 >>> df_errors = pd.DataFrame({ ... 'Model A (Good)': np.random.normal( ... loc=0.5, scale=1.5, size=n_points), ... 'Model B (Biased)': np.random.normal( ... loc=-4.0, scale=1.5, size=n_points), ... 'Model C (Inconsistent)': np.random.normal( ... loc=0, scale=4.0, size=n_points), ... }) >>> >>> # Generate the polar violin plot >>> ax = plot_polar_error_violins( ... df_errors, ... 'Model A (Good)', ... 'Model B (Biased)', ... 'Model C (Inconsistent)', ... title='Comparison of Model Error Distributions', ... cmap='plasma', ... alpha=0.7 ... ) References ---------- .. [1] Hintze, J. L., & Nelson, R. D. (1998). Violin Plots: A Box Plot-Density Trace Synergism. The American Statistician, 52(2), 181-184. """
[docs] @check_non_emptiness @isdf def plot_error_bands( df: pd.DataFrame, error_col: str, theta_col: str, *, theta_period: Optional[float] = None, theta_bins: int = 24, n_std: float = 1.0, title: Optional[str] = None, figsize: tuple[float, float] = (8, 8), cmap: str = "viridis", show_grid: bool = True, grid_props: Optional[dict[str, Any]] = None, mask_angle: bool = False, savefig: Optional[str] = None, dpi: int = 300, **fill_kws, ): exist_features(df, features=[error_col, theta_col]) data = df[[error_col, theta_col]].dropna() if data.empty: warnings.warn( "DataFrame is empty after dropping NaNs in required columns.", UserWarning, stacklevel=2, ) return None if theta_period: data["theta_rad"] = ( ((data[theta_col] % theta_period) / theta_period) * 2 * np.pi ) else: min_theta, max_theta = data[theta_col].min(), data[theta_col].max() if (max_theta - min_theta) > 1e-9: data["theta_rad"] = ( ((data[theta_col] - min_theta) / (max_theta - min_theta)) * 2 * np.pi ) else: data["theta_rad"] = 0 # Bin the data by angle theta_edges = np.linspace(0, 2 * np.pi, theta_bins + 1) theta_labels = (theta_edges[:-1] + theta_edges[1:]) / 2 data["theta_bin"] = pd.cut( data["theta_rad"], bins=theta_edges, labels=theta_labels, include_lowest=True, ) # Calculate stats per bin # stats = data.groupby("theta_bin")[error_col].agg(["mean", "std"]).reset_index() stats = ( data.groupby("theta_bin", observed=False)[error_col] .agg(["mean", "std"]) .reset_index() ) stats["std"] = stats["std"].fillna(0) # Handle bins with one sample # Create the plot fig, ax = plt.subplots( figsize=figsize, subplot_kw={"projection": "polar"} ) # Plot the mean error line ax.plot( stats["theta_bin"], stats["mean"], color="black", lw=2, label="Mean Error", ) # Create and plot the uncertainty band ax.fill_between( stats["theta_bin"], stats["mean"] - n_std * stats["std"], stats["mean"] + n_std * stats["std"], alpha=fill_kws.pop("alpha", 0.3), label=f"{n_std} Std. Dev. Band", **fill_kws, ) # Add a zero-error reference line ax.axhline(0, color="red", linestyle="--", lw=1.5, label="Zero Error") ax.set_title(title or f"Error Distribution vs. {theta_col}") ax.set_ylabel(f"Forecast Error ({error_col})") ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1)) set_axis_grid(ax, show_grid=show_grid, grid_props=grid_props) if mask_angle: ax.set_xticklabels([]) plt.tight_layout() if savefig: plt.savefig(savefig, dpi=dpi, bbox_inches="tight") plt.close(fig) else: plt.show() return ax
plot_error_bands.__doc__ = r""" Plot polar error bands to visualize systemic vs. random error. This function aggregates forecast errors across bins of a cyclical or ordered feature (like month or hour) and plots the mean error and its standard deviation. It is a powerful diagnostic tool for identifying systemic biases and variations in model performance. Parameters ---------- df : pd.DataFrame The input DataFrame containing the error and feature data. error_col : str Name of the column containing the forecast error values, typically calculated as ``actual - predicted``. theta_col : str Name of the column representing the feature to bin against, which will be mapped to the angular axis. theta_period : float, optional The period of the cyclical data in ``theta_col``. For example, if ``theta_col`` is the month of the year, the period is 12. This ensures the data wraps around the circle correctly. theta_bins : int, default=24 The number of angular bins to group the data into for calculating statistics. n_std : float, default=1.0 The number of standard deviations to display in the shaded error band around the mean error line. title : str, optional The title for the plot. If ``None``, a default is generated. figsize : tuple of (float, float), default=(8, 8) Figure size in inches. cmap : str, default='viridis' *Note: This parameter is currently not used in this function as colors are fixed for clarity (black, red, and a fill color).* show_grid : bool, default=True Toggle gridlines via the package helper ``set_axis_grid``. grid_props : dict, optional Keyword arguments passed to ``set_axis_grid`` for grid customization. mask_angle : bool, default=False If ``True``, hide the angular tick labels. savefig : str, optional If provided, save the figure to this path; otherwise the plot is shown interactively. dpi : int, default=300 Resolution for the saved figure. **fill_kws : dict, optional Additional keyword arguments passed to the ``ax.fill_between`` call for the shaded error band (e.g., ``color``, ``alpha``). Returns ------- ax : matplotlib.axes.Axes or None The Matplotlib Axes object containing the plot, or ``None`` if the plot could not be generated. Notes ----- The plot visualizes the first two moments (mean and standard deviation) of the error distribution conditioned on the angular variable :math:`\theta`. 1. **Binning**: The data is first partitioned into :math:`K` bins based on the values in ``theta_col``. Let :math:`B_k` be the set of indices of data points belonging to the :math:`k`-th bin. 2. **Mean Error Calculation**: For each bin :math:`B_k`, the mean error :math:`\mu_{e,k}` is calculated. This value is plotted as a point on the central black line. .. math:: \mu_{e,k} = \frac{1}{|B_k|} \sum_{i \in B_k} e_i where :math:`e_i` is the error for data point :math:`i`. A consistent deviation of this line from the zero-error circle indicates a **systemic bias**. 3. **Error Variance Calculation**: For each bin, the standard deviation of the error, :math:`\sigma_{e,k}`, is also calculated. .. math:: \sigma_{e,k} = \sqrt{\frac{1}{|B_k|-1}\\ \sum_{i \in B_k} (e_i - \mu_{e,k})^2} 4. **Band Construction**: A shaded band is drawn between the lower and upper bounds, defined by the mean plus or minus a multiple of the standard deviation. .. math:: \text{Upper Bound}_k &= \mu_{e,k} + n_{std} \cdot \sigma_{e,k} \\ \text{Lower Bound}_k &= \mu_{e,k} - n_{std} \cdot \sigma_{e,k} The width of this band indicates the **random error** or inconsistency of the model within that bin. Examples -------- >>> import numpy as np >>> import pandas as pd >>> from kdiagram.plot.errors import plot_error_bands >>> >>> # Simulate a model with seasonal error patterns >>> np.random.seed(42) >>> n_points = 2000 >>> day_of_year = np.arange(n_points) % 365 >>> month = (day_of_year // 30) + 1 >>> >>> # Create a bias (positive error) in summer and more noise in winter >>> seasonal_bias = np.sin((day_of_year - 90) * np.pi / 180) * 5 >>> seasonal_noise = 2 + 2 * np.cos(day_of_year * np.pi / 180)**2 >>> errors = seasonal_bias + np.random.normal(0, seasonal_noise, n_points) >>> >>> df_seasonal = pd.DataFrame({'month': month, 'forecast_error': errors}) >>> >>> # Generate the plot >>> ax = plot_error_bands( ... df=df_seasonal, ... error_col='forecast_error', ... theta_col='month', ... theta_period=12, ... theta_bins=12, ... n_std=1.5, ... title='Seasonal Forecast Error Analysis', ... color='#2980B9', ... alpha=0.3 ... ) """
[docs] @check_non_emptiness @isdf def plot_error_ellipses( df: pd.DataFrame, r_col: str, theta_col: str, r_std_col: str, theta_std_col: str, *, color_col: Optional[str] = None, n_std: float = 2.0, title: Optional[str] = None, figsize: tuple[float, float] = (8, 8), cmap: str = "viridis", mask_angle: bool = False, mask_radius: bool = False, show_grid: bool = True, grid_props: Optional[dict[str, Any]] = None, savefig: Optional[str] = None, dpi: int = 300, **ellipse_kws, ): required = [r_col, theta_col, r_std_col, theta_std_col] if color_col: required.append(color_col) exist_features(df, features=required) data = df[required].dropna() if data.empty: warnings.warn( "DataFrame is empty after dropping NaNs in " "required columns. Cannot plot.", UserWarning, stacklevel=2, ) return None if color_col: color_data = data[color_col].to_numpy() cbar_label = color_col else: # Default color to radial uncertainty color_data = data[r_std_col].to_numpy() cbar_label = f"Uncertainty ({r_std_col})" norm = Normalize(vmin=np.min(color_data), vmax=np.max(color_data)) cmap_obj = get_cmap(cmap, default="viridis") colors = cmap_obj(norm(color_data)) fig, ax = plt.subplots( figsize=figsize, subplot_kw={"projection": "polar"} ) # Plot each ellipse as a filled path for i, (_, row) in enumerate(data.iterrows()): theta_path, r_path = _get_ellipse_path( r_mean=row[r_col], theta_mean=np.deg2rad(row[theta_col]), r_std=row[r_std_col], theta_std=row[theta_std_col], n_std=n_std, ) ax.fill(theta_path, r_path, color=colors[i], **ellipse_kws) cbar = plt.colorbar( plt.cm.ScalarMappable(norm=norm, cmap=cmap_obj), ax=ax, pad=0.1, shrink=0.75, ) cbar.set_label(cbar_label, fontsize=10) ax.set_title(title or f"Error Ellipses ({n_std:.1f} std. dev.)") set_axis_grid(ax, show_grid=show_grid, grid_props=grid_props) if mask_angle: ax.set_xticklabels([]) if mask_radius: ax.set_yticklabels([]) plt.tight_layout() if savefig: plt.savefig(savefig, dpi=dpi, bbox_inches="tight") plt.close(fig) else: plt.show() return ax
def _get_ellipse_path(r_mean, theta_mean, r_std, theta_std, n_std=2.0): """ Helper to calculate the path of an ellipse in polar coordinates. The ellipse is defined in a local Cartesian frame and then transformed. """ # Width (radial) and height (tangential) of the ellipse width = n_std * r_std height = n_std * (r_mean * np.sin(theta_std)) # Center of the ellipse in Cartesian coordinates x_c = r_mean * np.cos(theta_mean) y_c = r_mean * np.sin(theta_mean) # Generate points on a standard ellipse t = np.linspace(0, 2 * np.pi, 100) x_local = (width / 2) * np.cos(t) y_local = (height / 2) * np.sin(t) # Rotation matrix to align ellipse with the radial direction R = np.array( [ [np.cos(theta_mean), -np.sin(theta_mean)], [np.sin(theta_mean), np.cos(theta_mean)], ] ) # Rotate and translate local points x_rotated, y_rotated = np.dot(R, [x_local, y_local]) x_final = x_rotated + x_c y_final = y_rotated + y_c # Convert final Cartesian points back to polar r_path = np.sqrt(x_final**2 + y_final**2) theta_path = np.arctan2(y_final, x_final) return theta_path, r_path plot_error_ellipses.__doc__ = r""" Plot polar error ellipses to visualize two-dimensional uncertainty. This function draws ellipses on a polar plot to represent the uncertainty of data points where both the radial and angular components have associated errors (standard deviations). Parameters ---------- df : pd.DataFrame Input DataFrame containing the data for the plot. r_col : str Name of the column for the mean radial position (e.g., distance). theta_col : str Name of the column for the mean angular position. **Must be in degrees.** r_std_col : str Name of the column for the standard deviation of the radial position. theta_std_col : str Name of the column for the standard deviation of the angular position. **Must be in degrees.** color_col : str, optional Name of a column to use for coloring the ellipses. If ``None``, ellipses are colored by their radial uncertainty (``r_std_col``). n_std : float, default=2.0 The number of standard deviations to use for the ellipse size. For example, ``n_std=2.0`` corresponds to approximately a 95% confidence region for a normal distribution. title : str, optional The title for the plot. If ``None``, a default is generated. figsize : tuple of (float, float), default=(8, 8) Figure size in inches. cmap : str, default='viridis' Matplotlib colormap for coloring the ellipses. show_grid : bool, default=True Toggle gridlines via the package helper ``set_axis_grid``. grid_props : dict, optional Keyword arguments passed to ``set_axis_grid`` for grid customization. mask_angle : bool, default=False If ``True``, hide the angular tick labels (degrees). mask_radius : bool, default=False If ``True``, hide the radial tick labels. savefig : str, optional If provided, save the figure to this path; otherwise the plot is shown interactively. dpi : int, default=300 Resolution for the saved figure. **ellipse_kws : dict, optional Additional keyword arguments passed to the ``ax.fill`` call for each ellipse (e.g., ``alpha``, ``edgecolor``). Returns ------- ax : matplotlib.axes.Axes or None The Matplotlib Axes object containing the plot, or ``None`` if the plot could not be generated. Notes ----- The visualization for each data point :math:`i` is constructed from its mean radial position :math:`\mu_{r,i}`, mean angular position :math:`\mu_{\theta,i}`, and their respective standard deviations :math:`\sigma_{r,i}` and :math:`\sigma_{\theta,i}`. 1. **Ellipse Dimensions**: The ellipse is first defined in a local Cartesian coordinate system at the origin. Its half-width (along the radial direction) and half-height (along the tangential direction) are determined by the standard deviations: .. math:: \text{width} &= n_{std} \cdot \sigma_{r,i} \\ \text{height} &= n_{std} \cdot (\mu_{r,i} \cdot \sin(\sigma_{\theta,i})) Note that the tangential height depends on the radial distance :math:`\mu_{r,i}`. 2. **Transformation**: This local ellipse is then transformed to the correct position on the polar plot. This involves two steps: a. **Rotation**: The ellipse is rotated by the mean angle :math:`\mu_{\theta,i}` to align its primary axis with the radial direction from the origin. b. **Translation**: The rotated ellipse is translated to the mean position, which in Cartesian coordinates is :math:`(x_c, y_c) = (\mu_{r,i} \cos(\mu_{\theta,i}), \mu_{r,i} \sin(\mu_{\theta,i}))`. 3. **Plotting**: The final transformed ellipse is drawn as a filled path on the polar axes. Examples -------- >>> import numpy as np >>> import pandas as pd >>> from kdiagram.plot.errors import plot_polar_error_ellipses >>> >>> # Simulate tracking data for 15 objects >>> np.random.seed(1) >>> n_points = 15 >>> df_tracking = pd.DataFrame({ ... 'angle_deg': np.linspace(0, 360, n_points, endpoint=False), ... 'distance_km': np.random.uniform(20, 80, n_points), ... 'distance_std': np.random.uniform(2, 7, n_points), ... 'angle_std_deg': np.random.uniform(3, 10, n_points), ... 'object_priority': np.random.randint(1, 5, n_points) ... }) >>> >>> # Generate the plot >>> ax = plot_polar_error_ellipses( ... df=df_tracking, ... r_col='distance_km', ... theta_col='angle_deg', ... r_std_col='distance_std', ... theta_std_col='angle_std_deg', ... color_col='object_priority', ... n_std=1.5, ... title='1.5-Sigma Positional Uncertainty', ... cmap='cividis', ... alpha=0.7, ... edgecolor='black', ... linewidth=0.5 ... ) """