Source code for kdiagram.plot.uncertainty

#   License: Apache-2.0
#   Author: LKouadio <etanoyau@gmail.com>

"""
Specialized diagnostic polar plots ('kdiagrams', named after author
Kouadio) designed for comprehensive model evaluation and forecast
analysis. Provides functions to visualize
prediction uncertainty, model drift, interval coverage, anomaly
magnitude, actual vs. predicted performance, feature influence, and
related diagnostics using polar coordinates.
"""

from __future__ import annotations

import textwrap
import warnings
from typing import Any, Literal

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.axes import Axes
from matplotlib.colors import Normalize
from scipy.stats import gaussian_kde

from ..api.summary import ResultSummary
from ..api.typing import Acov
from ..compat.matplotlib import get_cmap
from ..compat.sklearn import validate_params
from ..decorators import check_non_emptiness, isdf
from ..utils.diagnose_q import (
    build_qcols_multiple,
    detect_quantiles_in,
    validate_qcols,
)
from ..utils.fs import savefig as safe_savefig
from ..utils.handlers import columns_manager
from ..utils.plot import (
    _sample_colors,
    map_theta_to_span,
    set_axis_grid,
    setup_polar_axes,
)
from ..utils.validator import (
    _assert_all_types,
    exist_features,
)

__all__ = [
    "plot_actual_vs_predicted",
    "plot_anomaly_magnitude",
    "plot_coverage",
    "plot_coverage_diagnostic",
    "plot_interval_consistency",
    "plot_interval_width",
    "plot_model_drift",
    "plot_temporal_uncertainty",
    "plot_uncertainty_drift",
    "plot_velocity",
    "plot_radial_density_ring",
    "plot_polar_heatmap",
    "plot_polar_quiver",
]


[docs] @validate_params({"y_true": ["array-like"]}) def plot_coverage( y_true, *y_preds, names: str | list[str] | None = None, q: str | list[str] | None = None, kind: str = "line", cmap: str = "viridis", pie_startangle: int | float = 140, pie_autopct: str = "%1.1f%%", radar_color: str = "tab:blue", radar_fill_alpha: float = 0.25, radar_line_style: str = "o-", cov_fill: bool = False, figsize: tuple[str, str] = None, title: str | None = None, dpi: int = 300, savefig: str | None = None, verbose: int = 1, ax: Axes | None = None, ): # Convert the true values to a numpy array for consistency y_true = np.array(y_true) # Count how many model predictions were passed via *y_preds. num_models = len(y_preds) # Handle model names: create or extend to match the number of models. names = columns_manager(names, to_string=True) if names is None: names = [f"Model_{i + 1}" for i in range(num_models)] else: if len(names) < num_models: extra = num_models - len(names) for _i in range(extra): names.append(f"Model_{len(names) + 1}") coverage_scores = [] q = columns_manager(q) # Handle quantiles if q is not None: q = np.array(q) if q.ndim != 1: raise ValueError( "Parameter 'q' must be a 1D list or array of quantile levels." ) if not np.all((0 < q) & (q < 1)): raise ValueError("Quantile levels must be between 0 and 1.") # Sort q and get the sorted indices sorted_indices = np.argsort(q) q_sorted = q[sorted_indices] else: q_sorted = None # Compute coverage for each model in *y_preds. for _i, pred in enumerate(y_preds): pred = np.array(pred) if pred.ndim == 2: if q_sorted is not None: pred_sorted = pred[:, sorted_indices] else: pred_sorted = np.sort(pred, axis=1) lower_q = pred_sorted[:, 0] upper_q = pred_sorted[:, -1] in_interval = ((y_true >= lower_q) & (y_true <= upper_q)).astype( int ) coverage = np.mean(in_interval) elif pred.ndim == 1: # Point forecast coverage as fraction of exact matches matches = (y_true == pred).astype(int) coverage = np.mean(matches) else: coverage = None coverage_scores.append(coverage) # Prepare data for plotting. Replace None with 0 for convenience. valid_cov = [c if c is not None else 0 for c in coverage_scores] x_idx = np.arange(num_models) # ---- Axes / Figure handling (use ax if provided) ---------------------- if kind in {"bar", "line", "pipe", "pie"}: if ax is None: if figsize is not None: fig, ax = plt.subplots(figsize=figsize) else: fig, ax = plt.subplots() else: fig = ax.figure elif kind == "radar": if ax is None: # create a polar axes if none is provided if figsize is not None: fig, ax = plt.subplots( figsize=figsize, subplot_kw={"projection": "polar"} ) else: fig, ax = plt.subplots(subplot_kw={"projection": "polar"}) else: fig = ax.figure if getattr(ax, "name", "") != "polar": raise ValueError( "For kind='radar', the provided 'ax' must be a polar Axes." ) else: # No plotting; console output only fig = None # ---- Draw according to 'kind' ----------------------------------------- if kind == "bar": ax.bar(x_idx, valid_cov, color="blue", alpha=0.7) for idx, val in enumerate(coverage_scores): if val is not None: ax.text( x=idx, y=val + 0.01, s=f"{val:.2f}", ha="center", va="bottom", ) ax.set_xticks(x_idx) ax.set_xticklabels(names) ax.set_ylim([0, 1]) ax.set_ylabel("Coverage") ax.set_xlabel("Models") elif kind == "line": ax.plot(x_idx, valid_cov, marker="o") for idx, val in enumerate(coverage_scores): if val is not None: ax.text( x=idx, y=val + 0.01, s=f"{val:.2f}", ha="center", va="bottom", ) ax.set_xticks(x_idx) ax.set_xticklabels(names) ax.set_ylim([0, 1]) ax.set_ylabel("Coverage") ax.set_xlabel("Models") elif kind in {"pie", "pipe"}: # keep your original alias total_cov = sum(valid_cov) if total_cov == 0: ax.text(0.5, 0.5, "No coverage to plot", ha="center", va="center") else: ax.pie( valid_cov, labels=names, autopct=pie_autopct, startangle=pie_startangle, colors=get_cmap(cmap)(np.linspace(0, 1, num_models)), ) ax.axis("equal") # Make the pie chart a perfect circle. elif kind == "radar": N = num_models angles = np.linspace(0, 2 * np.pi, N, endpoint=False) angles = np.concatenate((angles, [angles[0]])) coverage_radar = np.concatenate((valid_cov, [valid_cov[0]])) # Plot main coverage line ax.plot( angles, coverage_radar, radar_line_style, color=radar_color, label="Coverage", ) if cov_fill: if num_models == 1: coverage_value = valid_cov[0] theta = np.linspace(0, 2 * np.pi, 100) r = np.linspace(0, coverage_value, 100) R, Theta = np.meshgrid(r, theta) ax.grid(False) ax.pcolormesh( Theta, R, R, cmap=cmap, shading="auto", alpha=radar_fill_alpha, zorder=0, ) ax.grid(True, which="both") ax.plot( theta, [coverage_value] * len(theta), color="red", linewidth=2, linestyle="-", ) ax.set_ylim(0, 1) ax.set_yticks([0.2, 0.4, 0.6, 0.8]) ax.yaxis.grid( True, color="gray", linestyle="--", linewidth=0.5, alpha=0.7, ) else: ax.fill( angles, coverage_radar, color=radar_color, alpha=radar_fill_alpha, zorder=0, ) # Wrap each label to multiple lines (keeps parentheticals readable) wrapped_names = [ textwrap.fill( n, width=18, break_long_words=False, break_on_hyphens=False ) for n in names ] ax.set_thetagrids( angles[:-1] * 180 / np.pi, labels=wrapped_names, ) # Push labels outward from the circle a bit (points) pad = 8 if len(names) <= 4 else (10 if len(names) <= 6 else 12) ax.tick_params(axis="x", pad=pad) # add radial padding for labels # ax.set_thetagrids(angles[:-1] * 180 / np.pi, labels=names) # Optional: if there are many models (crowded), gently reduce fontsize if len(names) >= 6: for lbl in ax.get_xticklabels(): lbl.set_fontsize(max(lbl.get_fontsize() - 1, 8)) ax.set_ylim(0, 1) ax.legend(loc="upper right") else: # Fallback: print coverage scores to the console for each model. for idx, val in enumerate(coverage_scores): print(f"{names[idx]} coverage: {val}") # ---- Summary / titles / saving ---------------------------------------- if verbose: cov_dict = { names[idx]: cov for idx, cov in enumerate(coverage_scores) } summary = ResultSummary("CoverageScores").add_results(cov_dict) print(summary) if title is not None and fig is not None: # Title for this axes ax.set_title(title) final = safe_savefig( savefig, fig, dpi=dpi, bbox_inches="tight", ) if final is not None: plt.close(fig) else: plt.tight_layout() plt.show() return ax if fig is not None else None
plot_coverage.__doc__ = r""" Plot overall coverage scores for forecast intervals or points. Computes and visualizes the empirical coverage rate, which is the fraction of times the true values fall within predicted quantile intervals (or match point forecasts). Supports comparing multiple models or prediction sets using various chart types. This plot provides a high-level summary of whether prediction intervals are well-calibrated on average across the entire dataset. It's useful for quickly comparing the overall reliability of uncertainty estimates from different models or parameter settings before diving into finer diagnostics :footcite:p:`kouadiob2025`. Parameters ---------- y_true : array-like of shape (n_samples,) The ground truth target values. Must be a 1D array-like object (list, NumPy array, pandas Series). *y_preds : array-like Variable number of prediction arrays. Each positional argument should be an array-like object corresponding to a model or prediction set. The shape determines interpretation: - shape (n_samples, n_quantiles): Represents quantile forecasts (e.g., lower, median, upper bounds). Coverage is calculated based on the interval defined by the minimum and maximum quantile columns provided (after sorting based on `q` if given, or by value otherwise). Requires `q` parameter for proper interval definition if quantiles are not ordered. - shape (n_samples,): Represents point forecasts. Coverage is calculated as the proportion of exact matches to `y_true` (typically near zero for continuous data). names : list of str, optional Labels for each prediction set provided in `*y_preds`. Used in legends and axis labels. If `None` or shorter than the number of prediction sets, default names like "Model_1", "Model_2", etc., are generated for the missing ones. Default is ``None``. q : list of float, optional Quantile levels corresponding to the columns in 2D `y_preds` arrays. Values must be between 0 and 1. Used to identify and sort quantile columns correctly before determining the interval bounds (minimum and maximum specified quantile) for coverage calculation. Required if `y_preds` contains 2D arrays representing unordered quantiles. Default is ``None``. kind : {'line', 'bar', 'pie', 'radar'}, optional The type of plot to generate for visualizing coverage scores. Default is ``'line'``. - ``'line'``: Line chart connecting coverage scores for each prediction set. - ``'bar'``: Bar chart showing the coverage score for each prediction set. - ``'pie'``: Pie chart where each slice represents a model's coverage score as a fraction of the sum of all scores. - ``'radar'``: Radar (spider) chart where each axis represents a prediction set, and the distance along the axis indicates its coverage score. cmap : str, optional Matplotlib colormap name used for coloring slices in the `'pie'` chart or the gradient fill in single-model radar plots when ``cov_fill=True``. Default is ``'viridis'``. pie_startangle : float, optional Starting angle in degrees for the first slice in the `'pie'` chart. Rotates the chart orientation. Default is 140. pie_autopct : str or None, optional String formatting pattern (e.g., ``'%1.1f%%'``) used to label pie slices with their numeric value (percentage). If `None`, no numeric labels are shown. Default is ``'%1.1f%%'``. radar_color : str, optional Color specification for the main line and optional fill in the `'radar'` chart. Accepts any valid Matplotlib color. Default is ``'tab:blue'``. radar_fill_alpha : float, optional Transparency level (alpha) for the filled area in the `'radar'` chart when ``cov_fill=True``. Value between 0 (transparent) and 1 (opaque). Default is 0.25. radar_line_style : str, optional Matplotlib line and marker style string for the radar chart outline (e.g., ``'o-'``, ``'--'``, ``':'``). Default is ``'o-'``. cov_fill : bool, optional If ``True`` and ``kind='radar'``, fills the area under the radar plot lines. For a single model, creates a radial gradient; for multiple models, fills polygons. Default is ``False``. figsize : tuple of (float, float), optional Figure size ``(width, height)`` in inches. If ``None``, uses Matplotlib's default figure size. title : str, optional Optional title for the plot. Default is ``None``. savefig : str, optional If not ``None``, specifies the full path (including filename and extension, e.g., 'coverage.png') to save the figure. If ``None``, the plot is displayed interactively. Default is ``None``. verbose : {0, 1}, optional Controls printing of coverage scores to the console. - ``0``: Silent. - ``1``: Prints a formatted summary of coverage scores. Default is ``1``. Returns ------- None This function generates and displays or saves a Matplotlib plot but does not return any object. Raises ------ ValueError If `q` contains values outside (0, 1), or if `kind` is not one of the allowed options, or if dimensions of `y_preds` and `q` are incompatible. TypeError If inputs `y_true` or `y_preds` contain non-numeric data. See Also -------- plot_coverage_diagnostic : Plot point-wise coverage status. numpy.mean : Used to calculate the average coverage rate. Notes ----- The empirical coverage rate is a key metric for assessing the calibration of probabilistic forecasts, particularly prediction intervals :footcite:p:`kouadiob2025`. **Calculation:** - For quantile intervals (2D `y_preds`), the interval for each sample :math:`i` is defined by the minimum and maximum predicted values across the specified quantiles for that sample, :math:`[\hat{y}_{i}^{(\ell)}, \hat{y}_{i}^{(u)}]`. Coverage is: .. math:: \text{Coverage} = \frac{1}{N}\sum_{i=1}^{N} \mathbf{1}\{\hat{y}_{i}^{(\ell)} \leq y_i \leq \hat{y}_{i}^{(u)}\} where :math:`\mathbf{1}\{\cdot\}` is 1 if true, 0 otherwise. - For point forecasts (1D `y_preds`), coverage is the proportion of exact matches: .. math:: \text{Coverage} = \frac{1}{N}\sum_{i=1}^{N} \mathbf{1}\{ \hat{y}_i = y_i \} This is typically very low for continuous :math:`y_true`. **Interpretation:** Compare the calculated coverage score(s) to the nominal coverage rate implied by the quantiles (e.g., 80% for a Q10-Q90 interval). Scores far from the nominal rate suggest miscalibration (intervals too wide or too narrow on average) :footcite:p:`Gneiting2007b`. **Radar Plot Fill (`cov_fill=True`):** - Single model: Shows a circular gradient fill from the center out to the calculated coverage score radius, with a solid reference line at the coverage score. - Multiple models: Fills the polygon defined by the coverage scores for all models with a semi-transparent color. Examples -------- >>> import numpy as np >>> from kdiagram.plot.uncertainty import plot_coverage >>> # True values >>> y_true = np.random.rand(100) * 10 >>> # 3-quantile predictions (Q10, Q50, Q90) for two models >>> y_pred_A = np.sort(np.random.normal( ... loc=9, scale=2, size=(100, 3)), axis=1) >>> y_pred_B = np.sort(np.random.normal( ... loc=11, scale=3, size=(100, 3)), axis=1) >>> q_levels = [0.1, 0.5, 0.9] >>> # Bar chart comparison >>> plot_coverage(y_true, y_pred_A, y_pred_B, q=q_levels, ... names=['Model A', 'Model B'], kind='bar', ... title='Coverage Comparison (Bar)') >>> # Radar chart comparison with fill >>> plot_coverage(y_true, y_pred_A, y_pred_B, q=q_levels, ... names=['Model A', 'Model B'], kind='radar', ... title='Coverage Comparison (Radar)', ... cov_fill=True, cmap='Set2') References ---------- .. footbibliography:: """
[docs] @check_non_emptiness @isdf def plot_model_drift( df: pd.DataFrame, q_cols: list | None = None, q10_cols: list[str] | None = None, q90_cols: list[str] | None = None, horizons: list[str | int] | None = None, color_metric_cols: list[str] | None = None, acov: str = "quarter_circle", value_label: str = "Uncertainty Width (Q90 - Q10)", cmap: str = "coolwarm", figsize: tuple[int, int] = (8, 8), title: str = "Model Forecast Drift Over Time", show_grid: bool = True, annotate: bool = True, grid_props: dict | None = None, savefig: str | None = None, dpi: int = 300, ax: Axes | None = None, ): # # 1. Pair quantile columns q_cols = build_qcols_multiple( q_cols, qlow_cols=q10_cols, qup_cols=q90_cols, ) n_horizons = len(q_cols) # Default angular labels if horizons is None: horizons = [f"Horizon {idx + 1}" for idx in range(n_horizons)] # # 2. Compute average inter-quantile width per horizon widths = np.array([(df[q90] - df[q10]).mean() for q10, q90 in q_cols]) # Secondary colouring metric if color_metric_cols is not None: colour_vals = np.array([df[col].mean() for col in color_metric_cols]) else: colour_vals = widths # # 3. Angular span selection # span = { "default": 2 * np.pi, "half_circle": np.pi, "quarter_circle": np.pi / 2, "eighth_circle": np.pi / 4, }.get(acov, 2 * np.pi) theta = np.linspace(0.0, span, n_horizons, endpoint=False) # Scale radii when angular coverage < full circle radii = widths / widths.max() if span < 2 * np.pi else widths # # 4. Figure setup if ax is None: fig, ax = plt.subplots( figsize=figsize, subplot_kw={"projection": "polar"}, ) else: fig = ax.figure # Orient polar chart: 0° at the top, clockwise direction ax.set_theta_offset(np.pi / 2) ax.set_theta_direction(-1) ax.set_thetamin(0) ax.set_thetamax(np.degrees(span)) # Colormap normalisation norm = Normalize(vmin=colour_vals.min(), vmax=colour_vals.max()) colours = get_cmap(cmap)(norm(colour_vals)) # # 5. Draw bars bar_width = (span / n_horizons) * 0.9 # slight gap between bars ax.bar( theta, radii, width=bar_width, color=colours, edgecolor="k", alpha=0.85, linewidth=0.8, ) # Annotation if annotate: for ang, rad, raw in zip(theta, radii, widths): label = f"{raw:.2f}" ax.text( ang, rad + 0.03 * radii.max(), label, ha="center", va="bottom", fontsize=9, ) # Ticks & labels # Wrap long labels over 1–2 lines so they don’t collide with the rim wrap_width = 32 if n_horizons <= 5 else (28 if n_horizons <= 8 else 24) labels = [ textwrap.fill( str(h), width=wrap_width, break_long_words=False, break_on_hyphens=False, ) for h in horizons ] ax.set_xticks(theta) ax.set_xticklabels(labels) # Push labels outward from the circle (points) # Use a bit more pad when there are many labels pad_pts = 10 if n_horizons <= 5 else (12 if n_horizons <= 8 else 14) ax.tick_params(axis="x", pad=pad_pts) # Keep radial ticks hidden and place the radial ylabel inside the axes ax.set_yticklabels([]) ax.set_ylabel(value_label, labelpad=52) # adjust as you like # Optional grid set_axis_grid(ax, show_grid, grid_props=grid_props) # 6. Output handling final = safe_savefig( savefig, fig, dpi=dpi, bbox_inches="tight", ) if final is not None: plt.close(fig) else: fig.tight_layout() plt.show() return ax
plot_model_drift.__doc__ = r""" Visualize forecast drift across prediction horizons. Renders a polar bar chart depicting how average model uncertainty (or another metric) evolves as the forecast horizon increases. Each bar corresponds to a specific forecast horizon, arranged angularly. This plot is crucial for diagnosing model degradation over longer lead times, often termed *concept drift* or *model aging* [1]_. A distinct increase in bar height (radius) for later horizons signals inflating uncertainty or error, potentially indicating a need for model retraining or adjustments to account for changing dynamics. Use this visualization to assess if your model's reliability holds as you forecast further into the future. Parameters ---------- df : pandas.DataFrame Input DataFrame containing the necessary quantile columns (or columns specified in ``color_metric_cols``) for each forecast horizon. q_cols : list[tuple[str, str]], optional A list where each element is a tuple containing the column names for the lower and upper quantiles for a specific horizon, e.g., ``[('q10_h1', 'q90_h1'), ('q10_h2', 'q90_h2')]``. If ``None``, ``q10_cols`` and ``q90_cols`` must be provided instead. Default is ``None``. q10_cols : list[str], optional List of column names representing the lower quantile (e.g., 10th percentile) for each successive horizon. Must be provided if ``q_cols`` is ``None``. Must have the same length as ``q90_cols`` and ``horizons`` (if provided). Default is ``None``. q90_cols : list[str], optional List of column names representing the upper quantile (e.g., 90th percentile) for each successive horizon. Must be provided if ``q_cols`` is ``None``. Must have the same length as ``q10_cols`` and ``horizons`` (if provided). Default is ``None``. horizons : list of str or int, optional Labels corresponding to each forecast horizon (plotted on the angular axis). The order must match the order in ``q_cols`` or ``q10_cols``/``q90_cols``. If ``None``, generic labels like "Horizon 1", "Horizon 2", ... are generated. Default is ``None``. color_metric_cols : list of str, optional If provided, the bars are colored based on the mean value of these columns for each horizon (e.g., provide RMSE columns like ``['rmse_h1', 'rmse_h2', ...]``). If ``None``, bars are colored based on the calculated mean interval width (Q90-Q10). Default is ``None``. acov : {'default', 'half_circle', 'quarter_circle', \ 'eighth_circle'}, optional Specifies the angular coverage (span) of the plot. Use narrower sectors for fewer horizons. Default is ``'quarter_circle'``. - ``'default'``: Full circle (:math:`2\pi`, 360°). - ``'half_circle'``: Half circle (:math:`\pi`, 180°). - ``'quarter_circle'``: Quarter circle (:math:`\pi/2`, 90°). - ``'eighth_circle'``: Eighth circle (:math:`\pi/4`, 45°). value_label : str, optional Label displayed for the radial axis, describing the metric represented by the bar height. Default is ``'Uncertainty Width (Q90 - Q10)'``. cmap : str, optional Name of the Matplotlib colormap used to color the bars based on their radial value (or the `color_metric_cols` value). Default is ``'coolwarm'``. figsize : tuple of (float, float), optional Figure dimensions ``(width, height)`` in inches. If ``None``, uses the default ``(8, 8)``. title : str, optional Headline text displayed above the plot. Default is ``'Model Forecast Drift Over Time'``. show_grid : bool, optional If ``True``, displays radial and angular grid lines to aid interpretation. Default is ``True``. annotate : bool, optional If ``True``, displays the numeric value (mean width or mean color metric) on top of each bar. Default is ``True``. grid_props : dict, optional Dictionary of keyword arguments to customize the appearance of the grid lines (passed to ``ax.grid()``). E.g., ``{'linestyle': ':', 'linewidth': 0.5}``. Default is ``None``. savefig : str, optional Full path and filename to save the plot (e.g., 'drift.pdf'). If ``None``, the plot is displayed interactively. Default is ``None``. Returns ------- matplotlib.axes.Axes The polar axes object containing the bar chart, allowing for further customization if desired. Raises ------ ValueError If required quantile columns (`q_cols` or both `q10_cols` and `q90_cols`) are missing from `df`, or if the lengths of provided column lists/horizons mismatch. TypeError If data in the specified columns cannot be processed numerically. See Also -------- kdiagram.plot.uncertainty.plot_uncertainty_drift : Visualize drift of the uncertainty pattern using rings. kdiagram.utils.plot.set_axis_grid : Helper for grid styling (if used). Notes ----- The primary radial value plotted for each horizon :math:`h` is the mean interval width calculated across all :math:`N` samples: .. math:: \bar{w}_h = \frac{1}{N}\sum_{j=1}^{N} \left( Q_{up, j, h} - Q_{low, j, h} \right) If ``color_metric_cols`` is provided, a similar average is calculated for those columns to determine bar color. Radii may be scaled relative to the maximum radius if a restricted angular coverage (``acov`` is not 'default') is used, to better fit the visual sector [2]_. References ---------- .. [1] Kouadio, K. L., Liu, R., Loukou, K. G. H., Liu, J., & Liu, W. (2025). Analytics Framework for Interpreting Spatiotemporal Probabilistic Forecasts. *International Journal of Forecasting*. Manuscript submitted. .. [2] Gama, J., Žliobaitė, I., Bifet, A., Pechenizkiy, M., & Bouchachia, A. (2014). A survey on concept drift adaptation. *ACM Computing Surveys (CSUR)*, 46(4), 1-37. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from kdiagram.plot.uncertainty import plot_model_drift >>> # Example with synthetic data >>> years = [2023, 2024, 2025, 2026] >>> n_samples=50 >>> df_synth = pd.DataFrame() >>> q10_cols, q90_cols = [], [] >>> for i, year in enumerate(years): ... ql, qu = f'val_{year}_q10', f'val_{year}_q90' ... q10_cols.append(ql); q90_cols.append(qu) ... q10 = np.random.rand(n_samples)*5 + i*0.5 ... q90 = q10 + np.random.rand(n_samples)*2 + 1 + i*0.8 ... df_synth[ql]=q10; df_synth[qu]=q90 >>> ax = plot_model_drift( ... df=df_synth, ... q10_cols=q10_cols, ... q90_cols=q90_cols, ... horizons=years, ... acov='quarter_circle', # Use 90 degree span ... title='Synthetic Model Drift Example' ... ) """
[docs] @check_non_emptiness @isdf def plot_velocity( df: pd.DataFrame, q50_cols: list[str], theta_col: str | None = None, cmap: str = "viridis", acov: str = "default", normalize: bool = True, use_abs_color: bool = True, figsize: tuple[float, float] = (9, 9), title: str | None = None, s: float | int = 30, alpha: float = 0.85, show_grid: bool = True, savefig: str | None = None, cbar: bool = True, mask_angle: bool = False, dpi: int = 300, ax: Axes | None = None, ): # --- Input Validation --- # Check if required q50_cols exist in the DataFrame missing_cols = [col for col in q50_cols if col not in df.columns] if missing_cols: raise ValueError( f"The following Q50 columns are missing from the " f"DataFrame: {', '.join(missing_cols)}" ) if len(q50_cols) < 2: raise ValueError( "At least two Q50 columns (representing two time points)" " are required to compute velocity." ) # Check theta_col status and warn if provided but unused if theta_col is not None: if theta_col not in df.columns: warnings.warn( f"Specified `theta_col` ('{theta_col}') not found in " f"DataFrame columns. Using index for angular position.", UserWarning, stacklevel=2, ) else: warnings.warn( f"`theta_col` ('{theta_col}') is provided but the current" f" implementation uses the DataFrame index for angular " f"positioning ('theta'). The column '{theta_col}' is " f"currently ignored for positioning.", UserWarning, stacklevel=2, ) # --- Data Processing --- # Extract Q50 data into a NumPy array (locations x time) q50_array = df[q50_cols].values # Shape (N, M) # Compute yearly differences along the time axis (axis=1) # Result shape (N, M-1) yearly_diff = np.diff(q50_array, axis=1) # Compute average velocity per location (mean across time diffs) # Result shape (N,) r = np.mean(yearly_diff, axis=1) # Normalize radial values (velocity) if requested r_normalized = r.copy() # Use a copy for potential normalization if normalize: r_range = np.ptp(r) # Peak-to-peak (max - min) if r_range > 1e-9: # Avoid division by zero or near-zero r_min = r.min() r_normalized = (r - r_min) / r_range else: # Handle case where all velocities are the same r_normalized = np.zeros_like(r) # Set all to 0 if range is zero warnings.warn( "Velocity range is zero or near-zero. Normalized radial " "values ('r') are set to 0.", UserWarning, stacklevel=2, ) # Determine values used for coloring the points if use_abs_color: # Use average absolute Q50 magnitude across all years color_vals = np.mean(np.abs(q50_array), axis=1) cbar_label = "Average Abs Q50 Magnitude" else: # Use the calculated average velocity for color color_vals = r # Use original velocity for color scale cbar_label = "Average Velocity" # --- Angular Coordinate Calculation --- N = len(df) # Number of locations/points # Generate linear space from 0 to 1 for N points theta_normalized = np.linspace( 0, 1, N, endpoint=True ) # Includes endpoint 1 # Map normalized theta to the desired angular coverage angular_range_map = { "default": 2 * np.pi, "half_circle": np.pi, "quarter_circle": np.pi / 2, "eighth_circle": np.pi / 4, } # Get the angular span in radians, default to full circle if invalid angle_span = angular_range_map.get(acov.lower(), 2 * np.pi) if acov.lower() not in angular_range_map: warnings.warn( f"Invalid `acov` value '{acov}'. Using 'default' (2*pi).", UserWarning, stacklevel=2, ) # Calculate final theta values theta = theta_normalized * angle_span # --- Color Normalization for Plotting --- # try: cmap_used = get_cmap(cmap, default="viridis") # except (TypeError, ValueError, KeyError): # warnings.warn( # f"Invalid `cmap` name '{cmap}'. Falling back to 'viridis'.", # UserWarning, # stacklevel=2, # ) # cmap = "viridis" # cmap_used = get_cmap(cmap) # Normalize color values to the range [0, 1] for the colormap color_norm = Normalize(vmin=np.min(color_vals), vmax=np.max(color_vals)) # Map normalized color values to actual colors using the colormap colors = cmap_used(color_norm(color_vals)) # --- Plotting --- if ax is None: fig, ax = plt.subplots( figsize=figsize, subplot_kw={"projection": "polar"} ) else: fig = ax.figure # Set the angular limits based on angular coverage ax.set_thetamin(0) ax.set_thetamax(np.degrees(angle_span)) # set_thetamax expects degrees # Create the polar scatter plot ax.scatter( theta, r_normalized if normalize else r, # Use normalized or raw r c=colors, # Point colors s=s, # Point size edgecolor="k", # Point edge color (optional, for visibility) linewidth=0.5, # Point edge width (optional) alpha=alpha, # Point transparency ) # Set plot title ax.set_title( title or "Average Velocity Polar Plot", fontsize=14, y=1.08 ) # Adjust title position # Add color bar if requested if cbar: # Create a ScalarMappable for the colorbar sm = cm.ScalarMappable(norm=color_norm, cmap=cmap_used) sm.set_array([]) # Necessary for ScalarMappable # Add the colorbar to the figure cbar_obj = plt.colorbar(sm, ax=ax, pad=0.1, shrink=0.7) cbar_obj.set_label(cbar_label, fontsize=10) # Customize grid and labels if show_grid: ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.7) else: ax.grid(False) # Optionally mask angular tick labels if mask_angle: ax.set_xticklabels([]) # Set radial label based on normalization if normalize: ax.set_ylabel("Normalized Average Velocity", labelpad=32, fontsize=10) # Ensure radial limits are appropriate for normalized data # ax.set_ylim(bottom=0, top=1.05) # Give slight padding # ax.set_yticks(np.linspace(0, 1, 5)) # Example radial ticks else: ax.set_ylabel("Average Velocity", labelpad=32, fontsize=10) # Radial limits might need auto-scaling or manual setting # --- Save or Show --- final = safe_savefig( savefig, fig, dpi=dpi, bbox_inches="tight", ) if final is not None: plt.close(fig) else: fig.tight_layout() plt.show() return ax
plot_velocity.__doc__ = r""" Polar plot visualizing average velocity across locations. Generates a polar scatter plot where each point represents a unique location or observation from the input DataFrame. The radial distance (`r`) of each point corresponds to the average rate of change (velocity) of the median prediction (Q50) over consecutive time periods (e.g., years), optionally normalized to [0, 1]. The angular position (`theta`) represents the location, currently determined by its index in the DataFrame, mapped onto a specified angular coverage. The color of each point provides an additional dimension, representing either the calculated velocity itself or the average absolute magnitude of the Q50 predictions over the considered time periods [1]_. This visualization is useful for identifying spatial patterns in the dynamics of a phenomenon, such as locating areas of rapid or slow change (high/low velocity) in land subsidence predictions. Coloring by magnitude helps to contextualize the velocity (e.g., is high velocity occurring in areas of already high subsidence?). Parameters ---------- df : pd.DataFrame Input DataFrame containing the data. Must include the columns specified in `q50_cols`. Decorator `@isdf` ensures this is a pandas DataFrame. Decorator `@check_non_emptiness` ensures it's not empty. q50_cols : list of str An ordered list of column names representing the Q50 (median) predictions for consecutive time steps (e.g., years). The list must contain at least two column names to compute velocity. Example: ``['subsidence_2022_q50', 'subsidence_2023_q50', 'subsidence_2024_q50']``. theta_col : str, optional *Intended* column name to determine the angular position (`theta`) for each location (e.g., 'latitude', 'longitude', or a spatial index). If ``None``, the DataFrame index is conceptually used. *Note: The current implementation maps the DataFrame row index to the angular range specified by `acov`, regardless of whether `theta_col` is provided. Providing `theta_col` will currently trigger a warning but will not affect the plot's angular axis.* Default is ``None``. cmap : str, default='viridis' The name of the Matplotlib colormap used to color the scatter points based on `color_vals` (determined by `use_abs_color`). acov : str, default='default' Angular coverage defining the span of the polar plot's theta axis. Options are: - ``'default'``: Full circle (2p radians or 360 degrees). - ``'half_circle'``: Half circle (p radians or 180 degrees). - ``'quarter_circle'``: Quarter circle (p/2 radians or 90 degrees). - ``'eighth_circle'``: Eighth circle (p/4 radians or 45 degrees). Invalid options default to ``'default'``. normalize : bool, default=True If ``True``, the calculated average velocity values (`r`) are min-max normalized to the range [0, 1] before plotting radially. This emphasizes relative velocity patterns. If ``False``, the raw average velocity values are used for the radial coordinate. use_abs_color : bool, default=True Determines the variable used for coloring the points: - If ``True``, points are colored based on the average absolute magnitude of the Q50 values across the specified `q50_cols`. This highlights areas with high overall prediction values. - If ``False``, points are colored based on the calculated average velocity (`r`) itself. This highlights areas of high or low rate of change. figsize : tuple of (float, float), default=(9, 9) The width and height of the figure in inches. title : str, optional The title displayed above the polar plot. If ``None``, a default title "Normalized Subsidence Velocity" (or similar, depending on context, though not dynamically changed here) is used. Default is ``None``. s : float or int, default=30 The marker size for the scatter points. alpha : float, default=0.85 The transparency level of the scatter points (0=transparent, 1=opaque). Useful for visualizing dense data. show_grid : bool, default=True If ``True``, display the polar grid lines (radial and angular) on the plot. savefig : str, optional The file path (including extension, e.g., 'velocity_plot.pdf') where the plot image should be saved. If ``None``, the plot is displayed interactively using `plt.show()`. Default is ``None``. cbar : bool, default=True If ``True``, display a color bar alongside the plot indicating the mapping between colors and the values defined by `use_abs_color`. mask_angle : bool, default=False If ``True``, hide the angular tick labels (the degrees/radians around the circumference). This can be useful if the angular position based on index is not inherently meaningful. Returns ------- ax : matplotlib.axes.Axes The Matplotlib Axes object containing the polar scatter plot. Can be used for further customization. Raises ------ ValueError If `q50_cols` contains fewer than two column names. See Also -------- numpy.diff : Computes the difference between consecutive elements. numpy.mean : Computes the arithmetic mean. matplotlib.pyplot.scatter : Creates scatter plots. matplotlib.pyplot.polar : Creates polar plots. kdiagram.plot.uncertainty.plot_uncertainty_drift : Visualizes uncertainty width changes over time. Notes ----- - The function assumes the columns in `q50_cols` represent equally spaced time steps for the velocity calculation to be meaningful as an average *yearly* (or per-step) velocity. - The average velocity (`r`) is calculated as the mean of the first-order differences between consecutive columns in `q50_cols`. - Normalization of `r` uses min-max scaling: :math:`r' = (r - \min(r)) / (\max(r) - \min(r))`. - The angular coordinate `theta` is currently derived from the DataFrame index, mapped linearly onto the angular range defined by `acov`. The `theta_col` parameter is not used for positioning in the current implementation, which might be revised in future versions. A warning is issued if `theta_col` is provided [2]_. Let :math:`\mathbf{Q}` be the data matrix extracted from `df` using columns `q50_cols`, with shape :math:`(N, M)`, where :math:`N` is the number of locations (rows) and :math:`M` is the number of time points (columns). Note the transpose compared to the description in `plot_feature_fingerprint`. 1. **Velocity Calculation**: The differences between consecutive time points for each location :math:`j` are computed: :math:`\Delta Q_{j,i} = Q_{j, i+1} - Q_{j, i}` for :math:`i = 0, \dots, M-2`. The average velocity for location :math:`j` is: .. math:: r_j = \frac{1}{M-1} \sum_{i=0}^{M-2} \Delta Q_{j,i} 2. **Radial Normalization** (if `normalize=True`): Let :math:`\mathbf{r} = (r_0, \dots, r_{N-1})`. .. math:: r'_j = \frac{r_j - \min(\mathbf{r})}{\max(\mathbf{r}) - \min(\mathbf{r})} If :math:`\max(\mathbf{r}) = \min(\mathbf{r})`, :math:`r'_j = 0`. 3. **Color Value Calculation**: - If `use_abs_color=True`: Average absolute magnitude. .. math:: c_j = \frac{1}{M} \sum_{i=0}^{M-1} |Q_{j,i}| - If `use_abs_color=False`: Use average velocity. :math:`c_j = r_j` 4. **Angular Coordinate Calculation**: Let :math:`S` be the angular span in radians determined by `acov` (e.g., :math:`2\pi` for ``'default'``). The angle for location :math:`j` (where :math:`j` is the row index from :math:`0` to :math:`N-1`) is: .. math:: \theta_j = \frac{j}{N} \times S The code uses `np.linspace(0, 1, N)` which generates N points from 0 to 1 inclusive, so the formula might be slightly different depending on endpoint handling, effectively :math:`\theta_j = \frac{j}{N-1} \times S` for the N points if endpoint=True, or spacing relates to N intervals if ``endpoint=False``. The code uses `np.linspace(0, 1, N)` and multiplies by `angle_span`, suggesting the angles might span from 0 up to `angle_span`. References ---------- .. [1] Kouadio, K. L., Liu, R., Loukou, K. G. H., Liu, J., & Liu, W. (2025). Analytics Framework for Interpreting Spatiotemporal Probabilistic Forecasts. *International Journal of Forecasting*. Manuscript submitted. .. [2] Hunter, J. D. (2007). Matplotlib: A 2D graphics environment. Computing in Science & Engineering, 9(3), 90-95. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from kdiagram.plot.uncertainty import plot_velocity **1. Random Example:** >>> np.random.seed(0) >>> N_points = 100 >>> df_random = pd.DataFrame({ ... 'location_id': range(N_points), ... 'value_2020_q50': np.random.rand(N_points) * 10, ... 'value_2021_q50': (np.random.rand(N_points) * 10 + ... np.linspace(0, 5, N_points)), ... 'value_2022_q50': (np.random.rand(N_points) * 10 + ... np.linspace(0, 10, N_points)), ... 'latitude': np.linspace(22, 23, N_points) ... }) >>> q50_cols_random = ['value_2020_q50', 'value_2021_q50', ... 'value_2022_q50'] >>> ax_random = plot_velocity( ... df=df_random, ... q50_cols=q50_cols_random, ... theta_col='latitude', # Note: currently ignored for pos ... acov='default', ... normalize=True, ... use_abs_color=False, # Color by velocity ... title='Random Data Velocity Profile', ... cmap='coolwarm', ... s=40, ... cbar=True ... ) >>> # plt.show() is called internally if savefig is None **2. Concrete Example (Subsidence Data - adapted from docstring):** >>> # Assume zhongshan_pred_2023_2026 is a loaded DataFrame like: >>> # zhongshan_pred_2023_2026 = pd.DataFrame({ >>> # 'subsidence_2022_q50': np.random.rand(50)*5 + 5, >>> # 'subsidence_2023_q50': np.random.rand(50)*6 + 6, >>> # 'subsidence_2024_q50': np.random.rand(50)*7 + 7, >>> # 'subsidence_2025_q50': np.random.rand(50)*8 + 8, >>> # 'subsidence_2026_q50': np.random.rand(50)*9 + 9, >>> # 'latitude': np.linspace(22.2, 22.8, 50) >>> # }) # Dummy data for example execution >>> # Create dummy data if zhongshan_pred_2023_2026 doesn't exist >>> try: ... zhongshan_pred_2023_2026 ... except NameError: ... print("Creating dummy subsidence data for example...") ... zhongshan_pred_2023_2026 = pd.DataFrame({ ... 'subsidence_2022_q50': np.random.rand(150)*5 + 5, ... 'subsidence_2023_q50': np.random.rand(150)*6 + 6 + np.linspace(0, 2, 150), ... 'subsidence_2024_q50': np.random.rand(150)*7 + 7 + np.linspace(0, 4, 150), ... 'subsidence_2025_q50': np.random.rand(150)*8 + 8 + np.linspace(0, 6, 150), ... 'subsidence_2026_q50': np.random.rand(150)*9 + 9 + np.linspace(0, 8, 150), ... 'latitude': np.linspace(22.2, 22.8, 150) ... }) >>> subsidence_q50_cols = [ ... 'subsidence_2022_q50', 'subsidence_2023_q50', ... 'subsidence_2024_q50', 'subsidence_2025_q50', ... 'subsidence_2026_q50', ... ] >>> ax_subsidence = plot_velocity( ... df=zhongshan_pred_2023_2026, ... q50_cols=subsidence_q50_cols, ... theta_col='latitude', # Ignored for pos, triggers warning ... acov='quarter_circle', # Focus angular range ... normalize=True, ... use_abs_color=True, # Color by Q50 magnitude ... title='Subsidence Velocity Across Zhongshan (2022–2026)', ... cmap='plasma', ... s=25, ... cbar=True, ... mask_angle=True # Hide angle labels ... ) >>> # plt.show() called internally """
[docs] @check_non_emptiness @isdf def plot_interval_consistency( df: pd.DataFrame, qlow_cols: list[str], qup_cols: list[str], q50_cols: list[str] | None = None, theta_col: str | None = None, use_cv: bool = True, cmap: str = "coolwarm", acov: str = "default", title: str | None = None, figsize: tuple[float, float] = (9, 9), s: float | int = 30, alpha: float = 0.85, show_grid: bool = True, mask_angle: bool = False, savefig: str | None = None, dpi: int = 300, ax: Axes | None = None, ): # --- Input Validation --- # Basic DataFrame checks handled by decorators @isdf @check_non_emptiness if len(qlow_cols) != len(qup_cols): raise ValueError( "Mismatch in length between `qlow_cols` " f"({len(qlow_cols)}) and `qup_cols` ({len(qup_cols)})." ) if q50_cols is not None and len(qlow_cols) != len(q50_cols): raise ValueError( "Mismatch in length between quantile columns: " f"qlow/qup ({len(qlow_cols)}) and q50 ({len(q50_cols)})." ) # Check if all specified columns exist in the DataFrame all_cols = qlow_cols + qup_cols + (q50_cols if q50_cols else []) missing_cols = [col for col in all_cols if col not in df.columns] if missing_cols: raise ValueError( f"The following columns are missing from the DataFrame: " f"{', '.join(missing_cols)}" ) # Check theta_col status and warn if provided but unused if theta_col is not None: if theta_col not in df.columns: warnings.warn( f"Specified `theta_col` ('{theta_col}') not found in " f"DataFrame columns. Using index for angular position.", UserWarning, stacklevel=2, ) else: # Issue warning as current implementation uses index warnings.warn( f"`theta_col` ('{theta_col}') is provided but the current" f" implementation uses the DataFrame index for angular " f"positioning ('theta'). The column '{theta_col}' is " f"currently ignored for positioning.", UserWarning, stacklevel=2, ) # --- Data Calculation --- # Calculate interval widths for each year/timepoint for all locations # widths shape: (M, N) where M=num_time_steps, N=num_locations try: widths = np.array( [ df[qup].values - df[qlo].values for qlo, qup in zip(qlow_cols, qup_cols) ] ) except Exception as e: raise TypeError( "Could not compute widths. Ensure quantile columns contain " "numeric data." ) from e # Calculate radial value 'r' (std dev or CV of widths over time) # Result shape: (N,) mean_widths = np.mean(widths, axis=0) std_widths = np.std(widths, axis=0) if use_cv: # Calculate Coefficient of Variation (CV) # Handle division by zero or near-zero mean width # Use np.divide for safe division, setting result to 0 where mean is ~0 r = np.divide( std_widths, mean_widths, out=np.zeros_like(mean_widths, dtype=float), # Output array where=np.abs(mean_widths) > 1e-9, ) # Condition for division # Optionally issue warning if division by zero occurred if np.any(np.abs(mean_widths) <= 1e-9): num_zeros = np.sum(np.abs(mean_widths) <= 1e-9) warnings.warn( f"Mean interval width was zero or near-zero for {num_zeros}" f" locations. CV is set to 0 for these locations.", RuntimeWarning, stacklevel=2, ) radial_label = "CV of Interval Width (Q90-Q10)" else: # Use Standard Deviation r = std_widths radial_label = "Std Dev of Interval Width (Q90-Q10)" # Calculate color values if q50_cols: try: # Average Q50 across time for each location q50_values = np.array([df[q].values for q in q50_cols]) color_vals_source = np.mean(q50_values, axis=0) cbar_label = "Average Q50 Prediction" except Exception as e: warnings.warn( f"Could not compute average Q50. Ensure Q50 columns contain" f" numeric data. Falling back to coloring by 'r'. Error: {e}", UserWarning, stacklevel=2, ) # Fallback: color by the radial value itself color_vals_source = r cbar_label = radial_label # Label reflects 'r' else: # Fallback: color by the radial value itself if no q50_cols given color_vals_source = r cbar_label = radial_label # Label reflects 'r' # --- Angular Coordinate Calculation --- N = len(df) # Number of locations # Generate linear space [0, 1] for N points theta_normalized = np.linspace(0, 1, N, endpoint=True) # Map normalized theta to the desired angular coverage angular_range_map = { "default": 2 * np.pi, "half_circle": np.pi, "quarter_circle": np.pi / 2, "eighth_circle": np.pi / 4, } angle_span = angular_range_map.get(acov.lower(), 2 * np.pi) if acov.lower() not in angular_range_map: warnings.warn( f"Invalid `acov` value '{acov}'. Using 'default' (2*pi).", UserWarning, stacklevel=2, ) # Calculate final theta values theta = theta_normalized * angle_span # --- Color Normalization --- # Ensure cmap is valid for fallback cmap_used = get_cmap(cmap, default="coolwarm") # Normalize color values for the colormap color_norm = Normalize( vmin=np.min(color_vals_source), vmax=np.max(color_vals_source) ) # Get actual colors plot_colors = cmap_used(color_norm(color_vals_source)) # --- Plotting --- if ax is None: fig, ax = plt.subplots( figsize=figsize, subplot_kw={"projection": "polar"} ) else: fig = ax.figure # Set angular limits ax.set_thetamin(0) ax.set_thetamax(np.degrees(angle_span)) # Create the polar scatter plot ax.scatter( theta, r, # Radial value (CV or Std Dev) c=plot_colors, # Point colors s=s, # Point size edgecolor="k", # Point edge color linewidth=0.5, # Point edge width alpha=alpha, # Point transparency ) # Set plot title ax.set_title( title or "Prediction Interval Consistency", fontsize=14, y=1.08 ) # Add color bar sm = cm.ScalarMappable(norm=color_norm, cmap=cmap_used) sm.set_array([]) # Necessary for ScalarMappable cbar_obj = plt.colorbar(sm, ax=ax, pad=0.1, shrink=0.7) cbar_obj.set_label(cbar_label, fontsize=10) # Customize grid and labels if show_grid: ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.7) else: ax.grid(False) # Optionally mask angular tick labels if mask_angle: ax.set_xticklabels([]) # Set radial axis label ax.set_ylabel(radial_label, labelpad=32, fontsize=10) # Optional: adjust radial limits if needed, e.g., start at 0 ax.set_ylim(bottom=0) # --- Save or Show --- final = safe_savefig( savefig, fig, dpi=dpi, bbox_inches="tight", ) if final is not None: plt.close(fig) else: fig.tight_layout() plt.show() return ax
plot_interval_consistency.__doc__ = r""" Polar plot showing consistency of prediction interval widths. This function generates a polar scatter plot to visualize the temporal consistency (or variability) of prediction interval widths (e.g., Q90 - Q10) across different locations over multiple time steps or forecast horizons: - The **angular position (`theta`)** represents each location, currently derived from the DataFrame index and mapped onto the specified angular coverage (`acov`). - The **radial distance (`r`)** quantifies the inconsistency or variability of the interval width over time for each location. It is calculated as either the standard deviation (absolute variability) or the coefficient of variation (CV, relative variability) of the interval widths (Upper Quantile - Lower Quantile) across the specified time steps. Higher `r` values indicate locations where the predicted uncertainty range fluctuates more significantly over time. - The **color** of each point typically represents the average median prediction (Q50) across the time steps (if `q50_cols` are provided). This adds context, helping to identify if interval inconsistency occurs in regions of high or low average predictions. If `q50_cols` are not provided, color defaults to representing the inconsistency measure `r`. This plot is useful for diagnosing model reliability, identifying locations or conditions where the model's uncertainty estimates are unstable or vary considerably across different forecast horizons. Parameters ---------- df : pd.DataFrame Input DataFrame containing the data. Must include columns specified in `qlow_cols` and `qup_cols`. Decorator `@isdf` ensures this is a pandas DataFrame. Decorator `@check_non_emptiness` ensures it's not empty. qlow_cols : list of str List of column names representing the lower quantile (e.g., Q10) predictions for consecutive time steps (e.g., years). Order should correspond to the time steps. Example: ``['subsidence_2023_q10', 'subsidence_2024_q10', ...]``. qup_cols : list of str List of column names representing the upper quantile (e.g., Q90) predictions for the same consecutive time steps as `qlow_cols`. Must be the same length as `qlow_cols`. Example: ``['subsidence_2023_q90', 'subsidence_2024_q90', ...]``. q50_cols : list of str, optional List of column names representing the median quantile (Q50) predictions for the same time steps. If provided, the average Q50 value across these columns will be used to color the points. Must be the same length as `qlow_cols` if provided. If ``None``, the color will represent the radial value `r` (the inconsistency measure). Default is ``None``. theta_col : str, optional *Intended* column name to determine the angular position (`theta`) for each location (e.g., 'latitude', 'longitude', or a spatial index). If ``None``, the DataFrame index is conceptually used. *Note: The current implementation maps the DataFrame row index to the angular range specified by `acov`, regardless of whether `theta_col` is provided. Providing `theta_col` will currently trigger a warning but will not affect the plot's angular axis.* Default is ``None``. use_cv : bool, default=True Determines the measure of interval width variability used for the radial coordinate `r`: - If ``True``, `r` is the Coefficient of Variation (CV) of the interval widths (Std Dev / Mean). CV measures relative variability, useful when mean widths differ substantially. - If ``False``, `r` is the Standard Deviation (Std Dev) of the interval widths. Std Dev measures absolute variability. cmap : str, default='coolwarm' The name of the Matplotlib colormap used to color the scatter points based on the average Q50 value (or `r` if `q50_cols` is ``None``). acov : str, default='default' Angular coverage defining the span of the polar plot's theta axis. Options: ``'default'`` (360°), ``'half_circle'`` (180°), ``'quarter_circle'`` (90°), ``'eighth_circle'`` (45°). Invalid options default to ``'default'``. title : str, optional The title displayed above the polar plot. If ``None``, a default title like "Prediction Interval Consistency (Q90–Q10)" is used. Default is ``None``. figsize : tuple of (float, float), default=(9, 9) The width and height of the figure in inches. s : float or int, default=30 The marker size for the scatter points. alpha : float, default=0.85 The transparency level of the scatter points (0=transparent, 1=opaque). show_grid : bool, default=True If ``True``, display the polar grid lines. mask_angle : bool, default=False If ``True``, hide the angular tick labels. Useful if the index- based angle is not directly interpretable. savefig : str, optional File path to save the plot image. If ``None``, displays the plot interactively. Default is ``None``. Returns ------- ax : matplotlib.axes.Axes The Matplotlib Axes object containing the polar scatter plot. Raises ------ AssertionError If `qlow_cols` and `qup_cols` have different lengths. ValueError If specified columns in `qlow_cols`, `qup_cols`, or `q50_cols` are not found in the DataFrame. See Also -------- plot_velocity : Plot average velocity in polar coordinates. numpy.std : Compute the standard deviation. numpy.mean : Compute the arithmetic mean. matplotlib.pyplot.scatter : Create scatter plots. Notes ----- Interval-width consistency is assessed from paired lower/upper quantiles for multiple time steps. For each location and time step, the width is computed as upper minus lower. The radial value encodes either the standard deviation of these widths (absolute variability) or their coefficient of variation (relative variability), with safe handling of zero means by setting the CV to zero when the average width is numerically indistinguishable from zero. Angles are derived from the row index and mapped linearly across the angular span determined by :py:data:`acov`; the current implementation does not use :py:data:`theta_col` for positioning. Rows containing missing values in any required column are dropped prior to computation. These diagnostics relate to standard notions of predictive-interval calibration and stability; see :footcite:t:`Gneiting2007b, Jolliffe2012`. **Interval widths.** Let :math:`\mathbf L` and :math:`\mathbf U` be matrices extracted from :py:data:`df` using :py:data:`qlow_cols` and :py:data:`qup_cols`, respectively, both of shape :math:`(N,M)`, with :math:`N` locations and :math:`M` time steps. Define the width matrix :math:`\mathbf W` by .. math:: W_{j,i} = U_{j,i} - L_{j,i} where :math:`j` indexes locations (:math:`0` to :math:`N-1`) and :math:`i` indexes time steps (:math:`0` to :math:`M-1`). **Radial Coordinate Calculation (`r`)**: Let :math:`\mathbf{w}_j = (W_{j,0}, \dots, W_{j,M-1})` be the vector of widths over time for location :math:`j`. Let :math:`\bar{w}_j = \text{mean}(\mathbf{w}_j)` and :math:`\sigma_{w_j} = \text{std}(\mathbf{w}_j)`. - If `use_cv=False` (Standard Deviation): .. math:: r_j = \sigma_{w_j} - If `use_cv=True` (Coefficient of Variation): .. math:: r_j = \begin{cases} \frac{\sigma_{w_j}}{\bar{w}_j} &\\ \text{if } |\bar{w}_j| > \epsilon \\ 0 & \text{if }\\ |\bar{w}_j| \le \epsilon \end{cases} where :math:`\epsilon` is a small threshold to prevent division by zero. **Color Value Calculation (`c`)**: Let :math:`\mathbf{Q50}` be the data matrix (shape :math:`(N, M)`) from `q50_cols`. - If `q50_cols` is provided: Let :math:`\mathbf{q50}_j = (Q50_{j,0}, \dots, Q50_{j,M-1})`. .. math:: c_j = \text{mean}(\mathbf{q50}_j) = \frac{1}{M} \sum_{i=0}^{M-1} Q50_{j,i} - If `q50_cols` is ``None``: :math:`c_j = r_j` **Angular Coordinate Calculation (`theta`)**: Same index-based calculation as `plot_velocity`. Let :math:`S` be the angular span from `acov`. .. math:: \theta_j = \frac{j}{N} \times S Examples -------- >>> import pandas as pd >>> import numpy as np >>> from kdiagram.plot.uncertainty import plot_interval_consistency **1. Random Example:** >>> np.random.seed(1) >>> N_points = 120 >>> df_rand_interval = pd.DataFrame({ ... 'id': range(N_points), ... 'lat': np.linspace(30, 31, N_points), ... 'val_2021_q10': np.random.rand(N_points) * 5, ... 'val_2021_q50': np.random.rand(N_points) * 5 + 5, ... 'val_2021_q90': np.random.rand(N_points) * 5 + 10, ... 'val_2022_q10': np.random.rand(N_points) * 6, # Slightly wider ... 'val_2022_q50': np.random.rand(N_points) * 6 + 6, ... 'val_2022_q90': np.random.rand(N_points) * 6 + 12, ... 'val_2023_q10': np.random.rand(N_points) * 4, # Narrower ... 'val_2023_q50': np.random.rand(N_points) * 4 + 7, ... 'val_2023_q90': np.random.rand(N_points) * 4 + 11, ... }) >>> q10_cols_rand = ['val_2021_q10', 'val_2022_q10', 'val_2023_q10'] >>> q90_cols_rand = ['val_2021_q90', 'val_2022_q90', 'val_2023_q90'] >>> q50_cols_rand = ['val_2021_q50', 'val_2022_q50', 'val_2023_q50'] >>> ax_rand_ic = plot_interval_consistency( ... df=df_rand_interval, ... qlow_cols=q10_cols_rand, ... qup_cols=q90_cols_rand, ... q50_cols=q50_cols_rand, ... theta_col='lat', # Note: Ignored for positioning ... use_cv=True, # Use CV for radial axis ... cmap='viridis', ... acov='half_circle', ... title='Random Interval Width Consistency (CV)', ... s=35 ... ) >>> # plt.show() called internally **2. Concrete Example (Subsidence Data - adapted from docstring):** >>> # Assume zhongshan_pred_2023_2026 is loaded DataFrame like: >>> # Create dummy data if it doesn't exist >>> try: ... zhongshan_pred_2023_2026 ... except NameError: ... print("Creating dummy subsidence data for example...") ... N_sub = 150 ... zhongshan_pred_2023_2026 = pd.DataFrame({ ... 'latitude': np.linspace(22.2, 22.8, N_sub), ... **{f'subsidence_{yr}_q10': np.random.rand(N_sub)*(yr-2020)+1 ... for yr in range(2023, 2027)}, ... **{f'subsidence_{yr}_q50': np.random.rand(N_sub)*(yr-2019)+5 ... + np.linspace(0, (yr-2022)*2, N_sub) ... for yr in range(2023, 2027)}, ... **{f'subsidence_{yr}_q90': np.random.rand(N_sub)*(yr-2018)+10 ... + np.linspace(0, (yr-2022)*4, N_sub) ... for yr in range(2023, 2027)}, ... }) >>> qlow_sub = [f'subsidence_{yr}_q10' for yr in range(2023, 2027)] >>> qup_sub = [f'subsidence_{yr}_q90' for yr in range(2023, 2027)] >>> q50_sub = [f'subsidence_{yr}_q50' for yr in range(2023, 2027)] >>> ax_sub_ic = plot_interval_consistency( ... df=zhongshan_pred_2023_2026, ... qlow_cols=qlow_sub, ... qup_cols=qup_sub, ... q50_cols=q50_sub, ... theta_col='latitude', # Ignored for pos, triggers warning ... acov='default', ... title='Subsidence Uncertainty Consistency (2023–2026)', ... use_cv=False, # Use Std Dev for radius ... cmap='coolwarm', ... s=28, ... alpha=0.8, ... mask_angle=True ... ) >>> References ---------- .. footbibliography:: """
[docs] @check_non_emptiness @isdf def plot_anomaly_magnitude( df: pd.DataFrame, actual_col: str, q_cols: list[str] | tuple[str, str], theta_col: str | None = None, acov: str = "default", title: str = "Anomaly Magnitude Polar Plot", figsize: tuple[float, float] = (8.0, 8.0), cmap_under: str = "Blues", cmap_over: str = "Reds", s: int = 30, alpha: float = 0.8, show_grid: bool = True, verbose: int = 1, cbar: bool = False, dpi: int = 300, savefig: str | None = None, mask_angle: bool = False, ax: Axes | None = None, ): try: qlow_col, qup_col = validate_qcols( q_cols=q_cols, ncols_exp="==2", # Expect exactly two columns err_msg=( "Expected `q_cols` to contain exactly two column names " f"[lower_bound, upper_bound], but got: {q_cols}" ), ) except Exception as e: # Catch potential errors from validate_qcols if it raises them raise ValueError(f"Validation of `q_cols` failed: {e}") from e # Consolidate list of essential columns cols_needed = [actual_col, qlow_col, qup_col] if theta_col: # Only add if specified, check existence later if needed cols_needed.append(theta_col) # Check existence of essential columns # missing_cols = [col for col in cols_needed if col not in df.columns] # Allow theta_col to be missing if specified, handled later missing_essential = [ col for col in [actual_col, qlow_col, qup_col] if col not in df.columns ] if missing_essential: raise ValueError( "The following essential columns are missing from the " f"DataFrame: {', '.join(missing_essential)}" ) # Drop rows with NaN in essential columns before proceeding # Use only the definitely required columns for dropna essential_cols_for_na = [actual_col, qlow_col, qup_col] data = df[cols_needed].dropna(subset=essential_cols_for_na).copy() if len(data) == 0: warnings.warn( "DataFrame is empty after dropping NaN values" " in essential columns. Cannot generate plot.", UserWarning, stacklevel=2, ) return None # Cannot proceed # --- Anomaly Calculation --- # Extract data as numpy arrays for efficiency try: y = data[actual_col].to_numpy(dtype=float) y_lo = data[qlow_col].to_numpy(dtype=float) y_hi = data[qup_col].to_numpy(dtype=float) except (ValueError, TypeError) as e: raise TypeError( f"Failed to convert essential columns to numeric arrays." f" Check data types. Original error: {e}" ) from e # Identify under- and over-predictions under_mask = y < y_lo over_mask = y > y_hi # Calculate anomaly magnitude (distance from the violated bound) anomaly_mag = np.zeros_like(y, dtype=float) # Magnitude is positive: bound - actual for under, actual - bound for over anomaly_mag[under_mask] = y_lo[under_mask] - y[under_mask] anomaly_mag[over_mask] = y[over_mask] - y_hi[over_mask] # Filter out non-anomalies for plotting (only plot r > 0) is_anomaly = under_mask | over_mask if not np.any(is_anomaly): warnings.warn( "No anomalies detected (all actual values are within " "the specified quantile bounds). Plot will be empty.", UserWarning, stacklevel=2, ) # Still create plot structure, but it will be empty # Filter data to only include anomalies anomaly_mag = anomaly_mag[is_anomaly] under_mask_filtered = under_mask[is_anomaly] over_mask_filtered = over_mask[is_anomaly] data_filtered = data[is_anomaly] # Filter DataFrame rows too N_anomalies = len(data_filtered) # Number of anomalies # --- Theta Coordinate and Ordering --- # Determine ordering index if theta_col and theta_col in data_filtered.columns: try: # Sort based on the theta_col values of the anomalies ordered_idx = np.argsort( data_filtered[theta_col].to_numpy(dtype=float) ) except (ValueError, TypeError): warnings.warn( f"Could not sort by `theta_col` ('{theta_col}') as it " f"contains non-numeric data after NaN removal. " f"Using default DataFrame order.", UserWarning, stacklevel=2, ) ordered_idx = np.arange(N_anomalies) # Fallback to original order else: # Use default order if theta_col not specified or not found if theta_col and theta_col not in data_filtered.columns: warnings.warn( f"Specified `theta_col` ('{theta_col}') not found in the" f" (filtered) DataFrame. Using default DataFrame order.", UserWarning, stacklevel=2, ) ordered_idx = np.arange(N_anomalies) # Generate base theta values (linear spacing) theta_norm = np.linspace(0.0, 1.0, N_anomalies, endpoint=True) # Define angular coverage range coverage_map = { "default": 2 * np.pi, "half_circle": np.pi, "quarter_circle": np.pi / 2, "eighth_circle": np.pi / 4, } coverage = coverage_map.get(acov.lower(), 2 * np.pi) if acov.lower() not in coverage_map: warnings.warn( f"Invalid `acov` value '{acov}'. Using 'default' (2*pi).", UserWarning, stacklevel=2, ) # Calculate final theta values based on coverage theta = theta_norm * coverage # Apply ordering to all relevant arrays theta = theta[ordered_idx] anomaly_mag_ordered = anomaly_mag[ordered_idx] under_mask_ordered = under_mask_filtered[ordered_idx] over_mask_ordered = over_mask_filtered[ordered_idx] # --- Plotting --- if ax is None: # Constrained layout avoids title / r-label overlap on polar axes fig = plt.figure(figsize=figsize, constrained_layout=True) ax = fig.add_subplot(111, projection="polar") else: fig = ax.figure ax.set_thetamin(0) ax.set_thetamax(np.degrees(coverage)) # Expects degrees # ax.set_title(title, fontsize=14, y=1.08) # Adjust position # Setup grid if show_grid: ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.7) else: ax.grid(False) # Optionally mask angle labels if mask_angle: ax.set_xticklabels([]) # --- Color Normalization (shared scale based on max magnitude) --- # Ensure vmax is at least a small positive number for normalization vmax = max( float(anomaly_mag_ordered.max()) if N_anomalies > 0 else 0.0, 1e-5 ) norm = Normalize(vmin=0.0, vmax=vmax) # Retrieve colormaps safely try: cmap_under_obj = get_cmap(cmap_under) except ValueError: warnings.warn( f"Invalid `cmap_under` ('{cmap_under}'). Using default 'Blues'.", UserWarning, stacklevel=2, ) cmap_under_obj = get_cmap("Blues") try: cmap_over_obj = get_cmap(cmap_over) except ValueError: warnings.warn( f"Invalid `cmap_over` ('{cmap_over}'). Using default 'Reds'.", UserWarning, stacklevel=2, ) cmap_over_obj = get_cmap("Reds") # --- Scatter Plot Anomalies (separate calls for color/label) --- if np.any(under_mask_ordered): ax.scatter( theta[under_mask_ordered], # Angles for under-preds anomaly_mag_ordered[ under_mask_ordered ], # Magnitudes for under-preds c=anomaly_mag_ordered[ under_mask_ordered ], # Color value is magnitude cmap=cmap_under_obj, # Blues colormap norm=norm, # Shared normalization s=s, # Marker size alpha=alpha, # Transparency edgecolor="grey", # Edge color linewidth=0.5, label="Under-prediction", # Legend label ) if np.any(over_mask_ordered): ax.scatter( theta[over_mask_ordered], # Angles for over-preds anomaly_mag_ordered[ over_mask_ordered ], # Magnitudes for over-preds c=anomaly_mag_ordered[ over_mask_ordered ], # Color value is magnitude cmap=cmap_over_obj, # Reds colormap norm=norm, # Shared normalization s=s, # Marker size alpha=alpha, # Transparency edgecolor="grey", # Edge color linewidth=0.5, label="Over-prediction", # Legend label ) # Add legend if any anomalies were plotted if np.any(under_mask_ordered) or np.any(over_mask_ordered): ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1), fontsize=9) else: # Add text if plot is empty ax.text( 0, 0, "No anomalies detected", horizontalalignment="center", verticalalignment="center", ) # --- Add Colorbar (optional) --- # Note: Visually uses cmap_over, but scale (norm) is correct if cbar and N_anomalies > 0: # Create a mappable object linked to the normalization and cmap_over sm = cm.ScalarMappable(norm=norm, cmap=cmap_over_obj) sm.set_array([]) # Needed for ScalarMappable # cbar_obj = plt.colorbar(sm, ax=ax, pad=0.1, shrink=0.7) cbar_obj = plt.colorbar(sm, ax=ax, pad=0.08, shrink=0.80) cbar_obj.set_label("Anomaly magnitude |Actual - Bound|", fontsize=10) # Set radial axis label ax.set_title(title, fontsize=14, y=1.12) ax.set_ylabel("Anomaly Magnitude", labelpad=24, fontsize=10) # Move the radial tick labels away from the y-label side ax.set_rlabel_position(315) # places radial tick labels near SE quadrant ax.set_ylim(bottom=0) # Ensure radius starts at 0 # --- Logging --- if verbose > 0: # Use original masks on NaN-dropped data before filtering for plot n_total_checked = len(data) # Total valid points checked n_under_total = np.sum(y < y_lo) n_over_total = np.sum(y > y_hi) n_anomalies_total = n_under_total + n_over_total print("-" * 50) print("Anomaly Detection Summary:") print(f" Total valid points checked: {n_total_checked}") print( f" Anomalies detected (outside {qlow_col}-{qup_col}): " f"{n_anomalies_total}" ) print( f" ? Under-predictions ({actual_col} < {qlow_col}): {n_under_total}" ) print( f" ? Over-predictions ({actual_col} > {qup_col}): {n_over_total}" ) print("-" * 50) final = safe_savefig( savefig, fig, dpi=dpi, bbox_inches="tight", ) if final is not None: plt.close(fig) else: plt.show() return ax
plot_anomaly_magnitude.__doc__ = r""" Visualize magnitude and type of prediction-interval anomalies. This function draws a polar scatter plot to highlight prediction-interval failures—cases where the observed value lies outside a user-specified interval (e.g., :math:`[Q_{0.10}, Q_{0.90}]`). It encodes the anomaly’s **order/location** (angle), **magnitude** (radius), and **type** (color). See :footcite:t:`kouadiob2025`; background on coverage and calibration in :footcite:p:`Gneiting2007b, Jolliffe2012`. - **Angular position (:math:`\theta`)**: maps each row to an angle over the chosen span (``acov``). If ``theta_col`` is provided, rows are *ordered* by that column before angles are assigned; otherwise the original row order is used. Angles are spaced linearly across the selected coverage. - **Radial distance (:math:`r`)**: the anomaly magnitude for interval violations, computed as :math:`r=\lvert y - B\rvert`, where :math:`B\in\{L,U\}` is the violated bound (lower :math:`L` or upper :math:`U`). Points satisfying :math:`L\le y\le U` are omitted. - **Color**: indicates anomaly type and scales with magnitude on a shared normalization. ``cmap_under`` colors under-predictions (:math:`y<L`); ``cmap_over`` colors over-predictions (:math:`y>U`). This diagnostic helps to (i) localize clusters of failures along the ordering induced by ``theta_col`` (or row index), (ii) assess how severe misses are via larger radii, and (iii) support calibration checks and targeted model refinement, alongside coverage and reliability analyses, see :footcite:p:`Gneiting2007b, Jolliffe2012`. Parameters ---------- df : pandas.DataFrame Input table containing the actual values and the two quantile (interval) columns. Must be non-empty after NaN removal in the required columns (see below). actual_col : str Name of the column with the observed/ground-truth values used to check interval coverage. q_cols : list[str] or tuple[str, str] Two-element sequence with the **lower** and **upper** quantile column names, in that order: ``[q_low, q_up]``. Each referenced column must exist in ``df`` and be numeric. Semantically, rows are expected to satisfy :math:`q_\text{low} \le q_\text{up}`. theta_col : str, optional Column used **only to order points** angularly before mapping them into the selected coverage span. Useful for spatial or categorical ordering (e.g., ``'latitude'``, ``'station_id'``). If omitted or non-numeric after NaN filtering, the original row order is used. acov : {'default', 'half_circle', 'quarter_circle', 'eighth_circle'}, \ default='default' Angular **coverage** of the plot: - ``'default'``: full circle, :math:`2\pi` - ``'half_circle'``: :math:`\pi` - ``'quarter_circle'``: :math:`\pi/2` - ``'eighth_circle'``: :math:`\pi/4` (Value is case-insensitive; invalid values fall back to full circle with a warning.) title : str, default='Anomaly Magnitude Polar Plot' Figure title. figsize : tuple[float, float], default=(8.0, 8.0) Figure size in inches; each dimension must be positive. cmap_under : str, default='Blues' Matplotlib colormap **name** used for *under-predictions* (:math:`y<q_\text{low}`). If invalid, a warning is issued and a default is used. cmap_over : str, default='Reds' Matplotlib colormap **name** used for *over-predictions* (:math:`y>q_\text{up}`). If invalid, a warning is issued and a default is used. s : int, default=30 Marker size for anomaly points (points with interval failure); must be positive. alpha : float, default=0.8 Point transparency in :math:`[0,1]`. show_grid : bool, default=True Whether to draw polar grid lines. verbose : int, default=1 Verbosity level. If :math:`>0`, prints a short anomaly summary (counts of under-/over-predictions). cbar : bool, default=False If ``True``, draw a colorbar for the **shared** magnitude normalization. The bar uses the ``cmap_over`` colormap for display, but its scale matches both under/over magnitudes. savefig : str, optional Path to save the figure (e.g., ``'anomaly_plot.png'``). If omitted, the figure is shown interactively. Errors during saving are reported via a printed message. mask_angle : bool, default=False If ``True``, hide angular tick labels (useful when the angle order is arbitrary or index-based). Returns ------- ax : matplotlib.axes.Axes or None The polar ``Axes`` containing the scatter plot. Returns ``None`` if the DataFrame becomes empty **after** dropping NaNs in the required columns and no plot can be produced. If no anomalies are found, an empty polar frame is returned with a notice text, not ``None``. Raises ------ ValueError - ``q_cols`` is not a 2-item sequence. - Any of ``actual_col``, the two ``q_cols``, or the provided ``theta_col`` (when used) is missing from ``df``. TypeError Required columns exist but cannot be coerced to numeric dtype after NaN handling. See Also -------- kdiagram.plot.uncertainty.plot_coverage : Aggregate empirical coverage comparison. kdiagram.plot.uncertainty.plot_coverage_diagnostic : Point-wise coverage successes/failures. kdiagram.plot.uncertainty.plot_interval_width : Magnitude of prediction-interval widths. kdiagram.plot.uncertainty.plot_interval_consistency : Stability of interval widths over time/steps. validate_qcols : Helper for validating the lower/upper quantile columns. Notes ----- - Anomalies are detected by comparing ``actual_col`` to the bounds in ``q_cols`` for each row. - Rows with NaNs in any required column (``actual_col``, the two quantile columns, and ``theta_col`` if provided) are dropped before analysis. - The anomaly magnitude :math:`r` is non-negative and measures the distance to the **exceeded** bound. - ``theta_col`` controls **ordering only**, not spacing. Angles are always linearly spaced within the selected coverage (:py:data:`acov`) and then assigned according to the sort order of ``theta_col``. - Color intensity for both under- and over-predictions reflects the shared normalization of magnitudes, based on the maximum observed :math:`r`. - The helper ``validate_qcols`` (from your utilities, e.g. *gofast*) performs initial structure checks on ``q_cols``. Let :math:`y_j` be the actual value, :math:`L_j` the lower bound, and :math:`U_j` the upper bound for location :math:`j`. 1. **Anomaly masks** - Under-prediction: :math:`M_{\text{under},j} = (y_j < L_j)` - Over-prediction: :math:`M_{\text{over},j} = (y_j > U_j)` 2. **Anomaly magnitude (radial coordinate :math:`r`)** .. math:: r_j = \begin{cases} L_j - y_j, & \text{if } y_j < L_j,\\ y_j - U_j, & \text{if } y_j > U_j,\\ 0, & \text{otherwise.} \end{cases} Only points with :math:`r_j > 0` are plotted. 3. **Angular coordinate (:math:`\theta`)** - For :math:`N` retained rows, define base angles .. math:: \theta'_k \;=\; \frac{k}{N}\,S, \qquad k=0,\ldots,N-1, where :math:`S \in \{2\pi, \pi, \pi/2, \pi/4\}` is the span implied by :py:data:`acov`. - If ``theta_col`` exists, sort by that column to obtain a permutation :math:`\pi`. The plotted angle for original row :math:`j` is :math:`\theta_{\pi^{-1}(j)}=\theta'_{\pi^{-1}(j)}`. If ``theta_col`` is absent, use the original row order: :math:`\theta_j=\theta'_j`. 4. **Color mapping** - Normalize magnitudes .. math:: \text{norm}(r_j) \;=\; \frac{r_j}{\max(r_1,\ldots,r_N) + \varepsilon}, with a small :math:`\varepsilon>0` for numerical safety. - Apply colormaps: ``cmap_under(norm(r_j))`` for under-predictions and ``cmap_over(norm(r_j))`` for over-predictions. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from kdiagram.plot.uncertainty import plot_anomaly_magnitude **1. Random Example:** >>> np.random.seed(42) >>> N_points = 150 >>> df_anomaly_rand = pd.DataFrame({ ... 'id': range(N_points), ... 'actual': np.random.randn(N_points) * 5 + 10, ... 'pred_q10': np.random.randn(N_points) * 1 + 7, # Interval around 10 ... 'pred_q90': np.random.randn(N_points) * 1 + 13, ... 'feature_order': np.random.rand(N_points) * 100 # For ordering ... }) >>> # Introduce some anomalies >>> df_anomaly_rand.loc[5:15, 'actual'] = 0 # Under-predictions >>> df_anomaly_rand.loc[100:110, 'actual'] = 25 # Over-predictions >>> >>> ax_rand_anomaly = plot_anomaly_magnitude( ... df=df_anomaly_rand, ... actual_col='actual', ... q_cols=['pred_q10', 'pred_q90'], ... theta_col='feature_order', # Order by this feature ... acov='default', ... title='Random Anomaly Distribution', ... cmap_under='GnBu', ... cmap_over='OrRd', ... s=40, ... cbar=True, ... verbose=1 ... ) >>> # Output will show anomaly counts... >>> # plt.show() called internally **2. Concrete Example (Subsidence Data - adapted from docstring):** >>> # Assume small_sample_pred is a loaded DataFrame like: >>> # Create dummy data if it doesn't exist >>> try: ... small_sample_pred ... except NameError: ... print("Creating dummy small sample prediction data...") ... N_small = 200 ... small_sample_pred = pd.DataFrame({ ... 'subsidence_2023': np.random.rand(N_small)*15 + np.linspace(0, 5, N_small), ... 'subsidence_2023_q10': np.random.rand(N_small)*10, ... 'subsidence_2023_q90': np.random.rand(N_small)*10 + 10, ... 'latitude': np.linspace(22.3, 22.7, N_small) + np.random.randn(N_small)*0.01 ... }) ... # Ensure some anomalies exist in dummy data ... anom_indices_under = np.random.choice(N_small, 15, replace=False) ... anom_indices_over = np.random.choice( ... list(set(range(N_small)) - set(anom_indices_under)), 20, replace=False ... ) ... small_sample_pred.loc[anom_indices_under, 'subsidence_2023'] = ( ... small_sample_pred.loc[anom_indices_under, 'subsidence_2023_q10'] ... - np.random.rand(15)*5 - 1 ... ) ... small_sample_pred.loc[anom_indices_over, 'subsidence_2023'] = ( ... small_sample_pred.loc[anom_indices_over, 'subsidence_2023_q90'] ... + np.random.rand(20)*5 + 1 ... ) >>> ax_sub_anomaly = plot_anomaly_magnitude( ... df=small_sample_pred, ... actual_col='subsidence_2023', ... q_cols=['subsidence_2023_q10', 'subsidence_2023_q90'], ... theta_col='latitude', # Order points by latitude ... acov='quarter_circle', # Use only 90 degrees ... title='Anomaly Magnitude (2023) – Zhongshan', ... figsize=(9, 9), ... s=35, ... cbar=True, # Show colorbar ... mask_angle=True, # Hide angle labels ... verbose=1 # Print anomaly counts ... ) >>> # Output will show anomaly counts... >>> # plt.show() called internally References ---------- .. footbibliography:: """
[docs] @check_non_emptiness @isdf def plot_uncertainty_drift( df: pd.DataFrame, qlow_cols: list[str], qup_cols: list[str], dt_labels: list[str] | None = None, theta_col: str | None = None, acov: str = "default", base_radius: float = 0.15, band_height: float = 0.15, cmap: str = "tab10", label: str = "Year", alpha: float = 0.85, figsize: tuple[float, float] = (9, 9), title: str | None = None, show_grid: bool = True, show_legend: bool = True, mask_angle: bool = True, savefig: str | None = None, dpi: int = 300, ax: Axes | None = None, ): # --- Input Validation --- if len(qlow_cols) != len(qup_cols): raise ValueError( "Mismatched lengths for `qlow_cols` " f"({len(qlow_cols)}) and `qup_cols` ({len(qup_cols)})." ) num_time_steps = len(qlow_cols) if num_time_steps == 0: raise ValueError("Quantile column lists cannot be empty.") # Generate default labels if none provided if dt_labels is None: # Use the 'label' param as base for default labels time_labels = [f"{label}_{i + 1}" for i in range(num_time_steps)] else: if len(dt_labels) != num_time_steps: raise ValueError( f"Length of `dt_labels` ({len(dt_labels)}) must match " f"the number of time steps ({num_time_steps})." ) time_labels = list(dt_labels) # Ensure list type # Consolidate required columns and check existence all_cols = qlow_cols + qup_cols # Do not check theta_col here, handle warning later missing_cols = [col for col in all_cols if col not in df.columns] if missing_cols: raise ValueError( f"The following quantile columns are missing from the " f"DataFrame: {', '.join(missing_cols)}" ) # Handle theta_col warning if theta_col: if theta_col not in df.columns: warnings.warn( f"Specified `theta_col` ('{theta_col}') not found. " f"Using index for angular position.", UserWarning, stacklevel=2, ) else: warnings.warn( f"`theta_col` ('{theta_col}') is provided but currently " f"ignored for positioning. Using index for angular position.", UserWarning, stacklevel=2, ) # Prepare data: Drop rows with NaNs in relevant columns data = df[all_cols].dropna() if len(data) == 0: warnings.warn( "DataFrame is empty after dropping NaN values in quantile " "columns. Cannot generate plot.", UserWarning, stacklevel=2, ) return None N = len(data) # Number of valid data points (locations) # --- Calculate Interval Widths and Normalize Globally --- widths = [] try: for ql, qu in zip(qlow_cols, qup_cols): width_values = (data[qu] - data[ql]).to_numpy(dtype=float) if np.any(width_values < 0): warnings.warn( f"Negative interval widths detected for columns " f"'{qu}' and '{ql}'. Check if upper < lower bound.", UserWarning, stacklevel=2, ) # Ensure non-negative widths, clamp if necessary? Or just proceed. # width_values[width_values < 0] = 0 # Option to clamp widths.append(width_values) except Exception as e: raise TypeError( f"Could not compute widths. Ensure quantile columns " f"({ql}, {qu}) contain numeric data. Original error: {e}" ) from e # Find global maximum width across all years and locations if not widths: # Should not happen due to earlier checks, but safe return None # Calculate max, handle case where all widths might be zero or negative all_width_values = np.concatenate(widths) if widths else np.array([0]) max_width = np.max(all_width_values) if len(all_width_values) > 0 else 0.0 # Normalize widths using the global maximum normalized_widths = [] if max_width > 1e-9: # Avoid division by zero/near-zero normalized_widths = [w / max_width for w in widths] else: # If max width is zero, all normalized widths are zero normalized_widths = [np.zeros_like(w) for w in widths] warnings.warn( "Maximum interval width across all data is zero or near-zero. " "Normalized widths are all set to 0.", UserWarning, stacklevel=2, ) # --- Angular Coordinate Calculation --- acov_map = { # Map coverage name to (min_angle, max_angle) in radians "default": (0, 2 * np.pi), "half_circle": (0, np.pi), "quarter_circle": (0, np.pi / 2), "eighth_circle": (0, np.pi / 4), } theta_min_rad, theta_max_rad = acov_map.get(acov.lower(), (0, 2 * np.pi)) if acov.lower() not in acov_map: warnings.warn( f"Invalid `acov` value '{acov}'. Using 'default' (0 to 2*pi).", UserWarning, stacklevel=2, ) angular_range_rad = theta_max_rad - theta_min_rad # Generate theta values based on index, mapped to the specified range theta = ( np.linspace(0.0, 1.0, N, endpoint=True) # Linear space [0, 1] * angular_range_rad # Scale to range width + theta_min_rad ) # --- Plotting Setup --- if ax is None: fig, ax = plt.subplots( figsize=figsize, subplot_kw={"projection": "polar"} ) else: fig = ax.figure ax.set_thetamin(np.degrees(theta_min_rad)) # Expects degrees ax.set_thetamax(np.degrees(theta_max_rad)) # Expects degrees # Hide angular tick labels if requested if mask_angle: ax.set_xticklabels([]) # Configure grid if show_grid: ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.7) else: ax.grid(False) # Hide radial ticks as they primarily separate years visually ax.set_yticks([]) # Get color palette color_palette = _sample_colors( cmap, num_time_steps, trim=0.08, default="tab10" ) # --- Draw Rings for Each Time Step --- for i, (w_norm, step_label) in enumerate( zip(normalized_widths, time_labels) ): # XXX TODO # Calculate base radius for this ring (increases for later years) # base_r = base_radius + i * some_increment # Alternative logic base_r = base_radius * ( i + 1 ) # Base offset increases multiplicatively # Calculate final radius: base + scaled normalized width # Ensure w_norm has the same length as theta (N) if len(w_norm) != N: # This should not happen if dropna was done correctly, but safeguard raise InternalError( "Mismatch between width data and theta length." ) # Or handle gracefully r = base_r + band_height * w_norm # Determine color for this ring, cycling if needed color = color_palette[i % len(color_palette)] # Plot the line for this time step # Ensure data wraps around by appending first point? Not needed # for line plot if endpoint=True in linspace? Check. # For visual continuity if range isn't full 2*pi, might not need wrap. # Let's omit wrap for now. ax.plot( theta, r, label=step_label, # Label for the legend color=color, # Color for this ring linewidth=1.8, # Line thickness alpha=alpha, # Transparency ) # --- Final Touches --- # Set plot title ax.set_title( title or "Multi-time Uncertainty Drift (Interval Width)", fontsize=14, y=1.08, ) # Add legend if requested if show_legend: # Use the provided 'label' parameter as the intended legend title # The example hardcodes 'Year'. Let's compromise: # use 'label' if given, else 'Time Step'. legend_title = label if label else "Time Step" ax.legend( loc="upper right", bbox_to_anchor=(1.25, 1.1), # Position outside plot title=legend_title, # Use dynamic title fontsize=9, ) # --- Save or Show --- if savefig is not None: safe_savefig(savefig, fig, bbox_inches="tight", dpi=dpi) plt.close(fig) else: fig.tight_layout() plt.show() return ax
plot_uncertainty_drift.__doc__ = r""" Polar plot visualizing temporal drift of uncertainty width. This function creates a polar line plot showing how the width of the prediction interval (e.g., Q90 - Q10), representing model uncertainty, evolves over multiple time steps (e.g., years) across different locations. Each time step is depicted as a distinct concentric ring [1]_. - **Angular Position (`theta`)**: Represents each location or data point. Currently derived from the DataFrame index, mapped linearly onto the angular range specified by `acov`. The optional `theta_col` parameter is intended for future use in ordering but is currently ignored for positioning. - **Radial Rings (`r`)**: Each ring corresponds to a specific time step provided via `qlow_cols`/`qup_cols`. The position of the ring (distance from the center) indicates the time step (later times are further out). The radius of the line at a specific angle (location) on a given ring is determined by a base offset for that year plus a component proportional to the *globally normalized* interval width at that location and time. Thus, the 'thickness' or deviation of a ring from a perfect circle reflects the magnitude of uncertainty (interval width) relative to the maximum width observed across all locations and times. - **Color**: Each ring (time step) is assigned a unique color based on the specified `cmap`, aiding in distinguishing and tracking changes across time steps. This visualization is particularly useful for: - Identifying locations where prediction uncertainty grows or shrinks significantly over the forecast horizon. - Monitoring the overall trend (drift) of uncertainty as forecasts extend further into the future. - Highlighting areas with consistently high or low uncertainty across all time steps. - Comparing the spatial patterns of uncertainty at different forecast lead times. Parameters ---------- df : pd.DataFrame Input DataFrame containing the quantile prediction columns. Decorators ensure it's a valid, non-empty pandas DataFrame. qlow_cols : list of str List of column names for the lower quantile bound (e.g., Q10) for consecutive time steps. Example: ``['pred_2023_q10', 'pred_2024_q10', ...]``. qup_cols : list of str List of column names for the upper quantile bound (e.g., Q90) for the same time steps as `qlow_cols`. Must be the same length. Example: ``['pred_2023_q90', 'pred_2024_q90', ...]``. dt_labels : list of str, optional List of labels for each time step, used in the legend to identify the rings. Must match the length of `qlow_cols`. If ``None``, generic labels like 'Time_1', 'Time_2', ... are generated based on the `label` parameter. Default is ``None``. theta_col : str, optional *Intended* column name for ordering points angularly. *Note: This parameter is currently ignored in the positioning logic; angles are based on the DataFrame index.* A warning is issued if provided. Default is ``None``. acov : {'default', 'half_circle', 'quarter_circle', \ 'eighth_circle'}, default='default' Specifies the angular coverage (span) of the plot: ``'default'`` (360°), ``'half_circle'`` (180°), ``'quarter_circle'`` (90°), ``'eighth_circle'`` (45°). base_radius : float, default=0.15 Determines the spacing between the base circles of consecutive time step rings. The base radial offset for the ring representing time step `i` (0-indexed) is calculated as ``base_radius * (i+1)``. A larger value increases the separation between rings. band_height : float, default=0.15 Scaling factor applied to the *normalized* interval width. This controls how much the radius deviates from the base circle for each ring, visually representing the uncertainty magnitude. Effectively, it's the maximum radial 'height' allocated to represent uncertainty on each ring. cmap : str, default='tab10' Name of the Matplotlib colormap used to assign distinct colors to the rings representing different time steps. label : str, default='Year' Base name used for generating default `dt_labels` if they are not provided (e.g., 'Year_1', 'Year_2', ...). *Note: This parameter is currently *not* used for the legend title itself.* alpha : float, default=0.85 Transparency level for the plotted lines (rings). figsize : tuple of (float, float), default=(9, 9) Width and height of the figure in inches. title : str, optional Title displayed above the polar plot. If ``None``, a default title is used. Default is ``None``. show_grid : bool, default=True If ``True``, display polar grid lines. show_legend : bool, default=True If ``True``, display a legend identifying the time step for each colored ring, using `dt_labels`. mask_angle : bool, default=True If ``True``, hide the angular tick labels (degrees). Recommended if the angular position is based on index. savefig : str, optional File path to save the plot image. If ``None``, displays the plot interactively. Default is ``None``. Returns ------- ax : matplotlib.axes.Axes The Matplotlib Axes object containing the polar line plot. Raises ------ AssertionError If `qlow_cols` and `qup_cols` lists have different lengths. ValueError If specified quantile columns are not found in the DataFrame. If data in quantile columns is not numeric. See Also -------- plot_interval_consistency : Plot consistency of interval widths using scatter points. numpy.linspace : Generate evenly spaced numbers. matplotlib.pyplot.plot : Plot lines. Notes ----- - Interval widths (:math:`W = Q_{upper} - Q_{lower}`) are calculated for each location and time step. - These widths are then *globally normalized* by dividing by the maximum width observed across all locations and all time steps. This ensures that the radial deviations are comparable across rings. - The radius for a point on ring `i` (0-indexed time step) is :math:`r = (\text{base}_{radius} \times (i+1)) + (\text{band}_{height} \times w_{normalized})`. - The angular coordinate `theta` is currently based on the DataFrame index, not influenced by `theta_col`. - Radial axis ticks and labels are hidden by default (`set_yticks([])`) as the radial dimension primarily separates the time steps. Let :math:`\mathbf{L}_i` and :math:`\mathbf{U}_i` be the lower and upper quantile bound vectors (length :math:`N`, number of locations) for time step :math:`i` (:math:`i = 0, \dots, M-1`). 1. **Interval Width Calculation**: For each time step :math:`i`, calculate the width vector: :math:`\mathbf{W}_i = \mathbf{U}_i - \mathbf{L}_i`. 2. **Global Normalization**: Find the maximum width across all locations and time steps: :math:`W_{max} = \max_{i,j} (\mathbf{W}_i)_j`. Normalize each width vector :math:`\mathbf{W}_i`: .. math:: \mathbf{w}_i = \frac{\mathbf{W}_i}{W_{max}} \quad (\text{element-wise}) If :math:`W_{max} = 0`, :math:`\mathbf{w}_i` will be all zeros. 3. **Angular Coordinate (`theta`)**: Let :math:`S` be the angular span and :math:`\theta_{min}` the start angle from `acov`. For location index :math:`j` (:math:`j=0, ..., N-1`): .. math:: \theta_j = \left( \frac{j}{N} \times S \right) + \theta_{min} 4. **Radial Coordinate (`r`)**: For time step :math:`i` and location :math:`j`: .. math:: r_{i,j} = (\text{base}_{radius} \times (i+1)) + \\ (\text{band}_{height} \times (\mathbf{w}_i)_j) 5. **Plotting**: For each time step :math:`i`, plot a line connecting points :math:`(r_{i,j}, \theta_j)` for :math:`j = 0, \dots, N-1`. References ---------- .. [1] Kouadio, K. L., Liu, R., Loukou, K. G. H., Liu, J., & Liu, W. (2025). Analytics Framework for Interpreting Spatiotemporal Probabilistic Forecasts. *International Journal of Forecasting*. Manuscript submitted. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from kdiagram.plot.uncertainty import plot_uncertainty_drift **1. Random Example:** >>> np.random.seed(2) >>> N_points = 100 >>> df_drift_rand = pd.DataFrame({'location_id': range(N_points)}) >>> years = range(2020, 2024) >>> q10_drift_cols = [] >>> q90_drift_cols = [] >>> for i, year in enumerate(years): ... q10_col = f'q10_{year}' ... q90_col = f'q90_{year}' ... base_val = np.random.rand(N_points) * 10 ... width = (np.random.rand(N_points) + 0.5) * (2 + i) # Increasing width ... df_drift_rand[q10_col] = base_val - width / 2 ... df_drift_rand[q90_col] = base_val + width / 2 ... q10_drift_cols.append(q10_col) ... q90_drift_cols.append(q90_col) >>> ax_drift_rand = plot_uncertainty_drift( ... df=df_drift_rand, ... qlow_cols=q10_drift_cols, ... qup_cols=q90_drift_cols, ... dt_labels=[str(y) for y in years], ... theta_col='location_id', # Ignored for positioning ... acov='default', ... base_radius=0.1, # Smaller spacing ... band_height=0.1, # Smaller uncertainty scaling ... cmap='viridis', ... title='Random Uncertainty Drift Example', ... mute_angle=False # Show angle labels ... ) >>> # plt.show() called internally **2. Concrete Example (Subsidence Data - adapted):** >>> # Assume zhongshan_pred_2023_2026 is a loaded DataFrame like: >>> # Create dummy data if it doesn't exist >>> try: ... zhongshan_pred_2023_2026 ... except NameError: ... print("Creating dummy subsidence data for example...") ... N_sub = 150 ... zhongshan_pred_2023_2026 = pd.DataFrame({ ... 'latitude': np.linspace(22.2, 22.8, N_sub), ... **{f'subsidence_{yr}_q10': np.random.rand(N_sub)*(yr-2022)*2 + 1 ... for yr in range(2023, 2027)}, ... **{f'subsidence_{yr}_q90': np.random.rand(N_sub)*(yr-2022)*2 + 5 ... + np.linspace(0, (yr-2022)*3, N_sub) # Increasing width trend ... for yr in range(2023, 2027)}, ... }) >>> qlow_sub_drift = [f'subsidence_{yr}_q10' for yr in range(2023, 2027)] >>> qup_sub_drift = [f'subsidence_{yr}_q90' for yr in range(2023, 2027)] >>> year_labels_sub = [str(yr) for yr in range(2023, 2027)] >>> ax_sub_drift = plot_uncertainty_drift( ... df=zhongshan_pred_2023_2026, ... qlow_cols=qlow_sub_drift, ... qup_cols=qup_sub_drift, ... dt_labels=year_labels_sub, ... theta_col='latitude', # Ignored for positioning ... acov='half_circle', # Use 180 degrees ... title='Uncertainty Drift Over Time (Zhongshan)', ... cmap='tab10', ... band_height=0.1, # Controls visual width effect ... base_radius=0.2, # Controls spacing between years ... show_legend=True, ... mute_degree=True ... ) >>> # plt.show() called internally """
[docs] @check_non_emptiness @isdf def plot_actual_vs_predicted( df: pd.DataFrame, actual_col: str, pred_col: str, theta_col: str | None = None, acov: str = "default", figsize: tuple[float, float] = (8.0, 8.0), title: str | None = None, line: bool = True, r_label: str | None = None, cmap: str | None = None, alpha: float = 0.3, actual_props: dict[str, Any] | None = None, pred_props: dict[str, Any] | None = None, show_grid: bool = True, grid_props: dict | None = None, show_legend: bool = True, mask_angle: bool = False, dpi: int = 300, savefig: str | None = None, ax: Axes | None = None, ): # If theta_col is provided but currently unused, warn: if theta_col is not None: warnings.warn( "`theta_col` is currently ignored by plot_actual_vs_predicted.", UserWarning, stacklevel=2, ) # --- Input Validation and Preparation --- # Basic checks handled by decorators # Check existence of primary columns exist_features( df, features=[actual_col, pred_col], error="raise", # Raise error if missing name="Actual and Predicted columns", ) # Consolidate columns needed, check theta_col existence only if specified cols_to_select = [actual_col, pred_col] if theta_col: if theta_col not in df.columns: warnings.warn( f"Specified `theta_col` ('{theta_col}') not found. " f"Using index for angular position.", UserWarning, stacklevel=2, ) # Proceed without theta_col for ordering else: warnings.warn( f"`theta_col` ('{theta_col}') is provided but currently " f"ignored for positioning/ordering. Using index.", UserWarning, stacklevel=2, ) # Although ignored, keep it for potential future use if needed? # For now, just select it if present, even if unused later. # cols_to_select.append(theta_col) # Decided against adding if unused # Drop rows with NaNs in essential columns data = df[cols_to_select].dropna().copy() if len(data) == 0: warnings.warn( "DataFrame is empty after dropping NaN values in actual" " and predicted columns. Cannot generate plot.", UserWarning, stacklevel=2, ) return None N = len(data) if N > 2_000: warnings.warn( "Large number of samples; consider " "saving the figure instead of showing.", UserWarning, stacklevel=2, ) # --- Angular Coordinate Calculation --- acov_map = { # Map name to angular range in radians "default": 2 * np.pi, "half_circle": np.pi, "quarter_circle": np.pi / 2, "eighth_circle": np.pi / 4, } angular_range = acov_map.get(acov.lower(), 2 * np.pi) if acov.lower() not in acov_map: warnings.warn( f"Invalid `acov` value '{acov}'. Using 'default' (2*pi).", UserWarning, stacklevel=2, ) # Calculate theta based on index, mapped to the angular range # Use endpoint=False if using lines to avoid overlap at 2pi? # If using dots, endpoint=True or False matters less visually. # Let's use endpoint=False for lines, True for dots for potentially # better spacing. use_endpoint = not line theta = np.linspace(0.0, angular_range, N, endpoint=use_endpoint) # --- Extract Data --- try: actual = data[actual_col].to_numpy(dtype=float) pred = data[pred_col].to_numpy(dtype=float) except (ValueError, TypeError) as e: raise TypeError( f"Failed to convert actual or predicted columns to numeric." f" Check data types. Original error: {e}" ) from e # --- Plotting Setup --- if ax is None: fig = plt.figure(figsize=figsize, constrained_layout=True) ax = fig.add_subplot(111, projection="polar") else: fig = ax.figure ax.set_thetamin(0) ax.set_thetamax(np.degrees(angular_range)) # Expects degrees # NEW: move radial tick labels away from the title/left edge ax.set_rlabel_position( 225 ) # SW quadrant works well for default orientation set_axis_grid(ax, show_grid=show_grid, grid_props=grid_props) if mask_angle: ax.set_xticklabels([]) # --- Plot Difference Lines --- # Warning: This loop can be very slow for large N # Consider alternatives like fill_between if performance is critical # and data can be meaningfully sorted by theta. if N > 5_000: # Add warning for potentially slow loop warnings.warn( f"Plotting difference lines for {N} points individually." f" This may be slow. Consider using `line=False` or sampling data.", PerformanceWarning, stacklevel=2, ) for t, a, p in zip(theta, actual, pred): # Plot a vertical line segment at angle t between actual and pred ax.plot( [t, t], # Start and end angle (same) [min(a, p), max(a, p)], # Start and end radius color="gray", # Hardcoded color for difference alpha=alpha, # Use specified transparency linewidth=1, # Fixed linewidth for diff lines ) # --- Plot Actual and Predicted Data (Lines or Dots) --- # Define default properties, merge with user-provided props default_actual_props_line = { "color": "black", "linewidth": 1.5, "label": "Actual", } default_pred_props_line = { "color": "red", "linewidth": 1.5, "label": "Predicted (Q50)", } default_actual_props_scatter = { "color": "black", "s": 20, "label": "Actual", } default_pred_props_scatter = { "color": "red", "s": 20, "alpha": alpha, "label": "Predicted (Q50)", } # Ensure user props are dictionaries if provided actual_props = actual_props or {} pred_props = pred_props or {} _assert_all_types(actual_props, dict, objname="'actual_props'") _assert_all_types(pred_props, dict, objname="'pred_props'") if line: # Merge user props with defaults for line plot current_actual_props = {**default_actual_props_line, **actual_props} current_pred_props = {**default_pred_props_line, **pred_props} # Ensure theta wraps around if using full circle for line plot theta_plot = ( np.append(theta, theta[0]) if np.isclose(angular_range, 2 * np.pi) else theta ) actual_plot = ( np.append(actual, actual[0]) if np.isclose(angular_range, 2 * np.pi) else actual ) pred_plot = ( np.append(pred, pred[0]) if np.isclose(angular_range, 2 * np.pi) else pred ) ax.plot(theta_plot, actual_plot, **current_actual_props) ax.plot(theta_plot, pred_plot, **current_pred_props) else: # Merge user props with defaults for scatter plot current_actual_props = { **default_actual_props_scatter, **actual_props, } current_pred_props = {**default_pred_props_scatter, **pred_props} ax.scatter(theta, actual, **current_actual_props) ax.scatter(theta, pred, **current_pred_props) # --- Final Touches --- ax.set_title( title or "Actual vs Predicted Polar Plot", fontsize=14, y=1.08 ) if r_label: # Use set_ylabel for radial axis label, adjust padding ax.set_ylabel(r_label, labelpad=32, fontsize=10) # _place_polar_ylabel(ax, r_label, angle=225, labelpad=32) # Add legend if requested and labels were provided in props if show_legend: # Check if labels exist before showing legend handles, labels = ax.get_legend_handles_labels() if labels: # Only show legend if there's something to label ax.legend( loc="upper right", bbox_to_anchor=(1.25, 1.1), fontsize=9 ) else: warnings.warn( "Legend requested but no labels found for plot elements.", UserWarning, stacklevel=2, ) # --- Save or Show --- final = safe_savefig( savefig, fig, # or 'ax' — both work dpi=dpi, bbox_inches="tight", pad_inches=0.2, ) if final is not None: plt.close(fig) else: plt.show() return ax
plot_actual_vs_predicted.__doc__ = r""" Polar plot comparing actual observed vs predicted values. This function generates a polar plot to visually compare actual ground truth values against model predictions (typically a central estimate like the median, Q50) for multiple data points or locations arranged circularly [1]_. - **Angular Position (`theta`)**: Represents each data point or location. Points are currently plotted in their DataFrame index order, mapped linearly onto the specified angular coverage (`acov`). The `theta_col` parameter is intended for future use in ordering points based on a specific feature (like latitude) but is currently ignored for positioning. - **Radial Distance (`r`)**: Represents the magnitude of the values. Both the actual value (`actual_col`) and the predicted value (`pred_col`) are plotted at the corresponding angle `theta`. - **Visual Comparison**: - Actual and predicted values are shown as either continuous lines or individual dots based on the `line` parameter [2]_. - Gray vertical lines connect the actual and predicted values at each angle, visually highlighting the magnitude and direction (over- or under-prediction) of the difference at each point. This plot facilitates: - Quick visual assessment of prediction accuracy and bias across samples. - Identification of regions or conditions (if angle relates to a feature) where the model performs well or poorly. - Communication of model performance to stakeholders. Parameters ---------- df : pd.DataFrame Input DataFrame containing actual and predicted value columns. Decorators ensure it's a valid, non-empty pandas DataFrame. actual_col : str Name of the column holding the actual observed (ground truth) values. pred_col : str Name of the column holding the corresponding predicted values (e.g., the Q50 median prediction). theta_col : str, optional *Intended* column name for ordering points angularly based on its values (e.g., 'latitude'). *Note: This parameter is currently ignored for positioning/ordering in this implementation; points use DataFrame index order.* A warning is issued if provided. Default is ``None``. acov : {'default', 'half_circle', 'quarter_circle', \ 'eighth_circle'}, default='default' Specifies the angular coverage (span) of the polar plot: ``'default'`` (360°), ``'half_circle'`` (180°), ``'quarter_circle'`` (90°), ``'eighth_circle'`` (45°). figsize : tuple of (float, float), default=(8.0, 8.0) Width and height of the figure in inches. title : str, optional Custom title for the plot. If ``None``, a default title is used. Default is ``None``. line : bool, default=True Determines the plotting style: - If ``True``, actual and predicted values are plotted as lines connecting consecutive points. - If ``False``, values are plotted as individual scatter dots. r_label : str, optional Custom label for the radial axis (representing value magnitude). If ``None``, no label is set. Default is ``None``. cmap : str, optional *Note: This parameter is currently unused in the function.* It might be intended for future use, perhaps coloring the difference lines. Default is ``None``. alpha : float, default=0.3 Transparency level applied to the gray difference lines drawn between actual and predicted values, and also to the predicted dots if ``line=False``. actual_props : dict, optional Dictionary of keyword arguments passed directly to the Matplotlib `plot` or `scatter` function for the 'Actual' data series. Allows customization (e.g., ``{'color': 'blue', 'linestyle': '--'}``). Defaults to basic black line/dots if ``None``. pred_props : dict, optional Dictionary of keyword arguments passed directly to the Matplotlib `plot` or `scatter` function for the 'Predicted' data series. Allows customization (e.g., ``{'color': 'orange', 'marker': 'x'}``). Defaults to basic red line/dots if ``None``. show_grid : bool, default=True If ``True``, display the polar grid lines. show_legend : bool, default=True If ``True``, display a legend labeling the 'Actual' and 'Predicted' series. mute_degree : bool, default=False If ``True``, hide the angular tick labels (degrees). savefig : str, optional File path to save the plot image. If ``None``, displays the plot interactively. Default is ``None``. Returns ------- ax : matplotlib.axes._axes.Axes The Matplotlib Axes object containing the polar plot. Note that due to subplot_kw, it's specifically a PolarAxesSubplot. Raises ------ ValueError If `actual_col` or `pred_col` are not found in `df`. TypeError If data in actual or predicted columns is not numeric. See Also -------- plot_anomaly_magnitude : Visualize only points outside prediction intervals. matplotlib.pyplot.plot : Function for line plots. matplotlib.pyplot.scatter : Function for scatter plots. Notes ----- - Rows with NaN values in `actual_col` or `pred_col` (or `theta_col` if specified, though currently unused for position) are dropped. - The gray lines indicating the difference are drawn individually for each point using a loop. *Warning: This approach can be very slow for large datasets (many thousands of points).* An alternative like `fill_between` might be more efficient for showing shaded areas but would require sorting by theta. - The `theta_col` parameter is currently ignored for positioning; angles are always based on the DataFrame index order after NaN removal. - The `cmap` parameter is currently unused. The difference lines are hardcoded to 'gray'. - Default plotting styles are black for actual and red for predicted, but can be overridden using `actual_props` and `pred_props` [1]_. Let :math:`y_j` be the actual value and :math:`\hat{y}_j` the predicted value for data point (location) :math:`j` (:math:`j=0, \dots, N-1` after NaN removal). 1. **Angular Coordinate (`theta`)**: Let :math:`S` be the angular span and :math:`\theta_{min}` the start angle from `acov`. .. math:: \theta_j = \left( \frac{j}{N} \times S \right) + \theta_{min} 2. **Radial Coordinates**: The radial coordinates are directly the values: :math:`r_{actual, j} = y_j` and :math:`r_{pred, j} = \hat{y}_j`. 3. **Plotting**: - Plot points/lines connecting :math:`(r_{actual, j}, \theta_j)` and :math:`(r_{pred, j}, \theta_j)`. - For each :math:`j`, draw a gray line segment connecting the points :math:`(\min(y_j, \hat{y}_j), \theta_j)` and :math:`(\max(y_j, \hat{y}_j), \theta_j)`. References ---------- .. [1] Kouadio, K. L., Liu, R., Loukou, K. G. H., Liu, J., & Liu, W. (2025). Analytics Framework for Interpreting Spatiotemporal Probabilistic Forecasts. *International Journal of Forecasting*. Manuscript submitted. .. [2] Matplotlib documentation: https://matplotlib.org/stable/ Examples -------- >>> import pandas as pd >>> import numpy as np >>> from kdiagram.plot.uncertainty import plot_actual_vs_predicted **1. Random Example:** >>> np.random.seed(0) >>> N = 100 >>> df_avp_rand = pd.DataFrame({ ... 'Time': pd.date_range('2023-01-01', periods=N, freq='D'), ... 'ActualTemp': 15 + 10 * np.sin(np.linspace(0, 4 * np.pi, N)) + np.random.randn(N) * 2, ... 'PredictedTemp': 16 + 9 * np.sin(np.linspace(0, 4 * np.pi, N) + 0.1) + np.random.randn(N) * 1.5 ... }) >>> ax_avp_rand = plot_actual_vs_predicted( ... df=df_avp_rand, ... actual_col='ActualTemp', ... pred_col='PredictedTemp', ... theta_col='Time', # Note: Ignored for positioning ... acov='default', ... title='Temperature: Actual vs. Predicted', ... line=True, # Use lines ... r_label='Temperature (°C)', ... actual_props={'color': 'navy', 'linestyle': '-'}, ... pred_props={'color': 'crimson', 'linestyle': '--'} ... ) >>> # plt.show() called internally **2. Concrete Example (Subsidence Data - using dots):** >>> # Assume zhongshan_pred_2023_2026 is a loaded DataFrame >>> # Create dummy data if it doesn't exist >>> try: ... zhongshan_pred_2023_2026 ... except NameError: ... print("Creating dummy subsidence data for example...") ... N_sub = 150 ... zhongshan_pred_2023_2026 = pd.DataFrame({ ... 'latitude': np.linspace(22.2, 22.8, N_sub), ... 'subsidence_2023': np.random.rand(N_sub)*15 + np.linspace(0, 5, N_sub), ... 'subsidence_2023_q50': np.random.rand(N_sub)*14 + np.linspace(0.5, 5.5, N_sub), ... # Add other columns if needed by other examples ... **{f'subsidence_{yr}_q10': np.random.rand(N_sub)*(yr-2022)*2 + 1 ... for yr in range(2023, 2027)}, ... **{f'subsidence_{yr}_q90': np.random.rand(N_sub)*(yr-2022)*2 + 5 ... + np.linspace(0, (yr-2022)*3, N_sub) ... for yr in range(2023, 2027)}, ... }) >>> ax_avp_sub = plot_actual_vs_predicted( ... df=zhongshan_pred_2023_2026.head(100), # Use subset for speed ... actual_col='subsidence_2023', ... pred_col='subsidence_2023_q50', ... theta_col='latitude', # Note: Ignored for positioning ... acov='half_circle', # Use 180 degrees ... title='Actual vs Predicted Subsidence (2023)', ... line=False, # Use dots instead of lines ... r_label="Subsidence (mm)", ... mute_degree=True, ... pred_props={'marker': 'x', 'color': 'purple'} # Customize predicted dots ... ) >>> # plt.show() called internally """
[docs] @check_non_emptiness @isdf def plot_interval_width( df: pd.DataFrame, q_cols: list[str] | tuple[str, str], theta_col: str | None = None, z_col: str | None = None, acov: str = "default", figsize: tuple[float, float] = (8.0, 8.0), title: str | None = None, cmap: str = "viridis", s: int = 30, alpha: float = 0.8, show_grid: bool = True, grid_props: dict | None = None, cbar: bool = True, mask_angle: bool = True, savefig: str | None = None, dpi: int = 300, ax: Axes | None = None, ): # --- Input Validation --- # Basic df checks handled by decorators q_cols_processed = columns_manager(q_cols, empty_as_none=False) if len(q_cols_processed) != 2: raise TypeError( "`q_cols` expects exactly two column names: " "[lower_quantile_column, upper_quantile_column]. " f"Received {len(q_cols_processed)} columns: {q_cols_processed}" ) # Assign validated lower and upper quantile columns qlow_col, qup_col = q_cols_processed[0], q_cols_processed[1] # Consolidate list of columns needed for processing and NaN checks cols_needed = [qlow_col, qup_col] if theta_col: # Although unused for positioning, check if exists if provided if theta_col not in df.columns: warnings.warn( f"Specified `theta_col` ('{theta_col}') not found. " f"It will be ignored.", UserWarning, stacklevel=2, ) else: # Add to list for potential future use or NaN check if desired # cols_needed.append(theta_col) # Currently ignored, don't add warnings.warn( f"`theta_col` ('{theta_col}') is provided but currently " f"ignored for positioning/ordering. Using index.", UserWarning, stacklevel=2, ) if z_col: if z_col not in df.columns: raise ValueError( f"Specified `z_col` ('{z_col}') not found in DataFrame." ) cols_needed.append(z_col) # Check existence of essential quantile columns (redundant if exist_features used) missing_essential = [ col for col in [qlow_col, qup_col] if col not in df.columns ] if missing_essential: raise ValueError( f"Essential quantile columns missing: {', '.join(missing_essential)}" ) # Also check z_col again if it was added if z_col and z_col not in df.columns: raise ValueError(f"`z_col` ('{z_col}') not found.") # Drop rows with NaNs in the essential columns used for plotting (q, z) data = df[cols_needed].dropna().copy() if len(data) == 0: warnings.warn( "DataFrame is empty after dropping NaN values." " Cannot generate plot.", UserWarning, stacklevel=2, ) return None N = len(data) # Number of valid data points # --- Calculate Radial Coordinate (Interval Width) --- try: r = data[qup_col].to_numpy(dtype=float) - data[qlow_col].to_numpy( dtype=float ) except Exception as e: raise TypeError( f"Could not compute interval width. Ensure columns '{qup_col}'" f" and '{qlow_col}' contain numeric data. Original error: {e}" ) from e # Check for negative widths if np.any(r < 0): num_negative = np.sum(r < 0) warnings.warn( f"{num_negative} out of {N} locations have negative interval " f"width ({qup_col} < {qlow_col}). These will be plotted with " f"negative radius, potentially causing visual artifacts.", UserWarning, stacklevel=2, ) # Option: clamp negative radius to 0? # r = np.maximum(r, 0) # --- Calculate Color Coordinate (Z-value) --- if z_col: try: z = data[z_col].to_numpy(dtype=float) cbar_label = z_col # Label colorbar with column name except Exception as e: raise TypeError( f"Could not use `z_col` ('{z_col}') for color. Ensure it " f"contains numeric data. Original error: {e}" ) from e else: # Default: color by interval width `r` z = r cbar_label = "Interval Width" # Label colorbar appropriately # --- Angular Coordinate Calculation --- acov_map = { # Map name to angular range in radians "default": 2 * np.pi, "half_circle": np.pi, "quarter_circle": np.pi / 2, "eighth_circle": np.pi / 4, } angular_range = acov_map.get(acov.lower(), 2 * np.pi) if acov.lower() not in acov_map: warnings.warn( f"Invalid `acov` value '{acov}'. Using 'default' (2*pi).", UserWarning, stacklevel=2, ) # Calculate theta based on index, mapped to the angular range # Using endpoint=False might give slightly better visual spacing for scatter theta = np.linspace(0.0, angular_range, N, endpoint=False) # --- Color Normalization --- # Check if z has variance for normalization if np.ptp(z) > 1e-9: # Check peak-to-peak range norm = Normalize(vmin=np.min(z), vmax=np.max(z)) else: # Handle constant z case: map all to middle color norm = Normalize(vmin=np.min(z) - 0.5, vmax=np.max(z) + 0.5) warnings.warn( f"Color values ('{cbar_label}') have zero range." f" All points will have the same color.", UserWarning, stacklevel=2, ) try: cmap_ref = get_cmap(cmap) colors = cmap_ref(norm(z)) except ValueError: warnings.warn( f"Invalid `cmap` name '{cmap}'. Falling back to 'viridis'.", stacklevel=2, ) cmap = "viridis" # Ensure cmap is valid for fallback cmap_ref = get_cmap(cmap) colors = cmap_ref(norm(z)) # --- Plotting --- if ax is None: fig, ax = plt.subplots( figsize=figsize, subplot_kw={"projection": "polar"} ) else: fig = ax.figure ax.set_thetamin(0) ax.set_thetamax(np.degrees(angular_range)) # Expects degrees # Apply grid and angle label settings set_axis_grid(ax, show_grid, grid_props=grid_props) if mask_angle: ax.set_xticklabels([]) # Create the scatter plot ax.scatter( theta, r, # Radius is interval width c=colors, # Point color based on z s=s, # Point size alpha=alpha, # Point transparency edgecolor="k", # Point edge color (optional) linewidth=0.5, # Point edge width (optional) # cmap=cmap_ref is implicitly used via 'colors', no need to pass here ) # Set plot title ax.set_title( title or f"Prediction Interval Width ({qup_col} - {qlow_col})", fontsize=14, y=1.08, ) # Add color bar if requested if cbar: # Create a ScalarMappable for the colorbar sm = cm.ScalarMappable(norm=norm, cmap=cmap_ref) sm.set_array([]) # Necessary for ScalarMappable # Add the colorbar to the figure cbar_obj = plt.colorbar(sm, ax=ax, pad=0.1, shrink=0.7) # Set label dynamically based on whether z_col was used cbar_obj.set_label(cbar_label, rotation=270, labelpad=32, fontsize=10) # Set radial axis label (useful to know radius means width) ax.set_ylabel("Interval Width", labelpad=32, fontsize=10) # Set radial limits, ensure 0 is included if widths are non-negative if np.all(r >= 0): ax.set_ylim(bottom=0) # else: let matplotlib auto-scale if negative widths exist plt.tight_layout() # Adjust layout # --- Save or Show --- final = safe_savefig( savefig, fig, dpi=dpi, bbox_inches="tight", ) if final is not None: plt.close(fig) else: fig.tight_layout() plt.show() return ax
plot_interval_width.__doc__ = r""" Polar scatter plot visualizing prediction interval width. This function generates a polar scatter plot to visualize the magnitude of prediction uncertainty, represented by the width of the prediction interval (Upper Quantile - Lower Quantile), across different locations or samples. - **Angular Position (`theta`)**: Represents each location or data point. Currently derived from the DataFrame index, mapped linearly onto the specified angular coverage (`acov`). The optional `theta_col` parameter is intended for future use in ordering but is currently ignored for positioning. - **Radial Distance (`r`)**: Directly represents the width of the prediction interval (:math:`Q_{upper} - Q_{lower}`). A larger radius indicates greater predicted uncertainty for that point. - **Color (`z`)**: Optionally represents a third variable, specified by `z_col` (e.g., the median prediction Q50, or the actual value). This allows for correlating the uncertainty width with another metric. If `z_col` is not provided, the color defaults to representing the interval width (`r`) itself [1]_. This plot helps to: - Identify locations with high or low prediction uncertainty. - Visualize the spatial distribution or sample distribution of uncertainty magnitude. - Explore potential correlations between uncertainty width and other variables (like the central prediction) when using `z_col`. Parameters ---------- df : pd.DataFrame Input DataFrame containing the quantile prediction columns and optionally columns for theta ordering and color (`z_col`). Decorators ensure it's a valid, non-empty pandas DataFrame. q_cols : list or tuple of str A sequence containing exactly two string elements: the column name for the lower quantile bound (e.g., 'prediction_q10') and the column name for the upper quantile bound (e.g., 'prediction_q90'). The order must be ``[lower_col, upper_col]``. theta_col : str, optional *Intended* column name for ordering points angularly based on its values (e.g., 'latitude'). *Note: This parameter is currently ignored for positioning/ordering in this implementation; points use DataFrame index order.* A warning is issued if provided. Default is ``None``. z_col : str, optional Name of the column whose values will be used to color the scatter points. Common choices include the median prediction column (e.g., 'prediction_q50') or an actual value column to see if high/low uncertainty correlates with high/low values. If ``None``, the color will represent the interval width (`r`) itself. Default is ``None``. acov : {'default', 'half_circle', 'quarter_circle', \ 'eighth_circle'}, default='default' Specifies the angular coverage (span) of the polar plot: ``'default'`` (360°), ``'half_circle'`` (180°), ``'quarter_circle'`` (90°), ``'eighth_circle'`` (45°). figsize : tuple of (float, float), default=(8.0, 8.0) Width and height of the figure in inches. title : str, optional Custom title for the plot. If ``None``, a default title like "Prediction Interval Width (Q_upper - Q_lower)" is used. Default is ``None``. cmap : str, default='viridis' Name of the Matplotlib colormap used to color the points based on the `z` value (from `z_col` or defaulting to interval width `r`). s : int, default=30 Marker size for the scatter points. alpha : float, default=0.8 Transparency level for the scatter points (0=transparent, 1=opaque). show_grid : bool, default=True If ``True``, display the polar grid lines. cbar : bool, default=True If ``True``, display a color bar indicating the mapping between colors and the `z` values. mask_angle : bool, default=True If ``True``, hide the angular tick labels (degrees). Recommended if the angle is based on index. savefig : str, optional File path to save the plot image. If ``None``, displays the plot interactively. Default is ``None``. Returns ------- ax : matplotlib.axes._axes.Axes The Matplotlib Axes object containing the polar scatter plot, specifically a PolarAxesSubplot. Raises ------ TypeError If `q_cols` does not contain exactly two elements. ValueError If specified columns in `q_cols`, `theta_col` (if provided), or `z_col` (if provided) are not found in the DataFrame `df`. If data in the required columns is not numeric. See Also -------- plot_interval_consistency : Visualize the temporal consistency of interval widths. plot_uncertainty_drift : Visualize the temporal drift of interval widths as rings. matplotlib.pyplot.scatter : Function used for plotting points. matplotlib.colors.Normalize : Used for scaling color values. Notes ----- - Rows with NaN values in any of the required columns (`q_cols`, `theta_col` if used, `z_col` if used) are dropped before plotting. - The interval width `r` is calculated as `Upper Quantile - Lower Quantile`. If the lower quantile is greater than the upper quantile for any point, this will result in a negative width, which might lead to unexpected plotting behavior as radius is typically non-negative. A warning is issued if negative widths are detected. - The angular coordinate `theta` is derived from the DataFrame index after NaN removal, not influenced by `theta_col` currently. - Color is determined by the `z_col` values if provided, otherwise it defaults to representing the interval width `r`. Let :math:`L_j` and :math:`U_j` be the lower and upper quantile bound values for data point (location) :math:`j` (:math:`j=0, \dots, N-1` after NaN removal). Let :math:`z'_j` be the value from `z_col` for point :math:`j`, if `z_col` is provided. 1. **Radial Coordinate (`r`)**: Interval Width. .. math:: r_j = U_j - L_j 2. **Color Value (`z`)**: .. math:: z_j = \begin{cases} z'_j & \text{if } z\_col \text{ is provided}\\ \\ r_j & \text{if } z\_col \text{ is None} \end{cases} 3. **Angular Coordinate (`theta`)**: Let :math:`S` be the angular span and :math:`\theta_{min}` the start angle from `acov`. .. math:: \theta_j = \left( \frac{j}{N} \times S \right) + \theta_{min} 4. **Plotting**: Plot points :math:`(r_j, \theta_j)` using `scatter`, where the color of each point is determined by applying `cmap` to the normalized value of :math:`z_j`. References ---------- .. [1] Kouadio, K. L., Liu, R., Loukou, K. G. H., Liu, J., & Liu, W. (2025). Analytics Framework for Interpreting Spatiotemporal Probabilistic Forecasts. *International Journal of Forecasting*. Manuscript submitted. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from kdiagram.plot.uncertainty import plot_interval_width **1. Random Example:** >>> np.random.seed(1) >>> N = 120 >>> df_iw_rand = pd.DataFrame({ ... 'sample_id': range(N), ... 'latitude': np.linspace(40, 42, N) + np.random.randn(N)*0.05, ... 'q10_pred': np.random.rand(N) * 10, ... 'q50_pred': np.random.rand(N) * 10 + 5, ... 'q90_pred': np.random.rand(N) * 10 + 10, # Width varies ... }) >>> # Ensure Q90 > Q10 >>> df_iw_rand['q90_pred'] = df_iw_rand['q10_pred'] + np.abs( ... df_iw_rand['q90_pred'] - df_iw_rand['q10_pred']) >>> >>> ax_iw_rand = plot_interval_width( ... df=df_iw_rand, ... q_cols=['q10_pred', 'q90_pred'], # Pass as list [lower, upper] ... z_col='q50_pred', # Color by median prediction ... theta_col='latitude', # Ignored for positioning ... acov='default', ... title='Interval Width vs. Median Prediction', ... cmap='plasma', ... s=40, ... cbar=True ... ) >>> # plt.show() called internally **2. Concrete Example (Subsidence Data):** >>> # Assume zhongshan_pred_2023_2026 is a loaded DataFrame >>> # Create dummy data if it doesn't exist >>> try: ... zhongshan_pred_2023_2026 ... except NameError: ... print("Creating dummy subsidence data for example...") ... N_sub = 150 ... zhongshan_pred_2023_2026 = pd.DataFrame({ ... 'latitude': np.linspace(22.2, 22.8, N_sub), ... 'subsidence_2023_q10': np.random.rand(N_sub)*5 + 1, ... 'subsidence_2023_q50': np.random.rand(N_sub)*10 + 3, ... 'subsidence_2023_q90': np.random.rand(N_sub)*5 + 6 + np.linspace(0, 10, N_sub), ... # Ensure q90 > q10 ... **{f'subsidence_{yr}_q10': np.random.rand(N_sub)*(yr-2022)*2 + 1 ... for yr in range(2024, 2027)}, # Add other cols if needed ... **{f'subsidence_{yr}_q90': np.random.rand(N_sub)*(yr-2022)*2 + 5 ... + np.linspace(0, (yr-2022)*3, N_sub) ... for yr in range(2024, 2027)}, ... }) >>> # Ensure Q90 > Q10 for the primary year >>> zhongshan_pred_2023_2026['subsidence_2023_q90'] = ( ... zhongshan_pred_2023_2026['subsidence_2023_q10'] + ... np.abs(zhongshan_pred_2023_2026['subsidence_2023_q90'] - ... zhongshan_pred_2023_2026['subsidence_2023_q10']) + 0.1 ... ) >>> ax_iw_sub = plot_interval_width( ... df=zhongshan_pred_2023_2026.head(100), # Use subset ... q_cols=['subsidence_2023_q10', 'subsidence_2023_q90'], # Use list ... z_col='subsidence_2023_q50', # Color by Q50 ... theta_col='latitude', # Ignored for positioning ... acov='quarter_circle', # Use 90 degrees ... title='Spatial Spread of Uncertainty (2023)', ... cmap='YlGnBu', ... s=25, ... cbar=True, # Show colorbar for Q50 ... mask_angle=True ... ) >>> # plt.show() called internally """
[docs] @check_non_emptiness @isdf def plot_coverage_diagnostic( df: pd.DataFrame, actual_col: str, q_cols: list[str] | tuple[str, str], theta_col: str | None = None, acov: str = "default", figsize: tuple[float, float] = (8.0, 8.0), title: str | None = None, show_grid: bool = True, grid_props: dict | None = None, cmap: str = "RdYlGn", alpha: float = 0.85, s: int = 35, as_bars: bool = False, coverage_line_color: str = "r", buffer_pts: int = 500, fill_gradient: bool = True, gradient_size: int = 300, gradient_cmap: str = "Greens", gradient_levels: list[float] | None = None, gradient_props: dict[str, Any] | None = None, mask_angle: bool = True, savefig: str | None = None, dpi: int = 300, verbose: int = 0, ax: Axes | None = None, ): q_cols_processed = columns_manager(q_cols, empty_as_none=False) if len(q_cols_processed) != 2: raise TypeError( "`q_cols` expects exactly two column names: " "[lower_quantile_column, upper_quantile_column]. " f"Received {len(q_cols_processed)} columns: {q_cols_processed}" ) qlow_col, qup_col = q_cols_processed[0], q_cols_processed[1] # Consolidate needed columns and check existence cols_needed = [actual_col, qlow_col, qup_col] # Handle theta_col warning (existence checked implicitly by df[cols]) if theta_col: if theta_col not in df.columns: warnings.warn( f"Specified `theta_col` ('{theta_col}') not found. " f"Using index order.", UserWarning, stacklevel=2, ) # Don't add to cols_needed if missing else: warnings.warn( f"`theta_col` ('{theta_col}') is provided but currently " f"ignored for positioning/ordering. Using index order.", UserWarning, stacklevel=2, ) missing_essential = [ col for col in [actual_col, qlow_col, qup_col] if col not in df.columns ] if missing_essential: raise ValueError( f"Essential columns missing: {', '.join(missing_essential)}" ) # Drop rows with NaNs in essential columns data = df[cols_needed].dropna().copy() if len(data) == 0: warnings.warn( "DataFrame is empty after dropping NaN values." " Cannot generate plot.", UserWarning, stacklevel=2, ) return None N = len(data) # Number of valid points # --- Calculate Coverage --- try: y = data[actual_col].to_numpy(dtype=float) y_lo = data[qlow_col].to_numpy(dtype=float) y_hi = data[qup_col].to_numpy(dtype=float) except Exception as e: raise TypeError( f"Could not convert actual or quantile columns to numeric." f" Check data types. Original error: {e}" ) from e # Binary coverage indicator (1 if covered, 0 otherwise) covered = ((y >= y_lo) & (y <= y_hi)).astype(int) # Overall coverage rate total_coverage = covered.mean() if N > 0 else 0.0 # --- Angular Coordinate Calculation --- acov_map = { # Map name to (min_angle, max_angle) in radians "default": (0, 2 * np.pi), "half_circle": (0, np.pi), "quarter_circle": (0, np.pi / 2), "eighth_circle": (0, np.pi / 4), } if acov.lower() not in acov_map: # Use .get() for default or raise error explicitly? Let's raise. raise ValueError( f"Invalid `acov` value '{acov}'. Choose from: " f"{', '.join(acov_map.keys())}" ) theta_min_rad, theta_max_rad = acov_map[acov.lower()] angular_range_rad = theta_max_rad - theta_min_rad # Calculate theta based on index, mapped to the angular range theta = ( np.linspace( 0.0, 1.0, N, endpoint=False ) # Use endpoint=False for bars/scatter * angular_range_rad + theta_min_rad ) # --- Plotting Setup --- if ax is None: fig, ax = plt.subplots( figsize=figsize, subplot_kw={"projection": "polar"} ) else: fig = ax.figure ax.set_thetamin(np.degrees(theta_min_rad)) ax.set_thetamax(np.degrees(theta_max_rad)) # Set radial limits strictly to [0, 1] for coverage plot ax.set_ylim(0, 1.05) # Slight padding at top ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0]) # Set explicit radial ticks # Apply grid and angle label settings set_axis_grid(ax, show_grid, grid_props) if mask_angle: ax.set_xticklabels([]) # --- Optional Background Gradient --- if fill_gradient: try: grad_cmap_obj = get_cmap(gradient_cmap) # Create radial and angular meshgrid # R goes from 0 up to the total_coverage value R, T = np.meshgrid( np.linspace(0, total_coverage, gradient_size), np.linspace(theta_min_rad, theta_max_rad, gradient_size), ) # Z represents the value to map color to (e.g., radius itself) # Tile radius values across angles for pcolormesh Z = np.tile( np.linspace(0, total_coverage, gradient_size)[:, np.newaxis], (1, gradient_size), ) # Normalize Z based on [0, 1] range for colormap norm_gradient = Normalize(vmin=0, vmax=1.0) # Plot the gradient mesh ax.grid(False) ax.pcolormesh( T, R, Z, shading="auto", cmap=grad_cmap_obj, alpha=0.20, # Make gradient subtle norm=norm_gradient, # Ensure consistent color mapping ) ax.grid(True, which="both") except ValueError: warnings.warn( f"Invalid `gradient_cmap` ('{gradient_cmap}')." f" Skipping background gradient.", UserWarning, stacklevel=2, ) except Exception as e: warnings.warn( f"Failed to plot background gradient: {e}", UserWarning, stacklevel=2, ) # --- Optional Concentric Reference Lines --- # Define default levels if None gradient_levels = ( gradient_levels if gradient_levels is not None else [0.2, 0.4, 0.6, 0.8, 1.0] ) # Filter levels to be within [0, 1] valid_levels = [lv for lv in gradient_levels if 0 <= lv <= 1] # Define default properties, merge with user props default_gradient_props = { "linestyle": ":", "color": "gray", "linewidth": 0.8, "alpha": 0.8, } current_gradient_props = { **default_gradient_props, **(gradient_props or {}), } # Plot each reference line and its label theta_line = np.linspace(theta_min_rad, theta_max_rad, buffer_pts) for lv in valid_levels: ax.plot(theta_line, [lv] * buffer_pts, **current_gradient_props) # Add text label near the end of the line ax.text( theta_max_rad * 0.98, # Position slightly inside max angle lv, f"{lv:.1f}", # Format label color=current_gradient_props.get("color", "gray"), fontsize=8, ha="right", # Horizontal alignment va="center", # Vertical alignment ) # --- Average Coverage Line --- ax.plot( theta_line, [total_coverage] * buffer_pts, color=coverage_line_color, linewidth=2.0, # Make it prominent label=f"Avg Coverage ({total_coverage:.2f})", # Add rate to label ) # --- Plot Individual Coverage (Bars or Scatter) --- # Radius 'r' is the 'covered' array (0 or 1) r_plot = covered.astype(float) # Setup colormap and normalization for points/bars try: cmap_main_obj = get_cmap(cmap) except ValueError: warnings.warn( f"Invalid `cmap` ('{cmap}'). Using 'RdYlGn'.", UserWarning, stacklevel=2, ) cmap_main_obj = get_cmap("RdYlGn") # Normalize 0 to low end, 1 to high end of cmap norm_main = Normalize(vmin=0, vmax=1) point_colors = cmap_main_obj(norm_main(r_plot)) if as_bars: # Calculate bar width to fill space (approximate) # Use difference between angles, handle wrap around if full circle if N > 1: # Estimate width based on average angular spacing # Could also use angular_range_rad / N bar_widths = np.diff(theta, append=theta[0] + angular_range_rad) # Ensure positive widths if theta wraps around bar_widths = ( np.abs(bar_widths) * 0.9 ) # Make slightly narrower than gap elif N == 1: bar_widths = ( angular_range_rad * 0.8 ) # Default width for single bar else: # N=0 bar_widths = 0.1 # Placeholder ax.bar( theta, # Angle for each bar r_plot, # Height (0 or 1) width=bar_widths, # Width of bars bottom=0.0, # Bars start from center color=point_colors, alpha=alpha, edgecolor="gray", linewidth=0.5, ) else: # Plot as scatter points ax.scatter( theta, r_plot, # Radius (0 or 1) c=point_colors, # Color based on coverage s=s, alpha=alpha, edgecolor="gray", linewidth=0.5, # No label needed here, color indicates status ) # --- Final Touches --- ax.set_title( title or "Prediction Interval Coverage Diagnostic", fontsize=14, y=1.08, ) # Add legend only for the average coverage line handles, labels = ax.get_legend_handles_labels() if handles: # If the coverage line was plotted and labeled ax.legend( handles=handles, labels=labels, loc="upper right", bbox_to_anchor=(1.25, 1.1), fontsize=9, ) # Set radial axis label (Y-axis in polar) ax.set_ylabel("Coverage (1=In, 0=Out)", labelpad=32, fontsize=10) # --- Logging --- if verbose > 0: print("-" * 50) print("Coverage Rate Calculation:") print(f" Interval: [{qlow_col}, {qup_col}]") print(f" Points checked (after NaN removal): {N}") print(f" Overall Coverage Rate: {total_coverage * 100:.2f}%") print("-" * 50) # --- Output --- final = safe_savefig( savefig, fig, dpi=dpi, bbox_inches="tight", ) if final is not None: plt.close(fig) else: fig.tight_layout() plt.show() return ax
plot_coverage_diagnostic.__doc__ = r""" Diagnose prediction-interval coverage on a polar plot. This visualization checks whether observed values fall within their predicted intervals and summarizes the **empirical coverage rate** against the nominal level. It is a compact diagnostic for calibration of quantile/interval forecasts; foundational background on forecast verification and calibration appears in :footcite:p:`Gneiting2007b, Jolliffe2012`. See :footcite:t:`Gneiting2007b` for a discussion of calibration vs. sharpness. The plot maps samples around a circle (angular coordinate) and encodes **covered** vs **not covered** on the radial axis. A solid reference ring marks the overall coverage rate, and optional concentric guides and background gradients aid interpretation. Parameters ---------- df : pandas.DataFrame Input table containing the observed target and the two quantile bounds. Decorators ensure a valid, non-empty DataFrame. actual_col : str Column name of the observed (ground-truth) values. q_cols : list of str or tuple of str Exactly two names ``[lower_quantile_col, upper_quantile_col]`` that define the prediction interval. The order must be *[lower, upper]*. theta_col : str, optional Intended column for angular ordering. **Currently ignored**; the plot uses the DataFrame index order. A warning is issued if provided. Default is ``None``. acov : {'default', 'half_circle', 'quarter_circle', 'eighth_circle'}, \ default='default' Angular coverage (span) of the plot: - ``'default'``: full circle :math:`2\pi` - ``'half_circle'``: :math:`\pi` - ``'quarter_circle'``: :math:`\pi/2` - ``'eighth_circle'``: :math:`\pi/4` figsize : tuple of (float, float), default=(8.0, 8.0) Figure size in inches. title : str, optional Custom title. If ``None``, a default title is used. show_grid : bool, default=True If ``True``, show polar grid lines. grid_props : dict, optional Keyword arguments passed to your grid helper for customizing the grid (e.g., ``{'linestyle': '--', 'alpha': 0.6}``). cmap : str, default='RdYlGn' Colormap for per-point coverage (0 or 1). Diverging maps work well (e.g., red for uncovered, green for covered). alpha : float, default=0.85 Transparency for scatter points or bars. s : int, default=35 Marker size for scatter points (ignored when ``as_bars=True``). as_bars : bool, default=False If ``True``, draw radial bars (height 0/1) instead of points. coverage_line_color : str, default='r' Color of the solid ring at the average coverage rate. buffer_pts : int, default=500 Number of samples used to draw smooth circular lines (average rate and guide rings). fill_gradient : bool, default=True If ``True``, fill the background radially up to the average coverage with a subtle gradient. gradient_size : int, default=300 Resolution of the background gradient mesh. gradient_cmap : str, default='Greens' Colormap used for the optional background gradient. gradient_levels : list of float, optional Radii in :math:`[0,1]` for dashed concentric reference rings. Defaults to ``[0.2, 0.4, 0.6, 0.8, 1.0]``. gradient_props : dict, optional Style for the concentric guide rings (e.g., ``{'linestyle': ':', 'color': 'gray', 'linewidth': 0.8}``). mask_angle : bool, default=True If ``True``, hide angular tick labels. savefig : str, optional File path to save the figure. If ``None``, the figure is shown. verbose : int, default=0 If ``> 0``, print the computed overall coverage rate. Returns ------- ax : matplotlib.axes.Axes The polar Axes containing the coverage diagnostic. Raises ------ TypeError If ``q_cols`` does not contain exactly two names. ValueError If required columns are missing or cannot be coerced to numeric, or if ``acov`` is invalid. See Also -------- plot_anomaly_magnitude: Polar diagnostic for the *magnitude* and *type* of interval failures (under/over). plot_reliability_diagram: Calibration (reliability) curves for probabilistic classifiers. Notes ----- Coverage for row :math:`j` is defined by the closed interval test :math:`L_j \le y_j \le U_j`. Rows with missing values in essential columns are removed prior to computation, so all symbols below refer to the filtered data. The radial axis is fixed to :math:`[0,1]` and encodes a binary outcome per sample, while a solid reference ring at radius :math:`\bar{C}` summarizes the empirical coverage rate. Compare :math:`\bar{C}` to the nominal level implied by your interval (e.g., :math:`0.8` for Q10–Q90, :math:`0.9` for Q5–Q95) to assess calibration. Angular positions follow index order over the chosen span; ``theta_col`` is currently ignored for positioning. Let :math:`y_j` denote the actual value, :math:`L_j` the lower bound, and :math:`U_j` the upper bound for sample :math:`j`, with :math:`j=0,\dots,N-1` after NaN removal. The per-sample coverage indicator (radial coordinate :math:`r_j`) is .. math:: r_j \;=\; \begin{cases} 1, & L_j \le y_j \le U_j, \\\\ 0, & \text{otherwise.} \end{cases} The overall coverage rate drawn as a ring is .. math:: \bar{C} \;=\; \frac{1}{N} \sum_{j=0}^{N-1} r_j. Let :math:`S \in \{2\pi,\;\pi,\;\pi/2,\;\pi/4\}` be the angular span set by ``acov`` and let :math:`\theta_{\min}` be the start angle. The angular coordinate for sample :math:`j` is .. math:: \theta_j \;=\; \frac{j}{N}\,S \;+\; \theta_{\min}. Plotting semantics: each sample is placed at :math:`(\theta_j, r_j)` and colored via ``cmap`` according to :math:`r_j`; a solid ring at :math:`\bar{C}` is overlaid as a global summary; optional concentric guides at user-specified radii and an optional radial background gradient up to :math:`\bar{C}` provide additional visual context. Rows with NaNs in essential columns are dropped before computation. ``theta_col`` is currently ignored (index order is used). Examples -------- >>> import pandas as pd >>> import numpy as np >>> from kdiagram.plot.uncertainty import plot_coverage_diagnostic **1. Random Example (Well-calibrated 80% interval):** >>> np.random.seed(0) >>> N = 200 >>> df_cov_rand = pd.DataFrame({'id': range(N)}) >>> df_cov_rand['actual'] = np.random.normal(loc=10, scale=2, size=N) >>> # Simulate an ~80% interval (e.g., +/- 1.28 std devs for Normal) >>> std_dev_pred = 2.0 >>> df_cov_rand['q10_pred'] = 10 - 1.28 * std_dev_pred >>> df_cov_rand['q90_pred'] = 10 + 1.28 * std_dev_pred >>> # Add some noise to interval bounds >>> df_cov_rand['q10_pred'] += np.random.randn(N) * 0.2 >>> df_cov_rand['q90_pred'] += np.random.randn(N) * 0.2 >>> ax_cov_rand = plot_coverage_diagnostic( ... df=df_cov_rand, ... actual_col='actual', ... q_cols=['q10_pred', 'q90_pred'], # [lower, upper] ... theta_col='id', # Ignored for positioning ... acov='default', ... title='Coverage Diagnostic (Simulated 80% Interval)', ... as_bars=False, # Use scatter points ... coverage_line_color='blue', # Color for avg coverage line ... gradient_cmap='Blues', # Background gradient color ... verbose=1 # Print coverage rate ... ) >>> # Expected coverage rate near 80% >>> # plt.show() called internally **2. Concrete Example (Subsidence Data):** >>> # Assume small_sample_pred is a loaded DataFrame >>> # Create dummy data if it doesn't exist >>> try: ... small_sample_pred ... except NameError: ... print("Creating dummy small sample prediction data...") ... N_small = 200 ... small_sample_pred = pd.DataFrame({ ... 'subsidence_2023': np.random.rand(N_small)*15 + np.linspace(0, 5, N_small), ... 'subsidence_2023_q10': np.random.rand(N_small)*10, ... 'subsidence_2023_q90': np.random.rand(N_small)*10 + 10, ... 'latitude': np.linspace(22.3, 22.7, N_small) + np.random.randn(N_small)*0.01 ... }) >>> # Ensure Q90 > Q10 >>> small_sample_pred['subsidence_2023_q90'] = ( ... small_sample_pred['subsidence_2023_q10'] + ... np.abs(small_sample_pred['subsidence_2023_q90'] - ... small_sample_pred['subsidence_2023_q10']) + 0.1 ... ) >>> ax_cov_sub = plot_coverage_diagnostic( ... df=small_sample_pred, ... actual_col='subsidence_2023', ... q_cols=['subsidence_2023_q10', 'subsidence_2023_q90'], ... theta_col=None, # Use index order ... acov='half_circle', # Use 180 degrees ... as_bars=True, # Use bars instead of scatter ... coverage_line_color='darkgreen', ... title='Coverage Evaluation for 2023 (Q10–Q90)', ... mask_angle=False, # Show angle labels if meaningful ... fill_gradient=False, # Turn off background gradient ... gradient_levels=[0.5, 0.8, 0.9], # Custom reference lines ... verbose=1 ... ) >>> References ---------- .. footbibliography:: """
[docs] @check_non_emptiness @isdf def plot_temporal_uncertainty( df: pd.DataFrame, q_cols: str | list[str] = "auto", theta_col: str | None = None, names: list[str] | None = None, acov: str = "default", figsize: tuple[float, float] = (8.0, 8.0), title: str | None = None, cmap: str = "tab10", normalize: bool = True, show_grid: bool = True, grid_props: dict | None = None, alpha: float = 0.7, s: int = 25, dot_style: str = "o", legend_loc: str = "upper right", mask_label: bool = False, mask_angle: bool = True, savefig: str | None = None, dpi: int = 300, ax: Axes | None = None, ): # --- Input Validation and Column Setup --- # Handle 'auto' detection or process explicit list for q_cols if isinstance(q_cols, str) and q_cols.lower() == "auto": try: # Assume detect_quantiles_in returns list of column names detected_cols = detect_quantiles_in(df) if not detected_cols: raise ValueError("Auto-detection found no quantile columns.") # Check if detected columns actually exist (redundant?) exist_features( df, features=detected_cols, error="raise", name="Auto-detected quantile columns", ) q_cols_list = detected_cols except ( NameError ) as err: # If detect_quantiles_in is not defined/imported raise ImportError( "Helper function 'detect_quantiles_in' is needed for " "`q_cols='auto'` but seems unavailable." ) from err except Exception as e: # Catch errors from detect_quantiles_in or exist_features raise ValueError( f"Automatic detection or validation of `q_cols` failed: {e}. " "Please provide `q_cols` explicitly as a list of column names." ) from e else: # Process explicit list using columns_manager if available if "columns_manager" in globals(): q_cols_list = columns_manager(q_cols, empty_as_none=False) else: # Basic list validation if helper not present if not isinstance(q_cols, (list, tuple)): raise TypeError("`q_cols` must be 'auto' or a list/tuple.") q_cols_list = list(q_cols) # Check if list is empty if not q_cols_list: raise ValueError( "`q_cols` list cannot be empty. Please provide column names." ) # Check existence of explicitly provided columns exist_features( df, features=q_cols_list, error="raise", name="Specified plot columns (`q_cols`)", ) # Determine columns needed for NaN handling cols_for_na_check = list(q_cols_list) # Start with plot columns # theta_col_valid = False if theta_col: if theta_col in df.columns: cols_for_na_check.append(theta_col) # theta_col_valid = True warnings.warn( f"`theta_col` ('{theta_col}') is currently used for NaN " f"alignment but ignored for positioning/ordering. Using index order.", UserWarning, stacklevel=2, ) else: warnings.warn( f"Specified `theta_col` ('{theta_col}') not found. " f"It will be ignored for NaN alignment and positioning.", UserWarning, stacklevel=2, ) # Drop rows with NaNs in ANY of the selected columns data = df[cols_for_na_check].dropna().copy() if len(data) == 0: warnings.warn( "DataFrame is empty after dropping NaN values. Cannot generate plot.", UserWarning, stacklevel=2, ) return None N = len(data) # Number of valid data points # --- Angular Coordinate Calculation --- angular_range_map = { # Map name to (min_angle, max_angle) in radians "default": (0, 2 * np.pi), "half_circle": (0, np.pi), "quarter_circle": (0, np.pi / 2), "eighth_circle": (0, np.pi / 4), } if acov.lower() not in angular_range_map: raise ValueError( f"Invalid `acov` value '{acov}'. Choose from: " f"{', '.join(angular_range_map.keys())}" ) theta_min_rad, theta_max_rad = angular_range_map[acov.lower()] angular_range_rad = theta_max_rad - theta_min_rad # Calculate theta based on index of cleaned data # Use endpoint=False for scatter for potentially better spacing visual theta = ( np.linspace(0.0, 1.0, N, endpoint=False) * angular_range_rad + theta_min_rad ) # --- Prepare Radial Values (Normalization) --- # Internal helper for min-max scaling def _normalize(x): """Min-max scales array x to [0, 1], handling zero range.""" x = np.asarray(x, dtype=float) # Ensure float array range_x = np.ptp(x) # Peak-to-peak range if range_x > 1e-9: # Check if range is non-negligible min_x = np.min(x) return (x - min_x) / range_x # XXX TODO # If range is zero or near-zero, return array of 0.5 or 0? # Returning original might be safer if scale is important even if constant # Let's return 0.5 for constant values after normalization request. # Or return 0 to keep origin clear? Let's use 0.5 as neutral midpoint. return np.full_like(x, 0.5) # Extract values from the cleaned data, apply normalization if needed values_list = [] for col in q_cols_list: try: vals = data[col].to_numpy(dtype=float) if normalize: vals = _normalize(vals) values_list.append(vals) except Exception as e: raise TypeError( f"Could not process column '{col}'. Ensure it contains numeric" f" data. Original error: {e}" ) from e # --- Setup Labels and Colors --- num_series = len(q_cols_list) # Generate default names if needed if names is None: # Try to make default names slightly more informative if auto-detected if isinstance(q_cols, str) and q_cols.lower() == "auto": # Use the detected names if possible plot_labels = [c.replace("_", " ").title() for c in q_cols_list] else: # Generic default if explicit list or auto-detection failed context plot_labels = [f"Series {i + 1}" for i in range(num_series)] else: # Use provided names, check length if len(names) != num_series: raise ValueError( f"Length of `names` ({len(names)}) must match the number " f"of plotted columns ({num_series})." ) plot_labels = list(names) # Get color palette try: cmap_obj = get_cmap(cmap) # Sample colors - handle discrete vs continuous cmaps if hasattr(cmap_obj, "colors") and len(cmap_obj.colors) >= num_series: # Use colors directly from discrete map if enough available color_palette = cmap_obj.colors else: # Sample from continuous map or discrete map with fewer colors color_palette = cmap_obj(np.linspace(0, 1, num_series)) except ValueError: warnings.warn( f"Invalid `cmap` name '{cmap}'. Falling back to 'tab10'.", stacklevel=2, ) cmap_obj = get_cmap("tab10") # Fallback cmap color_palette = cmap_obj(np.linspace(0, 1, num_series)) # --- Plotting --- if ax is None: fig, ax = plt.subplots( figsize=figsize, subplot_kw={"projection": "polar"} ) else: fig = ax.figure # Set angular limits ax.set_thetamin(np.degrees(theta_min_rad)) ax.set_thetamax(np.degrees(theta_max_rad)) # Plot each data series for i, (vals, label) in enumerate(zip(values_list, plot_labels)): if len(vals) != N: # Should not happen if NaN handling is correct raise InternalError( "Mismatch between data length and theta length." ) color = color_palette[ i % len(color_palette) ] # Cycle colors if needed ax.scatter( theta, vals, label=label, alpha=alpha, s=s, marker=dot_style, color=color, edgecolor="k", # Add edge color for visibility linewidth=0.5, ) # --- Final Touches --- ax.set_title(title or "Polar Data Series Plot", fontsize=14, y=1.08) set_axis_grid(ax, show_grid, grid_props) if mask_angle: ax.set_xticklabels([]) # Hide angular tick labels if mask_label: ax.set_yticklabels([]) # Hide radial tick labels else: ax.set_ylabel(f"{'Normalized ' if normalize else ''}Q_pred values") # Add legend handles, labels = ax.get_legend_handles_labels() if labels: # Only show legend if labels were generated/provided ax.legend( handles=handles, labels=labels, loc=legend_loc, bbox_to_anchor=(1.25, 1.0) if "right" in legend_loc else None, fontsize=9, ) # Set radial axis label if normalization is off? Maybe not useful. if ( not normalize and len(q_cols_list) == 1 ): # Only label if one series, unnormalized ax.set_ylabel(q_cols_list[0]) # --- Save or Show --- final = safe_savefig( savefig, fig, dpi=dpi, bbox_inches="tight", ) if final is not None: plt.close(fig) else: fig.tight_layout() plt.show() return ax
plot_temporal_uncertainty.__doc__ = r""" Visualize multiple data series using polar scatter plots. This function creates a general-purpose polar scatter plot to visualize and compare one or more data series (columns) from a DataFrame in a circular layout. Each series is plotted with a distinct color [1]_. - **Angular Position (`theta`)**: Represents each data point or sample, ordered by the DataFrame index after removing rows with NaNs in the selected `q_cols` (and `theta_col` if used for NaN alignment). The points are mapped linearly onto the specified angular coverage (`acov`). The `theta_col` parameter is currently ignored for sorting/positioning but helps align data if it has NaNs. - **Radial Distance (`r`)**: Represents the magnitude of the values from each column specified in `q_cols`. Values can be optionally normalized independently for each series using min-max scaling (`normalize=True`). - **Color**: Each data series (column in `q_cols`) is assigned a unique color based on the specified `cmap` [2]_. This plot is flexible and can be used for various purposes, such as: - Comparing different model predictions for the same target. - Visualizing different quantile predictions (e.g., Q10, Q50, Q90) for a single time step to show uncertainty spread. - Plotting related variables against each other in a polar context. Parameters ---------- df : pd.DataFrame Input DataFrame containing the data columns to be plotted. Decorators ensure it's a valid, non-empty pandas DataFrame. q_cols : str or list of str, default='auto' Specifies the columns to plot. - If ``'auto'``, attempts to automatically detect quantile columns (e.g., names containing '_q' followed by numbers) using the helper function `detect_quantiles_in`. Raises error if detection fails or no such columns are found. - If a list of strings, these column names are used directly. Must contain at least one valid column name from `df`. theta_col : str, optional *Intended* for ordering points angularly based on this column's values. *Note: Currently ignored for sorting/positioning; uses index order. However, if provided and present in `df`, it is included when checking for and dropping NaN values to ensure data alignment between angles and plotted values.* A warning is issued regarding its limited use. Default is ``None``. names : list of str, optional Custom labels for each data series specified in `q_cols`, used in the plot legend. Must match the number of columns in `q_cols`. If ``None``, generic labels like 'Q1', 'Q2', ... (or based on detected quantile names) are generated. Default is ``None``. acov : {'default', 'half_circle', 'quarter_circle', \ 'eighth_circle'}, default='default' Specifies the angular coverage (span) of the polar plot: ``'default'`` (360°), ``'half_circle'`` (180°), ``'quarter_circle'`` (90°), ``'eighth_circle'`` (45°). figsize : tuple of (float, float), default=(8.0, 8.0) Width and height of the figure in inches. title : str, optional Custom title for the plot. If ``None``, a default title is used. cmap : str, default='tab10' Name of the Matplotlib colormap used to assign distinct colors to each data series specified in `q_cols`. normalize : bool, default=True If ``True``, normalize the values within *each column* specified in `q_cols` independently to the range [0, 1] using min-max scaling before plotting. Useful for comparing shapes of series with different scales. If ``False``, raw values are plotted. show_grid : bool, default=True If ``True``, display the polar grid lines. alpha : float, default=0.7 Transparency level for the scatter points (0=transparent, 1=opaque). s : int, default=25 Marker size for the scatter points. dot_style : str, default='o' Marker style used for the scatter points (e.g., 'o', '.', 'x', '+'). Passed to `matplotlib.pyplot.scatter`. legend_loc : str, default='upper right' Location of the legend identifying the data series. Common values include 'best', 'upper right', 'lower left', etc. mask_angle : bool, default=True If ``True``, hide the angular tick labels (degrees). Recommended if the angle is based on index. savefig : str, optional File path to save the plot image. If ``None``, displays interactively. Returns ------- ax : matplotlib.axes._axes.Axes The Matplotlib Axes object (PolarAxesSubplot) containing the plot. Raises ------ ValueError If `q_cols='auto'` fails to find columns or `detect_quantiles_in` fails. If `q_cols` (explicitly provided or auto-detected) is empty or contains columns not found in `df`. If `names` is provided but length doesn't match `q_cols`. If `acov` value is invalid. TypeError If data in `q_cols` is not numeric. See Also -------- plot_polar_uncertainty_spread : Previous function focused on plotting quantiles for uncertainty (potentially similar). detect_quantiles_in : Helper function for automatic column detection. matplotlib.pyplot.scatter : Underlying function for plotting points. Notes ----- - Normalization :math:`\aleph` (`normalize=True`) is applied independently to each column in `q_cols`, scaling each series to its own [0, 1] range. - Rows containing NaN values in *any* of the selected `q_cols` (and `theta_col` if present) are dropped *before* plotting to ensure alignment. `theta` angles are based on the index of this cleaned data. - The `theta_col` parameter currently does not affect the order or position of points but is used for consistent NaN handling. - Radial axis ticks and labels are hidden by default (`set_yticklabels([])`) as the radius represents potentially normalized values from multiple series, making a single scale less meaningful. Let :math:`\mathbf{V}_i` be the data vector for column `i` in `q_cols` (length :math:`N`, after NaN removal based on all selected columns). 1. **Normalization** (if `normalize=True`, applied per column `i`): .. math:: \mathbf{v}'_i = \aleph ( \mathbf{V}_i) where .. math:: \aleph(x)_j = \frac{x_j - \min(\mathbf{x})}{\max(\mathbf{x}) - \min(\mathbf{x})} (Handles zero range by returning the original vector). 2. **Angular Coordinate (`theta`)**: Let :math:`S` be the angular span and :math:`\theta_{min}` the start angle from `acov`. For index :math:`j` (:math:`j=0, \dots, N-1` of cleaned data): .. math:: \theta_j = \left( \frac{j}{N} \times S \right) + \theta_{min} 3. **Radial Coordinate (`r`)**: For series `i` and point `j`: :math:`r_{i,j} = v'_{i,j}` (if normalized) or :math:`r_{i,j} = v_{i,j}` (if not normalized). 4. **Plotting**: For each series `i`, plot points :math:`(r_{i,j}, \theta_j)` using `scatter` with a distinct color derived from `cmap`. References ---------- .. [1] Kouadio, K. L., Liu, R., Loukou, K. G. H., Liu, J., & Liu, W. (2025). Analytics Framework for Interpreting Spatiotemporal Probabilistic Forecasts. *International Journal of Forecasting*. Manuscript submitted. .. [2] Matplotlib documentation: https://matplotlib.org/stable/ Examples -------- >>> import pandas as pd >>> import numpy as np >>> from kdiagram.plot.uncertainty import plot_temporal_uncertainty **1. Random Example (Comparing two series):** >>> np.random.seed(42) >>> N = 100 >>> df_comp_rand = pd.DataFrame({ ... 'Index': range(N), ... 'ModelA_Pred': 50 + 10 * np.sin(np.linspace(0, 3 * np.pi, N)) + np.random.randn(N)*5, ... 'ModelB_Pred': 55 + 12 * np.sin(np.linspace(0, 3 * np.pi, N) - 0.5) + np.random.randn(N)*4, ... }) >>> ax_comp_rand = plot_temporal_uncertainty( ... df=df_comp_rand, ... q_cols=['ModelA_Pred', 'ModelB_Pred'], ... names=['Model A', 'Model B'], ... theta_col=None, # Use index order ... acov='default', ... title='Comparison of Model A vs Model B', ... normalize=True, # Normalize for shape comparison ... cmap='Set1', ... dot_style='x', # Use 'x' markers ... mask_angle=False # Show angle ticks ... ) >>> # plt.show() called internally **2. Concrete Example (Subsidence Quantiles for 2023):** >>> # Assume zhongshan_pred_2023_2026 is a loaded DataFrame >>> # Create dummy data if it doesn't exist >>> try: ... zhongshan_pred_2023_2026 ... except NameError: ... print("Creating dummy subsidence data for example...") ... N_sub = 150 ... zhongshan_pred_2023_2026 = pd.DataFrame({ ... 'latitude': np.linspace(22.2, 22.8, N_sub), ... 'subsidence_2023_q10': np.random.rand(N_sub)*5 + 1 + np.linspace(0,2, N_sub), ... 'subsidence_2023_q50': np.random.rand(N_sub)*5 + 3 + np.linspace(1,3, N_sub), ... 'subsidence_2023_q90': np.random.rand(N_sub)*5 + 5 + np.linspace(2,4, N_sub), ... }) >>> # Ensure Q90 > Q50 > Q10 roughly >>> zhongshan_pred_2023_2026['subsidence_2023_q50'] = np.maximum( ... zhongshan_pred_2023_2026['subsidence_2023_q50'], ... zhongshan_pred_2023_2026['subsidence_2023_q10'] + 0.1) >>> zhongshan_pred_2023_2026['subsidence_2023_q90'] = np.maximum( ... zhongshan_pred_2023_2026['subsidence_2023_q90'], ... zhongshan_pred_2023_2026['subsidence_2023_q50'] + 0.1) >>> ax_tu_sub = plot_temporal_uncertainty( ... df=zhongshan_pred_2023_2026.head(100), # Use subset ... # Explicitly list quantile columns for 2023 ... q_cols=['subsidence_2023_q10', 'subsidence_2023_q50', ... 'subsidence_2023_q90'], ... theta_col='latitude', # Used for NaN alignment, not order ... names=["Lower Bound (Q10)", "Median (Q50)", "Upper Bound (Q90)"], ... acov='eighth_circle', # Use smaller angle span ... title='Uncertainty Spread for 2023 (Zhongshan)', ... normalize=False, # Plot raw values ... cmap='coolwarm', # Use diverging map for bounds ... mask_angle=True, ... s=30 ... ) >>> # plt.show() called internally """
[docs] @check_non_emptiness @isdf def plot_radial_density_ring( df: pd.DataFrame, *, kind: Literal["width", "velocity", "direct"] = "direct", target_cols: str | list[str], title: str | None = None, r_label: str | None = None, figsize: tuple[float, float] = (8.0, 8.0), cmap: str = "viridis", alpha: float = 0.8, cbar: bool = True, acov: Acov = "default", show_grid: bool = True, grid_props: dict[str, Any] | None = None, mask_angle: bool = True, bandwidth: float | None = None, savefig: str | None = None, dpi: int = 300, show_yticklabels: bool = False, ax: Axes | None = None, ) -> Axes | None: # warn if non-default sector (ring is conventionally 360°) if acov != "default": warnings.warn( "Non-default 'acov' for ring. Convention prefers " "full 360°, proceeding as requested.", UserWarning, stacklevel=2, ) # deprecation shim for legacy param if show_yticklabels: warnings.warn( 'Param "show_yticklabels" is deprecated. Use ' '"mask_angle" (False to show angular labels).', DeprecationWarning, stacklevel=2, ) mask_angle = False # columns and existence checks cols = columns_manager(target_cols) exist_features(df, features=cols) # select 1D target depending on kind if kind == "direct": if len(cols) != 1: raise ValueError( "`kind='direct'` needs one column in `target_cols`." ) data_1d = df[cols[0]] r_label = r_label or cols[0] title = title or f"Distribution of {cols[0]}" elif kind in ("width", "velocity"): if len(cols) != 2: raise ValueError( f"`kind='{kind}'` needs two columns in `target_cols`." ) data_1d = df[cols[1]] - df[cols[0]] r_label = r_label or f"{cols[1]} - {cols[0]}" title = title or f"Distribution of {kind.title()}" else: raise ValueError( f"Invalid kind={kind!r}. Use 'width', 'velocity', or 'direct'." ) # drop NaNs and bail if empty data_1d = data_1d.dropna() if data_1d.empty: warnings.warn( "Derived data empty after NaN drop. Cannot plot.", UserWarning, stacklevel=2, ) return None # KDE along radius grid, pdf = _calculate_kde_for_ring( data_1d.to_numpy(), bandwidth=bandwidth, ) if grid.size == 0: warnings.warn( "KDE failed. Cannot plot ring.", UserWarning, stacklevel=2, ) return None # normalize density to [0,1] pdf_norm = pdf.copy() vmax = float(np.max(pdf_norm)) if vmax > 0.0: pdf_norm /= vmax # axes + polar span fig, ax, span = setup_polar_axes( ax, acov=acov, figsize=figsize, ) # theta grid over requested span thetas = np.linspace(0.0, span, 128) T, R = np.meshgrid(thetas, grid) # tile normalized density over theta C = np.tile(pdf_norm, (len(thetas), 1)).T # draw pcolormesh ring cmap_obj = get_cmap(cmap, default="viridis") ax.grid(False) pcm = ax.pcolormesh( T, R, C, shading="auto", cmap=cmap_obj, alpha=alpha, vmin=0.0, vmax=1.0, ) # titles, labels, grids ax.set_title(title or "Radial Density Ring", va="bottom") set_axis_grid( ax, show_grid=show_grid, grid_props=grid_props, ) ax.set_thetagrids([]) # no angular labels by default ax.set_rlabel_position(0) if r_label: ax.set_ylabel(r_label, labelpad=20) if mask_angle: ax.set_yticklabels([]) # colorbar if requested if cbar: cb = plt.colorbar( pcm, ax=ax, pad=0.1, shrink=0.7, ) cb.set_label( "Normalized Density", rotation=270, labelpad=32, ) # output handling final = safe_savefig( savefig, fig, dpi=dpi, bbox_inches="tight", ) if final is not None: plt.close(fig) else: fig.tight_layout() plt.show() return ax
plot_radial_density_ring.__doc__ = r""" Plot a radial density ring to visualize a 1D distribution. This function computes the probability density function (PDF) of a one-dimensional variable using Kernel Density Estimation (KDE) and visualizes it as a colored ring on a polar plot. It's a powerful way to inspect the shape, peaks, and spread of a distribution, such as forecast errors or interval widths. - **Radial Distance (`r`)**: Represents the domain of the variable being analyzed (e.g., interval width in mm). - **Color**: Represents the **probability density** at each radial position. More intense colors indicate more common values (peaks in the distribution). - **Angular Position (`theta`)**: Is purely for aesthetics and carries no information; the density is uniform around the circle to form the ring. Parameters ---------- df : pd.DataFrame The input DataFrame containing the data. kind : {'width', 'velocity', 'direct'}, default='direct' Specifies how to derive the 1D data for the distribution plot from the ``target_cols``. - ``'direct'``: Directly uses the data from a single column specified in ``target_cols``. - ``'width'``: Calculates the interval width by subtracting the first column from the second in ``target_cols`` (e.g., ``q90 - q10``). - ``'velocity'``: Calculates the difference between two columns, representing a change or velocity (e.g., ``value_t2 - value_t1``). target_cols : str or list of str The column(s) to use for deriving the data. - For ``kind='direct'``, this must be a single column name (``str``). - For ``kind='width'`` or ``kind='velocity'``, this must be a list of exactly two column names (``[col1, col2]``). title : str, optional The title for the plot. If ``None``, a default is generated. r_label : str, optional Custom label for the radial axis. If ``None``, a default is generated from the column names. figsize : tuple of (float, float), default=(8, 8) Figure size in inches. cmap : str, default='viridis' Matplotlib colormap name for the density ring. alpha : float, default=0.8 Transparency level for the density ring. cbar : bool, default=True If ``True``, display a color bar representing the normalized density. acov : {'default', 'half_circle', 'quarter_circle', 'eighth_circle'}, default='default' Angular coverage of the polar sector. - ``'default'`` : full circle, :math:`2\pi` (360°) - ``'half_circle'`` : :math:`\pi` (180°) - ``'quarter_circle'`` : :math:`\pi/2` (90°) - ``'eighth_circle'`` : :math:`\pi/4` (45°) By convention, the radial density *ring* uses a full 360° sector. If a non-default value is provided, a short warning is issued. show_grid : bool, default=True Toggle gridlines via the package helper ``set_axis_grid``. grid_props : dict, optional Keyword arguments passed to ``set_axis_grid`` for grid customization. mask_angle : bool, default=True If ``False``, show the tick labels on the radial axis. Defaults to ``True`` as the ring is primarily a qualitative visual. bandwidth : float, optional The bandwidth for the Kernel Density Estimation. If ``None``, it is estimated using Silverman's rule of thumb. savefig : str, optional If provided, save the figure to this path; otherwise the plot is shown interactively. dpi : int, default=300 Resolution for the saved figure. Returns ------- ax : matplotlib.axes.Axes or None The Matplotlib Axes object containing the plot, or ``None`` if the plot could not be generated (e.g., due to empty data). Notes ----- The plot is generated through the following steps: 1. **Data Derivation**: A one-dimensional data vector, :math:`\mathbf{x}`, is created from the ``target_cols`` based on the specified ``kind``. 2. **Kernel Density Estimation**: The probability density function (PDF), :math:`\hat{f}_h(x)`, is estimated from the data vector :math:`\mathbf{x}` using a Gaussian kernel. This step is handled by ``scipy.stats.gaussian_kde``. 3. **Normalization for Coloring**: The estimated PDF is normalized to the range ``[0, 1]`` to map its values to the color map. .. math:: \text{PDF}_{\text{norm}}(x) = \frac{\hat{f}_h(x)}{\max(\hat{f}_h)} 4. **Visualization**: The plot is rendered on polar axes using ``pcolormesh``. The radial axis corresponds to the value domain :math:`x`, and the color at each radius corresponds to :math:`\text{PDF}_{\text{norm}}(x)`. This color value is repeated across all angles to form a ring. Examples -------- >>> import numpy as np >>> import pandas as pd >>> from kdiagram.plot.uncertainty import plot_radial_density_ring >>> >>> # Create synthetic data >>> np.random.seed(42) >>> n_samples = 500 >>> df_test = pd.DataFrame({ ... 'q10_pred': np.random.normal(10, 2, n_samples), ... 'q90_pred': np.random.normal(30, 3, n_samples), ... }) >>> # Ensure upper bound is greater than lower bound >>> df_test['q90_pred'] = df_test[['q10_pred', 'q90_pred']].max(axis=1) \ ... + np.random.rand(n_samples) >>> >>> # Generate the plot for the distribution of interval widths >>> ax = plot_radial_density_ring( ... df=df_test, ... kind="width", ... target_cols=["q10_pred", "q90_pred"], ... title="Distribution of Prediction Interval Width", ... r_label="Interval Width (mm)", ... cmap="magma", ... show_yticklabels=True ... ) """
[docs] @check_non_emptiness @isdf def plot_polar_quiver( df: pd.DataFrame, r_col: str, theta_col: str, u_col: str, v_col: str, *, acov: Acov = "default", color_col: str | None = None, theta_period: float | None = None, title: str | None = None, figsize: tuple[float, float] = (8.0, 8.0), cmap: str = "viridis", mask_angle: bool = False, mask_radius: bool = False, show_grid: bool = True, grid_props: dict[str, Any] | None = None, savefig: str | None = None, dpi: int = 300, ax: Axes | None = None, **quiver_kws, ) -> Axes | None: # ---- required columns req = [r_col, theta_col, u_col, v_col] if color_col: req.append(color_col) exist_features(df, features=req) # ---- clean data data = df[req].dropna() if data.empty: warnings.warn( "DataFrame empty after NaN drop in required " "columns. Cannot plot.", UserWarning, stacklevel=2, ) return None # ---- extract arrays r_data = data[r_col].to_numpy() t_raw = data[theta_col].to_numpy() u_data = data[u_col].to_numpy() # radial change v_data = data[v_col].to_numpy() # angular change # ---- axes & span fig, ax, span = setup_polar_axes( ax, acov=acov, figsize=figsize, ) # ---- map theta to [0, span] if theta_period is not None: t_map = map_theta_to_span( t_raw, span=span, theta_period=theta_period, ) else: tmin = float(t_raw.min()) tmax = float(t_raw.max()) if tmax > tmin + 1e-12: t_map = map_theta_to_span( t_raw, span=span, data_min=tmin, data_max=tmax, ) else: t_map = np.zeros_like(t_raw) # ---- color values if color_col: c_vals = data[color_col].to_numpy() cbar_label = color_col else: # default color by vector magnitude c_vals = np.sqrt(u_data**2 + v_data**2) cbar_label = "Vector Magnitude" # robust normalize cmin = float(np.nanmin(c_vals)) cmax = float(np.nanmax(c_vals)) if not np.isfinite(cmin) or not np.isfinite(cmax): cmin, cmax = 0.0, 1.0 if np.isclose(cmax - cmin, 0.0): cmax = cmin + 1.0 norm = Normalize(vmin=cmin, vmax=cmax) cmap_obj = get_cmap(cmap, default="viridis") colors = cmap_obj(norm(c_vals)) # ---- draw quiver # Matplotlib polar quiver expects: # (theta, r, dtheta, dr) ax.quiver( t_map, r_data, v_data, u_data, color=colors, **quiver_kws, ) # ---- colorbar sm = plt.cm.ScalarMappable(norm=norm, cmap=cmap_obj) cbar = plt.colorbar( sm, ax=ax, pad=0.1, shrink=0.75, ) cbar.set_label(cbar_label, fontsize=10) # ---- labels, grid, masks ax.set_title(title or f"Quiver Plot of ({u_col}, {v_col})") set_axis_grid( ax, show_grid=show_grid, grid_props=grid_props, ) if mask_angle: ax.set_xticklabels([]) if mask_radius: ax.set_yticklabels([]) # ---- output final = safe_savefig( savefig, fig, dpi=dpi, bbox_inches="tight", ) if final is not None: plt.close(fig) else: fig.tight_layout() plt.show() return ax
plot_polar_quiver.__doc__ = r""" Plot a polar quiver plot to visualize vector data. This function draws arrows (vectors) on a polar grid. Each arrow's origin, direction, and magnitude are determined by the data. This is highly effective for visualizing dynamic phenomena like forecast revisions, error vectors, or flow fields :footcite:p:`2020SciPy-NMeth`. Parameters ---------- df : pd.DataFrame The input DataFrame containing the vector data. r_col : str Name of the column for the **radial position** of the vector's origin (tail). theta_col : str Name of the column for the **angular position** of the vector's origin. u_col : str Name of the column for the **radial component** of the vector (the change in radius). v_col : str Name of the column for the **tangential component** of the vector (the change in angle). acov : {'default', 'half_circle', 'quarter_circle', 'eighth_circle'}, \ default='default' Angular **coverage** of the plot: - ``'default'``: full circle, :math:`2\pi` - ``'half_circle'``: :math:`\pi` - ``'quarter_circle'``: :math:`\pi/2` - ``'eighth_circle'``: :math:`\pi/4` (Value is case-insensitive; invalid values fall back to full circle with a warning.) color_col : str, optional Name of a column to use for coloring the arrows. If ``None``, arrows are colored by their total vector magnitude. theta_period : float, optional The period of the cyclical data in ``theta_col``. For example, if ``theta_col`` represents degrees in a circle, the period is 360. If provided, the angle is mapped to ``[0, 2*pi]`` based on this cycle. title : str, optional The title for the plot. If ``None``, a default is generated. figsize : tuple of (float, float), default=(8, 8) Figure size in inches. cmap : str, default='viridis' Matplotlib colormap for coloring the arrows. show_grid : bool, default=True Toggle gridlines via the package helper ``set_axis_grid``. grid_props : dict, optional Keyword arguments passed to ``set_axis_grid`` for grid customization. mask_angle : bool, default=False If ``True``, hide the angular tick labels (e.g., degrees). mask_radius : bool, default=False If ``True``, hide the radial tick labels. savefig : str, optional If provided, save the figure to this path; otherwise the plot is shown interactively. dpi : int, default=300 Resolution for the saved figure. **quiver_kws : dict, optional Additional keyword arguments passed to the ``ax.quiver`` call (e.g., ``scale``, ``width``, ``headwidth``). Returns ------- ax : matplotlib.axes.Axes or None The Matplotlib Axes object containing the plot, or ``None`` if the plot could not be generated. Notes ----- The plot visualizes a vector field on a polar coordinate system. For each data point :math:`i`, a vector is drawn footcite:p:`Hunter2007`. 1. **Vector Origin**: The tail of the vector is positioned at the polar coordinates :math:`(r_i, \theta_i)` specified by ``r_col`` and ``theta_col``. 2. **Vector Components**: The vector itself is defined by its components in the radial and tangential directions: - :math:`u_i` (from ``u_col``) is the component in the radial direction (pointing away from the center). - :math:`v_i` (from ``v_col``) is the component in the tangential direction (perpendicular to the radial line). 3. **Visualization**: Matplotlib's polar ``quiver`` function places an arrow at each origin point :math:`(r_i, \theta_i)`. The arrow's length and orientation are determined by the vector :math:`(u_i, v_i)`. 4. **Coloring**: The color of each arrow can be determined by a separate metric from ``color_col``. If not specified, it defaults to the vector's Euclidean magnitude, :math:`M_i`. .. math:: M_i = \sqrt{u_i^2 + v_i^2} Examples -------- >>> import numpy as np >>> import pandas as pd >>> from kdiagram.plot.uncertainty import plot_polar_quiver >>> >>> # Simulate forecast revisions for 50 spatial locations >>> np.random.seed(0) >>> n_points = 50 >>> locations = np.linspace(0, 360, n_points, endpoint=False) >>> initial_forecast = 10 + 5 * np.sin(np.deg2rad(locations) * 3) >>> >>> # Simulate revisions (changes in value) >>> radial_change = np.random.normal(0, 1.5, n_points) >>> tangential_change = np.random.normal(0, 0.1, n_points) >>> >>> df_forecasts = pd.DataFrame({ ... 'location_angle': locations, ... 'initial_value': initial_forecast, ... 'update_radial': radial_change, ... 'update_tangential': tangential_change, ... }) >>> >>> # Generate the polar quiver plot >>> ax = plot_polar_quiver( ... df=df_forecasts, ... r_col='initial_value', ... theta_col='location_angle', ... u_col='update_radial', ... v_col='update_tangential', ... theta_period=360, ... title='Forecast Revisions for Spatial Locations', ... cmap='coolwarm', ... scale=25 # Adjusts arrow size for better visibility ... ) References ---------- .. footbibliography:: """
[docs] @check_non_emptiness @isdf def plot_polar_heatmap( df: pd.DataFrame, r_col: str, theta_col: str, *, acov: Acov = "default", theta_period: float | None = None, r_bins: int = 20, theta_bins: int = 36, statistic: str = "count", cbar_label: str | None = None, title: str | None = None, figsize: tuple[float, float] = (8, 8), cmap: str = "viridis", mask_angle: bool = False, mask_radius: bool = False, show_grid: bool = True, grid_props: dict[str, Any] | None = None, savefig: str | None = None, dpi: int = 300, ax: Axes | None = None, ) -> Axes | None: # ---- Validate required columns & drop NaNs exist_features(df, features=[r_col, theta_col]) data = df[[r_col, theta_col]].dropna() if data.empty: warnings.warn( "DataFrame is empty after dropping NaNs" " in '{r_col}'/'{theta_col}'. Cannot plot.", UserWarning, stacklevel=2, ) return None # ---- Extract arrays r_data = data[r_col].to_numpy() theta_raw = data[theta_col].to_numpy() # ---- Axes & angular span (radians) fig, ax, span = setup_polar_axes(ax, acov=acov, figsize=figsize) # ---- Map theta values into [0, span] if theta_period is not None: theta_mapped = map_theta_to_span( theta_raw, span=span, theta_period=theta_period ) else: tmin, tmax = float(theta_raw.min()), float(theta_raw.max()) if tmax > tmin + 1e-12: theta_mapped = map_theta_to_span( theta_raw, span=span, data_min=tmin, data_max=tmax ) else: theta_mapped = np.zeros_like(theta_raw) # ---- Binning if statistic.lower() != "count": warnings.warn( f"statistic='{statistic}' not implemented;" " falling back to 'count'.", UserWarning, stacklevel=2, ) r_edges = np.linspace(r_data.min(), r_data.max(), int(r_bins) + 1) theta_edges = np.linspace(0.0, float(span), int(theta_bins) + 1) counts, _, _ = np.histogram2d( x=r_data, y=theta_mapped, bins=[r_edges, theta_edges], ) # Note: np.histogram2d returns shape (len(x_bins)-1, len(y_bins)-1). # Our mesh expects R(len(r_edges)) x T(len(theta_edges)) edges. # ---- Draw cmap_obj = get_cmap(cmap, default="viridis") ax.grid(False) T, R = np.meshgrid(theta_edges, r_edges) pcm = ax.pcolormesh(T, R, counts, cmap=cmap_obj, shading="auto") # ---- Colorbar cbar = plt.colorbar(pcm, ax=ax, pad=0.1, shrink=0.75) cbar.set_label(cbar_label or statistic.capitalize(), fontsize=10) # ---- Labels, grid, masks ax.set_title(title or f"Density of {r_col} vs. {theta_col}") set_axis_grid(ax, show_grid=show_grid, grid_props=grid_props) if mask_angle: ax.set_xticklabels([]) if mask_radius: ax.set_yticklabels([]) # ---- Output final = safe_savefig( savefig, fig, dpi=dpi, bbox_inches="tight", ) if final is not None: plt.close(fig) else: fig.tight_layout() plt.show() return ax
def _calculate_kde_for_ring( data: np.ndarray, bandwidth: float | None = None ) -> tuple[np.ndarray, np.ndarray]: """ Internal helper to compute KDE. In a package, this should be a shared utility. """ data = data[np.isfinite(data)] if data.size == 0: return np.array([]), np.array([]) lo, hi = np.min(data), np.max(data) pad = 0.05 * (hi - lo) if (hi - lo) > 0 else 0.5 grid = np.linspace(lo - pad, hi + pad, 512) if bandwidth is None: std = np.std(data, ddof=1) if data.size > 1 else 1.0 # Silverman's rule of thumb bw_val = 1.06 * std * (data.size ** (-1 / 5)) if std > 0 else 1.0 else: bw_val = bandwidth std_dev = np.std(data, ddof=1) bw_method = bw_val / std_dev if std_dev > 0 else "silverman" kde = gaussian_kde(data, bw_method=bw_method) pdf = kde(grid) return grid, pdf plot_polar_heatmap.__doc__ = r""" Plot a polar heatmap to visualize a 2D density distribution. This function creates a heatmap on a polar grid to show the concentration of data points based on two variables: one mapped to the **radius** and another to the **angle**. It is highly effective for identifying patterns and correlations between a linear metric and a cyclical or ordered feature. Parameters ---------- df : pd.DataFrame The input DataFrame containing the data. r_col : str Name of the column to be used for the radial coordinate. theta_col : str Name of the column to be used for the angular coordinate. acov : {'default', 'half_circle', 'quarter_circle', 'eighth_circle'}, \ default='default' Angular **coverage** of the plot: - ``'default'``: full circle, :math:`2\pi` - ``'half_circle'``: :math:`\pi` - ``'quarter_circle'``: :math:`\pi/2` - ``'eighth_circle'``: :math:`\pi/4` (Value is case-insensitive; invalid values fall back to full circle with a warning.) theta_period : float, optional The period of the cyclical data in ``theta_col``. For example, if ``theta_col`` represents the hour of the day, ``theta_period`` should be 24. If provided, the angle is mapped to ``[0, 2*pi]`` based on this cycle. If ``None``, the angle is scaled from the data's min/max values. r_bins : int, default=20 The number of bins to create along the radial axis. theta_bins : int, default=36 The number of bins to create around the angular axis. statistic : str, default='count' The statistic to compute in each bin. Currently, only 'count' is supported, which corresponds to a 2D histogram. cbar_label : str, optional Custom label for the color bar. If ``None``, it defaults to the value of ``statistic``. title : str, optional The title for the plot. If ``None``, a default is generated. figsize : tuple of (float, float), default=(8, 8) Figure size in inches. cmap : str, default='viridis' Matplotlib colormap for the heatmap. mask_angle : bool, default=False If ``True``, hide the angular tick labels (e.g., degrees). mask_radius : bool, default=False If ``True``, hide the radial tick labels. show_grid : bool, default=True Toggle gridlines via the package helper ``set_axis_grid``. grid_props : dict, optional Keyword arguments passed to ``set_axis_grid`` for grid customization. savefig : str, optional If provided, save the figure to this path; otherwise the plot is shown interactively. dpi : int, default=300 Resolution for the saved figure. Returns ------- ax : matplotlib.axes.Axes or None The Matplotlib Axes object containing the plot, or ``None`` if the plot could not be generated. Notes ----- The heatmap is constructed by binning the 2D polar space and counting the number of data points that fall into each bin. 1. **Coordinate Mapping**: - The radial data from ``r_col`` is used directly. - The angular data from ``theta_col``, :math:`\theta_{data}`, is converted to radians in the range :math:`[0, 2\pi]`. If ``theta_period`` (:math:`P`) is given, the mapping is: .. math:: \theta_{rad} = \left( \frac{\theta_{data} \pmod P}{P} \right) \cdot 2\pi Otherwise, it is linearly scaled from the data's range. 2. **Binning**: The data space is divided into a grid of polar bins defined by ``r_bins`` and ``theta_bins``. A 2D histogram is then computed, where the value of each cell :math:`C_{ij}` is the number of data points :math:`(r, \theta)` that fall within the radial and angular bounds of that cell. 3. **Visualization**: The resulting count matrix :math:`\mathbf{C}` is visualized using ``pcolormesh``, where the color of each cell corresponds to its count, creating the heatmap effect. See :footcite:p:`Hunter:2007` for Matplotlib details. See Also -------- matplotlib.pyplot.pcolormesh : Draw a pseudocolor plot with a nonregular rectangular grid. numpy.histogram2d : Compute the bi-dimensional histogram of two data samples. Examples -------- >>> import numpy as np >>> import pandas as pd >>> from kdiagram.plot.uncertainty import plot_polar_heatmap >>> >>> # Simulate weather data with a daily cycle >>> np.random.seed(42) >>> n_points = 5000 >>> # Simulate hour of day with more events in the afternoon >>> hour = np.concatenate([ ... np.random.normal(15, 2, int(n_points * 0.7)), ... np.random.normal(5, 2, int(n_points * 0.3)) ... ]) % 24 >>> # Simulate rainfall, correlated with afternoon hours >>> rainfall = np.random.gamma(2, 5, n_points) + \ ... (hour > 12) * np.random.gamma(3, 5, n_points) >>> >>> df_weather = pd.DataFrame({'hour': hour, 'rainfall_mm': rainfall}) >>> >>> # Generate the polar heatmap >>> ax = plot_polar_heatmap( ... df=df_weather, ... r_col='rainfall_mm', ... theta_col='hour', ... theta_period=24, ... r_bins=25, ... theta_bins=24, ... cmap='plasma', ... title='Rainfall Intensity vs. Hour of Day', ... cbar_label='Event Count' ... ) References ---------- .. footbibliography:: """ class PerformanceWarning(Warning): pass # Define InternalError for consistency if needed class InternalError(Exception): pass