# License: Apache-2.0
# Author: LKouadio <etanoyau@gmail.com>
"""
Specialized diagnostic polar plots ('kdiagrams', named after author
Kouadio) designed for comprehensive model evaluation and forecast
analysis. Provides functions to visualize
prediction uncertainty, model drift, interval coverage, anomaly
magnitude, actual vs. predicted performance, feature influence, and
related diagnostics using polar coordinates.
"""
from __future__ import annotations
import textwrap
import warnings
from typing import Any, Literal
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.axes import Axes
from matplotlib.colors import Normalize
from scipy.stats import gaussian_kde
from ..api.summary import ResultSummary
from ..api.typing import Acov
from ..compat.matplotlib import get_cmap
from ..compat.sklearn import validate_params
from ..decorators import check_non_emptiness, isdf
from ..utils.diagnose_q import (
build_qcols_multiple,
detect_quantiles_in,
validate_qcols,
)
from ..utils.fs import savefig as safe_savefig
from ..utils.handlers import columns_manager
from ..utils.plot import (
_sample_colors,
map_theta_to_span,
set_axis_grid,
setup_polar_axes,
)
from ..utils.validator import (
_assert_all_types,
exist_features,
)
__all__ = [
"plot_actual_vs_predicted",
"plot_anomaly_magnitude",
"plot_coverage",
"plot_coverage_diagnostic",
"plot_interval_consistency",
"plot_interval_width",
"plot_model_drift",
"plot_temporal_uncertainty",
"plot_uncertainty_drift",
"plot_velocity",
"plot_radial_density_ring",
"plot_polar_heatmap",
"plot_polar_quiver",
]
[docs]
@validate_params({"y_true": ["array-like"]})
def plot_coverage(
y_true,
*y_preds,
names: str | list[str] | None = None,
q: str | list[str] | None = None,
kind: str = "line",
cmap: str = "viridis",
pie_startangle: int | float = 140,
pie_autopct: str = "%1.1f%%",
radar_color: str = "tab:blue",
radar_fill_alpha: float = 0.25,
radar_line_style: str = "o-",
cov_fill: bool = False,
figsize: tuple[str, str] = None,
title: str | None = None,
dpi: int = 300,
savefig: str | None = None,
verbose: int = 1,
ax: Axes | None = None,
):
# Convert the true values to a numpy array for consistency
y_true = np.array(y_true)
# Count how many model predictions were passed via *y_preds.
num_models = len(y_preds)
# Handle model names: create or extend to match the number of models.
names = columns_manager(names, to_string=True)
if names is None:
names = [f"Model_{i + 1}" for i in range(num_models)]
else:
if len(names) < num_models:
extra = num_models - len(names)
for _i in range(extra):
names.append(f"Model_{len(names) + 1}")
coverage_scores = []
q = columns_manager(q)
# Handle quantiles
if q is not None:
q = np.array(q)
if q.ndim != 1:
raise ValueError(
"Parameter 'q' must be a 1D list or array of quantile levels."
)
if not np.all((0 < q) & (q < 1)):
raise ValueError("Quantile levels must be between 0 and 1.")
# Sort q and get the sorted indices
sorted_indices = np.argsort(q)
q_sorted = q[sorted_indices]
else:
q_sorted = None
# Compute coverage for each model in *y_preds.
for _i, pred in enumerate(y_preds):
pred = np.array(pred)
if pred.ndim == 2:
if q_sorted is not None:
pred_sorted = pred[:, sorted_indices]
else:
pred_sorted = np.sort(pred, axis=1)
lower_q = pred_sorted[:, 0]
upper_q = pred_sorted[:, -1]
in_interval = ((y_true >= lower_q) & (y_true <= upper_q)).astype(
int
)
coverage = np.mean(in_interval)
elif pred.ndim == 1:
# Point forecast coverage as fraction of exact matches
matches = (y_true == pred).astype(int)
coverage = np.mean(matches)
else:
coverage = None
coverage_scores.append(coverage)
# Prepare data for plotting. Replace None with 0 for convenience.
valid_cov = [c if c is not None else 0 for c in coverage_scores]
x_idx = np.arange(num_models)
# ---- Axes / Figure handling (use ax if provided) ----------------------
if kind in {"bar", "line", "pipe", "pie"}:
if ax is None:
if figsize is not None:
fig, ax = plt.subplots(figsize=figsize)
else:
fig, ax = plt.subplots()
else:
fig = ax.figure
elif kind == "radar":
if ax is None:
# create a polar axes if none is provided
if figsize is not None:
fig, ax = plt.subplots(
figsize=figsize, subplot_kw={"projection": "polar"}
)
else:
fig, ax = plt.subplots(subplot_kw={"projection": "polar"})
else:
fig = ax.figure
if getattr(ax, "name", "") != "polar":
raise ValueError(
"For kind='radar', the provided 'ax' must be a polar Axes."
)
else:
# No plotting; console output only
fig = None
# ---- Draw according to 'kind' -----------------------------------------
if kind == "bar":
ax.bar(x_idx, valid_cov, color="blue", alpha=0.7)
for idx, val in enumerate(coverage_scores):
if val is not None:
ax.text(
x=idx,
y=val + 0.01,
s=f"{val:.2f}",
ha="center",
va="bottom",
)
ax.set_xticks(x_idx)
ax.set_xticklabels(names)
ax.set_ylim([0, 1])
ax.set_ylabel("Coverage")
ax.set_xlabel("Models")
elif kind == "line":
ax.plot(x_idx, valid_cov, marker="o")
for idx, val in enumerate(coverage_scores):
if val is not None:
ax.text(
x=idx,
y=val + 0.01,
s=f"{val:.2f}",
ha="center",
va="bottom",
)
ax.set_xticks(x_idx)
ax.set_xticklabels(names)
ax.set_ylim([0, 1])
ax.set_ylabel("Coverage")
ax.set_xlabel("Models")
elif kind in {"pie", "pipe"}: # keep your original alias
total_cov = sum(valid_cov)
if total_cov == 0:
ax.text(0.5, 0.5, "No coverage to plot", ha="center", va="center")
else:
ax.pie(
valid_cov,
labels=names,
autopct=pie_autopct,
startangle=pie_startangle,
colors=get_cmap(cmap)(np.linspace(0, 1, num_models)),
)
ax.axis("equal") # Make the pie chart a perfect circle.
elif kind == "radar":
N = num_models
angles = np.linspace(0, 2 * np.pi, N, endpoint=False)
angles = np.concatenate((angles, [angles[0]]))
coverage_radar = np.concatenate((valid_cov, [valid_cov[0]]))
# Plot main coverage line
ax.plot(
angles,
coverage_radar,
radar_line_style,
color=radar_color,
label="Coverage",
)
if cov_fill:
if num_models == 1:
coverage_value = valid_cov[0]
theta = np.linspace(0, 2 * np.pi, 100)
r = np.linspace(0, coverage_value, 100)
R, Theta = np.meshgrid(r, theta)
ax.grid(False)
ax.pcolormesh(
Theta,
R,
R,
cmap=cmap,
shading="auto",
alpha=radar_fill_alpha,
zorder=0,
)
ax.grid(True, which="both")
ax.plot(
theta,
[coverage_value] * len(theta),
color="red",
linewidth=2,
linestyle="-",
)
ax.set_ylim(0, 1)
ax.set_yticks([0.2, 0.4, 0.6, 0.8])
ax.yaxis.grid(
True,
color="gray",
linestyle="--",
linewidth=0.5,
alpha=0.7,
)
else:
ax.fill(
angles,
coverage_radar,
color=radar_color,
alpha=radar_fill_alpha,
zorder=0,
)
# Wrap each label to multiple lines (keeps parentheticals readable)
wrapped_names = [
textwrap.fill(
n, width=18, break_long_words=False, break_on_hyphens=False
)
for n in names
]
ax.set_thetagrids(
angles[:-1] * 180 / np.pi,
labels=wrapped_names,
)
# Push labels outward from the circle a bit (points)
pad = 8 if len(names) <= 4 else (10 if len(names) <= 6 else 12)
ax.tick_params(axis="x", pad=pad) # add radial padding for labels
# ax.set_thetagrids(angles[:-1] * 180 / np.pi, labels=names)
# Optional: if there are many models (crowded), gently reduce fontsize
if len(names) >= 6:
for lbl in ax.get_xticklabels():
lbl.set_fontsize(max(lbl.get_fontsize() - 1, 8))
ax.set_ylim(0, 1)
ax.legend(loc="upper right")
else:
# Fallback: print coverage scores to the console for each model.
for idx, val in enumerate(coverage_scores):
print(f"{names[idx]} coverage: {val}")
# ---- Summary / titles / saving ----------------------------------------
if verbose:
cov_dict = {
names[idx]: cov for idx, cov in enumerate(coverage_scores)
}
summary = ResultSummary("CoverageScores").add_results(cov_dict)
print(summary)
if title is not None and fig is not None:
# Title for this axes
ax.set_title(title)
final = safe_savefig(
savefig,
fig,
dpi=dpi,
bbox_inches="tight",
)
if final is not None:
plt.close(fig)
else:
plt.tight_layout()
plt.show()
return ax if fig is not None else None
plot_coverage.__doc__ = r"""
Plot overall coverage scores for forecast intervals or points.
Computes and visualizes the empirical coverage rate, which is
the fraction of times the true values fall within predicted
quantile intervals (or match point forecasts). Supports comparing
multiple models or prediction sets using various chart types.
This plot provides a high-level summary of whether prediction
intervals are well-calibrated on average across the entire
dataset. It's useful for quickly comparing the overall
reliability of uncertainty estimates from different models or
parameter settings before diving into finer diagnostics
:footcite:p:`kouadiob2025`.
Parameters
----------
y_true : array-like of shape (n_samples,)
The ground truth target values. Must be a 1D array-like
object (list, NumPy array, pandas Series).
*y_preds : array-like
Variable number of prediction arrays. Each positional
argument should be an array-like object corresponding to a
model or prediction set. The shape determines interpretation:
- shape (n_samples, n_quantiles): Represents quantile
forecasts (e.g., lower, median, upper bounds). Coverage
is calculated based on the interval defined by the minimum
and maximum quantile columns provided (after sorting based
on `q` if given, or by value otherwise). Requires `q`
parameter for proper interval definition if quantiles
are not ordered.
- shape (n_samples,): Represents point forecasts. Coverage
is calculated as the proportion of exact matches to
`y_true` (typically near zero for continuous data).
names : list of str, optional
Labels for each prediction set provided in `*y_preds`. Used
in legends and axis labels. If `None` or shorter than the
number of prediction sets, default names like "Model_1",
"Model_2", etc., are generated for the missing ones.
Default is ``None``.
q : list of float, optional
Quantile levels corresponding to the columns in 2D `y_preds`
arrays. Values must be between 0 and 1. Used to identify
and sort quantile columns correctly before determining the
interval bounds (minimum and maximum specified quantile) for
coverage calculation. Required if `y_preds` contains 2D
arrays representing unordered quantiles. Default is ``None``.
kind : {'line', 'bar', 'pie', 'radar'}, optional
The type of plot to generate for visualizing coverage scores.
Default is ``'line'``.
- ``'line'``: Line chart connecting coverage scores for
each prediction set.
- ``'bar'``: Bar chart showing the coverage score for each
prediction set.
- ``'pie'``: Pie chart where each slice represents a model's
coverage score as a fraction of the sum of all scores.
- ``'radar'``: Radar (spider) chart where each axis
represents a prediction set, and the distance along the
axis indicates its coverage score.
cmap : str, optional
Matplotlib colormap name used for coloring slices in the
`'pie'` chart or the gradient fill in single-model radar
plots when ``cov_fill=True``. Default is ``'viridis'``.
pie_startangle : float, optional
Starting angle in degrees for the first slice in the `'pie'`
chart. Rotates the chart orientation. Default is 140.
pie_autopct : str or None, optional
String formatting pattern (e.g., ``'%1.1f%%'``) used to label
pie slices with their numeric value (percentage). If `None`,
no numeric labels are shown. Default is ``'%1.1f%%'``.
radar_color : str, optional
Color specification for the main line and optional fill in
the `'radar'` chart. Accepts any valid Matplotlib color.
Default is ``'tab:blue'``.
radar_fill_alpha : float, optional
Transparency level (alpha) for the filled area in the
`'radar'` chart when ``cov_fill=True``. Value between 0
(transparent) and 1 (opaque). Default is 0.25.
radar_line_style : str, optional
Matplotlib line and marker style string for the radar chart
outline (e.g., ``'o-'``, ``'--'``, ``':'``).
Default is ``'o-'``.
cov_fill : bool, optional
If ``True`` and ``kind='radar'``, fills the area under the
radar plot lines. For a single model, creates a radial
gradient; for multiple models, fills polygons.
Default is ``False``.
figsize : tuple of (float, float), optional
Figure size ``(width, height)`` in inches. If ``None``, uses
Matplotlib's default figure size.
title : str, optional
Optional title for the plot. Default is ``None``.
savefig : str, optional
If not ``None``, specifies the full path (including filename
and extension, e.g., 'coverage.png') to save the figure.
If ``None``, the plot is displayed interactively.
Default is ``None``.
verbose : {0, 1}, optional
Controls printing of coverage scores to the console.
- ``0``: Silent.
- ``1``: Prints a formatted summary of coverage scores.
Default is ``1``.
Returns
-------
None
This function generates and displays or saves a Matplotlib plot
but does not return any object.
Raises
------
ValueError
If `q` contains values outside (0, 1), or if `kind` is not
one of the allowed options, or if dimensions of `y_preds`
and `q` are incompatible.
TypeError
If inputs `y_true` or `y_preds` contain non-numeric data.
See Also
--------
plot_coverage_diagnostic : Plot point-wise coverage status.
numpy.mean : Used to calculate the average coverage rate.
Notes
-----
The empirical coverage rate is a key metric for assessing the
calibration of probabilistic forecasts, particularly prediction
intervals :footcite:p:`kouadiob2025`.
**Calculation:**
- For quantile intervals (2D `y_preds`), the interval for each
sample :math:`i` is defined by the minimum and maximum
predicted values across the specified quantiles for that sample,
:math:`[\hat{y}_{i}^{(\ell)}, \hat{y}_{i}^{(u)}]`. Coverage is:
.. math::
\text{Coverage} = \frac{1}{N}\sum_{i=1}^{N}
\mathbf{1}\{\hat{y}_{i}^{(\ell)} \leq y_i
\leq \hat{y}_{i}^{(u)}\}
where :math:`\mathbf{1}\{\cdot\}` is 1 if true, 0 otherwise.
- For point forecasts (1D `y_preds`), coverage is the proportion
of exact matches:
.. math::
\text{Coverage} = \frac{1}{N}\sum_{i=1}^{N}
\mathbf{1}\{ \hat{y}_i = y_i \}
This is typically very low for continuous :math:`y_true`.
**Interpretation:** Compare the calculated coverage score(s) to
the nominal coverage rate implied by the quantiles (e.g., 80%
for a Q10-Q90 interval). Scores far from the nominal rate suggest
miscalibration (intervals too wide or too narrow on average)
:footcite:p:`Gneiting2007b`.
**Radar Plot Fill (`cov_fill=True`):**
- Single model: Shows a circular gradient fill from the center
out to the calculated coverage score radius, with a solid
reference line at the coverage score.
- Multiple models: Fills the polygon defined by the coverage
scores for all models with a semi-transparent color.
Examples
--------
>>> import numpy as np
>>> from kdiagram.plot.uncertainty import plot_coverage
>>> # True values
>>> y_true = np.random.rand(100) * 10
>>> # 3-quantile predictions (Q10, Q50, Q90) for two models
>>> y_pred_A = np.sort(np.random.normal(
... loc=9, scale=2, size=(100, 3)), axis=1)
>>> y_pred_B = np.sort(np.random.normal(
... loc=11, scale=3, size=(100, 3)), axis=1)
>>> q_levels = [0.1, 0.5, 0.9]
>>> # Bar chart comparison
>>> plot_coverage(y_true, y_pred_A, y_pred_B, q=q_levels,
... names=['Model A', 'Model B'], kind='bar',
... title='Coverage Comparison (Bar)')
>>> # Radar chart comparison with fill
>>> plot_coverage(y_true, y_pred_A, y_pred_B, q=q_levels,
... names=['Model A', 'Model B'], kind='radar',
... title='Coverage Comparison (Radar)',
... cov_fill=True, cmap='Set2')
References
----------
.. footbibliography::
"""
[docs]
@check_non_emptiness
@isdf
def plot_model_drift(
df: pd.DataFrame,
q_cols: list | None = None,
q10_cols: list[str] | None = None,
q90_cols: list[str] | None = None,
horizons: list[str | int] | None = None,
color_metric_cols: list[str] | None = None,
acov: str = "quarter_circle",
value_label: str = "Uncertainty Width (Q90 - Q10)",
cmap: str = "coolwarm",
figsize: tuple[int, int] = (8, 8),
title: str = "Model Forecast Drift Over Time",
show_grid: bool = True,
annotate: bool = True,
grid_props: dict | None = None,
savefig: str | None = None,
dpi: int = 300,
ax: Axes | None = None,
):
#
# 1. Pair quantile columns
q_cols = build_qcols_multiple(
q_cols,
qlow_cols=q10_cols,
qup_cols=q90_cols,
)
n_horizons = len(q_cols)
# Default angular labels
if horizons is None:
horizons = [f"Horizon {idx + 1}" for idx in range(n_horizons)]
#
# 2. Compute average inter-quantile width per horizon
widths = np.array([(df[q90] - df[q10]).mean() for q10, q90 in q_cols])
# Secondary colouring metric
if color_metric_cols is not None:
colour_vals = np.array([df[col].mean() for col in color_metric_cols])
else:
colour_vals = widths
#
# 3. Angular span selection
#
span = {
"default": 2 * np.pi,
"half_circle": np.pi,
"quarter_circle": np.pi / 2,
"eighth_circle": np.pi / 4,
}.get(acov, 2 * np.pi)
theta = np.linspace(0.0, span, n_horizons, endpoint=False)
# Scale radii when angular coverage < full circle
radii = widths / widths.max() if span < 2 * np.pi else widths
#
# 4. Figure setup
if ax is None:
fig, ax = plt.subplots(
figsize=figsize,
subplot_kw={"projection": "polar"},
)
else:
fig = ax.figure
# Orient polar chart: 0° at the top, clockwise direction
ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)
ax.set_thetamin(0)
ax.set_thetamax(np.degrees(span))
# Colormap normalisation
norm = Normalize(vmin=colour_vals.min(), vmax=colour_vals.max())
colours = get_cmap(cmap)(norm(colour_vals))
#
# 5. Draw bars
bar_width = (span / n_horizons) * 0.9 # slight gap between bars
ax.bar(
theta,
radii,
width=bar_width,
color=colours,
edgecolor="k",
alpha=0.85,
linewidth=0.8,
)
# Annotation
if annotate:
for ang, rad, raw in zip(theta, radii, widths):
label = f"{raw:.2f}"
ax.text(
ang,
rad + 0.03 * radii.max(),
label,
ha="center",
va="bottom",
fontsize=9,
)
# Ticks & labels
# Wrap long labels over 1–2 lines so they don’t collide with the rim
wrap_width = 32 if n_horizons <= 5 else (28 if n_horizons <= 8 else 24)
labels = [
textwrap.fill(
str(h),
width=wrap_width,
break_long_words=False,
break_on_hyphens=False,
)
for h in horizons
]
ax.set_xticks(theta)
ax.set_xticklabels(labels)
# Push labels outward from the circle (points)
# Use a bit more pad when there are many labels
pad_pts = 10 if n_horizons <= 5 else (12 if n_horizons <= 8 else 14)
ax.tick_params(axis="x", pad=pad_pts)
# Keep radial ticks hidden and place the radial ylabel inside the axes
ax.set_yticklabels([])
ax.set_ylabel(value_label, labelpad=52) # adjust as you like
# Optional grid
set_axis_grid(ax, show_grid, grid_props=grid_props)
# 6. Output handling
final = safe_savefig(
savefig,
fig,
dpi=dpi,
bbox_inches="tight",
)
if final is not None:
plt.close(fig)
else:
fig.tight_layout()
plt.show()
return ax
plot_model_drift.__doc__ = r"""
Visualize forecast drift across prediction horizons.
Renders a polar bar chart depicting how average model
uncertainty (or another metric) evolves as the forecast horizon
increases. Each bar corresponds to a specific forecast horizon,
arranged angularly.
This plot is crucial for diagnosing model degradation over
longer lead times, often termed *concept drift* or *model
aging* [1]_. A distinct increase in bar height (radius) for
later horizons signals inflating uncertainty or error, potentially
indicating a need for model retraining or adjustments to account
for changing dynamics. Use this visualization to assess if your
model's reliability holds as you forecast further into the
future.
Parameters
----------
df : pandas.DataFrame
Input DataFrame containing the necessary quantile columns (or
columns specified in ``color_metric_cols``) for each forecast
horizon.
q_cols : list[tuple[str, str]], optional
A list where each element is a tuple containing the column
names for the lower and upper quantiles for a specific
horizon, e.g., ``[('q10_h1', 'q90_h1'), ('q10_h2', 'q90_h2')]``.
If ``None``, ``q10_cols`` and ``q90_cols`` must be provided
instead. Default is ``None``.
q10_cols : list[str], optional
List of column names representing the lower quantile (e.g.,
10th percentile) for each successive horizon. Must be provided
if ``q_cols`` is ``None``. Must have the same length as
``q90_cols`` and ``horizons`` (if provided). Default is ``None``.
q90_cols : list[str], optional
List of column names representing the upper quantile (e.g.,
90th percentile) for each successive horizon. Must be provided
if ``q_cols`` is ``None``. Must have the same length as
``q10_cols`` and ``horizons`` (if provided). Default is ``None``.
horizons : list of str or int, optional
Labels corresponding to each forecast horizon (plotted on the
angular axis). The order must match the order in ``q_cols`` or
``q10_cols``/``q90_cols``. If ``None``, generic labels like
"Horizon 1", "Horizon 2", ... are generated.
Default is ``None``.
color_metric_cols : list of str, optional
If provided, the bars are colored based on the mean value of
these columns for each horizon (e.g., provide RMSE columns
like ``['rmse_h1', 'rmse_h2', ...]``). If ``None``, bars are
colored based on the calculated mean interval width (Q90-Q10).
Default is ``None``.
acov : {'default', 'half_circle', 'quarter_circle', \
'eighth_circle'}, optional
Specifies the angular coverage (span) of the plot. Use narrower
sectors for fewer horizons. Default is ``'quarter_circle'``.
- ``'default'``: Full circle (:math:`2\pi`, 360°).
- ``'half_circle'``: Half circle (:math:`\pi`, 180°).
- ``'quarter_circle'``: Quarter circle (:math:`\pi/2`, 90°).
- ``'eighth_circle'``: Eighth circle (:math:`\pi/4`, 45°).
value_label : str, optional
Label displayed for the radial axis, describing the metric
represented by the bar height. Default is
``'Uncertainty Width (Q90 - Q10)'``.
cmap : str, optional
Name of the Matplotlib colormap used to color the bars based
on their radial value (or the `color_metric_cols` value).
Default is ``'coolwarm'``.
figsize : tuple of (float, float), optional
Figure dimensions ``(width, height)`` in inches. If ``None``,
uses the default ``(8, 8)``.
title : str, optional
Headline text displayed above the plot. Default is
``'Model Forecast Drift Over Time'``.
show_grid : bool, optional
If ``True``, displays radial and angular grid lines to aid
interpretation. Default is ``True``.
annotate : bool, optional
If ``True``, displays the numeric value (mean width or mean
color metric) on top of each bar. Default is ``True``.
grid_props : dict, optional
Dictionary of keyword arguments to customize the appearance
of the grid lines (passed to ``ax.grid()``). E.g.,
``{'linestyle': ':', 'linewidth': 0.5}``. Default is ``None``.
savefig : str, optional
Full path and filename to save the plot (e.g., 'drift.pdf').
If ``None``, the plot is displayed interactively.
Default is ``None``.
Returns
-------
matplotlib.axes.Axes
The polar axes object containing the bar chart, allowing
for further customization if desired.
Raises
------
ValueError
If required quantile columns (`q_cols` or both `q10_cols`
and `q90_cols`) are missing from `df`, or if the lengths of
provided column lists/horizons mismatch.
TypeError
If data in the specified columns cannot be processed
numerically.
See Also
--------
kdiagram.plot.uncertainty.plot_uncertainty_drift : Visualize drift
of the uncertainty pattern using rings.
kdiagram.utils.plot.set_axis_grid : Helper for grid styling (if used).
Notes
-----
The primary radial value plotted for each horizon :math:`h` is the
mean interval width calculated across all :math:`N` samples:
.. math::
\bar{w}_h = \frac{1}{N}\sum_{j=1}^{N} \left( Q_{up, j, h} -
Q_{low, j, h} \right)
If ``color_metric_cols`` is provided, a similar average is
calculated for those columns to determine bar color.
Radii may be scaled relative to the maximum radius if a
restricted angular coverage (``acov`` is not 'default') is used,
to better fit the visual sector [2]_.
References
----------
.. [1] Kouadio, K. L., Liu, R., Loukou, K. G. H., Liu, J., & Liu, W. (2025).
Analytics Framework for Interpreting Spatiotemporal Probabilistic
Forecasts. *International Journal of Forecasting*. Manuscript submitted.
.. [2] Gama, J., Žliobaitė, I., Bifet, A., Pechenizkiy, M., &
Bouchachia, A. (2014). A survey on concept drift
adaptation. *ACM Computing Surveys (CSUR)*, 46(4), 1-37.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from kdiagram.plot.uncertainty import plot_model_drift
>>> # Example with synthetic data
>>> years = [2023, 2024, 2025, 2026]
>>> n_samples=50
>>> df_synth = pd.DataFrame()
>>> q10_cols, q90_cols = [], []
>>> for i, year in enumerate(years):
... ql, qu = f'val_{year}_q10', f'val_{year}_q90'
... q10_cols.append(ql); q90_cols.append(qu)
... q10 = np.random.rand(n_samples)*5 + i*0.5
... q90 = q10 + np.random.rand(n_samples)*2 + 1 + i*0.8
... df_synth[ql]=q10; df_synth[qu]=q90
>>> ax = plot_model_drift(
... df=df_synth,
... q10_cols=q10_cols,
... q90_cols=q90_cols,
... horizons=years,
... acov='quarter_circle', # Use 90 degree span
... title='Synthetic Model Drift Example'
... )
"""
[docs]
@check_non_emptiness
@isdf
def plot_velocity(
df: pd.DataFrame,
q50_cols: list[str],
theta_col: str | None = None,
cmap: str = "viridis",
acov: str = "default",
normalize: bool = True,
use_abs_color: bool = True,
figsize: tuple[float, float] = (9, 9),
title: str | None = None,
s: float | int = 30,
alpha: float = 0.85,
show_grid: bool = True,
savefig: str | None = None,
cbar: bool = True,
mask_angle: bool = False,
dpi: int = 300,
ax: Axes | None = None,
):
# --- Input Validation ---
# Check if required q50_cols exist in the DataFrame
missing_cols = [col for col in q50_cols if col not in df.columns]
if missing_cols:
raise ValueError(
f"The following Q50 columns are missing from the "
f"DataFrame: {', '.join(missing_cols)}"
)
if len(q50_cols) < 2:
raise ValueError(
"At least two Q50 columns (representing two time points)"
" are required to compute velocity."
)
# Check theta_col status and warn if provided but unused
if theta_col is not None:
if theta_col not in df.columns:
warnings.warn(
f"Specified `theta_col` ('{theta_col}') not found in "
f"DataFrame columns. Using index for angular position.",
UserWarning,
stacklevel=2,
)
else:
warnings.warn(
f"`theta_col` ('{theta_col}') is provided but the current"
f" implementation uses the DataFrame index for angular "
f"positioning ('theta'). The column '{theta_col}' is "
f"currently ignored for positioning.",
UserWarning,
stacklevel=2,
)
# --- Data Processing ---
# Extract Q50 data into a NumPy array (locations x time)
q50_array = df[q50_cols].values # Shape (N, M)
# Compute yearly differences along the time axis (axis=1)
# Result shape (N, M-1)
yearly_diff = np.diff(q50_array, axis=1)
# Compute average velocity per location (mean across time diffs)
# Result shape (N,)
r = np.mean(yearly_diff, axis=1)
# Normalize radial values (velocity) if requested
r_normalized = r.copy() # Use a copy for potential normalization
if normalize:
r_range = np.ptp(r) # Peak-to-peak (max - min)
if r_range > 1e-9: # Avoid division by zero or near-zero
r_min = r.min()
r_normalized = (r - r_min) / r_range
else:
# Handle case where all velocities are the same
r_normalized = np.zeros_like(r) # Set all to 0 if range is zero
warnings.warn(
"Velocity range is zero or near-zero. Normalized radial "
"values ('r') are set to 0.",
UserWarning,
stacklevel=2,
)
# Determine values used for coloring the points
if use_abs_color:
# Use average absolute Q50 magnitude across all years
color_vals = np.mean(np.abs(q50_array), axis=1)
cbar_label = "Average Abs Q50 Magnitude"
else:
# Use the calculated average velocity for color
color_vals = r # Use original velocity for color scale
cbar_label = "Average Velocity"
# --- Angular Coordinate Calculation ---
N = len(df) # Number of locations/points
# Generate linear space from 0 to 1 for N points
theta_normalized = np.linspace(
0, 1, N, endpoint=True
) # Includes endpoint 1
# Map normalized theta to the desired angular coverage
angular_range_map = {
"default": 2 * np.pi,
"half_circle": np.pi,
"quarter_circle": np.pi / 2,
"eighth_circle": np.pi / 4,
}
# Get the angular span in radians, default to full circle if invalid
angle_span = angular_range_map.get(acov.lower(), 2 * np.pi)
if acov.lower() not in angular_range_map:
warnings.warn(
f"Invalid `acov` value '{acov}'. Using 'default' (2*pi).",
UserWarning,
stacklevel=2,
)
# Calculate final theta values
theta = theta_normalized * angle_span
# --- Color Normalization for Plotting ---
# try:
cmap_used = get_cmap(cmap, default="viridis")
# except (TypeError, ValueError, KeyError):
# warnings.warn(
# f"Invalid `cmap` name '{cmap}'. Falling back to 'viridis'.",
# UserWarning,
# stacklevel=2,
# )
# cmap = "viridis"
# cmap_used = get_cmap(cmap)
# Normalize color values to the range [0, 1] for the colormap
color_norm = Normalize(vmin=np.min(color_vals), vmax=np.max(color_vals))
# Map normalized color values to actual colors using the colormap
colors = cmap_used(color_norm(color_vals))
# --- Plotting ---
if ax is None:
fig, ax = plt.subplots(
figsize=figsize, subplot_kw={"projection": "polar"}
)
else:
fig = ax.figure
# Set the angular limits based on angular coverage
ax.set_thetamin(0)
ax.set_thetamax(np.degrees(angle_span)) # set_thetamax expects degrees
# Create the polar scatter plot
ax.scatter(
theta,
r_normalized if normalize else r, # Use normalized or raw r
c=colors, # Point colors
s=s, # Point size
edgecolor="k", # Point edge color (optional, for visibility)
linewidth=0.5, # Point edge width (optional)
alpha=alpha, # Point transparency
)
# Set plot title
ax.set_title(
title or "Average Velocity Polar Plot", fontsize=14, y=1.08
) # Adjust title position
# Add color bar if requested
if cbar:
# Create a ScalarMappable for the colorbar
sm = cm.ScalarMappable(norm=color_norm, cmap=cmap_used)
sm.set_array([]) # Necessary for ScalarMappable
# Add the colorbar to the figure
cbar_obj = plt.colorbar(sm, ax=ax, pad=0.1, shrink=0.7)
cbar_obj.set_label(cbar_label, fontsize=10)
# Customize grid and labels
if show_grid:
ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.7)
else:
ax.grid(False)
# Optionally mask angular tick labels
if mask_angle:
ax.set_xticklabels([])
# Set radial label based on normalization
if normalize:
ax.set_ylabel("Normalized Average Velocity", labelpad=32, fontsize=10)
# Ensure radial limits are appropriate for normalized data
# ax.set_ylim(bottom=0, top=1.05) # Give slight padding
# ax.set_yticks(np.linspace(0, 1, 5)) # Example radial ticks
else:
ax.set_ylabel("Average Velocity", labelpad=32, fontsize=10)
# Radial limits might need auto-scaling or manual setting
# --- Save or Show ---
final = safe_savefig(
savefig,
fig,
dpi=dpi,
bbox_inches="tight",
)
if final is not None:
plt.close(fig)
else:
fig.tight_layout()
plt.show()
return ax
plot_velocity.__doc__ = r"""
Polar plot visualizing average velocity across locations.
Generates a polar scatter plot where each point represents a
unique location or observation from the input DataFrame. The
radial distance (`r`) of each point corresponds to the average
rate of change (velocity) of the median prediction (Q50) over
consecutive time periods (e.g., years), optionally normalized
to [0, 1]. The angular position (`theta`) represents the location,
currently determined by its index in the DataFrame, mapped onto a
specified angular coverage. The color of each point provides an
additional dimension, representing either the calculated velocity
itself or the average absolute magnitude of the Q50 predictions
over the considered time periods [1]_.
This visualization is useful for identifying spatial patterns in
the dynamics of a phenomenon, such as locating areas of rapid or
slow change (high/low velocity) in land subsidence predictions.
Coloring by magnitude helps to contextualize the velocity (e.g.,
is high velocity occurring in areas of already high subsidence?).
Parameters
----------
df : pd.DataFrame
Input DataFrame containing the data. Must include the columns
specified in `q50_cols`. Decorator `@isdf` ensures this is a
pandas DataFrame. Decorator `@check_non_emptiness` ensures
it's not empty.
q50_cols : list of str
An ordered list of column names representing the Q50 (median)
predictions for consecutive time steps (e.g., years). The list
must contain at least two column names to compute velocity.
Example: ``['subsidence_2022_q50', 'subsidence_2023_q50',
'subsidence_2024_q50']``.
theta_col : str, optional
*Intended* column name to determine the angular position (`theta`)
for each location (e.g., 'latitude', 'longitude', or a spatial
index). If ``None``, the DataFrame index is conceptually used.
*Note: The current implementation maps the DataFrame row index
to the angular range specified by `acov`, regardless of whether
`theta_col` is provided. Providing `theta_col` will currently
trigger a warning but will not affect the plot's angular axis.*
Default is ``None``.
cmap : str, default='viridis'
The name of the Matplotlib colormap used to color the scatter
points based on `color_vals` (determined by `use_abs_color`).
acov : str, default='default'
Angular coverage defining the span of the polar plot's theta
axis. Options are:
- ``'default'``: Full circle (2p radians or 360 degrees).
- ``'half_circle'``: Half circle (p radians or 180 degrees).
- ``'quarter_circle'``: Quarter circle (p/2 radians or 90
degrees).
- ``'eighth_circle'``: Eighth circle (p/4 radians or 45
degrees).
Invalid options default to ``'default'``.
normalize : bool, default=True
If ``True``, the calculated average velocity values (`r`) are
min-max normalized to the range [0, 1] before plotting radially.
This emphasizes relative velocity patterns. If ``False``, the
raw average velocity values are used for the radial coordinate.
use_abs_color : bool, default=True
Determines the variable used for coloring the points:
- If ``True``, points are colored based on the average absolute
magnitude of the Q50 values across the specified `q50_cols`.
This highlights areas with high overall prediction values.
- If ``False``, points are colored based on the calculated
average velocity (`r`) itself. This highlights areas of high
or low rate of change.
figsize : tuple of (float, float), default=(9, 9)
The width and height of the figure in inches.
title : str, optional
The title displayed above the polar plot. If ``None``, a default
title "Normalized Subsidence Velocity" (or similar, depending
on context, though not dynamically changed here) is used.
Default is ``None``.
s : float or int, default=30
The marker size for the scatter points.
alpha : float, default=0.85
The transparency level of the scatter points (0=transparent,
1=opaque). Useful for visualizing dense data.
show_grid : bool, default=True
If ``True``, display the polar grid lines (radial and angular)
on the plot.
savefig : str, optional
The file path (including extension, e.g., 'velocity_plot.pdf')
where the plot image should be saved. If ``None``, the plot is
displayed interactively using `plt.show()`.
Default is ``None``.
cbar : bool, default=True
If ``True``, display a color bar alongside the plot indicating
the mapping between colors and the values defined by
`use_abs_color`.
mask_angle : bool, default=False
If ``True``, hide the angular tick labels (the degrees/radians
around the circumference). This can be useful if the angular
position based on index is not inherently meaningful.
Returns
-------
ax : matplotlib.axes.Axes
The Matplotlib Axes object containing the polar scatter plot.
Can be used for further customization.
Raises
------
ValueError
If `q50_cols` contains fewer than two column names.
See Also
--------
numpy.diff : Computes the difference between consecutive elements.
numpy.mean : Computes the arithmetic mean.
matplotlib.pyplot.scatter : Creates scatter plots.
matplotlib.pyplot.polar : Creates polar plots.
kdiagram.plot.uncertainty.plot_uncertainty_drift : Visualizes
uncertainty width changes over time.
Notes
-----
- The function assumes the columns in `q50_cols` represent equally
spaced time steps for the velocity calculation to be meaningful
as an average *yearly* (or per-step) velocity.
- The average velocity (`r`) is calculated as the mean of the
first-order differences between consecutive columns in `q50_cols`.
- Normalization of `r` uses min-max scaling:
:math:`r' = (r - \min(r)) / (\max(r) - \min(r))`.
- The angular coordinate `theta` is currently derived from the
DataFrame index, mapped linearly onto the angular range defined
by `acov`. The `theta_col` parameter is not used for positioning
in the current implementation, which might be revised in future
versions. A warning is issued if `theta_col` is provided [2]_.
Let :math:`\mathbf{Q}` be the data matrix extracted from `df` using
columns `q50_cols`, with shape :math:`(N, M)`, where :math:`N` is
the number of locations (rows) and :math:`M` is the number of time
points (columns). Note the transpose compared to the description
in `plot_feature_fingerprint`.
1. **Velocity Calculation**: The differences between consecutive time
points for each location :math:`j` are computed:
:math:`\Delta Q_{j,i} = Q_{j, i+1} - Q_{j, i}` for
:math:`i = 0, \dots, M-2`.
The average velocity for location :math:`j` is:
.. math::
r_j = \frac{1}{M-1} \sum_{i=0}^{M-2} \Delta Q_{j,i}
2. **Radial Normalization** (if `normalize=True`):
Let :math:`\mathbf{r} = (r_0, \dots, r_{N-1})`.
.. math::
r'_j = \frac{r_j - \min(\mathbf{r})}{\max(\mathbf{r}) - \min(\mathbf{r})}
If :math:`\max(\mathbf{r}) = \min(\mathbf{r})`, :math:`r'_j = 0`.
3. **Color Value Calculation**:
- If `use_abs_color=True`: Average absolute magnitude.
.. math::
c_j = \frac{1}{M} \sum_{i=0}^{M-1} |Q_{j,i}|
- If `use_abs_color=False`: Use average velocity.
:math:`c_j = r_j`
4. **Angular Coordinate Calculation**:
Let :math:`S` be the angular span in radians determined by `acov`
(e.g., :math:`2\pi` for ``'default'``). The angle for location
:math:`j` (where :math:`j` is the row index from :math:`0` to
:math:`N-1`) is:
.. math::
\theta_j = \frac{j}{N} \times S
The code uses `np.linspace(0, 1, N)` which generates N
points from 0 to 1 inclusive, so the formula might be slightly
different depending on endpoint handling, effectively
:math:`\theta_j = \frac{j}{N-1} \times S` for the N points if
endpoint=True, or spacing relates to N intervals if
``endpoint=False``. The code uses `np.linspace(0, 1, N)` and
multiplies by `angle_span`, suggesting the angles might span
from 0 up to `angle_span`.
References
----------
.. [1] Kouadio, K. L., Liu, R., Loukou, K. G. H., Liu, J., & Liu, W. (2025).
Analytics Framework for Interpreting Spatiotemporal Probabilistic Forecasts.
*International Journal of Forecasting*. Manuscript submitted.
.. [2] Hunter, J. D. (2007). Matplotlib: A 2D graphics environment.
Computing in Science & Engineering, 9(3), 90-95.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from kdiagram.plot.uncertainty import plot_velocity
**1. Random Example:**
>>> np.random.seed(0)
>>> N_points = 100
>>> df_random = pd.DataFrame({
... 'location_id': range(N_points),
... 'value_2020_q50': np.random.rand(N_points) * 10,
... 'value_2021_q50': (np.random.rand(N_points) * 10 +
... np.linspace(0, 5, N_points)),
... 'value_2022_q50': (np.random.rand(N_points) * 10 +
... np.linspace(0, 10, N_points)),
... 'latitude': np.linspace(22, 23, N_points)
... })
>>> q50_cols_random = ['value_2020_q50', 'value_2021_q50',
... 'value_2022_q50']
>>> ax_random = plot_velocity(
... df=df_random,
... q50_cols=q50_cols_random,
... theta_col='latitude', # Note: currently ignored for pos
... acov='default',
... normalize=True,
... use_abs_color=False, # Color by velocity
... title='Random Data Velocity Profile',
... cmap='coolwarm',
... s=40,
... cbar=True
... )
>>> # plt.show() is called internally if savefig is None
**2. Concrete Example (Subsidence Data - adapted from docstring):**
>>> # Assume zhongshan_pred_2023_2026 is a loaded DataFrame like:
>>> # zhongshan_pred_2023_2026 = pd.DataFrame({
>>> # 'subsidence_2022_q50': np.random.rand(50)*5 + 5,
>>> # 'subsidence_2023_q50': np.random.rand(50)*6 + 6,
>>> # 'subsidence_2024_q50': np.random.rand(50)*7 + 7,
>>> # 'subsidence_2025_q50': np.random.rand(50)*8 + 8,
>>> # 'subsidence_2026_q50': np.random.rand(50)*9 + 9,
>>> # 'latitude': np.linspace(22.2, 22.8, 50)
>>> # }) # Dummy data for example execution
>>> # Create dummy data if zhongshan_pred_2023_2026 doesn't exist
>>> try:
... zhongshan_pred_2023_2026
... except NameError:
... print("Creating dummy subsidence data for example...")
... zhongshan_pred_2023_2026 = pd.DataFrame({
... 'subsidence_2022_q50': np.random.rand(150)*5 + 5,
... 'subsidence_2023_q50': np.random.rand(150)*6 + 6 + np.linspace(0, 2, 150),
... 'subsidence_2024_q50': np.random.rand(150)*7 + 7 + np.linspace(0, 4, 150),
... 'subsidence_2025_q50': np.random.rand(150)*8 + 8 + np.linspace(0, 6, 150),
... 'subsidence_2026_q50': np.random.rand(150)*9 + 9 + np.linspace(0, 8, 150),
... 'latitude': np.linspace(22.2, 22.8, 150)
... })
>>> subsidence_q50_cols = [
... 'subsidence_2022_q50', 'subsidence_2023_q50',
... 'subsidence_2024_q50', 'subsidence_2025_q50',
... 'subsidence_2026_q50',
... ]
>>> ax_subsidence = plot_velocity(
... df=zhongshan_pred_2023_2026,
... q50_cols=subsidence_q50_cols,
... theta_col='latitude', # Ignored for pos, triggers warning
... acov='quarter_circle', # Focus angular range
... normalize=True,
... use_abs_color=True, # Color by Q50 magnitude
... title='Subsidence Velocity Across Zhongshan (2022–2026)',
... cmap='plasma',
... s=25,
... cbar=True,
... mask_angle=True # Hide angle labels
... )
>>> # plt.show() called internally
"""
[docs]
@check_non_emptiness
@isdf
def plot_interval_consistency(
df: pd.DataFrame,
qlow_cols: list[str],
qup_cols: list[str],
q50_cols: list[str] | None = None,
theta_col: str | None = None,
use_cv: bool = True,
cmap: str = "coolwarm",
acov: str = "default",
title: str | None = None,
figsize: tuple[float, float] = (9, 9),
s: float | int = 30,
alpha: float = 0.85,
show_grid: bool = True,
mask_angle: bool = False,
savefig: str | None = None,
dpi: int = 300,
ax: Axes | None = None,
):
# --- Input Validation ---
# Basic DataFrame checks handled by decorators @isdf @check_non_emptiness
if len(qlow_cols) != len(qup_cols):
raise ValueError(
"Mismatch in length between `qlow_cols` "
f"({len(qlow_cols)}) and `qup_cols` ({len(qup_cols)})."
)
if q50_cols is not None and len(qlow_cols) != len(q50_cols):
raise ValueError(
"Mismatch in length between quantile columns: "
f"qlow/qup ({len(qlow_cols)}) and q50 ({len(q50_cols)})."
)
# Check if all specified columns exist in the DataFrame
all_cols = qlow_cols + qup_cols + (q50_cols if q50_cols else [])
missing_cols = [col for col in all_cols if col not in df.columns]
if missing_cols:
raise ValueError(
f"The following columns are missing from the DataFrame: "
f"{', '.join(missing_cols)}"
)
# Check theta_col status and warn if provided but unused
if theta_col is not None:
if theta_col not in df.columns:
warnings.warn(
f"Specified `theta_col` ('{theta_col}') not found in "
f"DataFrame columns. Using index for angular position.",
UserWarning,
stacklevel=2,
)
else:
# Issue warning as current implementation uses index
warnings.warn(
f"`theta_col` ('{theta_col}') is provided but the current"
f" implementation uses the DataFrame index for angular "
f"positioning ('theta'). The column '{theta_col}' is "
f"currently ignored for positioning.",
UserWarning,
stacklevel=2,
)
# --- Data Calculation ---
# Calculate interval widths for each year/timepoint for all locations
# widths shape: (M, N) where M=num_time_steps, N=num_locations
try:
widths = np.array(
[
df[qup].values - df[qlo].values
for qlo, qup in zip(qlow_cols, qup_cols)
]
)
except Exception as e:
raise TypeError(
"Could not compute widths. Ensure quantile columns contain "
"numeric data."
) from e
# Calculate radial value 'r' (std dev or CV of widths over time)
# Result shape: (N,)
mean_widths = np.mean(widths, axis=0)
std_widths = np.std(widths, axis=0)
if use_cv:
# Calculate Coefficient of Variation (CV)
# Handle division by zero or near-zero mean width
# Use np.divide for safe division, setting result to 0 where mean is ~0
r = np.divide(
std_widths,
mean_widths,
out=np.zeros_like(mean_widths, dtype=float), # Output array
where=np.abs(mean_widths) > 1e-9,
) # Condition for division
# Optionally issue warning if division by zero occurred
if np.any(np.abs(mean_widths) <= 1e-9):
num_zeros = np.sum(np.abs(mean_widths) <= 1e-9)
warnings.warn(
f"Mean interval width was zero or near-zero for {num_zeros}"
f" locations. CV is set to 0 for these locations.",
RuntimeWarning,
stacklevel=2,
)
radial_label = "CV of Interval Width (Q90-Q10)"
else:
# Use Standard Deviation
r = std_widths
radial_label = "Std Dev of Interval Width (Q90-Q10)"
# Calculate color values
if q50_cols:
try:
# Average Q50 across time for each location
q50_values = np.array([df[q].values for q in q50_cols])
color_vals_source = np.mean(q50_values, axis=0)
cbar_label = "Average Q50 Prediction"
except Exception as e:
warnings.warn(
f"Could not compute average Q50. Ensure Q50 columns contain"
f" numeric data. Falling back to coloring by 'r'. Error: {e}",
UserWarning,
stacklevel=2,
)
# Fallback: color by the radial value itself
color_vals_source = r
cbar_label = radial_label # Label reflects 'r'
else:
# Fallback: color by the radial value itself if no q50_cols given
color_vals_source = r
cbar_label = radial_label # Label reflects 'r'
# --- Angular Coordinate Calculation ---
N = len(df) # Number of locations
# Generate linear space [0, 1] for N points
theta_normalized = np.linspace(0, 1, N, endpoint=True)
# Map normalized theta to the desired angular coverage
angular_range_map = {
"default": 2 * np.pi,
"half_circle": np.pi,
"quarter_circle": np.pi / 2,
"eighth_circle": np.pi / 4,
}
angle_span = angular_range_map.get(acov.lower(), 2 * np.pi)
if acov.lower() not in angular_range_map:
warnings.warn(
f"Invalid `acov` value '{acov}'. Using 'default' (2*pi).",
UserWarning,
stacklevel=2,
)
# Calculate final theta values
theta = theta_normalized * angle_span
# --- Color Normalization ---
# Ensure cmap is valid for fallback
cmap_used = get_cmap(cmap, default="coolwarm")
# Normalize color values for the colormap
color_norm = Normalize(
vmin=np.min(color_vals_source), vmax=np.max(color_vals_source)
)
# Get actual colors
plot_colors = cmap_used(color_norm(color_vals_source))
# --- Plotting ---
if ax is None:
fig, ax = plt.subplots(
figsize=figsize, subplot_kw={"projection": "polar"}
)
else:
fig = ax.figure
# Set angular limits
ax.set_thetamin(0)
ax.set_thetamax(np.degrees(angle_span))
# Create the polar scatter plot
ax.scatter(
theta,
r, # Radial value (CV or Std Dev)
c=plot_colors, # Point colors
s=s, # Point size
edgecolor="k", # Point edge color
linewidth=0.5, # Point edge width
alpha=alpha, # Point transparency
)
# Set plot title
ax.set_title(
title or "Prediction Interval Consistency", fontsize=14, y=1.08
)
# Add color bar
sm = cm.ScalarMappable(norm=color_norm, cmap=cmap_used)
sm.set_array([]) # Necessary for ScalarMappable
cbar_obj = plt.colorbar(sm, ax=ax, pad=0.1, shrink=0.7)
cbar_obj.set_label(cbar_label, fontsize=10)
# Customize grid and labels
if show_grid:
ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.7)
else:
ax.grid(False)
# Optionally mask angular tick labels
if mask_angle:
ax.set_xticklabels([])
# Set radial axis label
ax.set_ylabel(radial_label, labelpad=32, fontsize=10)
# Optional: adjust radial limits if needed, e.g., start at 0
ax.set_ylim(bottom=0)
# --- Save or Show ---
final = safe_savefig(
savefig,
fig,
dpi=dpi,
bbox_inches="tight",
)
if final is not None:
plt.close(fig)
else:
fig.tight_layout()
plt.show()
return ax
plot_interval_consistency.__doc__ = r"""
Polar plot showing consistency of prediction interval widths.
This function generates a polar scatter plot to visualize the
temporal consistency (or variability) of prediction interval
widths (e.g., Q90 - Q10) across different locations over multiple
time steps or forecast horizons:
- The **angular position (`theta`)** represents each location,
currently derived from the DataFrame index and mapped onto the
specified angular coverage (`acov`).
- The **radial distance (`r`)** quantifies the inconsistency or
variability of the interval width over time for each location. It
is calculated as either the standard deviation (absolute
variability) or the coefficient of variation (CV, relative
variability) of the interval widths (Upper Quantile - Lower
Quantile) across the specified time steps. Higher `r` values
indicate locations where the predicted uncertainty range
fluctuates more significantly over time.
- The **color** of each point typically represents the average
median prediction (Q50) across the time steps (if `q50_cols`
are provided). This adds context, helping to identify if interval
inconsistency occurs in regions of high or low average predictions.
If `q50_cols` are not provided, color defaults to representing the
inconsistency measure `r`.
This plot is useful for diagnosing model reliability, identifying
locations or conditions where the model's uncertainty estimates
are unstable or vary considerably across different forecast
horizons.
Parameters
----------
df : pd.DataFrame
Input DataFrame containing the data. Must include columns
specified in `qlow_cols` and `qup_cols`. Decorator `@isdf`
ensures this is a pandas DataFrame. Decorator
`@check_non_emptiness` ensures it's not empty.
qlow_cols : list of str
List of column names representing the lower quantile (e.g., Q10)
predictions for consecutive time steps (e.g., years). Order
should correspond to the time steps. Example:
``['subsidence_2023_q10', 'subsidence_2024_q10', ...]``.
qup_cols : list of str
List of column names representing the upper quantile (e.g., Q90)
predictions for the same consecutive time steps as `qlow_cols`.
Must be the same length as `qlow_cols`. Example:
``['subsidence_2023_q90', 'subsidence_2024_q90', ...]``.
q50_cols : list of str, optional
List of column names representing the median quantile (Q50)
predictions for the same time steps. If provided, the average
Q50 value across these columns will be used to color the points.
Must be the same length as `qlow_cols` if provided. If ``None``,
the color will represent the radial value `r` (the inconsistency
measure). Default is ``None``.
theta_col : str, optional
*Intended* column name to determine the angular position (`theta`)
for each location (e.g., 'latitude', 'longitude', or a spatial
index). If ``None``, the DataFrame index is conceptually used.
*Note: The current implementation maps the DataFrame row index
to the angular range specified by `acov`, regardless of whether
`theta_col` is provided. Providing `theta_col` will currently
trigger a warning but will not affect the plot's angular axis.*
Default is ``None``.
use_cv : bool, default=True
Determines the measure of interval width variability used for the
radial coordinate `r`:
- If ``True``, `r` is the Coefficient of Variation (CV) of the
interval widths (Std Dev / Mean). CV measures relative
variability, useful when mean widths differ substantially.
- If ``False``, `r` is the Standard Deviation (Std Dev) of the
interval widths. Std Dev measures absolute variability.
cmap : str, default='coolwarm'
The name of the Matplotlib colormap used to color the scatter
points based on the average Q50 value (or `r` if `q50_cols` is
``None``).
acov : str, default='default'
Angular coverage defining the span of the polar plot's theta
axis. Options: ``'default'`` (360°), ``'half_circle'`` (180°),
``'quarter_circle'`` (90°), ``'eighth_circle'`` (45°). Invalid
options default to ``'default'``.
title : str, optional
The title displayed above the polar plot. If ``None``, a default
title like "Prediction Interval Consistency (Q90–Q10)" is used.
Default is ``None``.
figsize : tuple of (float, float), default=(9, 9)
The width and height of the figure in inches.
s : float or int, default=30
The marker size for the scatter points.
alpha : float, default=0.85
The transparency level of the scatter points (0=transparent,
1=opaque).
show_grid : bool, default=True
If ``True``, display the polar grid lines.
mask_angle : bool, default=False
If ``True``, hide the angular tick labels. Useful if the index-
based angle is not directly interpretable.
savefig : str, optional
File path to save the plot image. If ``None``, displays the plot
interactively. Default is ``None``.
Returns
-------
ax : matplotlib.axes.Axes
The Matplotlib Axes object containing the polar scatter plot.
Raises
------
AssertionError
If `qlow_cols` and `qup_cols` have different lengths.
ValueError
If specified columns in `qlow_cols`, `qup_cols`, or `q50_cols`
are not found in the DataFrame.
See Also
--------
plot_velocity : Plot average velocity in polar coordinates.
numpy.std : Compute the standard deviation.
numpy.mean : Compute the arithmetic mean.
matplotlib.pyplot.scatter : Create scatter plots.
Notes
-----
Interval-width consistency is assessed from paired lower/upper quantiles
for multiple time steps. For each location and time step, the width is
computed as upper minus lower. The radial value encodes either the
standard deviation of these widths (absolute variability) or their
coefficient of variation (relative variability), with safe handling of
zero means by setting the CV to zero when the average width is
numerically indistinguishable from zero. Angles are derived from the row
index and mapped linearly across the angular span determined by
:py:data:`acov`; the current implementation does not use
:py:data:`theta_col` for positioning. Rows containing missing values in
any required column are dropped prior to computation. These diagnostics
relate to standard notions of predictive-interval calibration and
stability; see :footcite:t:`Gneiting2007b, Jolliffe2012`.
**Interval widths.** Let :math:`\mathbf L` and :math:`\mathbf U` be
matrices extracted from :py:data:`df` using :py:data:`qlow_cols` and
:py:data:`qup_cols`, respectively, both of shape :math:`(N,M)`, with
:math:`N` locations and :math:`M` time steps. Define the width matrix
:math:`\mathbf W` by
.. math::
W_{j,i} = U_{j,i} - L_{j,i}
where :math:`j` indexes locations (:math:`0` to :math:`N-1`) and
:math:`i` indexes time steps (:math:`0` to :math:`M-1`).
**Radial Coordinate Calculation (`r`)**: Let :math:`\mathbf{w}_j =
(W_{j,0}, \dots, W_{j,M-1})` be the vector of widths over time
for location :math:`j`. Let :math:`\bar{w}_j = \text{mean}(\mathbf{w}_j)`
and :math:`\sigma_{w_j} = \text{std}(\mathbf{w}_j)`.
- If `use_cv=False` (Standard Deviation):
.. math::
r_j = \sigma_{w_j}
- If `use_cv=True` (Coefficient of Variation):
.. math::
r_j = \begin{cases} \frac{\sigma_{w_j}}{\bar{w}_j} &\\
\text{if } |\bar{w}_j| > \epsilon \\ 0 & \text{if }\\
|\bar{w}_j| \le \epsilon \end{cases}
where :math:`\epsilon` is a small threshold to prevent division
by zero.
**Color Value Calculation (`c`)**: Let :math:`\mathbf{Q50}` be the
data matrix (shape :math:`(N, M)`) from `q50_cols`.
- If `q50_cols` is provided: Let :math:`\mathbf{q50}_j =
(Q50_{j,0}, \dots, Q50_{j,M-1})`.
.. math::
c_j = \text{mean}(\mathbf{q50}_j) = \frac{1}{M} \sum_{i=0}^{M-1} Q50_{j,i}
- If `q50_cols` is ``None``:
:math:`c_j = r_j`
**Angular Coordinate Calculation (`theta`)**:
Same index-based calculation as `plot_velocity`. Let :math:`S` be
the angular span from `acov`.
.. math::
\theta_j = \frac{j}{N} \times S
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from kdiagram.plot.uncertainty import plot_interval_consistency
**1. Random Example:**
>>> np.random.seed(1)
>>> N_points = 120
>>> df_rand_interval = pd.DataFrame({
... 'id': range(N_points),
... 'lat': np.linspace(30, 31, N_points),
... 'val_2021_q10': np.random.rand(N_points) * 5,
... 'val_2021_q50': np.random.rand(N_points) * 5 + 5,
... 'val_2021_q90': np.random.rand(N_points) * 5 + 10,
... 'val_2022_q10': np.random.rand(N_points) * 6, # Slightly wider
... 'val_2022_q50': np.random.rand(N_points) * 6 + 6,
... 'val_2022_q90': np.random.rand(N_points) * 6 + 12,
... 'val_2023_q10': np.random.rand(N_points) * 4, # Narrower
... 'val_2023_q50': np.random.rand(N_points) * 4 + 7,
... 'val_2023_q90': np.random.rand(N_points) * 4 + 11,
... })
>>> q10_cols_rand = ['val_2021_q10', 'val_2022_q10', 'val_2023_q10']
>>> q90_cols_rand = ['val_2021_q90', 'val_2022_q90', 'val_2023_q90']
>>> q50_cols_rand = ['val_2021_q50', 'val_2022_q50', 'val_2023_q50']
>>> ax_rand_ic = plot_interval_consistency(
... df=df_rand_interval,
... qlow_cols=q10_cols_rand,
... qup_cols=q90_cols_rand,
... q50_cols=q50_cols_rand,
... theta_col='lat', # Note: Ignored for positioning
... use_cv=True, # Use CV for radial axis
... cmap='viridis',
... acov='half_circle',
... title='Random Interval Width Consistency (CV)',
... s=35
... )
>>> # plt.show() called internally
**2. Concrete Example (Subsidence Data - adapted from docstring):**
>>> # Assume zhongshan_pred_2023_2026 is loaded DataFrame like:
>>> # Create dummy data if it doesn't exist
>>> try:
... zhongshan_pred_2023_2026
... except NameError:
... print("Creating dummy subsidence data for example...")
... N_sub = 150
... zhongshan_pred_2023_2026 = pd.DataFrame({
... 'latitude': np.linspace(22.2, 22.8, N_sub),
... **{f'subsidence_{yr}_q10': np.random.rand(N_sub)*(yr-2020)+1
... for yr in range(2023, 2027)},
... **{f'subsidence_{yr}_q50': np.random.rand(N_sub)*(yr-2019)+5
... + np.linspace(0, (yr-2022)*2, N_sub)
... for yr in range(2023, 2027)},
... **{f'subsidence_{yr}_q90': np.random.rand(N_sub)*(yr-2018)+10
... + np.linspace(0, (yr-2022)*4, N_sub)
... for yr in range(2023, 2027)},
... })
>>> qlow_sub = [f'subsidence_{yr}_q10' for yr in range(2023, 2027)]
>>> qup_sub = [f'subsidence_{yr}_q90' for yr in range(2023, 2027)]
>>> q50_sub = [f'subsidence_{yr}_q50' for yr in range(2023, 2027)]
>>> ax_sub_ic = plot_interval_consistency(
... df=zhongshan_pred_2023_2026,
... qlow_cols=qlow_sub,
... qup_cols=qup_sub,
... q50_cols=q50_sub,
... theta_col='latitude', # Ignored for pos, triggers warning
... acov='default',
... title='Subsidence Uncertainty Consistency (2023–2026)',
... use_cv=False, # Use Std Dev for radius
... cmap='coolwarm',
... s=28,
... alpha=0.8,
... mask_angle=True
... )
>>>
References
----------
.. footbibliography::
"""
[docs]
@check_non_emptiness
@isdf
def plot_anomaly_magnitude(
df: pd.DataFrame,
actual_col: str,
q_cols: list[str] | tuple[str, str],
theta_col: str | None = None,
acov: str = "default",
title: str = "Anomaly Magnitude Polar Plot",
figsize: tuple[float, float] = (8.0, 8.0),
cmap_under: str = "Blues",
cmap_over: str = "Reds",
s: int = 30,
alpha: float = 0.8,
show_grid: bool = True,
verbose: int = 1,
cbar: bool = False,
dpi: int = 300,
savefig: str | None = None,
mask_angle: bool = False,
ax: Axes | None = None,
):
try:
qlow_col, qup_col = validate_qcols(
q_cols=q_cols,
ncols_exp="==2", # Expect exactly two columns
err_msg=(
"Expected `q_cols` to contain exactly two column names "
f"[lower_bound, upper_bound], but got: {q_cols}"
),
)
except Exception as e:
# Catch potential errors from validate_qcols if it raises them
raise ValueError(f"Validation of `q_cols` failed: {e}") from e
# Consolidate list of essential columns
cols_needed = [actual_col, qlow_col, qup_col]
if theta_col:
# Only add if specified, check existence later if needed
cols_needed.append(theta_col)
# Check existence of essential columns
# missing_cols = [col for col in cols_needed if col not in df.columns]
# Allow theta_col to be missing if specified, handled later
missing_essential = [
col
for col in [actual_col, qlow_col, qup_col]
if col not in df.columns
]
if missing_essential:
raise ValueError(
"The following essential columns are missing from the "
f"DataFrame: {', '.join(missing_essential)}"
)
# Drop rows with NaN in essential columns before proceeding
# Use only the definitely required columns for dropna
essential_cols_for_na = [actual_col, qlow_col, qup_col]
data = df[cols_needed].dropna(subset=essential_cols_for_na).copy()
if len(data) == 0:
warnings.warn(
"DataFrame is empty after dropping NaN values"
" in essential columns. Cannot generate plot.",
UserWarning,
stacklevel=2,
)
return None # Cannot proceed
# --- Anomaly Calculation ---
# Extract data as numpy arrays for efficiency
try:
y = data[actual_col].to_numpy(dtype=float)
y_lo = data[qlow_col].to_numpy(dtype=float)
y_hi = data[qup_col].to_numpy(dtype=float)
except (ValueError, TypeError) as e:
raise TypeError(
f"Failed to convert essential columns to numeric arrays."
f" Check data types. Original error: {e}"
) from e
# Identify under- and over-predictions
under_mask = y < y_lo
over_mask = y > y_hi
# Calculate anomaly magnitude (distance from the violated bound)
anomaly_mag = np.zeros_like(y, dtype=float)
# Magnitude is positive: bound - actual for under, actual - bound for over
anomaly_mag[under_mask] = y_lo[under_mask] - y[under_mask]
anomaly_mag[over_mask] = y[over_mask] - y_hi[over_mask]
# Filter out non-anomalies for plotting (only plot r > 0)
is_anomaly = under_mask | over_mask
if not np.any(is_anomaly):
warnings.warn(
"No anomalies detected (all actual values are within "
"the specified quantile bounds). Plot will be empty.",
UserWarning,
stacklevel=2,
)
# Still create plot structure, but it will be empty
# Filter data to only include anomalies
anomaly_mag = anomaly_mag[is_anomaly]
under_mask_filtered = under_mask[is_anomaly]
over_mask_filtered = over_mask[is_anomaly]
data_filtered = data[is_anomaly] # Filter DataFrame rows too
N_anomalies = len(data_filtered) # Number of anomalies
# --- Theta Coordinate and Ordering ---
# Determine ordering index
if theta_col and theta_col in data_filtered.columns:
try:
# Sort based on the theta_col values of the anomalies
ordered_idx = np.argsort(
data_filtered[theta_col].to_numpy(dtype=float)
)
except (ValueError, TypeError):
warnings.warn(
f"Could not sort by `theta_col` ('{theta_col}') as it "
f"contains non-numeric data after NaN removal. "
f"Using default DataFrame order.",
UserWarning,
stacklevel=2,
)
ordered_idx = np.arange(N_anomalies) # Fallback to original order
else:
# Use default order if theta_col not specified or not found
if theta_col and theta_col not in data_filtered.columns:
warnings.warn(
f"Specified `theta_col` ('{theta_col}') not found in the"
f" (filtered) DataFrame. Using default DataFrame order.",
UserWarning,
stacklevel=2,
)
ordered_idx = np.arange(N_anomalies)
# Generate base theta values (linear spacing)
theta_norm = np.linspace(0.0, 1.0, N_anomalies, endpoint=True)
# Define angular coverage range
coverage_map = {
"default": 2 * np.pi,
"half_circle": np.pi,
"quarter_circle": np.pi / 2,
"eighth_circle": np.pi / 4,
}
coverage = coverage_map.get(acov.lower(), 2 * np.pi)
if acov.lower() not in coverage_map:
warnings.warn(
f"Invalid `acov` value '{acov}'. Using 'default' (2*pi).",
UserWarning,
stacklevel=2,
)
# Calculate final theta values based on coverage
theta = theta_norm * coverage
# Apply ordering to all relevant arrays
theta = theta[ordered_idx]
anomaly_mag_ordered = anomaly_mag[ordered_idx]
under_mask_ordered = under_mask_filtered[ordered_idx]
over_mask_ordered = over_mask_filtered[ordered_idx]
# --- Plotting ---
if ax is None:
# Constrained layout avoids title / r-label overlap on polar axes
fig = plt.figure(figsize=figsize, constrained_layout=True)
ax = fig.add_subplot(111, projection="polar")
else:
fig = ax.figure
ax.set_thetamin(0)
ax.set_thetamax(np.degrees(coverage)) # Expects degrees
# ax.set_title(title, fontsize=14, y=1.08) # Adjust position
# Setup grid
if show_grid:
ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.7)
else:
ax.grid(False)
# Optionally mask angle labels
if mask_angle:
ax.set_xticklabels([])
# --- Color Normalization (shared scale based on max magnitude) ---
# Ensure vmax is at least a small positive number for normalization
vmax = max(
float(anomaly_mag_ordered.max()) if N_anomalies > 0 else 0.0, 1e-5
)
norm = Normalize(vmin=0.0, vmax=vmax)
# Retrieve colormaps safely
try:
cmap_under_obj = get_cmap(cmap_under)
except ValueError:
warnings.warn(
f"Invalid `cmap_under` ('{cmap_under}'). Using default 'Blues'.",
UserWarning,
stacklevel=2,
)
cmap_under_obj = get_cmap("Blues")
try:
cmap_over_obj = get_cmap(cmap_over)
except ValueError:
warnings.warn(
f"Invalid `cmap_over` ('{cmap_over}'). Using default 'Reds'.",
UserWarning,
stacklevel=2,
)
cmap_over_obj = get_cmap("Reds")
# --- Scatter Plot Anomalies (separate calls for color/label) ---
if np.any(under_mask_ordered):
ax.scatter(
theta[under_mask_ordered], # Angles for under-preds
anomaly_mag_ordered[
under_mask_ordered
], # Magnitudes for under-preds
c=anomaly_mag_ordered[
under_mask_ordered
], # Color value is magnitude
cmap=cmap_under_obj, # Blues colormap
norm=norm, # Shared normalization
s=s, # Marker size
alpha=alpha, # Transparency
edgecolor="grey", # Edge color
linewidth=0.5,
label="Under-prediction", # Legend label
)
if np.any(over_mask_ordered):
ax.scatter(
theta[over_mask_ordered], # Angles for over-preds
anomaly_mag_ordered[
over_mask_ordered
], # Magnitudes for over-preds
c=anomaly_mag_ordered[
over_mask_ordered
], # Color value is magnitude
cmap=cmap_over_obj, # Reds colormap
norm=norm, # Shared normalization
s=s, # Marker size
alpha=alpha, # Transparency
edgecolor="grey", # Edge color
linewidth=0.5,
label="Over-prediction", # Legend label
)
# Add legend if any anomalies were plotted
if np.any(under_mask_ordered) or np.any(over_mask_ordered):
ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1), fontsize=9)
else:
# Add text if plot is empty
ax.text(
0,
0,
"No anomalies detected",
horizontalalignment="center",
verticalalignment="center",
)
# --- Add Colorbar (optional) ---
# Note: Visually uses cmap_over, but scale (norm) is correct
if cbar and N_anomalies > 0:
# Create a mappable object linked to the normalization and cmap_over
sm = cm.ScalarMappable(norm=norm, cmap=cmap_over_obj)
sm.set_array([]) # Needed for ScalarMappable
# cbar_obj = plt.colorbar(sm, ax=ax, pad=0.1, shrink=0.7)
cbar_obj = plt.colorbar(sm, ax=ax, pad=0.08, shrink=0.80)
cbar_obj.set_label("Anomaly magnitude |Actual - Bound|", fontsize=10)
# Set radial axis label
ax.set_title(title, fontsize=14, y=1.12)
ax.set_ylabel("Anomaly Magnitude", labelpad=24, fontsize=10)
# Move the radial tick labels away from the y-label side
ax.set_rlabel_position(315) # places radial tick labels near SE quadrant
ax.set_ylim(bottom=0) # Ensure radius starts at 0
# --- Logging ---
if verbose > 0:
# Use original masks on NaN-dropped data before filtering for plot
n_total_checked = len(data) # Total valid points checked
n_under_total = np.sum(y < y_lo)
n_over_total = np.sum(y > y_hi)
n_anomalies_total = n_under_total + n_over_total
print("-" * 50)
print("Anomaly Detection Summary:")
print(f" Total valid points checked: {n_total_checked}")
print(
f" Anomalies detected (outside {qlow_col}-{qup_col}): "
f"{n_anomalies_total}"
)
print(
f" ? Under-predictions ({actual_col} < {qlow_col}): {n_under_total}"
)
print(
f" ? Over-predictions ({actual_col} > {qup_col}): {n_over_total}"
)
print("-" * 50)
final = safe_savefig(
savefig,
fig,
dpi=dpi,
bbox_inches="tight",
)
if final is not None:
plt.close(fig)
else:
plt.show()
return ax
plot_anomaly_magnitude.__doc__ = r"""
Visualize magnitude and type of prediction-interval anomalies.
This function draws a polar scatter plot to highlight prediction-interval
failures—cases where the observed value lies outside a user-specified
interval (e.g., :math:`[Q_{0.10}, Q_{0.90}]`). It encodes the anomaly’s
**order/location** (angle), **magnitude** (radius), and **type** (color).
See :footcite:t:`kouadiob2025`; background on coverage and calibration in
:footcite:p:`Gneiting2007b, Jolliffe2012`.
- **Angular position (:math:`\theta`)**: maps each row to an angle over the
chosen span (``acov``). If ``theta_col`` is provided, rows are *ordered*
by that column before angles are assigned; otherwise the original row
order is used. Angles are spaced linearly across the selected coverage.
- **Radial distance (:math:`r`)**: the anomaly magnitude for interval
violations, computed as :math:`r=\lvert y - B\rvert`, where
:math:`B\in\{L,U\}` is the violated bound (lower :math:`L` or upper
:math:`U`). Points satisfying :math:`L\le y\le U` are omitted.
- **Color**: indicates anomaly type and scales with magnitude on a shared
normalization. ``cmap_under`` colors under-predictions
(:math:`y<L`); ``cmap_over`` colors over-predictions (:math:`y>U`).
This diagnostic helps to (i) localize clusters of failures along the
ordering induced by ``theta_col`` (or row index), (ii) assess how severe
misses are via larger radii, and (iii) support calibration checks and
targeted model refinement, alongside coverage and reliability analyses,
see :footcite:p:`Gneiting2007b, Jolliffe2012`.
Parameters
----------
df : pandas.DataFrame
Input table containing the actual values and the two quantile
(interval) columns. Must be non-empty after NaN removal in the
required columns (see below).
actual_col : str
Name of the column with the observed/ground-truth values used to
check interval coverage.
q_cols : list[str] or tuple[str, str]
Two-element sequence with the **lower** and **upper** quantile
column names, in that order: ``[q_low, q_up]``. Each referenced
column must exist in ``df`` and be numeric. Semantically, rows are
expected to satisfy :math:`q_\text{low} \le q_\text{up}`.
theta_col : str, optional
Column used **only to order points** angularly before mapping them
into the selected coverage span. Useful for spatial or categorical
ordering (e.g., ``'latitude'``, ``'station_id'``). If omitted or
non-numeric after NaN filtering, the original row order is used.
acov : {'default', 'half_circle', 'quarter_circle', 'eighth_circle'}, \
default='default'
Angular **coverage** of the plot:
- ``'default'``: full circle, :math:`2\pi`
- ``'half_circle'``: :math:`\pi`
- ``'quarter_circle'``: :math:`\pi/2`
- ``'eighth_circle'``: :math:`\pi/4`
(Value is case-insensitive; invalid values fall back to full circle
with a warning.)
title : str, default='Anomaly Magnitude Polar Plot'
Figure title.
figsize : tuple[float, float], default=(8.0, 8.0)
Figure size in inches; each dimension must be positive.
cmap_under : str, default='Blues'
Matplotlib colormap **name** used for *under-predictions*
(:math:`y<q_\text{low}`). If invalid, a warning is issued and a
default is used.
cmap_over : str, default='Reds'
Matplotlib colormap **name** used for *over-predictions*
(:math:`y>q_\text{up}`). If invalid, a warning is issued and a
default is used.
s : int, default=30
Marker size for anomaly points (points with interval failure);
must be positive.
alpha : float, default=0.8
Point transparency in :math:`[0,1]`.
show_grid : bool, default=True
Whether to draw polar grid lines.
verbose : int, default=1
Verbosity level. If :math:`>0`, prints a short anomaly summary
(counts of under-/over-predictions).
cbar : bool, default=False
If ``True``, draw a colorbar for the **shared** magnitude
normalization. The bar uses the ``cmap_over`` colormap for display,
but its scale matches both under/over magnitudes.
savefig : str, optional
Path to save the figure (e.g., ``'anomaly_plot.png'``). If omitted,
the figure is shown interactively. Errors during saving are reported
via a printed message.
mask_angle : bool, default=False
If ``True``, hide angular tick labels (useful when the angle order
is arbitrary or index-based).
Returns
-------
ax : matplotlib.axes.Axes or None
The polar ``Axes`` containing the scatter plot. Returns ``None`` if
the DataFrame becomes empty **after** dropping NaNs in the required
columns and no plot can be produced. If no anomalies are found, an
empty polar frame is returned with a notice text, not ``None``.
Raises
------
ValueError
- ``q_cols`` is not a 2-item sequence.
- Any of ``actual_col``, the two ``q_cols``, or the provided
``theta_col`` (when used) is missing from ``df``.
TypeError
Required columns exist but cannot be coerced to numeric dtype after
NaN handling.
See Also
--------
kdiagram.plot.uncertainty.plot_coverage :
Aggregate empirical coverage comparison.
kdiagram.plot.uncertainty.plot_coverage_diagnostic :
Point-wise coverage successes/failures.
kdiagram.plot.uncertainty.plot_interval_width :
Magnitude of prediction-interval widths.
kdiagram.plot.uncertainty.plot_interval_consistency :
Stability of interval widths over time/steps.
validate_qcols :
Helper for validating the lower/upper quantile columns.
Notes
-----
- Anomalies are detected by comparing ``actual_col`` to the bounds
in ``q_cols`` for each row.
- Rows with NaNs in any required column (``actual_col``, the two
quantile columns, and ``theta_col`` if provided) are dropped
before analysis.
- The anomaly magnitude :math:`r` is non-negative and measures the
distance to the **exceeded** bound.
- ``theta_col`` controls **ordering only**, not spacing. Angles are
always linearly spaced within the selected coverage (:py:data:`acov`)
and then assigned according to the sort order of ``theta_col``.
- Color intensity for both under- and over-predictions reflects the
shared normalization of magnitudes, based on the maximum observed
:math:`r`.
- The helper ``validate_qcols`` (from your utilities, e.g. *gofast*)
performs initial structure checks on ``q_cols``.
Let :math:`y_j` be the actual value, :math:`L_j` the lower bound,
and :math:`U_j` the upper bound for location :math:`j`.
1. **Anomaly masks**
- Under-prediction:
:math:`M_{\text{under},j} = (y_j < L_j)`
- Over-prediction:
:math:`M_{\text{over},j} = (y_j > U_j)`
2. **Anomaly magnitude (radial coordinate :math:`r`)**
.. math::
r_j =
\begin{cases}
L_j - y_j, & \text{if } y_j < L_j,\\
y_j - U_j, & \text{if } y_j > U_j,\\
0, & \text{otherwise.}
\end{cases}
Only points with :math:`r_j > 0` are plotted.
3. **Angular coordinate (:math:`\theta`)**
- For :math:`N` retained rows, define base angles
.. math::
\theta'_k \;=\; \frac{k}{N}\,S, \qquad k=0,\ldots,N-1,
where :math:`S \in \{2\pi, \pi, \pi/2, \pi/4\}` is the span
implied by :py:data:`acov`.
- If ``theta_col`` exists, sort by that column to obtain a
permutation :math:`\pi`. The plotted angle for original row
:math:`j` is :math:`\theta_{\pi^{-1}(j)}=\theta'_{\pi^{-1}(j)}`.
If ``theta_col`` is absent, use the original row order:
:math:`\theta_j=\theta'_j`.
4. **Color mapping**
- Normalize magnitudes
.. math::
\text{norm}(r_j) \;=\;
\frac{r_j}{\max(r_1,\ldots,r_N) + \varepsilon},
with a small :math:`\varepsilon>0` for numerical safety.
- Apply colormaps: ``cmap_under(norm(r_j))`` for under-predictions
and ``cmap_over(norm(r_j))`` for over-predictions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from kdiagram.plot.uncertainty import plot_anomaly_magnitude
**1. Random Example:**
>>> np.random.seed(42)
>>> N_points = 150
>>> df_anomaly_rand = pd.DataFrame({
... 'id': range(N_points),
... 'actual': np.random.randn(N_points) * 5 + 10,
... 'pred_q10': np.random.randn(N_points) * 1 + 7, # Interval around 10
... 'pred_q90': np.random.randn(N_points) * 1 + 13,
... 'feature_order': np.random.rand(N_points) * 100 # For ordering
... })
>>> # Introduce some anomalies
>>> df_anomaly_rand.loc[5:15, 'actual'] = 0 # Under-predictions
>>> df_anomaly_rand.loc[100:110, 'actual'] = 25 # Over-predictions
>>>
>>> ax_rand_anomaly = plot_anomaly_magnitude(
... df=df_anomaly_rand,
... actual_col='actual',
... q_cols=['pred_q10', 'pred_q90'],
... theta_col='feature_order', # Order by this feature
... acov='default',
... title='Random Anomaly Distribution',
... cmap_under='GnBu',
... cmap_over='OrRd',
... s=40,
... cbar=True,
... verbose=1
... )
>>> # Output will show anomaly counts...
>>> # plt.show() called internally
**2. Concrete Example (Subsidence Data - adapted from docstring):**
>>> # Assume small_sample_pred is a loaded DataFrame like:
>>> # Create dummy data if it doesn't exist
>>> try:
... small_sample_pred
... except NameError:
... print("Creating dummy small sample prediction data...")
... N_small = 200
... small_sample_pred = pd.DataFrame({
... 'subsidence_2023': np.random.rand(N_small)*15 + np.linspace(0, 5, N_small),
... 'subsidence_2023_q10': np.random.rand(N_small)*10,
... 'subsidence_2023_q90': np.random.rand(N_small)*10 + 10,
... 'latitude': np.linspace(22.3, 22.7, N_small) + np.random.randn(N_small)*0.01
... })
... # Ensure some anomalies exist in dummy data
... anom_indices_under = np.random.choice(N_small, 15, replace=False)
... anom_indices_over = np.random.choice(
... list(set(range(N_small)) - set(anom_indices_under)), 20, replace=False
... )
... small_sample_pred.loc[anom_indices_under, 'subsidence_2023'] = (
... small_sample_pred.loc[anom_indices_under, 'subsidence_2023_q10']
... - np.random.rand(15)*5 - 1
... )
... small_sample_pred.loc[anom_indices_over, 'subsidence_2023'] = (
... small_sample_pred.loc[anom_indices_over, 'subsidence_2023_q90']
... + np.random.rand(20)*5 + 1
... )
>>> ax_sub_anomaly = plot_anomaly_magnitude(
... df=small_sample_pred,
... actual_col='subsidence_2023',
... q_cols=['subsidence_2023_q10', 'subsidence_2023_q90'],
... theta_col='latitude', # Order points by latitude
... acov='quarter_circle', # Use only 90 degrees
... title='Anomaly Magnitude (2023) – Zhongshan',
... figsize=(9, 9),
... s=35,
... cbar=True, # Show colorbar
... mask_angle=True, # Hide angle labels
... verbose=1 # Print anomaly counts
... )
>>> # Output will show anomaly counts...
>>> # plt.show() called internally
References
----------
.. footbibliography::
"""
[docs]
@check_non_emptiness
@isdf
def plot_uncertainty_drift(
df: pd.DataFrame,
qlow_cols: list[str],
qup_cols: list[str],
dt_labels: list[str] | None = None,
theta_col: str | None = None,
acov: str = "default",
base_radius: float = 0.15,
band_height: float = 0.15,
cmap: str = "tab10",
label: str = "Year",
alpha: float = 0.85,
figsize: tuple[float, float] = (9, 9),
title: str | None = None,
show_grid: bool = True,
show_legend: bool = True,
mask_angle: bool = True,
savefig: str | None = None,
dpi: int = 300,
ax: Axes | None = None,
):
# --- Input Validation ---
if len(qlow_cols) != len(qup_cols):
raise ValueError(
"Mismatched lengths for `qlow_cols` "
f"({len(qlow_cols)}) and `qup_cols` ({len(qup_cols)})."
)
num_time_steps = len(qlow_cols)
if num_time_steps == 0:
raise ValueError("Quantile column lists cannot be empty.")
# Generate default labels if none provided
if dt_labels is None:
# Use the 'label' param as base for default labels
time_labels = [f"{label}_{i + 1}" for i in range(num_time_steps)]
else:
if len(dt_labels) != num_time_steps:
raise ValueError(
f"Length of `dt_labels` ({len(dt_labels)}) must match "
f"the number of time steps ({num_time_steps})."
)
time_labels = list(dt_labels) # Ensure list type
# Consolidate required columns and check existence
all_cols = qlow_cols + qup_cols
# Do not check theta_col here, handle warning later
missing_cols = [col for col in all_cols if col not in df.columns]
if missing_cols:
raise ValueError(
f"The following quantile columns are missing from the "
f"DataFrame: {', '.join(missing_cols)}"
)
# Handle theta_col warning
if theta_col:
if theta_col not in df.columns:
warnings.warn(
f"Specified `theta_col` ('{theta_col}') not found. "
f"Using index for angular position.",
UserWarning,
stacklevel=2,
)
else:
warnings.warn(
f"`theta_col` ('{theta_col}') is provided but currently "
f"ignored for positioning. Using index for angular position.",
UserWarning,
stacklevel=2,
)
# Prepare data: Drop rows with NaNs in relevant columns
data = df[all_cols].dropna()
if len(data) == 0:
warnings.warn(
"DataFrame is empty after dropping NaN values in quantile "
"columns. Cannot generate plot.",
UserWarning,
stacklevel=2,
)
return None
N = len(data) # Number of valid data points (locations)
# --- Calculate Interval Widths and Normalize Globally ---
widths = []
try:
for ql, qu in zip(qlow_cols, qup_cols):
width_values = (data[qu] - data[ql]).to_numpy(dtype=float)
if np.any(width_values < 0):
warnings.warn(
f"Negative interval widths detected for columns "
f"'{qu}' and '{ql}'. Check if upper < lower bound.",
UserWarning,
stacklevel=2,
)
# Ensure non-negative widths, clamp if necessary? Or just proceed.
# width_values[width_values < 0] = 0 # Option to clamp
widths.append(width_values)
except Exception as e:
raise TypeError(
f"Could not compute widths. Ensure quantile columns "
f"({ql}, {qu}) contain numeric data. Original error: {e}"
) from e
# Find global maximum width across all years and locations
if not widths: # Should not happen due to earlier checks, but safe
return None
# Calculate max, handle case where all widths might be zero or negative
all_width_values = np.concatenate(widths) if widths else np.array([0])
max_width = np.max(all_width_values) if len(all_width_values) > 0 else 0.0
# Normalize widths using the global maximum
normalized_widths = []
if max_width > 1e-9: # Avoid division by zero/near-zero
normalized_widths = [w / max_width for w in widths]
else:
# If max width is zero, all normalized widths are zero
normalized_widths = [np.zeros_like(w) for w in widths]
warnings.warn(
"Maximum interval width across all data is zero or near-zero. "
"Normalized widths are all set to 0.",
UserWarning,
stacklevel=2,
)
# --- Angular Coordinate Calculation ---
acov_map = { # Map coverage name to (min_angle, max_angle) in radians
"default": (0, 2 * np.pi),
"half_circle": (0, np.pi),
"quarter_circle": (0, np.pi / 2),
"eighth_circle": (0, np.pi / 4),
}
theta_min_rad, theta_max_rad = acov_map.get(acov.lower(), (0, 2 * np.pi))
if acov.lower() not in acov_map:
warnings.warn(
f"Invalid `acov` value '{acov}'. Using 'default' (0 to 2*pi).",
UserWarning,
stacklevel=2,
)
angular_range_rad = theta_max_rad - theta_min_rad
# Generate theta values based on index, mapped to the specified range
theta = (
np.linspace(0.0, 1.0, N, endpoint=True) # Linear space [0, 1]
* angular_range_rad # Scale to range width
+ theta_min_rad
)
# --- Plotting Setup ---
if ax is None:
fig, ax = plt.subplots(
figsize=figsize, subplot_kw={"projection": "polar"}
)
else:
fig = ax.figure
ax.set_thetamin(np.degrees(theta_min_rad)) # Expects degrees
ax.set_thetamax(np.degrees(theta_max_rad)) # Expects degrees
# Hide angular tick labels if requested
if mask_angle:
ax.set_xticklabels([])
# Configure grid
if show_grid:
ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.7)
else:
ax.grid(False)
# Hide radial ticks as they primarily separate years visually
ax.set_yticks([])
# Get color palette
color_palette = _sample_colors(
cmap, num_time_steps, trim=0.08, default="tab10"
)
# --- Draw Rings for Each Time Step ---
for i, (w_norm, step_label) in enumerate(
zip(normalized_widths, time_labels)
):
# XXX TODO
# Calculate base radius for this ring (increases for later years)
# base_r = base_radius + i * some_increment # Alternative logic
base_r = base_radius * (
i + 1
) # Base offset increases multiplicatively
# Calculate final radius: base + scaled normalized width
# Ensure w_norm has the same length as theta (N)
if len(w_norm) != N:
# This should not happen if dropna was done correctly, but safeguard
raise InternalError(
"Mismatch between width data and theta length."
) # Or handle gracefully
r = base_r + band_height * w_norm
# Determine color for this ring, cycling if needed
color = color_palette[i % len(color_palette)]
# Plot the line for this time step
# Ensure data wraps around by appending first point? Not needed
# for line plot if endpoint=True in linspace? Check.
# For visual continuity if range isn't full 2*pi, might not need wrap.
# Let's omit wrap for now.
ax.plot(
theta,
r,
label=step_label, # Label for the legend
color=color, # Color for this ring
linewidth=1.8, # Line thickness
alpha=alpha, # Transparency
)
# --- Final Touches ---
# Set plot title
ax.set_title(
title or "Multi-time Uncertainty Drift (Interval Width)",
fontsize=14,
y=1.08,
)
# Add legend if requested
if show_legend:
# Use the provided 'label' parameter as the intended legend title
# The example hardcodes 'Year'. Let's compromise:
# use 'label' if given, else 'Time Step'.
legend_title = label if label else "Time Step"
ax.legend(
loc="upper right",
bbox_to_anchor=(1.25, 1.1), # Position outside plot
title=legend_title, # Use dynamic title
fontsize=9,
)
# --- Save or Show ---
if savefig is not None:
safe_savefig(savefig, fig, bbox_inches="tight", dpi=dpi)
plt.close(fig)
else:
fig.tight_layout()
plt.show()
return ax
plot_uncertainty_drift.__doc__ = r"""
Polar plot visualizing temporal drift of uncertainty width.
This function creates a polar line plot showing how the width of
the prediction interval (e.g., Q90 - Q10), representing model
uncertainty, evolves over multiple time steps (e.g., years) across
different locations. Each time step is depicted as a distinct
concentric ring [1]_.
- **Angular Position (`theta`)**: Represents each location or data
point. Currently derived from the DataFrame index, mapped
linearly onto the angular range specified by `acov`. The optional
`theta_col` parameter is intended for future use in ordering but
is currently ignored for positioning.
- **Radial Rings (`r`)**: Each ring corresponds to a specific time
step provided via `qlow_cols`/`qup_cols`. The position of the
ring (distance from the center) indicates the time step (later
times are further out). The radius of the line at a specific angle
(location) on a given ring is determined by a base offset for that
year plus a component proportional to the *globally normalized*
interval width at that location and time. Thus, the 'thickness' or
deviation of a ring from a perfect circle reflects the magnitude
of uncertainty (interval width) relative to the maximum width
observed across all locations and times.
- **Color**: Each ring (time step) is assigned a unique color based
on the specified `cmap`, aiding in distinguishing and tracking
changes across time steps.
This visualization is particularly useful for:
- Identifying locations where prediction uncertainty grows or shrinks
significantly over the forecast horizon.
- Monitoring the overall trend (drift) of uncertainty as forecasts
extend further into the future.
- Highlighting areas with consistently high or low uncertainty across
all time steps.
- Comparing the spatial patterns of uncertainty at different forecast
lead times.
Parameters
----------
df : pd.DataFrame
Input DataFrame containing the quantile prediction columns.
Decorators ensure it's a valid, non-empty pandas DataFrame.
qlow_cols : list of str
List of column names for the lower quantile bound (e.g., Q10)
for consecutive time steps. Example: ``['pred_2023_q10',
'pred_2024_q10', ...]``.
qup_cols : list of str
List of column names for the upper quantile bound (e.g., Q90)
for the same time steps as `qlow_cols`. Must be the same length.
Example: ``['pred_2023_q90', 'pred_2024_q90', ...]``.
dt_labels : list of str, optional
List of labels for each time step, used in the legend to identify
the rings. Must match the length of `qlow_cols`. If ``None``,
generic labels like 'Time_1', 'Time_2', ... are generated based
on the `label` parameter. Default is ``None``.
theta_col : str, optional
*Intended* column name for ordering points angularly. *Note: This
parameter is currently ignored in the positioning logic; angles
are based on the DataFrame index.* A warning is issued if provided.
Default is ``None``.
acov : {'default', 'half_circle', 'quarter_circle', \
'eighth_circle'}, default='default'
Specifies the angular coverage (span) of the plot: ``'default'``
(360°), ``'half_circle'`` (180°), ``'quarter_circle'`` (90°),
``'eighth_circle'`` (45°).
base_radius : float, default=0.15
Determines the spacing between the base circles of consecutive
time step rings. The base radial offset for the ring representing
time step `i` (0-indexed) is calculated as ``base_radius * (i+1)``.
A larger value increases the separation between rings.
band_height : float, default=0.15
Scaling factor applied to the *normalized* interval width. This
controls how much the radius deviates from the base circle for
each ring, visually representing the uncertainty magnitude.
Effectively, it's the maximum radial 'height' allocated to
represent uncertainty on each ring.
cmap : str, default='tab10'
Name of the Matplotlib colormap used to assign distinct colors
to the rings representing different time steps.
label : str, default='Year'
Base name used for generating default `dt_labels` if they are
not provided (e.g., 'Year_1', 'Year_2', ...). *Note: This
parameter is currently *not* used for the legend title itself.*
alpha : float, default=0.85
Transparency level for the plotted lines (rings).
figsize : tuple of (float, float), default=(9, 9)
Width and height of the figure in inches.
title : str, optional
Title displayed above the polar plot. If ``None``, a default
title is used. Default is ``None``.
show_grid : bool, default=True
If ``True``, display polar grid lines.
show_legend : bool, default=True
If ``True``, display a legend identifying the time step for
each colored ring, using `dt_labels`.
mask_angle : bool, default=True
If ``True``, hide the angular tick labels (degrees). Recommended
if the angular position is based on index.
savefig : str, optional
File path to save the plot image. If ``None``, displays the plot
interactively. Default is ``None``.
Returns
-------
ax : matplotlib.axes.Axes
The Matplotlib Axes object containing the polar line plot.
Raises
------
AssertionError
If `qlow_cols` and `qup_cols` lists have different lengths.
ValueError
If specified quantile columns are not found in the DataFrame.
If data in quantile columns is not numeric.
See Also
--------
plot_interval_consistency : Plot consistency of interval widths using
scatter points.
numpy.linspace : Generate evenly spaced numbers.
matplotlib.pyplot.plot : Plot lines.
Notes
-----
- Interval widths (:math:`W = Q_{upper} - Q_{lower}`) are calculated
for each location and time step.
- These widths are then *globally normalized* by dividing by the
maximum width observed across all locations and all time steps.
This ensures that the radial deviations are comparable across rings.
- The radius for a point on ring `i` (0-indexed time step) is
:math:`r = (\text{base}_{radius} \times (i+1)) + (\text{band}_{height}
\times w_{normalized})`.
- The angular coordinate `theta` is currently based on the DataFrame
index, not influenced by `theta_col`.
- Radial axis ticks and labels are hidden by default (`set_yticks([])`)
as the radial dimension primarily separates the time steps.
Let :math:`\mathbf{L}_i` and :math:`\mathbf{U}_i` be the lower and
upper quantile bound vectors (length :math:`N`, number of locations)
for time step :math:`i` (:math:`i = 0, \dots, M-1`).
1. **Interval Width Calculation**: For each time step :math:`i`,
calculate the width vector:
:math:`\mathbf{W}_i = \mathbf{U}_i - \mathbf{L}_i`.
2. **Global Normalization**: Find the maximum width across all
locations and time steps:
:math:`W_{max} = \max_{i,j} (\mathbf{W}_i)_j`.
Normalize each width vector :math:`\mathbf{W}_i`:
.. math::
\mathbf{w}_i = \frac{\mathbf{W}_i}{W_{max}} \quad (\text{element-wise})
If :math:`W_{max} = 0`, :math:`\mathbf{w}_i` will be all zeros.
3. **Angular Coordinate (`theta`)**: Let :math:`S` be the angular span
and :math:`\theta_{min}` the start angle from `acov`. For location
index :math:`j` (:math:`j=0, ..., N-1`):
.. math::
\theta_j = \left( \frac{j}{N} \times S \right) + \theta_{min}
4. **Radial Coordinate (`r`)**: For time step :math:`i` and location
:math:`j`:
.. math::
r_{i,j} = (\text{base}_{radius} \times (i+1)) + \\
(\text{band}_{height} \times (\mathbf{w}_i)_j)
5. **Plotting**: For each time step :math:`i`, plot a line
connecting points :math:`(r_{i,j}, \theta_j)` for
:math:`j = 0, \dots, N-1`.
References
----------
.. [1] Kouadio, K. L., Liu, R., Loukou, K. G. H., Liu, J., & Liu, W. (2025).
Analytics Framework for Interpreting Spatiotemporal Probabilistic Forecasts.
*International Journal of Forecasting*. Manuscript submitted.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from kdiagram.plot.uncertainty import plot_uncertainty_drift
**1. Random Example:**
>>> np.random.seed(2)
>>> N_points = 100
>>> df_drift_rand = pd.DataFrame({'location_id': range(N_points)})
>>> years = range(2020, 2024)
>>> q10_drift_cols = []
>>> q90_drift_cols = []
>>> for i, year in enumerate(years):
... q10_col = f'q10_{year}'
... q90_col = f'q90_{year}'
... base_val = np.random.rand(N_points) * 10
... width = (np.random.rand(N_points) + 0.5) * (2 + i) # Increasing width
... df_drift_rand[q10_col] = base_val - width / 2
... df_drift_rand[q90_col] = base_val + width / 2
... q10_drift_cols.append(q10_col)
... q90_drift_cols.append(q90_col)
>>> ax_drift_rand = plot_uncertainty_drift(
... df=df_drift_rand,
... qlow_cols=q10_drift_cols,
... qup_cols=q90_drift_cols,
... dt_labels=[str(y) for y in years],
... theta_col='location_id', # Ignored for positioning
... acov='default',
... base_radius=0.1, # Smaller spacing
... band_height=0.1, # Smaller uncertainty scaling
... cmap='viridis',
... title='Random Uncertainty Drift Example',
... mute_angle=False # Show angle labels
... )
>>> # plt.show() called internally
**2. Concrete Example (Subsidence Data - adapted):**
>>> # Assume zhongshan_pred_2023_2026 is a loaded DataFrame like:
>>> # Create dummy data if it doesn't exist
>>> try:
... zhongshan_pred_2023_2026
... except NameError:
... print("Creating dummy subsidence data for example...")
... N_sub = 150
... zhongshan_pred_2023_2026 = pd.DataFrame({
... 'latitude': np.linspace(22.2, 22.8, N_sub),
... **{f'subsidence_{yr}_q10': np.random.rand(N_sub)*(yr-2022)*2 + 1
... for yr in range(2023, 2027)},
... **{f'subsidence_{yr}_q90': np.random.rand(N_sub)*(yr-2022)*2 + 5
... + np.linspace(0, (yr-2022)*3, N_sub) # Increasing width trend
... for yr in range(2023, 2027)},
... })
>>> qlow_sub_drift = [f'subsidence_{yr}_q10' for yr in range(2023, 2027)]
>>> qup_sub_drift = [f'subsidence_{yr}_q90' for yr in range(2023, 2027)]
>>> year_labels_sub = [str(yr) for yr in range(2023, 2027)]
>>> ax_sub_drift = plot_uncertainty_drift(
... df=zhongshan_pred_2023_2026,
... qlow_cols=qlow_sub_drift,
... qup_cols=qup_sub_drift,
... dt_labels=year_labels_sub,
... theta_col='latitude', # Ignored for positioning
... acov='half_circle', # Use 180 degrees
... title='Uncertainty Drift Over Time (Zhongshan)',
... cmap='tab10',
... band_height=0.1, # Controls visual width effect
... base_radius=0.2, # Controls spacing between years
... show_legend=True,
... mute_degree=True
... )
>>> # plt.show() called internally
"""
[docs]
@check_non_emptiness
@isdf
def plot_actual_vs_predicted(
df: pd.DataFrame,
actual_col: str,
pred_col: str,
theta_col: str | None = None,
acov: str = "default",
figsize: tuple[float, float] = (8.0, 8.0),
title: str | None = None,
line: bool = True,
r_label: str | None = None,
cmap: str | None = None,
alpha: float = 0.3,
actual_props: dict[str, Any] | None = None,
pred_props: dict[str, Any] | None = None,
show_grid: bool = True,
grid_props: dict | None = None,
show_legend: bool = True,
mask_angle: bool = False,
dpi: int = 300,
savefig: str | None = None,
ax: Axes | None = None,
):
# If theta_col is provided but currently unused, warn:
if theta_col is not None:
warnings.warn(
"`theta_col` is currently ignored by plot_actual_vs_predicted.",
UserWarning,
stacklevel=2,
)
# --- Input Validation and Preparation ---
# Basic checks handled by decorators
# Check existence of primary columns
exist_features(
df,
features=[actual_col, pred_col],
error="raise", # Raise error if missing
name="Actual and Predicted columns",
)
# Consolidate columns needed, check theta_col existence only if specified
cols_to_select = [actual_col, pred_col]
if theta_col:
if theta_col not in df.columns:
warnings.warn(
f"Specified `theta_col` ('{theta_col}') not found. "
f"Using index for angular position.",
UserWarning,
stacklevel=2,
)
# Proceed without theta_col for ordering
else:
warnings.warn(
f"`theta_col` ('{theta_col}') is provided but currently "
f"ignored for positioning/ordering. Using index.",
UserWarning,
stacklevel=2,
)
# Although ignored, keep it for potential future use if needed?
# For now, just select it if present, even if unused later.
# cols_to_select.append(theta_col) # Decided against adding if unused
# Drop rows with NaNs in essential columns
data = df[cols_to_select].dropna().copy()
if len(data) == 0:
warnings.warn(
"DataFrame is empty after dropping NaN values in actual"
" and predicted columns. Cannot generate plot.",
UserWarning,
stacklevel=2,
)
return None
N = len(data)
if N > 2_000:
warnings.warn(
"Large number of samples; consider "
"saving the figure instead of showing.",
UserWarning,
stacklevel=2,
)
# --- Angular Coordinate Calculation ---
acov_map = { # Map name to angular range in radians
"default": 2 * np.pi,
"half_circle": np.pi,
"quarter_circle": np.pi / 2,
"eighth_circle": np.pi / 4,
}
angular_range = acov_map.get(acov.lower(), 2 * np.pi)
if acov.lower() not in acov_map:
warnings.warn(
f"Invalid `acov` value '{acov}'. Using 'default' (2*pi).",
UserWarning,
stacklevel=2,
)
# Calculate theta based on index, mapped to the angular range
# Use endpoint=False if using lines to avoid overlap at 2pi?
# If using dots, endpoint=True or False matters less visually.
# Let's use endpoint=False for lines, True for dots for potentially
# better spacing.
use_endpoint = not line
theta = np.linspace(0.0, angular_range, N, endpoint=use_endpoint)
# --- Extract Data ---
try:
actual = data[actual_col].to_numpy(dtype=float)
pred = data[pred_col].to_numpy(dtype=float)
except (ValueError, TypeError) as e:
raise TypeError(
f"Failed to convert actual or predicted columns to numeric."
f" Check data types. Original error: {e}"
) from e
# --- Plotting Setup ---
if ax is None:
fig = plt.figure(figsize=figsize, constrained_layout=True)
ax = fig.add_subplot(111, projection="polar")
else:
fig = ax.figure
ax.set_thetamin(0)
ax.set_thetamax(np.degrees(angular_range)) # Expects degrees
# NEW: move radial tick labels away from the title/left edge
ax.set_rlabel_position(
225
) # SW quadrant works well for default orientation
set_axis_grid(ax, show_grid=show_grid, grid_props=grid_props)
if mask_angle:
ax.set_xticklabels([])
# --- Plot Difference Lines ---
# Warning: This loop can be very slow for large N
# Consider alternatives like fill_between if performance is critical
# and data can be meaningfully sorted by theta.
if N > 5_000: # Add warning for potentially slow loop
warnings.warn(
f"Plotting difference lines for {N} points individually."
f" This may be slow. Consider using `line=False` or sampling data.",
PerformanceWarning,
stacklevel=2,
)
for t, a, p in zip(theta, actual, pred):
# Plot a vertical line segment at angle t between actual and pred
ax.plot(
[t, t], # Start and end angle (same)
[min(a, p), max(a, p)], # Start and end radius
color="gray", # Hardcoded color for difference
alpha=alpha, # Use specified transparency
linewidth=1, # Fixed linewidth for diff lines
)
# --- Plot Actual and Predicted Data (Lines or Dots) ---
# Define default properties, merge with user-provided props
default_actual_props_line = {
"color": "black",
"linewidth": 1.5,
"label": "Actual",
}
default_pred_props_line = {
"color": "red",
"linewidth": 1.5,
"label": "Predicted (Q50)",
}
default_actual_props_scatter = {
"color": "black",
"s": 20,
"label": "Actual",
}
default_pred_props_scatter = {
"color": "red",
"s": 20,
"alpha": alpha,
"label": "Predicted (Q50)",
}
# Ensure user props are dictionaries if provided
actual_props = actual_props or {}
pred_props = pred_props or {}
_assert_all_types(actual_props, dict, objname="'actual_props'")
_assert_all_types(pred_props, dict, objname="'pred_props'")
if line:
# Merge user props with defaults for line plot
current_actual_props = {**default_actual_props_line, **actual_props}
current_pred_props = {**default_pred_props_line, **pred_props}
# Ensure theta wraps around if using full circle for line plot
theta_plot = (
np.append(theta, theta[0])
if np.isclose(angular_range, 2 * np.pi)
else theta
)
actual_plot = (
np.append(actual, actual[0])
if np.isclose(angular_range, 2 * np.pi)
else actual
)
pred_plot = (
np.append(pred, pred[0])
if np.isclose(angular_range, 2 * np.pi)
else pred
)
ax.plot(theta_plot, actual_plot, **current_actual_props)
ax.plot(theta_plot, pred_plot, **current_pred_props)
else:
# Merge user props with defaults for scatter plot
current_actual_props = {
**default_actual_props_scatter,
**actual_props,
}
current_pred_props = {**default_pred_props_scatter, **pred_props}
ax.scatter(theta, actual, **current_actual_props)
ax.scatter(theta, pred, **current_pred_props)
# --- Final Touches ---
ax.set_title(
title or "Actual vs Predicted Polar Plot", fontsize=14, y=1.08
)
if r_label:
# Use set_ylabel for radial axis label, adjust padding
ax.set_ylabel(r_label, labelpad=32, fontsize=10)
# _place_polar_ylabel(ax, r_label, angle=225, labelpad=32)
# Add legend if requested and labels were provided in props
if show_legend:
# Check if labels exist before showing legend
handles, labels = ax.get_legend_handles_labels()
if labels: # Only show legend if there's something to label
ax.legend(
loc="upper right", bbox_to_anchor=(1.25, 1.1), fontsize=9
)
else:
warnings.warn(
"Legend requested but no labels found for plot elements.",
UserWarning,
stacklevel=2,
)
# --- Save or Show ---
final = safe_savefig(
savefig,
fig, # or 'ax' — both work
dpi=dpi,
bbox_inches="tight",
pad_inches=0.2,
)
if final is not None:
plt.close(fig)
else:
plt.show()
return ax
plot_actual_vs_predicted.__doc__ = r"""
Polar plot comparing actual observed vs predicted values.
This function generates a polar plot to visually compare actual
ground truth values against model predictions (typically a central
estimate like the median, Q50) for multiple data points or
locations arranged circularly [1]_.
- **Angular Position (`theta`)**: Represents each data point or
location. Points are currently plotted in their DataFrame index
order, mapped linearly onto the specified angular coverage
(`acov`). The `theta_col` parameter is intended for future use
in ordering points based on a specific feature (like latitude)
but is currently ignored for positioning.
- **Radial Distance (`r`)**: Represents the magnitude of the values.
Both the actual value (`actual_col`) and the predicted value
(`pred_col`) are plotted at the corresponding angle `theta`.
- **Visual Comparison**:
- Actual and predicted values are shown as either continuous
lines or individual dots based on the `line` parameter [2]_.
- Gray vertical lines connect the actual and predicted values
at each angle, visually highlighting the magnitude and
direction (over- or under-prediction) of the difference
at each point.
This plot facilitates:
- Quick visual assessment of prediction accuracy and bias across
samples.
- Identification of regions or conditions (if angle relates to a
feature) where the model performs well or poorly.
- Communication of model performance to stakeholders.
Parameters
----------
df : pd.DataFrame
Input DataFrame containing actual and predicted value columns.
Decorators ensure it's a valid, non-empty pandas DataFrame.
actual_col : str
Name of the column holding the actual observed (ground truth)
values.
pred_col : str
Name of the column holding the corresponding predicted values
(e.g., the Q50 median prediction).
theta_col : str, optional
*Intended* column name for ordering points angularly based on
its values (e.g., 'latitude'). *Note: This parameter is
currently ignored for positioning/ordering in this implementation;
points use DataFrame index order.* A warning is issued if
provided. Default is ``None``.
acov : {'default', 'half_circle', 'quarter_circle', \
'eighth_circle'}, default='default'
Specifies the angular coverage (span) of the polar plot:
``'default'`` (360°), ``'half_circle'`` (180°),
``'quarter_circle'`` (90°), ``'eighth_circle'`` (45°).
figsize : tuple of (float, float), default=(8.0, 8.0)
Width and height of the figure in inches.
title : str, optional
Custom title for the plot. If ``None``, a default title is used.
Default is ``None``.
line : bool, default=True
Determines the plotting style:
- If ``True``, actual and predicted values are plotted as lines
connecting consecutive points.
- If ``False``, values are plotted as individual scatter dots.
r_label : str, optional
Custom label for the radial axis (representing value magnitude).
If ``None``, no label is set. Default is ``None``.
cmap : str, optional
*Note: This parameter is currently unused in the function.*
It might be intended for future use, perhaps coloring the
difference lines. Default is ``None``.
alpha : float, default=0.3
Transparency level applied to the gray difference lines drawn
between actual and predicted values, and also to the predicted
dots if ``line=False``.
actual_props : dict, optional
Dictionary of keyword arguments passed directly to the Matplotlib
`plot` or `scatter` function for the 'Actual' data series. Allows
customization (e.g., ``{'color': 'blue', 'linestyle': '--'}``).
Defaults to basic black line/dots if ``None``.
pred_props : dict, optional
Dictionary of keyword arguments passed directly to the Matplotlib
`plot` or `scatter` function for the 'Predicted' data series.
Allows customization (e.g., ``{'color': 'orange', 'marker': 'x'}``).
Defaults to basic red line/dots if ``None``.
show_grid : bool, default=True
If ``True``, display the polar grid lines.
show_legend : bool, default=True
If ``True``, display a legend labeling the 'Actual' and
'Predicted' series.
mute_degree : bool, default=False
If ``True``, hide the angular tick labels (degrees).
savefig : str, optional
File path to save the plot image. If ``None``, displays the plot
interactively. Default is ``None``.
Returns
-------
ax : matplotlib.axes._axes.Axes
The Matplotlib Axes object containing the polar plot. Note that
due to subplot_kw, it's specifically a PolarAxesSubplot.
Raises
------
ValueError
If `actual_col` or `pred_col` are not found in `df`.
TypeError
If data in actual or predicted columns is not numeric.
See Also
--------
plot_anomaly_magnitude : Visualize only points outside prediction
intervals.
matplotlib.pyplot.plot : Function for line plots.
matplotlib.pyplot.scatter : Function for scatter plots.
Notes
-----
- Rows with NaN values in `actual_col` or `pred_col` (or `theta_col`
if specified, though currently unused for position) are dropped.
- The gray lines indicating the difference are drawn individually
for each point using a loop. *Warning: This approach can be very
slow for large datasets (many thousands of points).* An alternative
like `fill_between` might be more efficient for showing shaded areas
but would require sorting by theta.
- The `theta_col` parameter is currently ignored for positioning;
angles are always based on the DataFrame index order after NaN
removal.
- The `cmap` parameter is currently unused. The difference lines are
hardcoded to 'gray'.
- Default plotting styles are black for actual and red for predicted,
but can be overridden using `actual_props` and `pred_props` [1]_.
Let :math:`y_j` be the actual value and :math:`\hat{y}_j` the
predicted value for data point (location) :math:`j` (:math:`j=0,
\dots, N-1` after NaN removal).
1. **Angular Coordinate (`theta`)**: Let :math:`S` be the angular span
and :math:`\theta_{min}` the start angle from `acov`.
.. math::
\theta_j = \left( \frac{j}{N} \times S \right) + \theta_{min}
2. **Radial Coordinates**: The radial coordinates are directly the
values: :math:`r_{actual, j} = y_j` and :math:`r_{pred, j} =
\hat{y}_j`.
3. **Plotting**:
- Plot points/lines connecting :math:`(r_{actual, j}, \theta_j)`
and :math:`(r_{pred, j}, \theta_j)`.
- For each :math:`j`, draw a gray line segment connecting the points
:math:`(\min(y_j, \hat{y}_j), \theta_j)` and
:math:`(\max(y_j, \hat{y}_j), \theta_j)`.
References
----------
.. [1] Kouadio, K. L., Liu, R., Loukou, K. G. H., Liu, J., & Liu, W. (2025).
Analytics Framework for Interpreting Spatiotemporal Probabilistic Forecasts.
*International Journal of Forecasting*. Manuscript submitted.
.. [2] Matplotlib documentation: https://matplotlib.org/stable/
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from kdiagram.plot.uncertainty import plot_actual_vs_predicted
**1. Random Example:**
>>> np.random.seed(0)
>>> N = 100
>>> df_avp_rand = pd.DataFrame({
... 'Time': pd.date_range('2023-01-01', periods=N, freq='D'),
... 'ActualTemp': 15 + 10 * np.sin(np.linspace(0, 4 * np.pi, N)) + np.random.randn(N) * 2,
... 'PredictedTemp': 16 + 9 * np.sin(np.linspace(0, 4 * np.pi, N) + 0.1) + np.random.randn(N) * 1.5
... })
>>> ax_avp_rand = plot_actual_vs_predicted(
... df=df_avp_rand,
... actual_col='ActualTemp',
... pred_col='PredictedTemp',
... theta_col='Time', # Note: Ignored for positioning
... acov='default',
... title='Temperature: Actual vs. Predicted',
... line=True, # Use lines
... r_label='Temperature (°C)',
... actual_props={'color': 'navy', 'linestyle': '-'},
... pred_props={'color': 'crimson', 'linestyle': '--'}
... )
>>> # plt.show() called internally
**2. Concrete Example (Subsidence Data - using dots):**
>>> # Assume zhongshan_pred_2023_2026 is a loaded DataFrame
>>> # Create dummy data if it doesn't exist
>>> try:
... zhongshan_pred_2023_2026
... except NameError:
... print("Creating dummy subsidence data for example...")
... N_sub = 150
... zhongshan_pred_2023_2026 = pd.DataFrame({
... 'latitude': np.linspace(22.2, 22.8, N_sub),
... 'subsidence_2023': np.random.rand(N_sub)*15 + np.linspace(0, 5, N_sub),
... 'subsidence_2023_q50': np.random.rand(N_sub)*14 + np.linspace(0.5, 5.5, N_sub),
... # Add other columns if needed by other examples
... **{f'subsidence_{yr}_q10': np.random.rand(N_sub)*(yr-2022)*2 + 1
... for yr in range(2023, 2027)},
... **{f'subsidence_{yr}_q90': np.random.rand(N_sub)*(yr-2022)*2 + 5
... + np.linspace(0, (yr-2022)*3, N_sub)
... for yr in range(2023, 2027)},
... })
>>> ax_avp_sub = plot_actual_vs_predicted(
... df=zhongshan_pred_2023_2026.head(100), # Use subset for speed
... actual_col='subsidence_2023',
... pred_col='subsidence_2023_q50',
... theta_col='latitude', # Note: Ignored for positioning
... acov='half_circle', # Use 180 degrees
... title='Actual vs Predicted Subsidence (2023)',
... line=False, # Use dots instead of lines
... r_label="Subsidence (mm)",
... mute_degree=True,
... pred_props={'marker': 'x', 'color': 'purple'} # Customize predicted dots
... )
>>> # plt.show() called internally
"""
[docs]
@check_non_emptiness
@isdf
def plot_interval_width(
df: pd.DataFrame,
q_cols: list[str] | tuple[str, str],
theta_col: str | None = None,
z_col: str | None = None,
acov: str = "default",
figsize: tuple[float, float] = (8.0, 8.0),
title: str | None = None,
cmap: str = "viridis",
s: int = 30,
alpha: float = 0.8,
show_grid: bool = True,
grid_props: dict | None = None,
cbar: bool = True,
mask_angle: bool = True,
savefig: str | None = None,
dpi: int = 300,
ax: Axes | None = None,
):
# --- Input Validation ---
# Basic df checks handled by decorators
q_cols_processed = columns_manager(q_cols, empty_as_none=False)
if len(q_cols_processed) != 2:
raise TypeError(
"`q_cols` expects exactly two column names: "
"[lower_quantile_column, upper_quantile_column]. "
f"Received {len(q_cols_processed)} columns: {q_cols_processed}"
)
# Assign validated lower and upper quantile columns
qlow_col, qup_col = q_cols_processed[0], q_cols_processed[1]
# Consolidate list of columns needed for processing and NaN checks
cols_needed = [qlow_col, qup_col]
if theta_col:
# Although unused for positioning, check if exists if provided
if theta_col not in df.columns:
warnings.warn(
f"Specified `theta_col` ('{theta_col}') not found. "
f"It will be ignored.",
UserWarning,
stacklevel=2,
)
else:
# Add to list for potential future use or NaN check if desired
# cols_needed.append(theta_col) # Currently ignored, don't add
warnings.warn(
f"`theta_col` ('{theta_col}') is provided but currently "
f"ignored for positioning/ordering. Using index.",
UserWarning,
stacklevel=2,
)
if z_col:
if z_col not in df.columns:
raise ValueError(
f"Specified `z_col` ('{z_col}') not found in DataFrame."
)
cols_needed.append(z_col)
# Check existence of essential quantile columns (redundant if exist_features used)
missing_essential = [
col for col in [qlow_col, qup_col] if col not in df.columns
]
if missing_essential:
raise ValueError(
f"Essential quantile columns missing: {', '.join(missing_essential)}"
)
# Also check z_col again if it was added
if z_col and z_col not in df.columns:
raise ValueError(f"`z_col` ('{z_col}') not found.")
# Drop rows with NaNs in the essential columns used for plotting (q, z)
data = df[cols_needed].dropna().copy()
if len(data) == 0:
warnings.warn(
"DataFrame is empty after dropping NaN values."
" Cannot generate plot.",
UserWarning,
stacklevel=2,
)
return None
N = len(data) # Number of valid data points
# --- Calculate Radial Coordinate (Interval Width) ---
try:
r = data[qup_col].to_numpy(dtype=float) - data[qlow_col].to_numpy(
dtype=float
)
except Exception as e:
raise TypeError(
f"Could not compute interval width. Ensure columns '{qup_col}'"
f" and '{qlow_col}' contain numeric data. Original error: {e}"
) from e
# Check for negative widths
if np.any(r < 0):
num_negative = np.sum(r < 0)
warnings.warn(
f"{num_negative} out of {N} locations have negative interval "
f"width ({qup_col} < {qlow_col}). These will be plotted with "
f"negative radius, potentially causing visual artifacts.",
UserWarning,
stacklevel=2,
)
# Option: clamp negative radius to 0?
# r = np.maximum(r, 0)
# --- Calculate Color Coordinate (Z-value) ---
if z_col:
try:
z = data[z_col].to_numpy(dtype=float)
cbar_label = z_col # Label colorbar with column name
except Exception as e:
raise TypeError(
f"Could not use `z_col` ('{z_col}') for color. Ensure it "
f"contains numeric data. Original error: {e}"
) from e
else:
# Default: color by interval width `r`
z = r
cbar_label = "Interval Width" # Label colorbar appropriately
# --- Angular Coordinate Calculation ---
acov_map = { # Map name to angular range in radians
"default": 2 * np.pi,
"half_circle": np.pi,
"quarter_circle": np.pi / 2,
"eighth_circle": np.pi / 4,
}
angular_range = acov_map.get(acov.lower(), 2 * np.pi)
if acov.lower() not in acov_map:
warnings.warn(
f"Invalid `acov` value '{acov}'. Using 'default' (2*pi).",
UserWarning,
stacklevel=2,
)
# Calculate theta based on index, mapped to the angular range
# Using endpoint=False might give slightly better visual spacing for scatter
theta = np.linspace(0.0, angular_range, N, endpoint=False)
# --- Color Normalization ---
# Check if z has variance for normalization
if np.ptp(z) > 1e-9: # Check peak-to-peak range
norm = Normalize(vmin=np.min(z), vmax=np.max(z))
else:
# Handle constant z case: map all to middle color
norm = Normalize(vmin=np.min(z) - 0.5, vmax=np.max(z) + 0.5)
warnings.warn(
f"Color values ('{cbar_label}') have zero range."
f" All points will have the same color.",
UserWarning,
stacklevel=2,
)
try:
cmap_ref = get_cmap(cmap)
colors = cmap_ref(norm(z))
except ValueError:
warnings.warn(
f"Invalid `cmap` name '{cmap}'. Falling back to 'viridis'.",
stacklevel=2,
)
cmap = "viridis" # Ensure cmap is valid for fallback
cmap_ref = get_cmap(cmap)
colors = cmap_ref(norm(z))
# --- Plotting ---
if ax is None:
fig, ax = plt.subplots(
figsize=figsize, subplot_kw={"projection": "polar"}
)
else:
fig = ax.figure
ax.set_thetamin(0)
ax.set_thetamax(np.degrees(angular_range)) # Expects degrees
# Apply grid and angle label settings
set_axis_grid(ax, show_grid, grid_props=grid_props)
if mask_angle:
ax.set_xticklabels([])
# Create the scatter plot
ax.scatter(
theta,
r, # Radius is interval width
c=colors, # Point color based on z
s=s, # Point size
alpha=alpha, # Point transparency
edgecolor="k", # Point edge color (optional)
linewidth=0.5, # Point edge width (optional)
# cmap=cmap_ref is implicitly used via 'colors', no need to pass here
)
# Set plot title
ax.set_title(
title or f"Prediction Interval Width ({qup_col} - {qlow_col})",
fontsize=14,
y=1.08,
)
# Add color bar if requested
if cbar:
# Create a ScalarMappable for the colorbar
sm = cm.ScalarMappable(norm=norm, cmap=cmap_ref)
sm.set_array([]) # Necessary for ScalarMappable
# Add the colorbar to the figure
cbar_obj = plt.colorbar(sm, ax=ax, pad=0.1, shrink=0.7)
# Set label dynamically based on whether z_col was used
cbar_obj.set_label(cbar_label, rotation=270, labelpad=32, fontsize=10)
# Set radial axis label (useful to know radius means width)
ax.set_ylabel("Interval Width", labelpad=32, fontsize=10)
# Set radial limits, ensure 0 is included if widths are non-negative
if np.all(r >= 0):
ax.set_ylim(bottom=0)
# else: let matplotlib auto-scale if negative widths exist
plt.tight_layout() # Adjust layout
# --- Save or Show ---
final = safe_savefig(
savefig,
fig,
dpi=dpi,
bbox_inches="tight",
)
if final is not None:
plt.close(fig)
else:
fig.tight_layout()
plt.show()
return ax
plot_interval_width.__doc__ = r"""
Polar scatter plot visualizing prediction interval width.
This function generates a polar scatter plot to visualize the
magnitude of prediction uncertainty, represented by the width of the
prediction interval (Upper Quantile - Lower Quantile), across
different locations or samples.
- **Angular Position (`theta`)**: Represents each location or data
point. Currently derived from the DataFrame index, mapped
linearly onto the specified angular coverage (`acov`). The optional
`theta_col` parameter is intended for future use in ordering but
is currently ignored for positioning.
- **Radial Distance (`r`)**: Directly represents the width of the
prediction interval (:math:`Q_{upper} - Q_{lower}`). A larger
radius indicates greater predicted uncertainty for that point.
- **Color (`z`)**: Optionally represents a third variable, specified
by `z_col` (e.g., the median prediction Q50, or the actual value).
This allows for correlating the uncertainty width with another
metric. If `z_col` is not provided, the color defaults to
representing the interval width (`r`) itself [1]_.
This plot helps to:
- Identify locations with high or low prediction uncertainty.
- Visualize the spatial distribution or sample distribution of
uncertainty magnitude.
- Explore potential correlations between uncertainty width and other
variables (like the central prediction) when using `z_col`.
Parameters
----------
df : pd.DataFrame
Input DataFrame containing the quantile prediction columns and
optionally columns for theta ordering and color (`z_col`).
Decorators ensure it's a valid, non-empty pandas DataFrame.
q_cols : list or tuple of str
A sequence containing exactly two string elements: the column
name for the lower quantile bound (e.g., 'prediction_q10') and
the column name for the upper quantile bound (e.g.,
'prediction_q90'). The order must be ``[lower_col, upper_col]``.
theta_col : str, optional
*Intended* column name for ordering points angularly based on
its values (e.g., 'latitude'). *Note: This parameter is
currently ignored for positioning/ordering in this implementation;
points use DataFrame index order.* A warning is issued if
provided. Default is ``None``.
z_col : str, optional
Name of the column whose values will be used to color the scatter
points. Common choices include the median prediction column (e.g.,
'prediction_q50') or an actual value column to see if high/low
uncertainty correlates with high/low values. If ``None``, the
color will represent the interval width (`r`) itself.
Default is ``None``.
acov : {'default', 'half_circle', 'quarter_circle', \
'eighth_circle'}, default='default'
Specifies the angular coverage (span) of the polar plot:
``'default'`` (360°), ``'half_circle'`` (180°),
``'quarter_circle'`` (90°), ``'eighth_circle'`` (45°).
figsize : tuple of (float, float), default=(8.0, 8.0)
Width and height of the figure in inches.
title : str, optional
Custom title for the plot. If ``None``, a default title like
"Prediction Interval Width (Q_upper - Q_lower)" is used.
Default is ``None``.
cmap : str, default='viridis'
Name of the Matplotlib colormap used to color the points based
on the `z` value (from `z_col` or defaulting to interval width `r`).
s : int, default=30
Marker size for the scatter points.
alpha : float, default=0.8
Transparency level for the scatter points (0=transparent,
1=opaque).
show_grid : bool, default=True
If ``True``, display the polar grid lines.
cbar : bool, default=True
If ``True``, display a color bar indicating the mapping between
colors and the `z` values.
mask_angle : bool, default=True
If ``True``, hide the angular tick labels (degrees). Recommended
if the angle is based on index.
savefig : str, optional
File path to save the plot image. If ``None``, displays the plot
interactively. Default is ``None``.
Returns
-------
ax : matplotlib.axes._axes.Axes
The Matplotlib Axes object containing the polar scatter plot,
specifically a PolarAxesSubplot.
Raises
------
TypeError
If `q_cols` does not contain exactly two elements.
ValueError
If specified columns in `q_cols`, `theta_col` (if provided), or
`z_col` (if provided) are not found in the DataFrame `df`.
If data in the required columns is not numeric.
See Also
--------
plot_interval_consistency : Visualize the temporal consistency of
interval widths.
plot_uncertainty_drift : Visualize the temporal drift of interval
widths as rings.
matplotlib.pyplot.scatter : Function used for plotting points.
matplotlib.colors.Normalize : Used for scaling color values.
Notes
-----
- Rows with NaN values in any of the required columns (`q_cols`,
`theta_col` if used, `z_col` if used) are dropped before plotting.
- The interval width `r` is calculated as `Upper Quantile - Lower
Quantile`. If the lower quantile is greater than the upper
quantile for any point, this will result in a negative width,
which might lead to unexpected plotting behavior as radius is
typically non-negative. A warning is issued if negative widths
are detected.
- The angular coordinate `theta` is derived from the DataFrame index
after NaN removal, not influenced by `theta_col` currently.
- Color is determined by the `z_col` values if provided, otherwise
it defaults to representing the interval width `r`.
Let :math:`L_j` and :math:`U_j` be the lower and upper quantile bound
values for data point (location) :math:`j` (:math:`j=0, \dots, N-1`
after NaN removal). Let :math:`z'_j` be the value from `z_col` for
point :math:`j`, if `z_col` is provided.
1. **Radial Coordinate (`r`)**: Interval Width.
.. math::
r_j = U_j - L_j
2. **Color Value (`z`)**:
.. math::
z_j = \begin{cases} z'_j & \text{if } z\_col \text{ is provided}\\
\\ r_j & \text{if } z\_col \text{ is None} \end{cases}
3. **Angular Coordinate (`theta`)**: Let :math:`S` be the angular span
and :math:`\theta_{min}` the start angle from `acov`.
.. math::
\theta_j = \left( \frac{j}{N} \times S \right) + \theta_{min}
4. **Plotting**: Plot points :math:`(r_j, \theta_j)` using `scatter`,
where the color of each point is determined by applying `cmap` to
the normalized value of :math:`z_j`.
References
----------
.. [1] Kouadio, K. L., Liu, R., Loukou, K. G. H., Liu, J., & Liu, W. (2025).
Analytics Framework for Interpreting Spatiotemporal Probabilistic Forecasts.
*International Journal of Forecasting*. Manuscript submitted.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from kdiagram.plot.uncertainty import plot_interval_width
**1. Random Example:**
>>> np.random.seed(1)
>>> N = 120
>>> df_iw_rand = pd.DataFrame({
... 'sample_id': range(N),
... 'latitude': np.linspace(40, 42, N) + np.random.randn(N)*0.05,
... 'q10_pred': np.random.rand(N) * 10,
... 'q50_pred': np.random.rand(N) * 10 + 5,
... 'q90_pred': np.random.rand(N) * 10 + 10, # Width varies
... })
>>> # Ensure Q90 > Q10
>>> df_iw_rand['q90_pred'] = df_iw_rand['q10_pred'] + np.abs(
... df_iw_rand['q90_pred'] - df_iw_rand['q10_pred'])
>>>
>>> ax_iw_rand = plot_interval_width(
... df=df_iw_rand,
... q_cols=['q10_pred', 'q90_pred'], # Pass as list [lower, upper]
... z_col='q50_pred', # Color by median prediction
... theta_col='latitude', # Ignored for positioning
... acov='default',
... title='Interval Width vs. Median Prediction',
... cmap='plasma',
... s=40,
... cbar=True
... )
>>> # plt.show() called internally
**2. Concrete Example (Subsidence Data):**
>>> # Assume zhongshan_pred_2023_2026 is a loaded DataFrame
>>> # Create dummy data if it doesn't exist
>>> try:
... zhongshan_pred_2023_2026
... except NameError:
... print("Creating dummy subsidence data for example...")
... N_sub = 150
... zhongshan_pred_2023_2026 = pd.DataFrame({
... 'latitude': np.linspace(22.2, 22.8, N_sub),
... 'subsidence_2023_q10': np.random.rand(N_sub)*5 + 1,
... 'subsidence_2023_q50': np.random.rand(N_sub)*10 + 3,
... 'subsidence_2023_q90': np.random.rand(N_sub)*5 + 6 + np.linspace(0, 10, N_sub),
... # Ensure q90 > q10
... **{f'subsidence_{yr}_q10': np.random.rand(N_sub)*(yr-2022)*2 + 1
... for yr in range(2024, 2027)}, # Add other cols if needed
... **{f'subsidence_{yr}_q90': np.random.rand(N_sub)*(yr-2022)*2 + 5
... + np.linspace(0, (yr-2022)*3, N_sub)
... for yr in range(2024, 2027)},
... })
>>> # Ensure Q90 > Q10 for the primary year
>>> zhongshan_pred_2023_2026['subsidence_2023_q90'] = (
... zhongshan_pred_2023_2026['subsidence_2023_q10'] +
... np.abs(zhongshan_pred_2023_2026['subsidence_2023_q90'] -
... zhongshan_pred_2023_2026['subsidence_2023_q10']) + 0.1
... )
>>> ax_iw_sub = plot_interval_width(
... df=zhongshan_pred_2023_2026.head(100), # Use subset
... q_cols=['subsidence_2023_q10', 'subsidence_2023_q90'], # Use list
... z_col='subsidence_2023_q50', # Color by Q50
... theta_col='latitude', # Ignored for positioning
... acov='quarter_circle', # Use 90 degrees
... title='Spatial Spread of Uncertainty (2023)',
... cmap='YlGnBu',
... s=25,
... cbar=True, # Show colorbar for Q50
... mask_angle=True
... )
>>> # plt.show() called internally
"""
[docs]
@check_non_emptiness
@isdf
def plot_coverage_diagnostic(
df: pd.DataFrame,
actual_col: str,
q_cols: list[str] | tuple[str, str],
theta_col: str | None = None,
acov: str = "default",
figsize: tuple[float, float] = (8.0, 8.0),
title: str | None = None,
show_grid: bool = True,
grid_props: dict | None = None,
cmap: str = "RdYlGn",
alpha: float = 0.85,
s: int = 35,
as_bars: bool = False,
coverage_line_color: str = "r",
buffer_pts: int = 500,
fill_gradient: bool = True,
gradient_size: int = 300,
gradient_cmap: str = "Greens",
gradient_levels: list[float] | None = None,
gradient_props: dict[str, Any] | None = None,
mask_angle: bool = True,
savefig: str | None = None,
dpi: int = 300,
verbose: int = 0,
ax: Axes | None = None,
):
q_cols_processed = columns_manager(q_cols, empty_as_none=False)
if len(q_cols_processed) != 2:
raise TypeError(
"`q_cols` expects exactly two column names: "
"[lower_quantile_column, upper_quantile_column]. "
f"Received {len(q_cols_processed)} columns: {q_cols_processed}"
)
qlow_col, qup_col = q_cols_processed[0], q_cols_processed[1]
# Consolidate needed columns and check existence
cols_needed = [actual_col, qlow_col, qup_col]
# Handle theta_col warning (existence checked implicitly by df[cols])
if theta_col:
if theta_col not in df.columns:
warnings.warn(
f"Specified `theta_col` ('{theta_col}') not found. "
f"Using index order.",
UserWarning,
stacklevel=2,
)
# Don't add to cols_needed if missing
else:
warnings.warn(
f"`theta_col` ('{theta_col}') is provided but currently "
f"ignored for positioning/ordering. Using index order.",
UserWarning,
stacklevel=2,
)
missing_essential = [
col
for col in [actual_col, qlow_col, qup_col]
if col not in df.columns
]
if missing_essential:
raise ValueError(
f"Essential columns missing: {', '.join(missing_essential)}"
)
# Drop rows with NaNs in essential columns
data = df[cols_needed].dropna().copy()
if len(data) == 0:
warnings.warn(
"DataFrame is empty after dropping NaN values."
" Cannot generate plot.",
UserWarning,
stacklevel=2,
)
return None
N = len(data) # Number of valid points
# --- Calculate Coverage ---
try:
y = data[actual_col].to_numpy(dtype=float)
y_lo = data[qlow_col].to_numpy(dtype=float)
y_hi = data[qup_col].to_numpy(dtype=float)
except Exception as e:
raise TypeError(
f"Could not convert actual or quantile columns to numeric."
f" Check data types. Original error: {e}"
) from e
# Binary coverage indicator (1 if covered, 0 otherwise)
covered = ((y >= y_lo) & (y <= y_hi)).astype(int)
# Overall coverage rate
total_coverage = covered.mean() if N > 0 else 0.0
# --- Angular Coordinate Calculation ---
acov_map = { # Map name to (min_angle, max_angle) in radians
"default": (0, 2 * np.pi),
"half_circle": (0, np.pi),
"quarter_circle": (0, np.pi / 2),
"eighth_circle": (0, np.pi / 4),
}
if acov.lower() not in acov_map:
# Use .get() for default or raise error explicitly? Let's raise.
raise ValueError(
f"Invalid `acov` value '{acov}'. Choose from: "
f"{', '.join(acov_map.keys())}"
)
theta_min_rad, theta_max_rad = acov_map[acov.lower()]
angular_range_rad = theta_max_rad - theta_min_rad
# Calculate theta based on index, mapped to the angular range
theta = (
np.linspace(
0.0, 1.0, N, endpoint=False
) # Use endpoint=False for bars/scatter
* angular_range_rad
+ theta_min_rad
)
# --- Plotting Setup ---
if ax is None:
fig, ax = plt.subplots(
figsize=figsize, subplot_kw={"projection": "polar"}
)
else:
fig = ax.figure
ax.set_thetamin(np.degrees(theta_min_rad))
ax.set_thetamax(np.degrees(theta_max_rad))
# Set radial limits strictly to [0, 1] for coverage plot
ax.set_ylim(0, 1.05) # Slight padding at top
ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0]) # Set explicit radial ticks
# Apply grid and angle label settings
set_axis_grid(ax, show_grid, grid_props)
if mask_angle:
ax.set_xticklabels([])
# --- Optional Background Gradient ---
if fill_gradient:
try:
grad_cmap_obj = get_cmap(gradient_cmap)
# Create radial and angular meshgrid
# R goes from 0 up to the total_coverage value
R, T = np.meshgrid(
np.linspace(0, total_coverage, gradient_size),
np.linspace(theta_min_rad, theta_max_rad, gradient_size),
)
# Z represents the value to map color to (e.g., radius itself)
# Tile radius values across angles for pcolormesh
Z = np.tile(
np.linspace(0, total_coverage, gradient_size)[:, np.newaxis],
(1, gradient_size),
)
# Normalize Z based on [0, 1] range for colormap
norm_gradient = Normalize(vmin=0, vmax=1.0)
# Plot the gradient mesh
ax.grid(False)
ax.pcolormesh(
T,
R,
Z,
shading="auto",
cmap=grad_cmap_obj,
alpha=0.20, # Make gradient subtle
norm=norm_gradient, # Ensure consistent color mapping
)
ax.grid(True, which="both")
except ValueError:
warnings.warn(
f"Invalid `gradient_cmap` ('{gradient_cmap}')."
f" Skipping background gradient.",
UserWarning,
stacklevel=2,
)
except Exception as e:
warnings.warn(
f"Failed to plot background gradient: {e}",
UserWarning,
stacklevel=2,
)
# --- Optional Concentric Reference Lines ---
# Define default levels if None
gradient_levels = (
gradient_levels
if gradient_levels is not None
else [0.2, 0.4, 0.6, 0.8, 1.0]
)
# Filter levels to be within [0, 1]
valid_levels = [lv for lv in gradient_levels if 0 <= lv <= 1]
# Define default properties, merge with user props
default_gradient_props = {
"linestyle": ":",
"color": "gray",
"linewidth": 0.8,
"alpha": 0.8,
}
current_gradient_props = {
**default_gradient_props,
**(gradient_props or {}),
}
# Plot each reference line and its label
theta_line = np.linspace(theta_min_rad, theta_max_rad, buffer_pts)
for lv in valid_levels:
ax.plot(theta_line, [lv] * buffer_pts, **current_gradient_props)
# Add text label near the end of the line
ax.text(
theta_max_rad * 0.98, # Position slightly inside max angle
lv,
f"{lv:.1f}", # Format label
color=current_gradient_props.get("color", "gray"),
fontsize=8,
ha="right", # Horizontal alignment
va="center", # Vertical alignment
)
# --- Average Coverage Line ---
ax.plot(
theta_line,
[total_coverage] * buffer_pts,
color=coverage_line_color,
linewidth=2.0, # Make it prominent
label=f"Avg Coverage ({total_coverage:.2f})", # Add rate to label
)
# --- Plot Individual Coverage (Bars or Scatter) ---
# Radius 'r' is the 'covered' array (0 or 1)
r_plot = covered.astype(float)
# Setup colormap and normalization for points/bars
try:
cmap_main_obj = get_cmap(cmap)
except ValueError:
warnings.warn(
f"Invalid `cmap` ('{cmap}'). Using 'RdYlGn'.",
UserWarning,
stacklevel=2,
)
cmap_main_obj = get_cmap("RdYlGn")
# Normalize 0 to low end, 1 to high end of cmap
norm_main = Normalize(vmin=0, vmax=1)
point_colors = cmap_main_obj(norm_main(r_plot))
if as_bars:
# Calculate bar width to fill space (approximate)
# Use difference between angles, handle wrap around if full circle
if N > 1:
# Estimate width based on average angular spacing
# Could also use angular_range_rad / N
bar_widths = np.diff(theta, append=theta[0] + angular_range_rad)
# Ensure positive widths if theta wraps around
bar_widths = (
np.abs(bar_widths) * 0.9
) # Make slightly narrower than gap
elif N == 1:
bar_widths = (
angular_range_rad * 0.8
) # Default width for single bar
else: # N=0
bar_widths = 0.1 # Placeholder
ax.bar(
theta, # Angle for each bar
r_plot, # Height (0 or 1)
width=bar_widths, # Width of bars
bottom=0.0, # Bars start from center
color=point_colors,
alpha=alpha,
edgecolor="gray",
linewidth=0.5,
)
else: # Plot as scatter points
ax.scatter(
theta,
r_plot, # Radius (0 or 1)
c=point_colors, # Color based on coverage
s=s,
alpha=alpha,
edgecolor="gray",
linewidth=0.5,
# No label needed here, color indicates status
)
# --- Final Touches ---
ax.set_title(
title or "Prediction Interval Coverage Diagnostic",
fontsize=14,
y=1.08,
)
# Add legend only for the average coverage line
handles, labels = ax.get_legend_handles_labels()
if handles: # If the coverage line was plotted and labeled
ax.legend(
handles=handles,
labels=labels,
loc="upper right",
bbox_to_anchor=(1.25, 1.1),
fontsize=9,
)
# Set radial axis label (Y-axis in polar)
ax.set_ylabel("Coverage (1=In, 0=Out)", labelpad=32, fontsize=10)
# --- Logging ---
if verbose > 0:
print("-" * 50)
print("Coverage Rate Calculation:")
print(f" Interval: [{qlow_col}, {qup_col}]")
print(f" Points checked (after NaN removal): {N}")
print(f" Overall Coverage Rate: {total_coverage * 100:.2f}%")
print("-" * 50)
# --- Output ---
final = safe_savefig(
savefig,
fig,
dpi=dpi,
bbox_inches="tight",
)
if final is not None:
plt.close(fig)
else:
fig.tight_layout()
plt.show()
return ax
plot_coverage_diagnostic.__doc__ = r"""
Diagnose prediction-interval coverage on a polar plot.
This visualization checks whether observed values fall within their
predicted intervals and summarizes the **empirical coverage rate**
against the nominal level. It is a compact diagnostic for calibration
of quantile/interval forecasts; foundational background on forecast
verification and calibration appears in :footcite:p:`Gneiting2007b,
Jolliffe2012`. See :footcite:t:`Gneiting2007b` for a discussion of
calibration vs. sharpness.
The plot maps samples around a circle (angular coordinate) and encodes
**covered** vs **not covered** on the radial axis. A solid reference
ring marks the overall coverage rate, and optional concentric guides
and background gradients aid interpretation.
Parameters
----------
df : pandas.DataFrame
Input table containing the observed target and the two quantile
bounds. Decorators ensure a valid, non-empty DataFrame.
actual_col : str
Column name of the observed (ground-truth) values.
q_cols : list of str or tuple of str
Exactly two names ``[lower_quantile_col, upper_quantile_col]``
that define the prediction interval. The order must be
*[lower, upper]*.
theta_col : str, optional
Intended column for angular ordering. **Currently ignored**; the
plot uses the DataFrame index order. A warning is issued if
provided. Default is ``None``.
acov : {'default', 'half_circle', 'quarter_circle', 'eighth_circle'}, \
default='default'
Angular coverage (span) of the plot:
- ``'default'``: full circle :math:`2\pi`
- ``'half_circle'``: :math:`\pi`
- ``'quarter_circle'``: :math:`\pi/2`
- ``'eighth_circle'``: :math:`\pi/4`
figsize : tuple of (float, float), default=(8.0, 8.0)
Figure size in inches.
title : str, optional
Custom title. If ``None``, a default title is used.
show_grid : bool, default=True
If ``True``, show polar grid lines.
grid_props : dict, optional
Keyword arguments passed to your grid helper for customizing the
grid (e.g., ``{'linestyle': '--', 'alpha': 0.6}``).
cmap : str, default='RdYlGn'
Colormap for per-point coverage (0 or 1). Diverging maps work well
(e.g., red for uncovered, green for covered).
alpha : float, default=0.85
Transparency for scatter points or bars.
s : int, default=35
Marker size for scatter points (ignored when ``as_bars=True``).
as_bars : bool, default=False
If ``True``, draw radial bars (height 0/1) instead of points.
coverage_line_color : str, default='r'
Color of the solid ring at the average coverage rate.
buffer_pts : int, default=500
Number of samples used to draw smooth circular lines (average rate
and guide rings).
fill_gradient : bool, default=True
If ``True``, fill the background radially up to the average
coverage with a subtle gradient.
gradient_size : int, default=300
Resolution of the background gradient mesh.
gradient_cmap : str, default='Greens'
Colormap used for the optional background gradient.
gradient_levels : list of float, optional
Radii in :math:`[0,1]` for dashed concentric reference rings.
Defaults to ``[0.2, 0.4, 0.6, 0.8, 1.0]``.
gradient_props : dict, optional
Style for the concentric guide rings (e.g., ``{'linestyle': ':',
'color': 'gray', 'linewidth': 0.8}``).
mask_angle : bool, default=True
If ``True``, hide angular tick labels.
savefig : str, optional
File path to save the figure. If ``None``, the figure is shown.
verbose : int, default=0
If ``> 0``, print the computed overall coverage rate.
Returns
-------
ax : matplotlib.axes.Axes
The polar Axes containing the coverage diagnostic.
Raises
------
TypeError
If ``q_cols`` does not contain exactly two names.
ValueError
If required columns are missing or cannot be coerced to numeric,
or if ``acov`` is invalid.
See Also
--------
plot_anomaly_magnitude:
Polar diagnostic for the *magnitude* and *type* of interval
failures (under/over).
plot_reliability_diagram:
Calibration (reliability) curves for probabilistic classifiers.
Notes
-----
Coverage for row :math:`j` is defined by the closed interval test
:math:`L_j \le y_j \le U_j`. Rows with missing values in essential
columns are removed prior to computation, so all symbols below refer
to the filtered data. The radial axis is fixed to :math:`[0,1]` and
encodes a binary outcome per sample, while a solid reference ring at
radius :math:`\bar{C}` summarizes the empirical coverage rate. Compare
:math:`\bar{C}` to the nominal level implied by your interval (e.g.,
:math:`0.8` for Q10–Q90, :math:`0.9` for Q5–Q95) to assess
calibration. Angular positions follow index order over the chosen span;
``theta_col`` is currently ignored for positioning.
Let :math:`y_j` denote the actual value, :math:`L_j` the lower bound,
and :math:`U_j` the upper bound for sample :math:`j`, with
:math:`j=0,\dots,N-1` after NaN removal. The per-sample coverage
indicator (radial coordinate :math:`r_j`) is
.. math::
r_j \;=\;
\begin{cases}
1, & L_j \le y_j \le U_j, \\\\
0, & \text{otherwise.}
\end{cases}
The overall coverage rate drawn as a ring is
.. math::
\bar{C} \;=\; \frac{1}{N} \sum_{j=0}^{N-1} r_j.
Let :math:`S \in \{2\pi,\;\pi,\;\pi/2,\;\pi/4\}` be the angular span
set by ``acov`` and let :math:`\theta_{\min}` be the start angle. The
angular coordinate for sample :math:`j` is
.. math::
\theta_j \;=\; \frac{j}{N}\,S \;+\; \theta_{\min}.
Plotting semantics: each sample is placed at
:math:`(\theta_j, r_j)` and colored via ``cmap`` according to
:math:`r_j`; a solid ring at :math:`\bar{C}` is overlaid as a global
summary; optional concentric guides at user-specified radii and an
optional radial background gradient up to :math:`\bar{C}` provide
additional visual context.
Rows with NaNs in essential columns are dropped before computation.
``theta_col`` is currently ignored (index order is used).
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from kdiagram.plot.uncertainty import plot_coverage_diagnostic
**1. Random Example (Well-calibrated 80% interval):**
>>> np.random.seed(0)
>>> N = 200
>>> df_cov_rand = pd.DataFrame({'id': range(N)})
>>> df_cov_rand['actual'] = np.random.normal(loc=10, scale=2, size=N)
>>> # Simulate an ~80% interval (e.g., +/- 1.28 std devs for Normal)
>>> std_dev_pred = 2.0
>>> df_cov_rand['q10_pred'] = 10 - 1.28 * std_dev_pred
>>> df_cov_rand['q90_pred'] = 10 + 1.28 * std_dev_pred
>>> # Add some noise to interval bounds
>>> df_cov_rand['q10_pred'] += np.random.randn(N) * 0.2
>>> df_cov_rand['q90_pred'] += np.random.randn(N) * 0.2
>>> ax_cov_rand = plot_coverage_diagnostic(
... df=df_cov_rand,
... actual_col='actual',
... q_cols=['q10_pred', 'q90_pred'], # [lower, upper]
... theta_col='id', # Ignored for positioning
... acov='default',
... title='Coverage Diagnostic (Simulated 80% Interval)',
... as_bars=False, # Use scatter points
... coverage_line_color='blue', # Color for avg coverage line
... gradient_cmap='Blues', # Background gradient color
... verbose=1 # Print coverage rate
... )
>>> # Expected coverage rate near 80%
>>> # plt.show() called internally
**2. Concrete Example (Subsidence Data):**
>>> # Assume small_sample_pred is a loaded DataFrame
>>> # Create dummy data if it doesn't exist
>>> try:
... small_sample_pred
... except NameError:
... print("Creating dummy small sample prediction data...")
... N_small = 200
... small_sample_pred = pd.DataFrame({
... 'subsidence_2023': np.random.rand(N_small)*15 + np.linspace(0, 5, N_small),
... 'subsidence_2023_q10': np.random.rand(N_small)*10,
... 'subsidence_2023_q90': np.random.rand(N_small)*10 + 10,
... 'latitude': np.linspace(22.3, 22.7, N_small) + np.random.randn(N_small)*0.01
... })
>>> # Ensure Q90 > Q10
>>> small_sample_pred['subsidence_2023_q90'] = (
... small_sample_pred['subsidence_2023_q10'] +
... np.abs(small_sample_pred['subsidence_2023_q90'] -
... small_sample_pred['subsidence_2023_q10']) + 0.1
... )
>>> ax_cov_sub = plot_coverage_diagnostic(
... df=small_sample_pred,
... actual_col='subsidence_2023',
... q_cols=['subsidence_2023_q10', 'subsidence_2023_q90'],
... theta_col=None, # Use index order
... acov='half_circle', # Use 180 degrees
... as_bars=True, # Use bars instead of scatter
... coverage_line_color='darkgreen',
... title='Coverage Evaluation for 2023 (Q10–Q90)',
... mask_angle=False, # Show angle labels if meaningful
... fill_gradient=False, # Turn off background gradient
... gradient_levels=[0.5, 0.8, 0.9], # Custom reference lines
... verbose=1
... )
>>>
References
----------
.. footbibliography::
"""
[docs]
@check_non_emptiness
@isdf
def plot_temporal_uncertainty(
df: pd.DataFrame,
q_cols: str | list[str] = "auto",
theta_col: str | None = None,
names: list[str] | None = None,
acov: str = "default",
figsize: tuple[float, float] = (8.0, 8.0),
title: str | None = None,
cmap: str = "tab10",
normalize: bool = True,
show_grid: bool = True,
grid_props: dict | None = None,
alpha: float = 0.7,
s: int = 25,
dot_style: str = "o",
legend_loc: str = "upper right",
mask_label: bool = False,
mask_angle: bool = True,
savefig: str | None = None,
dpi: int = 300,
ax: Axes | None = None,
):
# --- Input Validation and Column Setup ---
# Handle 'auto' detection or process explicit list for q_cols
if isinstance(q_cols, str) and q_cols.lower() == "auto":
try:
# Assume detect_quantiles_in returns list of column names
detected_cols = detect_quantiles_in(df)
if not detected_cols:
raise ValueError("Auto-detection found no quantile columns.")
# Check if detected columns actually exist (redundant?)
exist_features(
df,
features=detected_cols,
error="raise",
name="Auto-detected quantile columns",
)
q_cols_list = detected_cols
except (
NameError
) as err: # If detect_quantiles_in is not defined/imported
raise ImportError(
"Helper function 'detect_quantiles_in' is needed for "
"`q_cols='auto'` but seems unavailable."
) from err
except Exception as e:
# Catch errors from detect_quantiles_in or exist_features
raise ValueError(
f"Automatic detection or validation of `q_cols` failed: {e}. "
"Please provide `q_cols` explicitly as a list of column names."
) from e
else:
# Process explicit list using columns_manager if available
if "columns_manager" in globals():
q_cols_list = columns_manager(q_cols, empty_as_none=False)
else:
# Basic list validation if helper not present
if not isinstance(q_cols, (list, tuple)):
raise TypeError("`q_cols` must be 'auto' or a list/tuple.")
q_cols_list = list(q_cols)
# Check if list is empty
if not q_cols_list:
raise ValueError(
"`q_cols` list cannot be empty. Please provide column names."
)
# Check existence of explicitly provided columns
exist_features(
df,
features=q_cols_list,
error="raise",
name="Specified plot columns (`q_cols`)",
)
# Determine columns needed for NaN handling
cols_for_na_check = list(q_cols_list) # Start with plot columns
# theta_col_valid = False
if theta_col:
if theta_col in df.columns:
cols_for_na_check.append(theta_col)
# theta_col_valid = True
warnings.warn(
f"`theta_col` ('{theta_col}') is currently used for NaN "
f"alignment but ignored for positioning/ordering. Using index order.",
UserWarning,
stacklevel=2,
)
else:
warnings.warn(
f"Specified `theta_col` ('{theta_col}') not found. "
f"It will be ignored for NaN alignment and positioning.",
UserWarning,
stacklevel=2,
)
# Drop rows with NaNs in ANY of the selected columns
data = df[cols_for_na_check].dropna().copy()
if len(data) == 0:
warnings.warn(
"DataFrame is empty after dropping NaN values. Cannot generate plot.",
UserWarning,
stacklevel=2,
)
return None
N = len(data) # Number of valid data points
# --- Angular Coordinate Calculation ---
angular_range_map = { # Map name to (min_angle, max_angle) in radians
"default": (0, 2 * np.pi),
"half_circle": (0, np.pi),
"quarter_circle": (0, np.pi / 2),
"eighth_circle": (0, np.pi / 4),
}
if acov.lower() not in angular_range_map:
raise ValueError(
f"Invalid `acov` value '{acov}'. Choose from: "
f"{', '.join(angular_range_map.keys())}"
)
theta_min_rad, theta_max_rad = angular_range_map[acov.lower()]
angular_range_rad = theta_max_rad - theta_min_rad
# Calculate theta based on index of cleaned data
# Use endpoint=False for scatter for potentially better spacing visual
theta = (
np.linspace(0.0, 1.0, N, endpoint=False) * angular_range_rad
+ theta_min_rad
)
# --- Prepare Radial Values (Normalization) ---
# Internal helper for min-max scaling
def _normalize(x):
"""Min-max scales array x to [0, 1], handling zero range."""
x = np.asarray(x, dtype=float) # Ensure float array
range_x = np.ptp(x) # Peak-to-peak range
if range_x > 1e-9: # Check if range is non-negligible
min_x = np.min(x)
return (x - min_x) / range_x
# XXX TODO
# If range is zero or near-zero, return array of 0.5 or 0?
# Returning original might be safer if scale is important even if constant
# Let's return 0.5 for constant values after normalization request.
# Or return 0 to keep origin clear? Let's use 0.5 as neutral midpoint.
return np.full_like(x, 0.5)
# Extract values from the cleaned data, apply normalization if needed
values_list = []
for col in q_cols_list:
try:
vals = data[col].to_numpy(dtype=float)
if normalize:
vals = _normalize(vals)
values_list.append(vals)
except Exception as e:
raise TypeError(
f"Could not process column '{col}'. Ensure it contains numeric"
f" data. Original error: {e}"
) from e
# --- Setup Labels and Colors ---
num_series = len(q_cols_list)
# Generate default names if needed
if names is None:
# Try to make default names slightly more informative if auto-detected
if isinstance(q_cols, str) and q_cols.lower() == "auto":
# Use the detected names if possible
plot_labels = [c.replace("_", " ").title() for c in q_cols_list]
else:
# Generic default if explicit list or auto-detection failed context
plot_labels = [f"Series {i + 1}" for i in range(num_series)]
else:
# Use provided names, check length
if len(names) != num_series:
raise ValueError(
f"Length of `names` ({len(names)}) must match the number "
f"of plotted columns ({num_series})."
)
plot_labels = list(names)
# Get color palette
try:
cmap_obj = get_cmap(cmap)
# Sample colors - handle discrete vs continuous cmaps
if hasattr(cmap_obj, "colors") and len(cmap_obj.colors) >= num_series:
# Use colors directly from discrete map if enough available
color_palette = cmap_obj.colors
else: # Sample from continuous map or discrete map with fewer colors
color_palette = cmap_obj(np.linspace(0, 1, num_series))
except ValueError:
warnings.warn(
f"Invalid `cmap` name '{cmap}'. Falling back to 'tab10'.",
stacklevel=2,
)
cmap_obj = get_cmap("tab10") # Fallback cmap
color_palette = cmap_obj(np.linspace(0, 1, num_series))
# --- Plotting ---
if ax is None:
fig, ax = plt.subplots(
figsize=figsize, subplot_kw={"projection": "polar"}
)
else:
fig = ax.figure
# Set angular limits
ax.set_thetamin(np.degrees(theta_min_rad))
ax.set_thetamax(np.degrees(theta_max_rad))
# Plot each data series
for i, (vals, label) in enumerate(zip(values_list, plot_labels)):
if len(vals) != N: # Should not happen if NaN handling is correct
raise InternalError(
"Mismatch between data length and theta length."
)
color = color_palette[
i % len(color_palette)
] # Cycle colors if needed
ax.scatter(
theta,
vals,
label=label,
alpha=alpha,
s=s,
marker=dot_style,
color=color,
edgecolor="k", # Add edge color for visibility
linewidth=0.5,
)
# --- Final Touches ---
ax.set_title(title or "Polar Data Series Plot", fontsize=14, y=1.08)
set_axis_grid(ax, show_grid, grid_props)
if mask_angle:
ax.set_xticklabels([]) # Hide angular tick labels
if mask_label:
ax.set_yticklabels([]) # Hide radial tick labels
else:
ax.set_ylabel(f"{'Normalized ' if normalize else ''}Q_pred values")
# Add legend
handles, labels = ax.get_legend_handles_labels()
if labels: # Only show legend if labels were generated/provided
ax.legend(
handles=handles,
labels=labels,
loc=legend_loc,
bbox_to_anchor=(1.25, 1.0) if "right" in legend_loc else None,
fontsize=9,
)
# Set radial axis label if normalization is off? Maybe not useful.
if (
not normalize and len(q_cols_list) == 1
): # Only label if one series, unnormalized
ax.set_ylabel(q_cols_list[0])
# --- Save or Show ---
final = safe_savefig(
savefig,
fig,
dpi=dpi,
bbox_inches="tight",
)
if final is not None:
plt.close(fig)
else:
fig.tight_layout()
plt.show()
return ax
plot_temporal_uncertainty.__doc__ = r"""
Visualize multiple data series using polar scatter plots.
This function creates a general-purpose polar scatter plot to
visualize and compare one or more data series (columns) from a
DataFrame in a circular layout. Each series is plotted with a
distinct color [1]_.
- **Angular Position (`theta`)**: Represents each data point or
sample, ordered by the DataFrame index after removing rows with
NaNs in the selected `q_cols` (and `theta_col` if used for NaN
alignment). The points are mapped linearly onto the specified
angular coverage (`acov`). The `theta_col` parameter is currently
ignored for sorting/positioning but helps align data if it has NaNs.
- **Radial Distance (`r`)**: Represents the magnitude of the values
from each column specified in `q_cols`. Values can be optionally
normalized independently for each series using min-max scaling
(`normalize=True`).
- **Color**: Each data series (column in `q_cols`) is assigned a
unique color based on the specified `cmap` [2]_.
This plot is flexible and can be used for various purposes, such as:
- Comparing different model predictions for the same target.
- Visualizing different quantile predictions (e.g., Q10, Q50, Q90)
for a single time step to show uncertainty spread.
- Plotting related variables against each other in a polar context.
Parameters
----------
df : pd.DataFrame
Input DataFrame containing the data columns to be plotted.
Decorators ensure it's a valid, non-empty pandas DataFrame.
q_cols : str or list of str, default='auto'
Specifies the columns to plot.
- If ``'auto'``, attempts to automatically detect quantile columns
(e.g., names containing '_q' followed by numbers) using the
helper function `detect_quantiles_in`. Raises error if detection
fails or no such columns are found.
- If a list of strings, these column names are used directly.
Must contain at least one valid column name from `df`.
theta_col : str, optional
*Intended* for ordering points angularly based on this column's
values. *Note: Currently ignored for sorting/positioning; uses
index order. However, if provided and present in `df`, it is
included when checking for and dropping NaN values to ensure data
alignment between angles and plotted values.* A warning is issued
regarding its limited use. Default is ``None``.
names : list of str, optional
Custom labels for each data series specified in `q_cols`, used
in the plot legend. Must match the number of columns in `q_cols`.
If ``None``, generic labels like 'Q1', 'Q2', ... (or based on
detected quantile names) are generated. Default is ``None``.
acov : {'default', 'half_circle', 'quarter_circle', \
'eighth_circle'}, default='default'
Specifies the angular coverage (span) of the polar plot:
``'default'`` (360°), ``'half_circle'`` (180°),
``'quarter_circle'`` (90°), ``'eighth_circle'`` (45°).
figsize : tuple of (float, float), default=(8.0, 8.0)
Width and height of the figure in inches.
title : str, optional
Custom title for the plot. If ``None``, a default title is used.
cmap : str, default='tab10'
Name of the Matplotlib colormap used to assign distinct colors
to each data series specified in `q_cols`.
normalize : bool, default=True
If ``True``, normalize the values within *each column* specified
in `q_cols` independently to the range [0, 1] using min-max
scaling before plotting. Useful for comparing shapes of series
with different scales. If ``False``, raw values are plotted.
show_grid : bool, default=True
If ``True``, display the polar grid lines.
alpha : float, default=0.7
Transparency level for the scatter points (0=transparent, 1=opaque).
s : int, default=25
Marker size for the scatter points.
dot_style : str, default='o'
Marker style used for the scatter points (e.g., 'o', '.', 'x', '+').
Passed to `matplotlib.pyplot.scatter`.
legend_loc : str, default='upper right'
Location of the legend identifying the data series. Common values
include 'best', 'upper right', 'lower left', etc.
mask_angle : bool, default=True
If ``True``, hide the angular tick labels (degrees). Recommended
if the angle is based on index.
savefig : str, optional
File path to save the plot image. If ``None``, displays interactively.
Returns
-------
ax : matplotlib.axes._axes.Axes
The Matplotlib Axes object (PolarAxesSubplot) containing the plot.
Raises
------
ValueError
If `q_cols='auto'` fails to find columns or `detect_quantiles_in` fails.
If `q_cols` (explicitly provided or auto-detected) is empty or
contains columns not found in `df`.
If `names` is provided but length doesn't match `q_cols`.
If `acov` value is invalid.
TypeError
If data in `q_cols` is not numeric.
See Also
--------
plot_polar_uncertainty_spread : Previous function focused on plotting
quantiles for uncertainty (potentially similar).
detect_quantiles_in : Helper function for automatic column detection.
matplotlib.pyplot.scatter : Underlying function for plotting points.
Notes
-----
- Normalization :math:`\aleph` (`normalize=True`) is applied independently to each
column in `q_cols`, scaling each series to its own [0, 1] range.
- Rows containing NaN values in *any* of the selected `q_cols` (and
`theta_col` if present) are dropped *before* plotting to ensure
alignment. `theta` angles are based on the index of this cleaned data.
- The `theta_col` parameter currently does not affect the order or
position of points but is used for consistent NaN handling.
- Radial axis ticks and labels are hidden by default (`set_yticklabels([])`)
as the radius represents potentially normalized values from multiple
series, making a single scale less meaningful.
Let :math:`\mathbf{V}_i` be the data vector for column `i` in `q_cols`
(length :math:`N`, after NaN removal based on all selected columns).
1. **Normalization** (if `normalize=True`, applied per column `i`):
.. math::
\mathbf{v}'_i = \aleph ( \mathbf{V}_i)
where
.. math::
\aleph(x)_j = \frac{x_j - \min(\mathbf{x})}{\max(\mathbf{x}) - \min(\mathbf{x})}
(Handles zero range by returning the original vector).
2. **Angular Coordinate (`theta`)**: Let :math:`S` be the angular span
and :math:`\theta_{min}` the start angle from `acov`. For index
:math:`j` (:math:`j=0, \dots, N-1` of cleaned data):
.. math::
\theta_j = \left( \frac{j}{N} \times S \right) + \theta_{min}
3. **Radial Coordinate (`r`)**: For series `i` and point `j`:
:math:`r_{i,j} = v'_{i,j}` (if normalized) or
:math:`r_{i,j} = v_{i,j}` (if not normalized).
4. **Plotting**: For each series `i`, plot points :math:`(r_{i,j}, \theta_j)`
using `scatter` with a distinct color derived from `cmap`.
References
----------
.. [1] Kouadio, K. L., Liu, R., Loukou, K. G. H., Liu, J., & Liu, W. (2025).
Analytics Framework for Interpreting Spatiotemporal Probabilistic Forecasts.
*International Journal of Forecasting*. Manuscript submitted.
.. [2] Matplotlib documentation: https://matplotlib.org/stable/
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from kdiagram.plot.uncertainty import plot_temporal_uncertainty
**1. Random Example (Comparing two series):**
>>> np.random.seed(42)
>>> N = 100
>>> df_comp_rand = pd.DataFrame({
... 'Index': range(N),
... 'ModelA_Pred': 50 + 10 * np.sin(np.linspace(0, 3 * np.pi, N)) + np.random.randn(N)*5,
... 'ModelB_Pred': 55 + 12 * np.sin(np.linspace(0, 3 * np.pi, N) - 0.5) + np.random.randn(N)*4,
... })
>>> ax_comp_rand = plot_temporal_uncertainty(
... df=df_comp_rand,
... q_cols=['ModelA_Pred', 'ModelB_Pred'],
... names=['Model A', 'Model B'],
... theta_col=None, # Use index order
... acov='default',
... title='Comparison of Model A vs Model B',
... normalize=True, # Normalize for shape comparison
... cmap='Set1',
... dot_style='x', # Use 'x' markers
... mask_angle=False # Show angle ticks
... )
>>> # plt.show() called internally
**2. Concrete Example (Subsidence Quantiles for 2023):**
>>> # Assume zhongshan_pred_2023_2026 is a loaded DataFrame
>>> # Create dummy data if it doesn't exist
>>> try:
... zhongshan_pred_2023_2026
... except NameError:
... print("Creating dummy subsidence data for example...")
... N_sub = 150
... zhongshan_pred_2023_2026 = pd.DataFrame({
... 'latitude': np.linspace(22.2, 22.8, N_sub),
... 'subsidence_2023_q10': np.random.rand(N_sub)*5 + 1 + np.linspace(0,2, N_sub),
... 'subsidence_2023_q50': np.random.rand(N_sub)*5 + 3 + np.linspace(1,3, N_sub),
... 'subsidence_2023_q90': np.random.rand(N_sub)*5 + 5 + np.linspace(2,4, N_sub),
... })
>>> # Ensure Q90 > Q50 > Q10 roughly
>>> zhongshan_pred_2023_2026['subsidence_2023_q50'] = np.maximum(
... zhongshan_pred_2023_2026['subsidence_2023_q50'],
... zhongshan_pred_2023_2026['subsidence_2023_q10'] + 0.1)
>>> zhongshan_pred_2023_2026['subsidence_2023_q90'] = np.maximum(
... zhongshan_pred_2023_2026['subsidence_2023_q90'],
... zhongshan_pred_2023_2026['subsidence_2023_q50'] + 0.1)
>>> ax_tu_sub = plot_temporal_uncertainty(
... df=zhongshan_pred_2023_2026.head(100), # Use subset
... # Explicitly list quantile columns for 2023
... q_cols=['subsidence_2023_q10', 'subsidence_2023_q50',
... 'subsidence_2023_q90'],
... theta_col='latitude', # Used for NaN alignment, not order
... names=["Lower Bound (Q10)", "Median (Q50)", "Upper Bound (Q90)"],
... acov='eighth_circle', # Use smaller angle span
... title='Uncertainty Spread for 2023 (Zhongshan)',
... normalize=False, # Plot raw values
... cmap='coolwarm', # Use diverging map for bounds
... mask_angle=True,
... s=30
... )
>>> # plt.show() called internally
"""
[docs]
@check_non_emptiness
@isdf
def plot_radial_density_ring(
df: pd.DataFrame,
*,
kind: Literal["width", "velocity", "direct"] = "direct",
target_cols: str | list[str],
title: str | None = None,
r_label: str | None = None,
figsize: tuple[float, float] = (8.0, 8.0),
cmap: str = "viridis",
alpha: float = 0.8,
cbar: bool = True,
acov: Acov = "default",
show_grid: bool = True,
grid_props: dict[str, Any] | None = None,
mask_angle: bool = True,
bandwidth: float | None = None,
savefig: str | None = None,
dpi: int = 300,
show_yticklabels: bool = False,
ax: Axes | None = None,
) -> Axes | None:
# warn if non-default sector (ring is conventionally 360°)
if acov != "default":
warnings.warn(
"Non-default 'acov' for ring. Convention prefers "
"full 360°, proceeding as requested.",
UserWarning,
stacklevel=2,
)
# deprecation shim for legacy param
if show_yticklabels:
warnings.warn(
'Param "show_yticklabels" is deprecated. Use '
'"mask_angle" (False to show angular labels).',
DeprecationWarning,
stacklevel=2,
)
mask_angle = False
# columns and existence checks
cols = columns_manager(target_cols)
exist_features(df, features=cols)
# select 1D target depending on kind
if kind == "direct":
if len(cols) != 1:
raise ValueError(
"`kind='direct'` needs one column in `target_cols`."
)
data_1d = df[cols[0]]
r_label = r_label or cols[0]
title = title or f"Distribution of {cols[0]}"
elif kind in ("width", "velocity"):
if len(cols) != 2:
raise ValueError(
f"`kind='{kind}'` needs two columns in `target_cols`."
)
data_1d = df[cols[1]] - df[cols[0]]
r_label = r_label or f"{cols[1]} - {cols[0]}"
title = title or f"Distribution of {kind.title()}"
else:
raise ValueError(
f"Invalid kind={kind!r}. Use 'width', 'velocity', or 'direct'."
)
# drop NaNs and bail if empty
data_1d = data_1d.dropna()
if data_1d.empty:
warnings.warn(
"Derived data empty after NaN drop. Cannot plot.",
UserWarning,
stacklevel=2,
)
return None
# KDE along radius
grid, pdf = _calculate_kde_for_ring(
data_1d.to_numpy(),
bandwidth=bandwidth,
)
if grid.size == 0:
warnings.warn(
"KDE failed. Cannot plot ring.",
UserWarning,
stacklevel=2,
)
return None
# normalize density to [0,1]
pdf_norm = pdf.copy()
vmax = float(np.max(pdf_norm))
if vmax > 0.0:
pdf_norm /= vmax
# axes + polar span
fig, ax, span = setup_polar_axes(
ax,
acov=acov,
figsize=figsize,
)
# theta grid over requested span
thetas = np.linspace(0.0, span, 128)
T, R = np.meshgrid(thetas, grid)
# tile normalized density over theta
C = np.tile(pdf_norm, (len(thetas), 1)).T
# draw pcolormesh ring
cmap_obj = get_cmap(cmap, default="viridis")
ax.grid(False)
pcm = ax.pcolormesh(
T,
R,
C,
shading="auto",
cmap=cmap_obj,
alpha=alpha,
vmin=0.0,
vmax=1.0,
)
# titles, labels, grids
ax.set_title(title or "Radial Density Ring", va="bottom")
set_axis_grid(
ax,
show_grid=show_grid,
grid_props=grid_props,
)
ax.set_thetagrids([]) # no angular labels by default
ax.set_rlabel_position(0)
if r_label:
ax.set_ylabel(r_label, labelpad=20)
if mask_angle:
ax.set_yticklabels([])
# colorbar if requested
if cbar:
cb = plt.colorbar(
pcm,
ax=ax,
pad=0.1,
shrink=0.7,
)
cb.set_label(
"Normalized Density",
rotation=270,
labelpad=32,
)
# output handling
final = safe_savefig(
savefig,
fig,
dpi=dpi,
bbox_inches="tight",
)
if final is not None:
plt.close(fig)
else:
fig.tight_layout()
plt.show()
return ax
plot_radial_density_ring.__doc__ = r"""
Plot a radial density ring to visualize a 1D distribution.
This function computes the probability density function (PDF) of a
one-dimensional variable using Kernel Density Estimation (KDE) and
visualizes it as a colored ring on a polar plot. It's a
powerful way to inspect the shape, peaks, and spread of a
distribution, such as forecast errors or interval widths.
- **Radial Distance (`r`)**: Represents the domain of the
variable being analyzed (e.g., interval width in mm).
- **Color**: Represents the **probability density** at each
radial position. More intense colors indicate more common
values (peaks in the distribution).
- **Angular Position (`theta`)**: Is purely for aesthetics and
carries no information; the density is uniform around the
circle to form the ring.
Parameters
----------
df : pd.DataFrame
The input DataFrame containing the data.
kind : {'width', 'velocity', 'direct'}, default='direct'
Specifies how to derive the 1D data for the distribution
plot from the ``target_cols``.
- ``'direct'``: Directly uses the data from a single
column specified in ``target_cols``.
- ``'width'``: Calculates the interval width by
subtracting the first column from the second in
``target_cols`` (e.g., ``q90 - q10``).
- ``'velocity'``: Calculates the difference between two
columns, representing a change or velocity (e.g.,
``value_t2 - value_t1``).
target_cols : str or list of str
The column(s) to use for deriving the data.
- For ``kind='direct'``, this must be a single column name
(``str``).
- For ``kind='width'`` or ``kind='velocity'``, this must be a
list of exactly two column names (``[col1, col2]``).
title : str, optional
The title for the plot. If ``None``, a default is generated.
r_label : str, optional
Custom label for the radial axis. If ``None``, a default
is generated from the column names.
figsize : tuple of (float, float), default=(8, 8)
Figure size in inches.
cmap : str, default='viridis'
Matplotlib colormap name for the density ring.
alpha : float, default=0.8
Transparency level for the density ring.
cbar : bool, default=True
If ``True``, display a color bar representing the
normalized density.
acov : {'default', 'half_circle', 'quarter_circle',
'eighth_circle'}, default='default'
Angular coverage of the polar sector.
- ``'default'`` : full circle, :math:`2\pi` (360°)
- ``'half_circle'`` : :math:`\pi` (180°)
- ``'quarter_circle'`` : :math:`\pi/2` (90°)
- ``'eighth_circle'`` : :math:`\pi/4` (45°)
By convention, the radial density *ring* uses a full
360° sector. If a non-default value is provided, a short
warning is issued.
show_grid : bool, default=True
Toggle gridlines via the package helper ``set_axis_grid``.
grid_props : dict, optional
Keyword arguments passed to ``set_axis_grid`` for grid
customization.
mask_angle : bool, default=True
If ``False``, show the tick labels on the radial axis.
Defaults to ``True`` as the ring is primarily a
qualitative visual.
bandwidth : float, optional
The bandwidth for the Kernel Density Estimation. If ``None``,
it is estimated using Silverman's rule of thumb.
savefig : str, optional
If provided, save the figure to this path; otherwise the
plot is shown interactively.
dpi : int, default=300
Resolution for the saved figure.
Returns
-------
ax : matplotlib.axes.Axes or None
The Matplotlib Axes object containing the plot, or ``None``
if the plot could not be generated (e.g., due to empty data).
Notes
-----
The plot is generated through the following steps:
1. **Data Derivation**: A one-dimensional data vector,
:math:`\mathbf{x}`, is created from the ``target_cols``
based on the specified ``kind``.
2. **Kernel Density Estimation**: The probability density
function (PDF), :math:`\hat{f}_h(x)`, is estimated from
the data vector :math:`\mathbf{x}` using a Gaussian kernel.
This step is handled by ``scipy.stats.gaussian_kde``.
3. **Normalization for Coloring**: The estimated PDF is
normalized to the range ``[0, 1]`` to map its values to the
color map.
.. math::
\text{PDF}_{\text{norm}}(x) = \frac{\hat{f}_h(x)}{\max(\hat{f}_h)}
4. **Visualization**: The plot is rendered on polar axes using
``pcolormesh``. The radial axis corresponds to the value
domain :math:`x`, and the color at each radius corresponds
to :math:`\text{PDF}_{\text{norm}}(x)`. This color value is
repeated across all angles to form a ring.
Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> from kdiagram.plot.uncertainty import plot_radial_density_ring
>>>
>>> # Create synthetic data
>>> np.random.seed(42)
>>> n_samples = 500
>>> df_test = pd.DataFrame({
... 'q10_pred': np.random.normal(10, 2, n_samples),
... 'q90_pred': np.random.normal(30, 3, n_samples),
... })
>>> # Ensure upper bound is greater than lower bound
>>> df_test['q90_pred'] = df_test[['q10_pred', 'q90_pred']].max(axis=1) \
... + np.random.rand(n_samples)
>>>
>>> # Generate the plot for the distribution of interval widths
>>> ax = plot_radial_density_ring(
... df=df_test,
... kind="width",
... target_cols=["q10_pred", "q90_pred"],
... title="Distribution of Prediction Interval Width",
... r_label="Interval Width (mm)",
... cmap="magma",
... show_yticklabels=True
... )
"""
[docs]
@check_non_emptiness
@isdf
def plot_polar_quiver(
df: pd.DataFrame,
r_col: str,
theta_col: str,
u_col: str,
v_col: str,
*,
acov: Acov = "default",
color_col: str | None = None,
theta_period: float | None = None,
title: str | None = None,
figsize: tuple[float, float] = (8.0, 8.0),
cmap: str = "viridis",
mask_angle: bool = False,
mask_radius: bool = False,
show_grid: bool = True,
grid_props: dict[str, Any] | None = None,
savefig: str | None = None,
dpi: int = 300,
ax: Axes | None = None,
**quiver_kws,
) -> Axes | None:
# ---- required columns
req = [r_col, theta_col, u_col, v_col]
if color_col:
req.append(color_col)
exist_features(df, features=req)
# ---- clean data
data = df[req].dropna()
if data.empty:
warnings.warn(
"DataFrame empty after NaN drop in required "
"columns. Cannot plot.",
UserWarning,
stacklevel=2,
)
return None
# ---- extract arrays
r_data = data[r_col].to_numpy()
t_raw = data[theta_col].to_numpy()
u_data = data[u_col].to_numpy() # radial change
v_data = data[v_col].to_numpy() # angular change
# ---- axes & span
fig, ax, span = setup_polar_axes(
ax,
acov=acov,
figsize=figsize,
)
# ---- map theta to [0, span]
if theta_period is not None:
t_map = map_theta_to_span(
t_raw,
span=span,
theta_period=theta_period,
)
else:
tmin = float(t_raw.min())
tmax = float(t_raw.max())
if tmax > tmin + 1e-12:
t_map = map_theta_to_span(
t_raw,
span=span,
data_min=tmin,
data_max=tmax,
)
else:
t_map = np.zeros_like(t_raw)
# ---- color values
if color_col:
c_vals = data[color_col].to_numpy()
cbar_label = color_col
else:
# default color by vector magnitude
c_vals = np.sqrt(u_data**2 + v_data**2)
cbar_label = "Vector Magnitude"
# robust normalize
cmin = float(np.nanmin(c_vals))
cmax = float(np.nanmax(c_vals))
if not np.isfinite(cmin) or not np.isfinite(cmax):
cmin, cmax = 0.0, 1.0
if np.isclose(cmax - cmin, 0.0):
cmax = cmin + 1.0
norm = Normalize(vmin=cmin, vmax=cmax)
cmap_obj = get_cmap(cmap, default="viridis")
colors = cmap_obj(norm(c_vals))
# ---- draw quiver
# Matplotlib polar quiver expects:
# (theta, r, dtheta, dr)
ax.quiver(
t_map,
r_data,
v_data,
u_data,
color=colors,
**quiver_kws,
)
# ---- colorbar
sm = plt.cm.ScalarMappable(norm=norm, cmap=cmap_obj)
cbar = plt.colorbar(
sm,
ax=ax,
pad=0.1,
shrink=0.75,
)
cbar.set_label(cbar_label, fontsize=10)
# ---- labels, grid, masks
ax.set_title(title or f"Quiver Plot of ({u_col}, {v_col})")
set_axis_grid(
ax,
show_grid=show_grid,
grid_props=grid_props,
)
if mask_angle:
ax.set_xticklabels([])
if mask_radius:
ax.set_yticklabels([])
# ---- output
final = safe_savefig(
savefig,
fig,
dpi=dpi,
bbox_inches="tight",
)
if final is not None:
plt.close(fig)
else:
fig.tight_layout()
plt.show()
return ax
plot_polar_quiver.__doc__ = r"""
Plot a polar quiver plot to visualize vector data.
This function draws arrows (vectors) on a polar grid. Each arrow's
origin, direction, and magnitude are determined by the data. This is
highly effective for visualizing dynamic phenomena like forecast
revisions, error vectors, or flow fields :footcite:p:`2020SciPy-NMeth`.
Parameters
----------
df : pd.DataFrame
The input DataFrame containing the vector data.
r_col : str
Name of the column for the **radial position** of the vector's
origin (tail).
theta_col : str
Name of the column for the **angular position** of the vector's
origin.
u_col : str
Name of the column for the **radial component** of the vector
(the change in radius).
v_col : str
Name of the column for the **tangential component** of the vector
(the change in angle).
acov : {'default', 'half_circle', 'quarter_circle', 'eighth_circle'}, \
default='default'
Angular **coverage** of the plot:
- ``'default'``: full circle, :math:`2\pi`
- ``'half_circle'``: :math:`\pi`
- ``'quarter_circle'``: :math:`\pi/2`
- ``'eighth_circle'``: :math:`\pi/4`
(Value is case-insensitive; invalid values fall back to full circle
with a warning.)
color_col : str, optional
Name of a column to use for coloring the arrows. If ``None``,
arrows are colored by their total vector magnitude.
theta_period : float, optional
The period of the cyclical data in ``theta_col``. For example,
if ``theta_col`` represents degrees in a circle, the period is
360. If provided, the angle is mapped to ``[0, 2*pi]`` based
on this cycle.
title : str, optional
The title for the plot. If ``None``, a default is generated.
figsize : tuple of (float, float), default=(8, 8)
Figure size in inches.
cmap : str, default='viridis'
Matplotlib colormap for coloring the arrows.
show_grid : bool, default=True
Toggle gridlines via the package helper ``set_axis_grid``.
grid_props : dict, optional
Keyword arguments passed to ``set_axis_grid`` for grid
customization.
mask_angle : bool, default=False
If ``True``, hide the angular tick labels (e.g., degrees).
mask_radius : bool, default=False
If ``True``, hide the radial tick labels.
savefig : str, optional
If provided, save the figure to this path; otherwise the
plot is shown interactively.
dpi : int, default=300
Resolution for the saved figure.
**quiver_kws : dict, optional
Additional keyword arguments passed to the ``ax.quiver`` call
(e.g., ``scale``, ``width``, ``headwidth``).
Returns
-------
ax : matplotlib.axes.Axes or None
The Matplotlib Axes object containing the plot, or ``None``
if the plot could not be generated.
Notes
-----
The plot visualizes a vector field on a polar coordinate system. For
each data point :math:`i`, a vector is drawn footcite:p:`Hunter2007`.
1. **Vector Origin**: The tail of the vector is positioned at the
polar coordinates :math:`(r_i, \theta_i)` specified by ``r_col``
and ``theta_col``.
2. **Vector Components**: The vector itself is defined by its
components in the radial and tangential directions:
- :math:`u_i` (from ``u_col``) is the component in the radial
direction (pointing away from the center).
- :math:`v_i` (from ``v_col``) is the component in the tangential
direction (perpendicular to the radial line).
3. **Visualization**: Matplotlib's polar ``quiver`` function places an
arrow at each origin point :math:`(r_i, \theta_i)`. The arrow's
length and orientation are determined by the vector
:math:`(u_i, v_i)`.
4. **Coloring**: The color of each arrow can be determined by a
separate metric from ``color_col``. If not specified, it defaults
to the vector's Euclidean magnitude, :math:`M_i`.
.. math::
M_i = \sqrt{u_i^2 + v_i^2}
Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> from kdiagram.plot.uncertainty import plot_polar_quiver
>>>
>>> # Simulate forecast revisions for 50 spatial locations
>>> np.random.seed(0)
>>> n_points = 50
>>> locations = np.linspace(0, 360, n_points, endpoint=False)
>>> initial_forecast = 10 + 5 * np.sin(np.deg2rad(locations) * 3)
>>>
>>> # Simulate revisions (changes in value)
>>> radial_change = np.random.normal(0, 1.5, n_points)
>>> tangential_change = np.random.normal(0, 0.1, n_points)
>>>
>>> df_forecasts = pd.DataFrame({
... 'location_angle': locations,
... 'initial_value': initial_forecast,
... 'update_radial': radial_change,
... 'update_tangential': tangential_change,
... })
>>>
>>> # Generate the polar quiver plot
>>> ax = plot_polar_quiver(
... df=df_forecasts,
... r_col='initial_value',
... theta_col='location_angle',
... u_col='update_radial',
... v_col='update_tangential',
... theta_period=360,
... title='Forecast Revisions for Spatial Locations',
... cmap='coolwarm',
... scale=25 # Adjusts arrow size for better visibility
... )
References
----------
.. footbibliography::
"""
[docs]
@check_non_emptiness
@isdf
def plot_polar_heatmap(
df: pd.DataFrame,
r_col: str,
theta_col: str,
*,
acov: Acov = "default",
theta_period: float | None = None,
r_bins: int = 20,
theta_bins: int = 36,
statistic: str = "count",
cbar_label: str | None = None,
title: str | None = None,
figsize: tuple[float, float] = (8, 8),
cmap: str = "viridis",
mask_angle: bool = False,
mask_radius: bool = False,
show_grid: bool = True,
grid_props: dict[str, Any] | None = None,
savefig: str | None = None,
dpi: int = 300,
ax: Axes | None = None,
) -> Axes | None:
# ---- Validate required columns & drop NaNs
exist_features(df, features=[r_col, theta_col])
data = df[[r_col, theta_col]].dropna()
if data.empty:
warnings.warn(
"DataFrame is empty after dropping NaNs"
" in '{r_col}'/'{theta_col}'. Cannot plot.",
UserWarning,
stacklevel=2,
)
return None
# ---- Extract arrays
r_data = data[r_col].to_numpy()
theta_raw = data[theta_col].to_numpy()
# ---- Axes & angular span (radians)
fig, ax, span = setup_polar_axes(ax, acov=acov, figsize=figsize)
# ---- Map theta values into [0, span]
if theta_period is not None:
theta_mapped = map_theta_to_span(
theta_raw, span=span, theta_period=theta_period
)
else:
tmin, tmax = float(theta_raw.min()), float(theta_raw.max())
if tmax > tmin + 1e-12:
theta_mapped = map_theta_to_span(
theta_raw, span=span, data_min=tmin, data_max=tmax
)
else:
theta_mapped = np.zeros_like(theta_raw)
# ---- Binning
if statistic.lower() != "count":
warnings.warn(
f"statistic='{statistic}' not implemented;"
" falling back to 'count'.",
UserWarning,
stacklevel=2,
)
r_edges = np.linspace(r_data.min(), r_data.max(), int(r_bins) + 1)
theta_edges = np.linspace(0.0, float(span), int(theta_bins) + 1)
counts, _, _ = np.histogram2d(
x=r_data,
y=theta_mapped,
bins=[r_edges, theta_edges],
)
# Note: np.histogram2d returns shape (len(x_bins)-1, len(y_bins)-1).
# Our mesh expects R(len(r_edges)) x T(len(theta_edges)) edges.
# ---- Draw
cmap_obj = get_cmap(cmap, default="viridis")
ax.grid(False)
T, R = np.meshgrid(theta_edges, r_edges)
pcm = ax.pcolormesh(T, R, counts, cmap=cmap_obj, shading="auto")
# ---- Colorbar
cbar = plt.colorbar(pcm, ax=ax, pad=0.1, shrink=0.75)
cbar.set_label(cbar_label or statistic.capitalize(), fontsize=10)
# ---- Labels, grid, masks
ax.set_title(title or f"Density of {r_col} vs. {theta_col}")
set_axis_grid(ax, show_grid=show_grid, grid_props=grid_props)
if mask_angle:
ax.set_xticklabels([])
if mask_radius:
ax.set_yticklabels([])
# ---- Output
final = safe_savefig(
savefig,
fig,
dpi=dpi,
bbox_inches="tight",
)
if final is not None:
plt.close(fig)
else:
fig.tight_layout()
plt.show()
return ax
def _calculate_kde_for_ring(
data: np.ndarray, bandwidth: float | None = None
) -> tuple[np.ndarray, np.ndarray]:
"""
Internal helper to compute KDE. In a package, this
should be a shared utility.
"""
data = data[np.isfinite(data)]
if data.size == 0:
return np.array([]), np.array([])
lo, hi = np.min(data), np.max(data)
pad = 0.05 * (hi - lo) if (hi - lo) > 0 else 0.5
grid = np.linspace(lo - pad, hi + pad, 512)
if bandwidth is None:
std = np.std(data, ddof=1) if data.size > 1 else 1.0
# Silverman's rule of thumb
bw_val = 1.06 * std * (data.size ** (-1 / 5)) if std > 0 else 1.0
else:
bw_val = bandwidth
std_dev = np.std(data, ddof=1)
bw_method = bw_val / std_dev if std_dev > 0 else "silverman"
kde = gaussian_kde(data, bw_method=bw_method)
pdf = kde(grid)
return grid, pdf
plot_polar_heatmap.__doc__ = r"""
Plot a polar heatmap to visualize a 2D density distribution.
This function creates a heatmap on a polar grid to show the
concentration of data points based on two variables: one mapped to
the **radius** and another to the **angle**. It is highly effective
for identifying patterns and correlations between a linear metric and
a cyclical or ordered feature.
Parameters
----------
df : pd.DataFrame
The input DataFrame containing the data.
r_col : str
Name of the column to be used for the radial coordinate.
theta_col : str
Name of the column to be used for the angular coordinate.
acov : {'default', 'half_circle', 'quarter_circle', 'eighth_circle'}, \
default='default'
Angular **coverage** of the plot:
- ``'default'``: full circle, :math:`2\pi`
- ``'half_circle'``: :math:`\pi`
- ``'quarter_circle'``: :math:`\pi/2`
- ``'eighth_circle'``: :math:`\pi/4`
(Value is case-insensitive; invalid values fall back to full circle
with a warning.)
theta_period : float, optional
The period of the cyclical data in ``theta_col``. For example,
if ``theta_col`` represents the hour of the day, ``theta_period``
should be 24. If provided, the angle is mapped to ``[0, 2*pi]``
based on this cycle. If ``None``, the angle is scaled from the
data's min/max values.
r_bins : int, default=20
The number of bins to create along the radial axis.
theta_bins : int, default=36
The number of bins to create around the angular axis.
statistic : str, default='count'
The statistic to compute in each bin. Currently, only 'count'
is supported, which corresponds to a 2D histogram.
cbar_label : str, optional
Custom label for the color bar. If ``None``, it defaults to
the value of ``statistic``.
title : str, optional
The title for the plot. If ``None``, a default is generated.
figsize : tuple of (float, float), default=(8, 8)
Figure size in inches.
cmap : str, default='viridis'
Matplotlib colormap for the heatmap.
mask_angle : bool, default=False
If ``True``, hide the angular tick labels (e.g., degrees).
mask_radius : bool, default=False
If ``True``, hide the radial tick labels.
show_grid : bool, default=True
Toggle gridlines via the package helper ``set_axis_grid``.
grid_props : dict, optional
Keyword arguments passed to ``set_axis_grid`` for grid
customization.
savefig : str, optional
If provided, save the figure to this path; otherwise the
plot is shown interactively.
dpi : int, default=300
Resolution for the saved figure.
Returns
-------
ax : matplotlib.axes.Axes or None
The Matplotlib Axes object containing the plot, or ``None``
if the plot could not be generated.
Notes
-----
The heatmap is constructed by binning the 2D polar space and
counting the number of data points that fall into each bin.
1. **Coordinate Mapping**:
- The radial data from ``r_col`` is used directly.
- The angular data from ``theta_col``, :math:`\theta_{data}`, is
converted to radians in the range :math:`[0, 2\pi]`. If
``theta_period`` (:math:`P`) is given, the mapping is:
.. math::
\theta_{rad} = \left( \frac{\theta_{data} \pmod P}{P} \right) \cdot 2\pi
Otherwise, it is linearly scaled from the data's range.
2. **Binning**: The data space is divided into a grid of polar
bins defined by ``r_bins`` and ``theta_bins``. A 2D histogram
is then computed, where the value of each cell :math:`C_{ij}`
is the number of data points :math:`(r, \theta)` that fall
within the radial and angular bounds of that cell.
3. **Visualization**: The resulting count matrix :math:`\mathbf{C}`
is visualized using ``pcolormesh``, where the color of each
cell corresponds to its count, creating the heatmap effect.
See :footcite:p:`Hunter:2007` for Matplotlib details.
See Also
--------
matplotlib.pyplot.pcolormesh :
Draw a pseudocolor plot with a nonregular rectangular grid.
numpy.histogram2d :
Compute the bi-dimensional histogram of two data samples.
Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> from kdiagram.plot.uncertainty import plot_polar_heatmap
>>>
>>> # Simulate weather data with a daily cycle
>>> np.random.seed(42)
>>> n_points = 5000
>>> # Simulate hour of day with more events in the afternoon
>>> hour = np.concatenate([
... np.random.normal(15, 2, int(n_points * 0.7)),
... np.random.normal(5, 2, int(n_points * 0.3))
... ]) % 24
>>> # Simulate rainfall, correlated with afternoon hours
>>> rainfall = np.random.gamma(2, 5, n_points) + \
... (hour > 12) * np.random.gamma(3, 5, n_points)
>>>
>>> df_weather = pd.DataFrame({'hour': hour, 'rainfall_mm': rainfall})
>>>
>>> # Generate the polar heatmap
>>> ax = plot_polar_heatmap(
... df=df_weather,
... r_col='rainfall_mm',
... theta_col='hour',
... theta_period=24,
... r_bins=25,
... theta_bins=24,
... cmap='plasma',
... title='Rainfall Intensity vs. Hour of Day',
... cbar_label='Event Count'
... )
References
----------
.. footbibliography::
"""
class PerformanceWarning(Warning):
pass
# Define InternalError for consistency if needed
class InternalError(Exception):
pass