Source code for kdiagram.datasets.make

# Author: LKouadio <etanoyau@gmail.com>
# License: Apache License 2.0 (see LICENSE file)

"""
Dataset Generation Utilities (:mod:`kdiagram.datasets.make`)
============================================================

This module provides functions to create synthetic datasets tailored
for demonstrating and testing the various plotting functions within
the `k-diagram` package, particularly those focused on uncertainty.
"""
from __future__ import annotations

import textwrap
import warnings

import numpy as np
import pandas as pd

from ..api.bunch import Bunch

__all__ = [
    "make_uncertainty_data",
    "make_taylor_data",
    "make_multi_model_quantile_data",
    "make_cyclical_data",
]


[docs] def make_cyclical_data( n_samples: int = 365, n_series: int = 2, cycle_period: float = 365, noise_level: float = 0.5, amplitude_true: float = 10.0, offset_true: float = 20.0, pred_bias: float | list[float] = None, pred_noise_factor: float | list[float] = None, pred_amplitude_factor: float | list[float] = None, pred_phase_shift: float | list[float] = None, prefix: str = "model", series_names: list[str] | None = None, seed: int | None = 404, as_frame: bool = False, ) -> Bunch | pd.DataFrame: # --- Input Validation & Setup --- if pred_phase_shift is None: pred_phase_shift = [0, np.pi / 6] if pred_amplitude_factor is None: pred_amplitude_factor = [1.0, 0.8] if pred_noise_factor is None: pred_noise_factor = [1.0, 1.5] if pred_bias is None: pred_bias = [0, 1.5] if seed is not None: rng = np.random.default_rng(seed) else: rng = np.random.default_rng() # Ensure prediction parameters are lists of correct length params_to_check = { "pred_bias": pred_bias, "pred_noise_factor": pred_noise_factor, "pred_amplitude_factor": pred_amplitude_factor, "pred_phase_shift": pred_phase_shift, } processed_params = {} for name, param in params_to_check.items(): if isinstance(param, (int, float)): processed_params[name] = [param] * n_series elif isinstance(param, list): if len(param) != n_series: raise ValueError( f"Length of '{name}' ({len(param)}) must match " f"n_series ({n_series})." ) processed_params[name] = param else: raise TypeError(f"'{name}' must be float or list of floats.") # --- Generate Time Step and True Signal --- time_step = np.arange(n_samples) # Angular frequency based on cycle period omega = 2 * np.pi / cycle_period theta = omega * time_step # True signal (e.g., sine wave + offset + noise) y_true = ( offset_true + amplitude_true * np.sin(theta) + rng.normal(0, noise_level, n_samples) ) data_dict = {"time_step": time_step, "y_true": y_true} # --- Generate Model Names & Prediction Columns --- if series_names is None: series_names_list = [f"{prefix}_{chr(65+i)}" for i in range(n_series)] elif len(series_names) != n_series: raise ValueError( f"Length of series_names ({len(series_names)}) must " f"match n_series ({n_series})." ) else: series_names_list = list(series_names) prediction_cols_list = [] for i, series_name in enumerate(series_names_list): col_name = series_name # Use provided or generated name prediction_cols_list.append(col_name) # Get parameters for this series amp = amplitude_true * processed_params["pred_amplitude_factor"][i] bias = processed_params["pred_bias"][i] noise = noise_level * processed_params["pred_noise_factor"][i] phase = processed_params["pred_phase_shift"][i] # Generate prediction series y_pred = ( offset_true + bias + amp * np.sin(theta + phase) + rng.normal(0, noise, n_samples) ) data_dict[col_name] = y_pred # --- Create DataFrame --- df = pd.DataFrame(data_dict) # Define column categories for Bunch feature_names = ["time_step"] target_name = ["y_true"] # --- Return based on as_frame --- if as_frame: # Order columns logically ordered_cols = target_name + feature_names + prediction_cols_list return df[ordered_cols] else: # Create Bunch description descr = textwrap.dedent( f"""\ Synthetic Cyclical Pattern Data for k-diagram **Description:** Simulates a dataset with a primary 'true' cyclical signal and {n_series} related prediction series over {n_samples} time steps. The true signal is a sine wave with added noise. Prediction series are generated based on the true signal but may include systematic bias, different amplitude scaling, phase shifts (lag/lead), and varying noise levels, according to the specified parameters. **Generation Parameters:** - n_samples : {n_samples} - n_series : {n_series} - cycle_period : {cycle_period} - noise_level : {noise_level:.2f} (base for y_true) - amplitude_true : {amplitude_true:.2f} - offset_true : {offset_true:.2f} - pred_bias : {processed_params['pred_bias']} - pred_noise_factor : {processed_params['pred_noise_factor']} - pred_amplitude_factor : {processed_params['pred_amplitude_factor']} - pred_phase_shift : {processed_params['pred_phase_shift']} (radians) - prefix : '{prefix}' - seed : {seed} **Data Structure (Bunch object):** - frame : Complete pandas DataFrame. - feature_names : List of feature column names (['time_step']). - target_names : List containing the target column name (['y_true']). - target : NumPy array of 'y_true' values. - series_names : List of prediction series names. - prediction_columns: List of prediction column names in the frame. - DESCR : This description. This dataset is suitable for visualizing relationships or temporal patterns in a polar context using functions like plot_relationship or plot_temporal_uncertainty. """ ) # Build arrays with a uniform dtype to avoid pandas -> np.find_common_type num_cols = feature_names + prediction_cols_list target_array = df[target_name[0]].to_numpy( dtype=np.float64, copy=True ) data_array = df[num_cols].to_numpy(dtype=np.float64, copy=True) return Bunch( frame=df[target_name + feature_names + prediction_cols_list], data=data_array, feature_names=feature_names, target_names=target_name, target=target_array, series_names=series_names_list, prediction_columns=prediction_cols_list, DESCR=descr, )
make_cyclical_data.__doc__ = r""" Generate synthetic cyclical data for relationship and temporal plots. Creates a dataset with a single **true** cyclical signal and one or more **prediction** series that can differ in amplitude, phase, bias, and noise relative to the truth. This is useful for demos of polar relationship and temporal-uncertainty plots in `k-diagram` :footcite:p:`harris2020array, 2020SciPy-NMeth, Hunter:2007`. This data is useful for demonstrating and testing functions like :func:`~kdiagram.plot.relationship.plot_relationship` or :func:`~kdiagram.plot.uncertainty.plot_temporal_uncertainty` where visualizing behavior over a cycle is important. Parameters ---------- n_samples : int, default=365 Number of time steps to generate. Interpreted as evenly spaced samples over one or more cycles. n_series : int, default=2 Number of simulated prediction series (e.g., models). cycle_period : float, default=365 Samples per full cycle :math:`P`. The angular frequency is :math:`\omega = 2\pi / P`. Use ``365`` for daily data over one year, ``12`` for monthly data over one year, etc. noise_level : float, default=0.5 Standard deviation of Gaussian noise added to the **true** signal. Prediction series scale this by ``pred_noise_factor``. amplitude_true : float, default=10.0 Amplitude of the sinusoidal **true** signal. offset_true : float, default=20.0 Vertical offset (mean level) of the **true** signal. pred_bias : float or list of float, optional Additive bias for each prediction series. If a scalar is provided it is broadcast to all ``n_series``. If a list is provided, its length must equal ``n_series``. Defaults to ``[0.0, 1.5]`` when ``None``. pred_noise_factor : float or list of float, optional Multiplier for ``noise_level`` per series. Scalar values are broadcast; lists must match ``n_series`` in length. Defaults to ``[1.0, 1.5]`` when ``None``. pred_amplitude_factor : float or list of float, optional Multiplier of ``amplitude_true`` per series (allows under/ over-estimation of the cycle amplitude). Scalar broadcast is supported. Defaults to ``[1.0, 0.8]`` when ``None``. pred_phase_shift : float or list of float, optional Phase shift (radians) added to each series. Positive values produce a lag relative to the truth. Scalar broadcast is supported. Defaults to ``[0.0, np.pi / 6]`` when ``None``. prefix : str, default='model' Prefix used to generate prediction column names, e.g., ``model_A``, ``model_B``, … series_names : list of str, optional Explicit names for prediction columns. If omitted, names are generated from ``prefix`` as ``prefix_A``, ``prefix_B``, … Must have length ``n_series`` if provided. seed : int or None, default=404 Seed for NumPy’s random generator. If ``None``, a fresh RNG is used. as_frame : bool, default=False If ``False``, return a :class:`~kdiagram.bunch.Bunch` with metadata and arrays. If ``True``, return only the pandas ``DataFrame``. Returns ------- data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame If ``as_frame=False`` (default), a Bunch with: - ``frame`` : pandas ``DataFrame`` containing ``'time_step'``, ``'y_true'``, and prediction columns. - ``feature_names`` : ``['time_step']``. - ``target_names`` : ``['y_true']``. - ``target`` : ``ndarray`` of shape ``(n_samples,)`` with the true signal. - ``series_names`` : list of prediction series names. - ``prediction_columns`` : list of prediction column names. - ``DESCR`` : human-readable description. If ``as_frame=True``, only the pandas ``DataFrame`` is returned. Raises ------ ValueError If a provided list for prediction parameters does not match ``n_series`` in length. TypeError If prediction parameters are not float or list of float. Notes ----- **Signal model.** Let :math:`P` be the cycle period and :math:`\omega = 2\pi/P`. The **true** signal at time step :math:`t \in \{0,\dots,n\_samples-1\}` is .. math:: y_{\text{true}}(t) \;=\; \texttt{offset\_true} \;+\; \texttt{amplitude\_true}\,\sin(\omega t) \;+\; \varepsilon_t, \qquad \varepsilon_t \sim \mathcal{N}(0,\sigma^2), \;\; \sigma=\texttt{noise\_level}. For series :math:`k=1,\dots,n\_{\text{series}}`, the prediction is .. math:: y_{\text{pred}}^{(k)}(t) \;=\; \texttt{offset\_true} \;+\; b_k \;+\; \big(\texttt{amplitude\_true}\,\alpha_k\big) \sin(\omega t + \phi_k) \;+\; \eta^{(k)}_t, with :math:`\eta^{(k)}_t \sim \mathcal{N}\!\big(0,\, (\sigma\,\gamma_k)^2\big)`. Here :math:`b_k` is the bias (``pred_bias``), :math:`\alpha_k` the amplitude factor (``pred_amplitude_factor``), :math:`\phi_k` the phase shift (``pred_phase_shift``), and :math:`\gamma_k` the noise factor (``pred_noise_factor``). Numerical generation and plotting typically rely on array/scientific and graphics stacks :footcite:p:`harris2020array, 2020SciPy-NMeth, Hunter:2007`. See Also -------- kdiagram.plot.relationship.plot_relationship Polar relationship scatter for true vs. predictions. kdiagram.plot.uncertainty.plot_temporal_uncertainty General-purpose polar series plot; useful for Q10/Q50/Q90 and cyclical visualizations. Examples -------- >>> Generate a small cyclical dataset as a Bunch: >>> >>> from kdiagram.datasets import make_cyclical_data >>> ds = make_cyclical_data( ... n_samples=24, n_series=2, cycle_period=12, seed=7 ... ) >>> ds.frame.head().columns.tolist()[:3] ['time_step', 'y_true', ds.prediction_columns[0]] >>> >>> Return only a DataFrame and supply custom names: >>> >>> df = make_cyclical_data( ... n_samples=50, ... n_series=3, ... series_names=['A','B','C'], ... as_frame=True, ... seed=1 ... ) >>> set(['time_step','y_true']).issubset(df.columns) True References ------------ .. footbibliography:: """
[docs] def make_fingerprint_data( n_layers: int = 3, n_features: int = 8, layer_names: list[str] | None = None, feature_names: list[str] | None = None, value_range: tuple[float, float] = (0.0, 1.0), sparsity: float = 0.1, add_structure: bool = True, seed: int | None = 303, as_frame: bool = False, ) -> Bunch | pd.DataFrame: # --- Input Validation & Setup --- if not (0.0 <= sparsity <= 1.0): raise ValueError("sparsity must be between 0.0 and 1.0") if not ( isinstance(value_range, tuple) and len(value_range) == 2 and value_range[0] <= value_range[1] ): raise ValueError( "value_range must be a tuple (min, max)" " with min <= max." ) if seed is not None: rng = np.random.default_rng(seed) else: rng = np.random.default_rng() # Generate names if needed if feature_names is None: feature_names = [f"Feature_{i+1}" for i in range(n_features)] elif len(feature_names) != n_features: raise ValueError( f"Length of feature_names ({len(feature_names)}) " f"must match n_features ({n_features})." ) if layer_names is None: layer_names = [f"Layer_{chr(65+i)}" for i in range(n_layers)] elif len(layer_names) != n_layers: raise ValueError( f"Length of layer_names ({len(layer_names)}) " f"must match n_layers ({n_layers})." ) # --- Generate Importance Matrix --- min_val, max_val = value_range importances = rng.uniform(min_val, max_val, size=(n_layers, n_features)) # Add optional structure if add_structure and n_layers > 1 and n_features > 1: for i in range(n_layers): # Example structure: layer 'i' emphasizes feature 'i' (cycling) emphasized_feature = i % n_features importances[i, emphasized_feature] = rng.uniform( (min_val + max_val) / 1.5, # Emphasize higher values max_val * 1.1, # Allow slightly exceeding max ) # Maybe deemphasize another feature deemphasized_feature = (i + n_features // 2) % n_features if deemphasized_feature != emphasized_feature: importances[i, deemphasized_feature] = rng.uniform( min_val * 0.9, # Allow slightly below min (min_val + max_val) / 2.5, # Emphasize lower values ) # Ensure values stay within reasonable bounds if needed importances = np.clip(importances, min_val * 0.8, max_val * 1.2) # Introduce sparsity if sparsity > 0: mask = rng.choice( [0, 1], size=importances.shape, p=[sparsity, 1 - sparsity] ) importances *= mask # --- Assemble DataFrame --- df = pd.DataFrame(importances, index=layer_names, columns=feature_names) # --- Return based on as_frame --- if as_frame: return df else: # Create Bunch description descr = textwrap.dedent( f"""\ Synthetic Feature Fingerprint Data **Description:** Simulated feature importance matrix for {n_layers} layers/groups and {n_features} features. Values were sampled uniformly from the range {value_range} and approximately {sparsity*100:.0f}% were randomly set to zero (sparsity).{' Some basic structure was added.' if add_structure else ''} This dataset is suitable for use with plot_feature_fingerprint. **Generation Parameters:** - n_layers : {n_layers} - n_features : {n_features} - value_range : {value_range} - sparsity : {sparsity:.2f} - add_structure : {add_structure} - seed : {seed} **Contents (Bunch object):** - importances : NumPy array ({n_layers}, {n_features}) with scores. - frame : Pandas DataFrame view of importances matrix. - layer_names : List of {n_layers} layer names (index). - feature_names : List of {n_features} feature names (columns). - DESCR : This description. """ ) return Bunch( importances=importances, frame=df, layer_names=list(layer_names), feature_names=list(feature_names), DESCR=descr, )
make_fingerprint_data.__doc__ = r""" Generate synthetic feature-importance data for fingerprint plots. Creates a matrix of feature-importance scores across multiple **layers** (e.g., models, periods, experimental groups) suitable for visualization with :func:`~kdiagram.plot.feature_based.plot_feature_fingerprint`. This is handy for comparing profiles in a compact polar radar view and for testing feature-comparison workflows in forecasting and ML :footcite:p:`scikit-learn, Lim2021, kouadiob2025`. Parameters ---------- n_layers : int, default=3 Number of rows (layers) in the importance matrix. Each row represents a group such as a model or time period. n_features : int, default=8 Number of columns (features) in the importance matrix. layer_names : list of str, optional Names for the layers. If ``None``, generic names like ``'Layer_A'``, ``'Layer_B'`` are generated. Must have length ``n_layers`` if provided. feature_names : list of str, optional Names for the features. If ``None``, generic names like ``'Feature_1'``, ``'Feature_2'`` are generated. Must have length ``n_features`` if provided. value_range : tuple of (float, float), default=(0.0, 1.0) Approximate sampling range ``(min_val, max_val)`` for raw importance scores. Values are drawn from a uniform distribution before structure/sparsity are applied. sparsity : float, default=0.1 Fraction in ``[0, 1]`` of entries that are set to zero at random, simulating unimportant features for some layers. add_structure : bool, default=True If ``True``, inject simple patterns to make fingerprints distinct, e.g., emphasizing one feature per layer and de-emphasizing another. If ``False``, the matrix is fully random apart from sparsity. seed : int or None, default=303 Seed for NumPy’s random generator. If ``None``, a fresh RNG is used. as_frame : bool, default=False If ``False``, return a :class:`~kdiagram.bunch.Bunch` with metadata and arrays. If ``True``, return only the pandas ``DataFrame`` indexed by layers with feature columns. Returns ------- data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame If ``as_frame=False`` (default), a Bunch with: - ``importances`` : ``ndarray`` of shape ``(n_layers, n_features)``. - ``frame`` : pandas ``DataFrame`` view of the matrix with layers as index and features as columns. - ``layer_names`` : list of layer names. - ``feature_names`` : list of feature names. - ``DESCR`` : human-readable description. If ``as_frame=True``, only the pandas ``DataFrame`` is returned. Raises ------ ValueError If ``layer_names`` or ``feature_names`` lengths do not match the specified dimensions, if ``sparsity`` is outside ``[0, 1]``, or if ``value_range`` does not satisfy ``min_val <= max_val``. Notes ----- **Generation model.** Let :math:`I \in \mathbb{R}^{L \times F}` denote the importance matrix with :math:`L = \texttt{n\_layers}` and :math:`F = \texttt{n\_features}`. Raw scores are sampled as .. math:: I_{k,j}^{(0)} \sim \mathcal{U}(m, M), \qquad m = \texttt{value\_range[0]},\; M = \texttt{value\_range[1]}. If structure is enabled, a layer-specific emphasis and de-emphasis may be applied, producing :math:`I^{(1)}`. Finally, a sparsity mask :math:`\;M_{k,j} \sim \text{Bernoulli}(1-s)\;` with :math:`s=\texttt{sparsity}` is applied: .. math:: I_{k,j} \;=\; I_{k,j}^{(1)} \cdot M_{k,j}. Scores are left in their original scale; you may normalize per-layer or per-feature downstream if desired. For practical feature-importance workflows and attribution in forecasting, see :footcite:t:`scikit-learn` and :footcite:t:`Lim2021`. The fingerprint visualization concept is part of our polar analytics framework :footcite:t:`kouadiob2025`. See Also -------- kdiagram.plot.feature_based.plot_feature_fingerprint Radar-style comparison of multi-feature profiles across layers. Examples -------- >>> Return a Bunch with arrays and a DataFrame view: >>> >>> from kdiagram.datasets import make_fingerprint_data >>> fp = make_fingerprint_data(n_layers=4, n_features=10, seed=1) >>> fp.importances.shape (4, 10) >>> list(fp.frame.index)[:2], list(fp.frame.columns)[:3] (['Layer_A', 'Layer_B'], ['Feature_1', 'Feature_2', 'Feature_3']) >>> >>> Return only a DataFrame with custom names: >>> >>> df = make_fingerprint_data( ... n_layers=3, ... n_features=5, ... layer_names=['L1','L2','L3'], ... feature_names=['f1','f2','f3','f4','f5'], ... as_frame=True, ... seed=2, ... ) >>> df.shape (3, 5) References ---------- .. footbibliography:: """
[docs] def make_uncertainty_data( n_samples: int = 150, n_periods: int = 4, anomaly_frac: float = 0.15, start_year: int = 2022, prefix: str = "value", base_value: float = 10.0, trend_strength: float = 1.5, noise_level: float = 2.0, interval_width_base: float = 4.0, interval_width_noise: float = 1.5, interval_width_trend: float = 0.5, seed: int | None = 42, as_frame: bool = False, ) -> Bunch | pd.DataFrame: # --- Generation Logic (same as before) --- if seed is not None: rng = np.random.default_rng(seed) else: rng = np.random.default_rng() location_id = np.arange(n_samples) longitude = rng.uniform(-120, -115, n_samples) latitude = rng.uniform(33, 36, n_samples) elevation = rng.uniform(50, 500, n_samples) + latitude * 5 base_signal = ( base_value + np.sin(np.linspace(0, 3 * np.pi, n_samples)) * 5 + rng.normal(0, noise_level / 2, n_samples) ) actual_first_period = base_signal + rng.normal( 0, noise_level / 2, n_samples ) data_dict = { "location_id": location_id, "longitude": longitude, "latitude": latitude, "elevation": elevation, # Store actual only once, representing T=0 or reference time f"{prefix}_actual": actual_first_period.copy(), } all_q10_cols, all_q50_cols, all_q90_cols = [], [], [] quantile_cols_dict = {"q0.1": [], "q0.5": [], "q0.9": []} for i in range(n_periods): year = start_year + i q10_col = f"{prefix}_{year}_q0.1" q50_col = f"{prefix}_{year}_q0.5" q90_col = f"{prefix}_{year}_q0.9" all_q10_cols.append(q10_col) all_q50_cols.append(q50_col) all_q90_cols.append(q90_col) quantile_cols_dict["q0.1"].append(q10_col) quantile_cols_dict["q0.5"].append(q50_col) quantile_cols_dict["q0.9"].append(q90_col) current_trend = trend_strength * i q50 = ( base_signal + current_trend + rng.normal(0, noise_level / 3, n_samples) ) current_interval_width = ( interval_width_base + interval_width_trend * i + rng.uniform( -interval_width_noise / 2, interval_width_noise / 2, n_samples ) ) current_interval_width = np.maximum(0.1, current_interval_width) q10 = q50 - current_interval_width / 2 q90 = q50 + current_interval_width / 2 data_dict[q10_col] = q10 data_dict[q50_col] = q50 data_dict[q90_col] = q90 df = pd.DataFrame(data_dict) actual_col_name = f"{prefix}_actual" if anomaly_frac > 0 and n_samples > 0: n_anomalies = int(anomaly_frac * n_samples) if n_anomalies > 0 and all_q10_cols and all_q90_cols: anomaly_indices = rng.choice( n_samples, size=n_anomalies, replace=False ) n_under = n_anomalies // 2 under_indices = anomaly_indices[:n_under] over_indices = anomaly_indices[n_under:] q10_first = df[all_q10_cols[0]].iloc[under_indices] q90_first = df[all_q90_cols[0]].iloc[over_indices] df.loc[under_indices, actual_col_name] = q10_first - rng.uniform( 0.5, 3.0, size=len(under_indices) ) * (interval_width_base / 2 + 1) df.loc[over_indices, actual_col_name] = q90_first + rng.uniform( 0.5, 3.0, size=len(over_indices) ) * (interval_width_base / 2 + 1) # Define final column order feature_names = ["location_id", "longitude", "latitude", "elevation"] target_names = [actual_col_name] pred_cols_sorted = [ col for pair in zip(all_q10_cols, all_q50_cols, all_q90_cols) for col in pair ] ordered_cols = feature_names + target_names + pred_cols_sorted df = df[ordered_cols] # --- Return based on as_frame --- if as_frame: return df else: # Create Bunch object numeric_cols = feature_names + target_names + pred_cols_sorted # data_array = df[numeric_cols].values # Data array (optional) # target_array = df[target_names[0]].values target_array = df[target_names[0]].to_numpy( dtype=np.float64, copy=True ) data_array = df[numeric_cols].to_numpy(dtype=np.float64, copy=True) # Create detailed description string descr = textwrap.dedent( f"""\ Synthetic Multi-Period Uncertainty Dataset for k-diagram **Description:** This dataset simulates quantile forecasts (Q10, Q50, Q90) for a single variable ('{prefix}') over {n_periods} consecutive time periods (starting from {start_year}) across {n_samples} independent samples or locations. It includes simulated spatial coordinates and an auxiliary feature ('elevation'). An 'actual' value column (``{actual_col_name}``) corresponding to the *first* time period is provided, with ~{anomaly_frac*100:.0f}% of these values artificially placed outside the first period's Q10-Q90 interval to simulate prediction anomalies. The Q50 predictions follow a base signal with added noise and a linear trend controlled by `trend_strength`. The prediction interval width (Q90-Q10) also includes baseline width, noise, and a linear trend controlled by `interval_width_trend`. **Generation Parameters:** - n_samples : {n_samples} - n_periods : {n_periods} - start_year : {start_year} - prefix : '{prefix}' - anomaly_frac : {anomaly_frac:.2f} - base_value : {base_value:.2f} - trend_strength : {trend_strength:.2f} (for Q50) - noise_level : {noise_level:.2f} (added to Q50/actual) - interval_width_base : {interval_width_base:.2f} - interval_width_noise : {interval_width_noise:.2f} - interval_width_trend : {interval_width_trend:.2f} - seed : {seed} **Data Structure (Bunch object):** - frame : Complete pandas DataFrame. - feature_names : List of spatial/auxiliary feature column names. - target_names : List containing the target column name. - target : NumPy array of target ('actual') values. - quantile_cols : Dict mapping quantiles ('q0.1', 'q0.5', 'q0.9') to lists of column names across periods. - q10_cols : Convenience list of Q10 column names. - q50_cols : Convenience list of Q50 column names. - q90_cols : Convenience list of Q90 column names. - n_periods : Number of periods with quantile data. - prefix : Prefix used for value/quantile columns. - DESCR : This description. This dataset is ideal for testing functions like plot_model_drift, plot_uncertainty_drift, plot_interval_consistency, plot_anomaly_magnitude, plot_coverage_diagnostic, etc. """ ) # Create and return Bunch object return Bunch( frame=df, data=data_array, feature_names=feature_names, target_names=target_names, target=target_array, quantile_cols=quantile_cols_dict, q10_cols=all_q10_cols, q50_cols=all_q50_cols, q90_cols=all_q90_cols, n_periods=n_periods, prefix=prefix, DESCR=descr, )
make_uncertainty_data.__doc__ = r""" Generate a synthetic multi-period uncertainty dataset. Creates a compact dataset for testing `k-diagram` uncertainty visualizations: simulated **actuals** (for the first period), quantile predictions **Q10/Q50/Q90** over multiple periods, controllable trends and noise, injected interval-coverage failures (anomalies), and simple spatial features. This is useful for coverage, calibration, drift, and consistency diagnostics :footcite:p:`Jolliffe2012, Gneiting2007b, kouadiob2025`. Parameters ---------- n_samples : int, default=150 Number of rows (locations) to generate. n_periods : int, default=4 Number of consecutive periods (e.g., years) for which to generate quantiles. anomaly_frac : float, default=0.15 Fraction in ``[0, 1]`` of rows whose first-period actual is forced **outside** the Q10–Q90 interval (half under-, half over-prediction, up to rounding). start_year : int, default=2022 First period’s year used in column names. prefix : str, default='value' Base prefix for generated value/quantile columns. base_value : float, default=10.0 Mean level for the latent signal that drives Q50. trend_strength : float, default=1.5 Linear trend added to Q50 by period index (lead time). noise_level : float, default=2.0 Standard deviation for Gaussian noise added to the latent signal (for Q50 and actuals). interval_width_base : float, default=4.0 Baseline width of the Q10–Q90 interval in the first period. interval_width_noise : float, default=1.5 Uniform jitter magnitude applied per row/period to the interval width. interval_width_trend : float, default=0.5 Linear trend added to interval width across periods. seed : int or None, default=42 NumPy RNG seed for reproducibility. If ``None``, a fresh RNG is used. as_frame : bool, default=False If ``False``, return a :class:`~kdiagram.bunch.Bunch` with arrays and metadata. If ``True``, return only the pandas ``DataFrame``. Returns ------- data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame If ``as_frame=False`` (default), a Bunch with: - ``frame`` : pandas ``DataFrame`` with spatial features, first-period actual, and Q10/Q50/Q90 columns by period. - ``feature_names`` : ``['location_id','longitude','latitude', 'elevation']``. - ``target_names`` : ``[f'{prefix}_actual']``. - ``target`` : ``ndarray`` of actual values. - ``quantile_cols`` : dict mapping ``'q0.1'``, ``'q0.5'``, ``'q0.9'`` to lists of column names across periods. - ``q10_cols``, ``q50_cols``, ``q90_cols`` : convenience lists. - ``n_periods`` : number of generated periods. - ``prefix`` : the column name prefix. - ``DESCR`` : human-readable description. If ``as_frame=True``, only the pandas ``DataFrame`` is returned. Raises ------ TypeError If numeric inputs cannot be processed. Notes ----- **Column naming.** Quantile columns encode the year :math:`y` and quantile level :math:`q`: .. math:: \text{quantile name} \;\equiv\; \texttt{<prefix>}\_{y}\_\texttt{q}q, \qquad y \in \{\texttt{start\_year},\dots\}, \;\; q \in \{0.1,0.5,0.9\}. The first-period actual is stored once as ``f"{prefix}_actual"``. **Signal and interval model.** Let period index be :math:`t \in \{0,\dots,n\_\text{periods}-1\}` and row index :math:`i`. Define latent base signal :math:`s_i` and Q50: .. math:: s_i \;=\; \texttt{base\_value} \;+\; \varepsilon_i, \qquad \varepsilon_i \sim \mathcal{N}(0, \sigma^2),\; \sigma=\texttt{noise\_level}/2, .. math:: Q50_{i,t} \;=\; s_i \;+\; t\cdot\texttt{trend\_strength} \;+\; \eta_{i,t}, \quad \eta_{i,t} \sim \mathcal{N}\!\big(0, (\texttt{noise\_level}/3)^2\big). Interval width :math:`w_{i,t}` has baseline, trend, and jitter: .. math:: w_{i,t} \;=\; \max\!\Bigl( 0.1,\, \texttt{interval\_width\_base} + t\cdot\texttt{interval\_width\_trend} + u_{i,t} \Bigr), \quad u_{i,t} \sim \mathcal{U}\!\Bigl(-\tfrac{ \texttt{interval\_width\_noise}}{2},\, \tfrac{\texttt{interval\_width\_noise}}{2}\Bigr), and .. math:: Q10_{i,t} \;=\; Q50_{i,t} - \tfrac{1}{2}w_{i,t},\qquad Q90_{i,t} \;=\; Q50_{i,t} + \tfrac{1}{2}w_{i,t}. **Anomaly injection (first period).** For a fraction ``anomaly_frac`` of rows we enforce a coverage failure: .. math:: y^{\text{actual}}_{i} \notin [\,Q10_{i,0},\,Q90_{i,0}\,], splitting under/over cases approximately evenly to aid tests of coverage diagnostics and anomaly magnitude plots. Use this data to study calibration vs. sharpness trade-offs :footcite:p:`Gneiting2007b` and operational verification practice :footcite:p:`Jolliffe2012`. See Also -------- kdiagram.plot.uncertainty.plot_coverage Aggregate empirical coverage vs. nominal levels. kdiagram.plot.uncertainty.plot_coverage_diagnostic Point-wise success/failure on a polar layout. kdiagram.plot.uncertainty.plot_interval_consistency Temporal stability of interval widths per location. kdiagram.plot.uncertainty.plot_model_drift Lead-time trend of mean interval width. kdiagram.plot.uncertainty.plot_anomaly_magnitude Where and how severely intervals fail. Examples -------- >>> # Return a Bunch and inspect quantile columns: >>> >>> from kdiagram.datasets import make_uncertainty_data >>> ds = make_uncertainty_data(n_samples=12, n_periods=3, seed=7) >>> sorted(ds.quantile_cols.keys()) ['q0.1', 'q0.5', 'q0.9'] >>> >>> # Return only a DataFrame and check column order: >>> >>> df = make_uncertainty_data(as_frame=True, n_samples=5, seed=0) >>> df.columns[:6].tolist() # features + actual then Q10/Q50/Q90 ['location_id', 'longitude', 'latitude', 'elevation', f'{ 'value'}_actual', 'value_2022_q0.1'] # doctest: +ELLIPSIS References ---------- .. footbibliography:: """
[docs] def make_taylor_data( n_samples: int = 100, n_models: int = 3, ref_std: float = 1.0, corr_range: tuple[float, float] = (0.5, 0.99), std_range: tuple[float, float] = (0.7, 1.3), noise_level: float = 0.3, bias_level: float = 0.1, seed: int | None = 101, as_frame: bool = False, ) -> Bunch | pd.DataFrame: # --- Input Validation & Setup --- if seed is not None: rng = np.random.default_rng(seed) else: rng = np.random.default_rng() # Basic validation for ranges if not (0 <= corr_range[0] <= corr_range[1] <= 1.0): warnings.warn( "corr_range limits should ideally be between 0 and 1 for " "standard Taylor Diagrams. Adjusting...", stacklevel=2, ) corr_range = (max(0, corr_range[0]), min(1.0, corr_range[1])) if corr_range[0] > corr_range[1]: corr_range = (0.5, 0.99) if not (0 <= std_range[0] <= std_range[1]): warnings.warn( "std_range factors should be non-negative and min <= max." " Using defaults.", stacklevel=2, ) std_range = (0.7, 1.3) if noise_level <= 1e-9 and corr_range[1] < 1.0 - 1e-9: raise ValueError( "noise_level cannot be zero if target correlation < 1 is possible." ) # --- Generate Reference Data --- reference_raw = rng.normal(0, ref_std, n_samples) # Center mean at 0 reference = reference_raw - np.mean(reference_raw) # Scale to desired std dev current_std = np.std(reference) if current_std > 1e-9: reference = reference * (ref_std / current_std) # Store actual std dev actual_ref_std = np.std(reference) # --- Generate Model Predictions --- predictions = [] model_names = [] calculated_stds = [] calculated_corrs = [] for i in range(n_models): model_name = f"Model_{chr(65+i)}" # Model A, B, C... model_names.append(model_name) # Sample target stats for this model target_rho = rng.uniform(corr_range[0], corr_range[1]) target_std_factor = rng.uniform(std_range[0], std_range[1]) target_std = target_std_factor * actual_ref_std # Calculate coefficients a and b for p = a*r + b*noise + bias a = target_rho * target_std_factor b_squared_term = target_std**2 - (a * actual_ref_std) ** 2 if b_squared_term < -1e-9: warnings.warn( f"Model {model_name}: Cannot achieve target std " f"({target_std:.2f}) with target correlation " f"({target_rho:.2f}) and noise " f"({noise_level:.2f}). Setting b=0.", UserWarning, stacklevel=2, ) b = 0 else: # Ensure noise_level isn't zero if b_squared_term > 0 if noise_level <= 1e-9 and b_squared_term > 1e-9: raise ValueError( "noise_level cannot be zero if needed to reach target std" ) b = np.sqrt(max(0, b_squared_term)) / max(noise_level, 1e-9) # Generate noise and bias noise = rng.normal(0, noise_level, n_samples) bias = rng.uniform(-bias_level, bias_level) # Create prediction pred = a * reference + b * noise + bias predictions.append(pred) # Calculate actual stats calculated_stds.append(np.std(pred)) # Clip correlation calculation for safety corr_val = np.corrcoef(pred, reference)[0, 1] calculated_corrs.append(np.clip(corr_val, -1.0, 1.0)) # --- Assemble DataFrame (used for both frame and Bunch) --- df_dict = {"reference": reference} for name, pred_array in zip(model_names, predictions): df_dict[name] = pred_array df = pd.DataFrame(df_dict) # --- Return based on as_frame --- if as_frame: return df else: # Assemble stats DataFrame stats_df = pd.DataFrame( {"stddev": calculated_stds, "corrcoef": calculated_corrs}, index=model_names, ) # Assemble description descr = textwrap.dedent( f"""\ Synthetic Taylor Diagram Data **Generated Parameters:** - n_samples : {n_samples} - n_models : {n_models} - ref_std : {ref_std:.2f} (target), {actual_ref_std:.2f} (actual) - corr_range : ({corr_range[0]:.2f}, {corr_range[1]:.2f}) (target) - std_range : ({std_range[0]:.2f}, {std_range[1]:.2f}) (target factor) - noise_level : {noise_level:.2f} - bias_level : {bias_level:.2f} - seed : {seed} **Contents (Bunch object):** - frame : DataFrame with reference and prediction columns. - reference : NumPy array (n_samples,) - Reference data. - predictions : List of {n_models} NumPy arrays (n_samples,) - Model data. - model_names : List of {n_models} strings - Model labels. - stats : DataFrame with actual calculated 'stddev' and 'corrcoef' for each model vs reference. - ref_std : Actual standard deviation of the reference data. - DESCR : This description. """ ) return Bunch( frame=df, reference=reference, predictions=predictions, model_names=model_names, stats=stats_df, ref_std=actual_ref_std, DESCR=descr, )
make_taylor_data.__doc__ = r""" Generate synthetic data for Taylor diagrams. Taylor diagrams, introduced by :footcite:t:`Taylor2001`, summarize correlation, standard deviation, and centered RMS difference between model outputs and a reference. This routine creates one reference series and several model-like series with controllable correlation and spread, suitable for exercising plotting functions such as :func:`~kdiagram.plot.evaluation.taylor_diagram`. Practical guidance on verification appears in :footcite:p:`Jolliffe2012`. Parameters ---------- n_samples : int, default=100 Number of observations in each generated series. n_models : int, default=3 Number of model (prediction) series to simulate. ref_std : float, default=1.0 Target standard deviation for the reference series (mean is centered to 0). corr_range : tuple of (float, float), default=(0.5, 0.99) Closed interval from which target correlations :math:`\rho` for models are sampled uniformly. Values should be in :math:`[0,1]` for standard Taylor use. std_range : tuple of (float, float), default=(0.7, 1.3) Closed interval for multiplicative factors applied to the reference standard deviation to obtain each model’s target spread. noise_level : float, default=0.3 Standard deviation of the independent noise used to reach the requested spread and correlation. Must be positive if any target correlation is less than 1. bias_level : float, default=0.1 Maximum absolute bias added to each model series (uniform in ``[-bias_level, bias_level]``). Note that Taylor diagrams are insensitive to overall bias. seed : int or None, default=101 NumPy random seed. If ``None``, a fresh RNG is used. as_frame : bool, default=False If ``False``, return a :class:`~kdiagram.bunch.Bunch` with arrays, names, and summary stats. If ``True``, return only a pandas ``DataFrame`` with columns for the reference and each model series. Returns ------- data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame If ``as_frame=False`` (default), a Bunch with: - ``frame`` : pandas ``DataFrame`` with ``'reference'`` and model columns. - ``reference`` : ``ndarray`` of shape ``(n_samples,)``. - ``predictions`` : list of ``ndarray`` predictions. - ``model_names`` : list of model labels. - ``stats`` : pandas ``DataFrame`` with columns ``'stddev'`` and ``'corrcoef'`` vs the reference. - ``ref_std`` : actual standard deviation of the reference. - ``DESCR`` : human-readable description. If ``as_frame=True``, only the pandas ``DataFrame`` is returned. Raises ------ ValueError If ranges are invalid, or ``noise_level`` is non-positive while a sub-perfect target correlation is requested. Notes ----- **Construction.** Let the reference be :math:`r` with :math:`\mathrm{E}[r]=0` and :math:`\mathrm{sd}(r)=\sigma_r` (we target :math:`\sigma_r=\texttt{ref\_std}`). For model :math:`k`, we synthesize .. math:: p^{(k)} \;=\; a^{(k)} r \;+\; b^{(k)} \epsilon^{(k)} \;+\; \text{bias}^{(k)}, with :math:`\epsilon^{(k)} \sim \mathcal{N}(0,\sigma_\epsilon^2)` independent of :math:`r`, where :math:`\sigma_\epsilon=\texttt{noise\_level}`. Ignoring bias (centered statistics), the model spread and correlation satisfy .. math:: \sigma_{p}^{(k)} \;=\; \sqrt{(a^{(k)} \sigma_r)^2 + (b^{(k)} \sigma_\epsilon)^2}, \qquad \rho^{(k)} \;=\; \frac{a^{(k)} \sigma_r}{\sigma_{p}^{(k)}}. We sample a target :math:`\rho^{(k)} \in \texttt{corr\_range}` and a target spread factor :math:`\alpha^{(k)} \in \texttt{std\_range}`, set :math:`\sigma_p^{(k)} = \alpha^{(k)} \sigma_r`, choose .. math:: a^{(k)} \;=\; \rho^{(k)} \alpha^{(k)}, \qquad b^{(k)} \;=\; \frac{\sqrt{\left(\sigma_p^{(k)}\right)^2 - \left(a^{(k)} \sigma_r\right)^2}} {\sigma_\epsilon}, and draw a small constant :math:`\text{bias}^{(k)} \in [-\texttt{bias\_level},\texttt{bias\_level}]`. Centered Taylor statistics are unaffected by bias. See :footcite:t:`Taylor2001` for interpretation; broader verification context is covered in :footcite:p:`Jolliffe2012`. See Also -------- kdiagram.plot.evaluation.taylor_diagram Flexible Taylor diagram from raw arrays or pre-computed stats. kdiagram.plot.evaluation.plot_taylor_diagram Standard Taylor diagram from raw arrays. kdiagram.plot.evaluation.plot_taylor_diagram_in Taylor diagram with background shading. Examples -------- >>> # Get arrays and stats as a Bunch: >>> >>> from kdiagram.datasets import make_taylor_data >>> ds = make_taylor_data(n_models=2, seed=0) >>> list(ds.frame.columns) ['reference', 'Model_A', 'Model_B'] >>> set(ds.stats.columns) == {'stddev', 'corrcoef'} True >>> >>> # Return only a DataFrame: >>> >>> df = make_taylor_data(as_frame=True, seed=1) >>> 'reference' in df.columns True References ---------- .. footbibliography:: """
[docs] def make_multi_model_quantile_data( n_samples: int = 100, n_models: int = 3, quantiles: list[float] = None, prefix: str = "pred", model_names: list[str] | None = None, true_mean: float = 50.0, true_std: float = 10.0, bias_range: tuple[float, float] = (-2.0, 2.0), width_range: tuple[float, float] = (5.0, 15.0), noise_level: float = 1.0, seed: int | None = 202, as_frame: bool = False, ) -> Bunch | pd.DataFrame: # --- Input Validation --- if quantiles is None: quantiles = [0.1, 0.5, 0.9] if 0.5 not in quantiles: # Current logic relies on 0.5 being present for centering raise ValueError("The `quantiles` list must contain 0.5 (median).") if seed is not None: rng = np.random.default_rng(seed) else: rng = np.random.default_rng() if not width_range[0] <= width_range[1] or width_range[0] < 0: raise ValueError( "width_range must be (min, max) with min >= 0 and min <= max." ) if not bias_range[0] <= bias_range[1]: raise ValueError("bias_range must be (min, max) with min <= max.") # --- Setup --- # Ensure unique and sorted quantiles quantiles_sorted = sorted(list(set(quantiles))) if len(quantiles_sorted) < 2: q_min, q_max = quantiles_sorted[0], quantiles_sorted[0] else: q_min = quantiles_sorted[0] q_max = quantiles_sorted[-1] q_median = 0.5 # Factor to scale half-width based on min/max quantile range vs Q10-Q90 # Avoid division by zero if only one quantile provided width_denominator = 0.9 - 0.1 width_numerator = q_max - q_min if len(quantiles_sorted) > 1 and abs(width_numerator) > 1e-9: width_scale_factor = width_numerator / width_denominator else: width_scale_factor = ( 1.0 # No scaling needed if range is zero/single q ) # --- Data Generation --- y_true = rng.normal(true_mean, true_std, n_samples) feature_1 = rng.uniform(0, 1, n_samples) feature_2 = rng.normal(5, 2, n_samples) data_dict = { # Use dict to build data before DataFrame "y_true": y_true, "feature_1": feature_1, "feature_2": feature_2, } # Generate Model Names if model_names is None: model_names_list = [f"Model_{chr(65+i)}" for i in range(n_models)] elif len(model_names) != n_models: raise ValueError( f"Length of model_names ({len(model_names)}) must " f"match n_models ({n_models})." ) else: model_names_list = list(model_names) prediction_columns_dict = {name: [] for name in model_names_list} # --- Generate predictions for each model --- for _i, model_name in enumerate(model_names_list): # Sample model-specific parameters model_bias = rng.uniform(bias_range[0], bias_range[1]) model_width = rng.uniform(width_range[0], width_range[1]) # Store generated quantiles temporarily before sorting temp_model_quantiles = {} # Generate Q50 (median) prediction first q50_pred = y_true + model_bias + rng.normal(0, noise_level, n_samples) q50_col_name = f"{prefix}_{model_name}_q0.5" temp_model_quantiles[0.5] = q50_pred # Add name to tracking dict immediately prediction_columns_dict[model_name].append(q50_col_name) # Generate other quantiles based on Q50 and target width for q in quantiles_sorted: if q == q_median: continue # Skip if median # Calculate offset using proportional distance from median # Avoid division by zero if q_max == q_min q_range = q_max - q_min # from scipy.stats import norm # z_score = norm.ppf(q) # Z-score for the quantile # Use standard deviation implied by width (e.g. q90-q10 ~ 2.56*std) # implied_std = model_width / (norm.ppf(q_max) - norm.ppf(q_min)) # if (q_max != q_min) else 1.0 # quantile_offset = z_score * implied_std if abs(q_range) > 1e-9 and abs(width_scale_factor) > 1e-9: quantile_offset = ( (model_width / width_scale_factor) * (q - q_median) / q_range * 2 ) else: # Handle single quantile or zero range quantile_offset = 0 q_pred = ( q50_pred + quantile_offset + rng.normal( 0, noise_level / 2, n_samples, # Slightly less noise for bounds ) ) temp_model_quantiles[q] = q_pred # Ensure quantile order and add to main data dict # Create temporary DF for sorting this model's quantiles model_data_temp = pd.DataFrame(temp_model_quantiles) # Sort values row-wise sorted_data = np.sort(model_data_temp.values, axis=1) # Assign sorted values back, creating final column names for k, q in enumerate(quantiles_sorted): col_name = f"{prefix}_{model_name}_q{q:.2f}".rstrip("0").rstrip( "." ) data_dict[col_name] = sorted_data[:, k] # Add to tracking dict if not already added (handles Q50 case) if col_name not in prediction_columns_dict[model_name]: prediction_columns_dict[model_name].append(col_name) # Create the final DataFrame df = pd.DataFrame(data_dict) # Order columns somewhat logically feature_names = ["feature_1", "feature_2"] target_name = ["y_true"] pred_cols_sorted = sorted( [col for col in df.columns if col.startswith(prefix)] ) ordered_cols = target_name + feature_names + pred_cols_sorted df = df[ordered_cols] # --- Return based on as_frame --- if as_frame: return df else: # Create Bunch object data_numeric_cols = feature_names + pred_cols_sorted data_array = df[data_numeric_cols].values target_array = df[target_name[0]].values descr = textwrap.dedent( f"""\ Synthetic Multi-Model Quantile Dataset for k-diagram **Generated Parameters:** - n_samples : {n_samples} - n_models : {n_models} - quantiles : {quantiles_sorted} - prefix : {prefix} - true_mean : {true_mean:.2f} - true_std : {true_std:.2f} - bias_range : {bias_range} - width_range : {width_range} - noise_level : {noise_level:.2f} - seed : {seed} **Data Structure (Bunch object):** - frame : Complete pandas DataFrame. - data : NumPy array of numeric feature & prediction columns. - feature_names : List of auxiliary feature column names. - target_names : List containing the target column name ('y_true'). - target : NumPy array of 'y_true' values. - model_names : List of simulated model names. - quantile_levels : Sorted list of quantile levels generated. - prediction_columns : Dict mapping model names to their column names. - prefix : Prefix used for prediction columns. - DESCR : This description. This dataset simulates quantile predictions from {n_models} models for a single time point, allowing comparison of their uncertainty characteristics. """ ) return Bunch( frame=df, data=data_array, feature_names=feature_names, target_names=target_name, target=target_array, model_names=model_names_list, quantile_levels=quantiles_sorted, prediction_columns=prediction_columns_dict, prefix=prefix, DESCR=descr, )
make_multi_model_quantile_data.__doc__ = r""" Generate multi-model quantile forecast data for a single horizon. Simulates a target variable :math:`y_{\text{true}}` and quantile predictions (e.g., Q10/Q50/Q90) from several models for the **same** forecast time. Each model can have its own systematic bias and characteristic interval width, enabling reproducible examples for coverage/calibration and cross-model comparisons :footcite:p:`Gneiting2007b, Jolliffe2012`. Parameters ---------- n_samples : int, default=100 Number of rows (independent samples/locations). n_models : int, default=3 Number of simulated models providing quantile forecasts. quantiles : list of float, default=[0.1, 0.5, 0.9] Quantile levels in ``(0, 1)`` to generate for **each** model. Must include ``0.5`` (the median). The list is de-duplicated and sorted internally. prefix : str, default='pred' Base prefix for prediction columns. Final names follow ``{prefix}_{model_name}_q{quantile}``. model_names : list of str, optional Custom model names of length ``n_models``. If ``None``, ``'Model_A'``, ``'Model_B'``, … are generated. true_mean : float, default=50.0 Mean of the Normal distribution used to draw ``y_true``. true_std : float, default=10.0 Standard deviation of the Normal distribution for ``y_true``. bias_range : tuple of (float, float), default=(-2.0, 2.0) Uniform range from which a model-specific bias for Q50 is sampled and added to ``y_true``. width_range : tuple of (float, float), default=(5.0, 15.0) Uniform range for the target **overall** interval width (e.g., Q90–Q10) of each model. noise_level : float, default=1.0 Standard deviation of independent Gaussian noise added to each generated quantile series. seed : int or None, default=202 NumPy RNG seed (``default_rng``). If ``None``, a fresh RNG is used. as_frame : bool, default=False If ``False``, return a :class:`~kdiagram.bunch.Bunch` with arrays/metadata; if ``True``, return only the pandas ``DataFrame``. Returns ------- data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame If ``as_frame=False`` (default), a Bunch with: - ``frame`` : pandas ``DataFrame`` of shape ``(n_samples, 3 + n_models * n_quantiles)`` containing ``'y_true'``, two auxiliary features, and all quantile columns. - ``data`` : ``ndarray`` with numeric feature + prediction columns. - ``feature_names`` : ``['feature_1', 'feature_2']``. - ``target_names`` : ``['y_true']``. - ``target`` : ``ndarray`` of ``y_true`` values. - ``model_names`` : list of model labels. - ``quantile_levels`` : sorted list of unique quantiles. - ``prediction_columns`` : dict mapping each model name to its list of quantile column names. - ``prefix`` : the column prefix. - ``DESCR`` : human-readable description. If ``as_frame=True``, only the pandas ``DataFrame`` is returned. Raises ------ ValueError If ``0.5`` is not in ``quantiles``, if name/range lengths are inconsistent, or if ranges are invalid. TypeError If non-numeric inputs prevent computation. Notes ----- **Generation model.** Draw the truth as :math:`y_{\text{true}} \sim \mathcal{N}(\mu, \sigma^2)` with ``mu=true_mean`` and ``sigma=true_std``. For model :math:`m`, let :math:`b^{(m)}` be a sampled bias and :math:`W^{(m)}` a sampled overall width (e.g., Q90–Q10). The median prediction (Q50) is .. math:: q_{0.5}^{(m)} \;=\; y_{\text{true}} \;+\; b^{(m)} \;+\; \varepsilon^{(m)}, \qquad \varepsilon^{(m)} \sim \mathcal{N}(0, \sigma_\varepsilon^2), with ``sigma_ε = noise_level``. Other quantiles are created by adding offsets proportional to their distance from the median and scaled so that the extreme quantiles span approximately :math:`W^{(m)}`; small independent noise is then added. Finally, for each row we sort the model’s quantile values to enforce :math:`q_{\alpha} \le q_{0.5} \le q_{\beta}` (e.g., Q10 ≤ Q50 ≤ Q90), which is useful for coverage and calibration diagnostics :footcite:p:`Gneiting2007b, Jolliffe2012`. Two auxiliary columns (``feature_1``, ``feature_2``) are included for convenience in examples; they do not influence the simulated target or quantiles. See Also -------- make_uncertainty_data Temporal multi-period quantiles with drift/consistency controls. make_taylor_data Synthetic data tailored for Taylor diagram evaluation. kdiagram.plot.uncertainty.plot_coverage Aggregate empirical coverage vs nominal. kdiagram.plot.uncertainty.plot_temporal_uncertainty General polar visualization for multiple series. Examples -------- >>> # As a Bunch with metadata: >>> >>> from kdiagram.datasets import make_multi_model_quantile_data >>> ds = make_multi_model_quantile_data(n_samples=50, n_models=2, seed=1) >>> ds.model_names ['Model_A', 'Model_B'] >>> sorted(ds.quantile_levels) [0.1, 0.5, 0.9] >>> ds.prediction_columns['Model_A'][:3] # doctest: +ELLIPSIS ['pred_Model_A_q0.1', 'pred_Model_A_q0.5', 'pred_Model_A_q0.9'] >>> >>> # As a DataFrame: >>> >>> df = make_multi_model_quantile_data(as_frame=True, seed=2) >>> set(['y_true','feature_1','feature_2']).issubset(df.columns) True References ---------- .. footbibliography:: """