Source code for kdiagram.datasets.make

# Author: LKouadio <etanoyau@gmail.com>
# License: Apache License 2.0 (see LICENSE file)

"""
Dataset Generation Utilities (:mod:`kdiagram.datasets.make`)
============================================================

This module provides functions to create synthetic datasets tailored
for demonstrating and testing the various plotting functions within
the `k-diagram` package, particularly those focused on uncertainty.
"""
from __future__ import annotations

import textwrap
import warnings

import numpy as np
import pandas as pd

from ..api.bunch import Bunch

__all__ = [
    "make_uncertainty_data",
    "make_taylor_data",
    "make_multi_model_quantile_data",
    "make_cyclical_data",
]



[docs]
def make_cyclical_data(
    n_samples: int = 365,
    n_series: int = 2,
    cycle_period: float = 365,
    noise_level: float = 0.5,
    amplitude_true: float = 10.0,
    offset_true: float = 20.0,
    pred_bias: float | list[float] = None,
    pred_noise_factor: float | list[float] = None,
    pred_amplitude_factor: float | list[float] = None,
    pred_phase_shift: float | list[float] = None,
    prefix: str = "model",
    series_names: list[str] | None = None,
    seed: int | None = 404,
    as_frame: bool = False,
) -> Bunch | pd.DataFrame:
    # --- Input Validation & Setup ---
    if pred_phase_shift is None:
        pred_phase_shift = [0, np.pi / 6]
    if pred_amplitude_factor is None:
        pred_amplitude_factor = [1.0, 0.8]
    if pred_noise_factor is None:
        pred_noise_factor = [1.0, 1.5]
    if pred_bias is None:
        pred_bias = [0, 1.5]
    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    # Ensure prediction parameters are lists of correct length
    params_to_check = {
        "pred_bias": pred_bias,
        "pred_noise_factor": pred_noise_factor,
        "pred_amplitude_factor": pred_amplitude_factor,
        "pred_phase_shift": pred_phase_shift,
    }
    processed_params = {}
    for name, param in params_to_check.items():
        if isinstance(param, (int, float)):
            processed_params[name] = [param] * n_series
        elif isinstance(param, list):
            if len(param) != n_series:
                raise ValueError(
                    f"Length of '{name}' ({len(param)}) must match "
                    f"n_series ({n_series})."
                )
            processed_params[name] = param
        else:
            raise TypeError(f"'{name}' must be float or list of floats.")

    # --- Generate Time Step and True Signal ---
    time_step = np.arange(n_samples)
    # Angular frequency based on cycle period
    omega = 2 * np.pi / cycle_period
    theta = omega * time_step

    # True signal (e.g., sine wave + offset + noise)
    y_true = (
        offset_true
        + amplitude_true * np.sin(theta)
        + rng.normal(0, noise_level, n_samples)
    )

    data_dict = {"time_step": time_step, "y_true": y_true}

    # --- Generate Model Names & Prediction Columns ---
    if series_names is None:
        series_names_list = [f"{prefix}_{chr(65+i)}" for i in range(n_series)]
    elif len(series_names) != n_series:
        raise ValueError(
            f"Length of series_names ({len(series_names)}) must "
            f"match n_series ({n_series})."
        )
    else:
        series_names_list = list(series_names)

    prediction_cols_list = []

    for i, series_name in enumerate(series_names_list):
        col_name = series_name  # Use provided or generated name
        prediction_cols_list.append(col_name)

        # Get parameters for this series
        amp = amplitude_true * processed_params["pred_amplitude_factor"][i]
        bias = processed_params["pred_bias"][i]
        noise = noise_level * processed_params["pred_noise_factor"][i]
        phase = processed_params["pred_phase_shift"][i]

        # Generate prediction series
        y_pred = (
            offset_true
            + bias
            + amp * np.sin(theta + phase)
            + rng.normal(0, noise, n_samples)
        )

        data_dict[col_name] = y_pred

    # --- Create DataFrame ---
    df = pd.DataFrame(data_dict)

    # Define column categories for Bunch
    feature_names = ["time_step"]
    target_name = ["y_true"]

    # --- Return based on as_frame ---
    if as_frame:
        # Order columns logically
        ordered_cols = target_name + feature_names + prediction_cols_list
        return df[ordered_cols]
    else:
        # Create Bunch description
        descr = textwrap.dedent(
            f"""\
        Synthetic Cyclical Pattern Data for k-diagram

        **Description:**
        Simulates a dataset with a primary 'true' cyclical signal and
        {n_series} related prediction series over {n_samples} time steps.
        The true signal is a sine wave with added noise. Prediction
        series are generated based on the true signal but may include
        systematic bias, different amplitude scaling, phase shifts (lag/lead),
        and varying noise levels, according to the specified parameters.

        **Generation Parameters:**
        - n_samples             : {n_samples}
        - n_series              : {n_series}
        - cycle_period          : {cycle_period}
        - noise_level           : {noise_level:.2f} (base for y_true)
        - amplitude_true        : {amplitude_true:.2f}
        - offset_true           : {offset_true:.2f}
        - pred_bias             : {processed_params['pred_bias']}
        - pred_noise_factor     : {processed_params['pred_noise_factor']}
        - pred_amplitude_factor : {processed_params['pred_amplitude_factor']}
        - pred_phase_shift      : {processed_params['pred_phase_shift']} (radians)
        - prefix                : '{prefix}'
        - seed                  : {seed}

        **Data Structure (Bunch object):**
        - frame           : Complete pandas DataFrame.
        - feature_names   : List of feature column names (['time_step']).
        - target_names    : List containing the target column name (['y_true']).
        - target          : NumPy array of 'y_true' values.
        - series_names    : List of prediction series names.
        - prediction_columns: List of prediction column names in the frame.
        - DESCR           : This description.

        This dataset is suitable for visualizing relationships or temporal
        patterns in a polar context using functions like plot_relationship
        or plot_temporal_uncertainty.
        """
        )

        # Build arrays with a uniform dtype to avoid pandas -> np.find_common_type
        num_cols = feature_names + prediction_cols_list

        target_array = df[target_name[0]].to_numpy(
            dtype=np.float64, copy=True
        )
        data_array = df[num_cols].to_numpy(dtype=np.float64, copy=True)

        return Bunch(
            frame=df[target_name + feature_names + prediction_cols_list],
            data=data_array,
            feature_names=feature_names,
            target_names=target_name,
            target=target_array,
            series_names=series_names_list,
            prediction_columns=prediction_cols_list,
            DESCR=descr,
        )



make_cyclical_data.__doc__ = r"""
Generate synthetic cyclical data for relationship and temporal plots.

Creates a dataset with a single **true** cyclical signal and one or
more **prediction** series that can differ in amplitude, phase, bias,
and noise relative to the truth. This is useful for demos of
polar relationship and temporal-uncertainty plots in `k-diagram`
:footcite:p:`harris2020array, 2020SciPy-NMeth, Hunter:2007`.

This data is useful for demonstrating and testing functions like
:func:`~kdiagram.plot.relationship.plot_relationship` or
:func:`~kdiagram.plot.uncertainty.plot_temporal_uncertainty` where
visualizing behavior over a cycle is important.

Parameters
----------
n_samples : int, default=365
    Number of time steps to generate. Interpreted as evenly
    spaced samples over one or more cycles.

n_series : int, default=2
    Number of simulated prediction series (e.g., models).

cycle_period : float, default=365
    Samples per full cycle :math:`P`. The angular frequency is
    :math:`\omega = 2\pi / P`. Use ``365`` for daily data over
    one year, ``12`` for monthly data over one year, etc.

noise_level : float, default=0.5
    Standard deviation of Gaussian noise added to the **true**
    signal. Prediction series scale this by ``pred_noise_factor``.

amplitude_true : float, default=10.0
    Amplitude of the sinusoidal **true** signal.

offset_true : float, default=20.0
    Vertical offset (mean level) of the **true** signal.

pred_bias : float or list of float, optional
    Additive bias for each prediction series. If a scalar is
    provided it is broadcast to all ``n_series``. If a list is
    provided, its length must equal ``n_series``. Defaults to
    ``[0.0, 1.5]`` when ``None``.

pred_noise_factor : float or list of float, optional
    Multiplier for ``noise_level`` per series. Scalar values are
    broadcast; lists must match ``n_series`` in length. Defaults
    to ``[1.0, 1.5]`` when ``None``.

pred_amplitude_factor : float or list of float, optional
    Multiplier of ``amplitude_true`` per series (allows under/
    over-estimation of the cycle amplitude). Scalar broadcast is
    supported. Defaults to ``[1.0, 0.8]`` when ``None``.

pred_phase_shift : float or list of float, optional
    Phase shift (radians) added to each series. Positive values
    produce a lag relative to the truth. Scalar broadcast is
    supported. Defaults to ``[0.0, np.pi / 6]`` when ``None``.

prefix : str, default='model'
    Prefix used to generate prediction column names, e.g.,
    ``model_A``, ``model_B``, …

series_names : list of str, optional
    Explicit names for prediction columns. If omitted, names are
    generated from ``prefix`` as ``prefix_A``, ``prefix_B``, …
    Must have length ``n_series`` if provided.

seed : int or None, default=404
    Seed for NumPy’s random generator. If ``None``, a fresh RNG
    is used.

as_frame : bool, default=False
    If ``False``, return a :class:`~kdiagram.bunch.Bunch` with
    metadata and arrays. If ``True``, return only the pandas
    ``DataFrame``.

Returns
-------
data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame
    If ``as_frame=False`` (default), a Bunch with:

    - ``frame`` : pandas ``DataFrame`` containing ``'time_step'``,
      ``'y_true'``, and prediction columns.
    - ``feature_names`` : ``['time_step']``.
    - ``target_names`` : ``['y_true']``.
    - ``target`` : ``ndarray`` of shape ``(n_samples,)`` with the
      true signal.
    - ``series_names`` : list of prediction series names.
    - ``prediction_columns`` : list of prediction column names.
    - ``DESCR`` : human-readable description.

    If ``as_frame=True``, only the pandas ``DataFrame`` is
    returned.

Raises
------
ValueError
    If a provided list for prediction parameters does not match
    ``n_series`` in length.

TypeError
    If prediction parameters are not float or list of float.

Notes
-----
**Signal model.** Let :math:`P` be the cycle period and
:math:`\omega = 2\pi/P`. The **true** signal at time step
:math:`t \in \{0,\dots,n\_samples-1\}` is

.. math::

   y_{\text{true}}(t)
   \;=\;
   \texttt{offset\_true}
   \;+\;
   \texttt{amplitude\_true}\,\sin(\omega t)
   \;+\;
   \varepsilon_t,
   \qquad
   \varepsilon_t \sim \mathcal{N}(0,\sigma^2),
   \;\; \sigma=\texttt{noise\_level}.

For series :math:`k=1,\dots,n\_{\text{series}}`, the prediction is

.. math::

   y_{\text{pred}}^{(k)}(t)
   \;=\;
   \texttt{offset\_true}
   \;+\;
   b_k
   \;+\;
   \big(\texttt{amplitude\_true}\,\alpha_k\big)
   \sin(\omega t + \phi_k)
   \;+\;
   \eta^{(k)}_t,

with :math:`\eta^{(k)}_t \sim \mathcal{N}\!\big(0,\,
(\sigma\,\gamma_k)^2\big)`.
Here :math:`b_k` is the bias (``pred_bias``),
:math:`\alpha_k` the amplitude factor (``pred_amplitude_factor``),
:math:`\phi_k` the phase shift (``pred_phase_shift``), and
:math:`\gamma_k` the noise factor (``pred_noise_factor``).
Numerical generation and plotting typically rely on array/scientific
and graphics stacks :footcite:p:`harris2020array, 2020SciPy-NMeth, Hunter:2007`.

See Also
--------
kdiagram.plot.relationship.plot_relationship
    Polar relationship scatter for true vs. predictions.

kdiagram.plot.uncertainty.plot_temporal_uncertainty
    General-purpose polar series plot; useful for Q10/Q50/Q90 and
    cyclical visualizations.

Examples
--------
>>> Generate a small cyclical dataset as a Bunch:
>>> 
>>> from kdiagram.datasets import make_cyclical_data
>>> ds = make_cyclical_data(
...     n_samples=24, n_series=2, cycle_period=12, seed=7
... )
>>> ds.frame.head().columns.tolist()[:3]
['time_step', 'y_true', ds.prediction_columns[0]]
>>> 
>>> Return only a DataFrame and supply custom names:
>>> 
>>> df = make_cyclical_data(
...     n_samples=50,
...     n_series=3,
...     series_names=['A','B','C'],
...     as_frame=True,
...     seed=1
... )
>>> set(['time_step','y_true']).issubset(df.columns)
True

References 
------------

.. footbibliography::
    
"""



[docs]
def make_fingerprint_data(
    n_layers: int = 3,
    n_features: int = 8,
    layer_names: list[str] | None = None,
    feature_names: list[str] | None = None,
    value_range: tuple[float, float] = (0.0, 1.0),
    sparsity: float = 0.1,
    add_structure: bool = True,
    seed: int | None = 303,
    as_frame: bool = False,
) -> Bunch | pd.DataFrame:

    # --- Input Validation & Setup ---
    if not (0.0 <= sparsity <= 1.0):
        raise ValueError("sparsity must be between 0.0 and 1.0")
    if not (
        isinstance(value_range, tuple)
        and len(value_range) == 2
        and value_range[0] <= value_range[1]
    ):
        raise ValueError(
            "value_range must be a tuple (min, max)" " with min <= max."
        )

    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    # Generate names if needed
    if feature_names is None:
        feature_names = [f"Feature_{i+1}" for i in range(n_features)]
    elif len(feature_names) != n_features:
        raise ValueError(
            f"Length of feature_names ({len(feature_names)}) "
            f"must match n_features ({n_features})."
        )

    if layer_names is None:
        layer_names = [f"Layer_{chr(65+i)}" for i in range(n_layers)]
    elif len(layer_names) != n_layers:
        raise ValueError(
            f"Length of layer_names ({len(layer_names)}) "
            f"must match n_layers ({n_layers})."
        )

    # --- Generate Importance Matrix ---
    min_val, max_val = value_range
    importances = rng.uniform(min_val, max_val, size=(n_layers, n_features))

    # Add optional structure
    if add_structure and n_layers > 1 and n_features > 1:
        for i in range(n_layers):
            # Example structure: layer 'i' emphasizes feature 'i' (cycling)
            emphasized_feature = i % n_features
            importances[i, emphasized_feature] = rng.uniform(
                (min_val + max_val) / 1.5,  # Emphasize higher values
                max_val * 1.1,  # Allow slightly exceeding max
            )
            # Maybe deemphasize another feature
            deemphasized_feature = (i + n_features // 2) % n_features
            if deemphasized_feature != emphasized_feature:
                importances[i, deemphasized_feature] = rng.uniform(
                    min_val * 0.9,  # Allow slightly below min
                    (min_val + max_val) / 2.5,  # Emphasize lower values
                )
        # Ensure values stay within reasonable bounds if needed
        importances = np.clip(importances, min_val * 0.8, max_val * 1.2)

    # Introduce sparsity
    if sparsity > 0:
        mask = rng.choice(
            [0, 1], size=importances.shape, p=[sparsity, 1 - sparsity]
        )
        importances *= mask

    # --- Assemble DataFrame ---
    df = pd.DataFrame(importances, index=layer_names, columns=feature_names)

    # --- Return based on as_frame ---
    if as_frame:
        return df
    else:
        # Create Bunch description
        descr = textwrap.dedent(
            f"""\
        Synthetic Feature Fingerprint Data

        **Description:**
        Simulated feature importance matrix for {n_layers} layers/groups
        and {n_features} features. Values were sampled uniformly from
        the range {value_range} and approximately {sparsity*100:.0f}% were
        randomly set to zero (sparsity).{' Some basic structure was added.'
        if add_structure else ''} This dataset is suitable for use with
        plot_feature_fingerprint.

        **Generation Parameters:**
        - n_layers       : {n_layers}
        - n_features     : {n_features}
        - value_range    : {value_range}
        - sparsity       : {sparsity:.2f}
        - add_structure  : {add_structure}
        - seed           : {seed}

        **Contents (Bunch object):**
        - importances    : NumPy array ({n_layers}, {n_features}) with scores.
        - frame          : Pandas DataFrame view of importances matrix.
        - layer_names    : List of {n_layers} layer names (index).
        - feature_names  : List of {n_features} feature names (columns).
        - DESCR          : This description.
        """
        )

        return Bunch(
            importances=importances,
            frame=df,
            layer_names=list(layer_names),
            feature_names=list(feature_names),
            DESCR=descr,
        )



make_fingerprint_data.__doc__ = r"""
Generate synthetic feature-importance data for fingerprint plots.

Creates a matrix of feature-importance scores across multiple
**layers** (e.g., models, periods, experimental groups) suitable
for visualization with
:func:`~kdiagram.plot.feature_based.plot_feature_fingerprint`.
This is handy for comparing profiles in a compact polar radar
view and for testing feature-comparison workflows in forecasting
and ML :footcite:p:`scikit-learn, Lim2021, kouadiob2025`.

Parameters
----------
n_layers : int, default=3
    Number of rows (layers) in the importance matrix. Each row
    represents a group such as a model or time period.

n_features : int, default=8
    Number of columns (features) in the importance matrix.

layer_names : list of str, optional
    Names for the layers. If ``None``, generic names like
    ``'Layer_A'``, ``'Layer_B'`` are generated. Must have length
    ``n_layers`` if provided.

feature_names : list of str, optional
    Names for the features. If ``None``, generic names like
    ``'Feature_1'``, ``'Feature_2'`` are generated. Must have
    length ``n_features`` if provided.

value_range : tuple of (float, float), default=(0.0, 1.0)
    Approximate sampling range ``(min_val, max_val)`` for raw
    importance scores. Values are drawn from a uniform
    distribution before structure/sparsity are applied.

sparsity : float, default=0.1
    Fraction in ``[0, 1]`` of entries that are set to zero
    at random, simulating unimportant features for some layers.

add_structure : bool, default=True
    If ``True``, inject simple patterns to make fingerprints
    distinct, e.g., emphasizing one feature per layer and
    de-emphasizing another. If ``False``, the matrix is fully
    random apart from sparsity.

seed : int or None, default=303
    Seed for NumPy’s random generator. If ``None``, a fresh RNG
    is used.

as_frame : bool, default=False
    If ``False``, return a :class:`~kdiagram.bunch.Bunch` with
    metadata and arrays. If ``True``, return only the pandas
    ``DataFrame`` indexed by layers with feature columns.

Returns
-------
data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame
    If ``as_frame=False`` (default), a Bunch with:

    - ``importances`` : ``ndarray`` of shape
      ``(n_layers, n_features)``.
    - ``frame`` : pandas ``DataFrame`` view of the matrix with
      layers as index and features as columns.
    - ``layer_names`` : list of layer names.
    - ``feature_names`` : list of feature names.
    - ``DESCR`` : human-readable description.

    If ``as_frame=True``, only the pandas ``DataFrame`` is
    returned.

Raises
------
ValueError
    If ``layer_names`` or ``feature_names`` lengths do not match
    the specified dimensions, if ``sparsity`` is outside
    ``[0, 1]``, or if ``value_range`` does not satisfy
    ``min_val <= max_val``.

Notes
-----
**Generation model.** Let :math:`I \in \mathbb{R}^{L \times F}`
denote the importance matrix with :math:`L = \texttt{n\_layers}`
and :math:`F = \texttt{n\_features}`. Raw scores are sampled as

.. math::

   I_{k,j}^{(0)} \sim \mathcal{U}(m, M),
   \qquad m = \texttt{value\_range[0]},\; M = \texttt{value\_range[1]}.

If structure is enabled, a layer-specific emphasis and
de-emphasis may be applied, producing :math:`I^{(1)}`. Finally,
a sparsity mask :math:`\;M_{k,j} \sim \text{Bernoulli}(1-s)\;`
with :math:`s=\texttt{sparsity}` is applied:

.. math::

   I_{k,j} \;=\; I_{k,j}^{(1)} \cdot M_{k,j}.

Scores are left in their original scale; you may normalize
per-layer or per-feature downstream if desired. For practical
feature-importance workflows and attribution in forecasting,
see :footcite:t:`scikit-learn` and :footcite:t:`Lim2021`. The
fingerprint visualization concept is part of our polar analytics
framework :footcite:t:`kouadiob2025`.

See Also
--------
kdiagram.plot.feature_based.plot_feature_fingerprint
    Radar-style comparison of multi-feature profiles across layers.

Examples
--------
>>> Return a Bunch with arrays and a DataFrame view:
>>> 
>>> from kdiagram.datasets import make_fingerprint_data
>>> fp = make_fingerprint_data(n_layers=4, n_features=10, seed=1)
>>> fp.importances.shape
(4, 10)
>>> list(fp.frame.index)[:2], list(fp.frame.columns)[:3]
(['Layer_A', 'Layer_B'], ['Feature_1', 'Feature_2', 'Feature_3'])
>>> 
>>> Return only a DataFrame with custom names:
>>> 
>>> df = make_fingerprint_data(
...     n_layers=3,
...     n_features=5,
...     layer_names=['L1','L2','L3'],
...     feature_names=['f1','f2','f3','f4','f5'],
...     as_frame=True,
...     seed=2,
... )
>>> df.shape
(3, 5)

References
----------
.. footbibliography::
"""



[docs]
def make_uncertainty_data(
    n_samples: int = 150,
    n_periods: int = 4,
    anomaly_frac: float = 0.15,
    start_year: int = 2022,
    prefix: str = "value",
    base_value: float = 10.0,
    trend_strength: float = 1.5,
    noise_level: float = 2.0,
    interval_width_base: float = 4.0,
    interval_width_noise: float = 1.5,
    interval_width_trend: float = 0.5,
    seed: int | None = 42,
    as_frame: bool = False,
) -> Bunch | pd.DataFrame:

    # --- Generation Logic (same as before) ---
    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    location_id = np.arange(n_samples)
    longitude = rng.uniform(-120, -115, n_samples)
    latitude = rng.uniform(33, 36, n_samples)
    elevation = rng.uniform(50, 500, n_samples) + latitude * 5
    base_signal = (
        base_value
        + np.sin(np.linspace(0, 3 * np.pi, n_samples)) * 5
        + rng.normal(0, noise_level / 2, n_samples)
    )
    actual_first_period = base_signal + rng.normal(
        0, noise_level / 2, n_samples
    )

    data_dict = {
        "location_id": location_id,
        "longitude": longitude,
        "latitude": latitude,
        "elevation": elevation,
        # Store actual only once, representing T=0 or reference time
        f"{prefix}_actual": actual_first_period.copy(),
    }

    all_q10_cols, all_q50_cols, all_q90_cols = [], [], []
    quantile_cols_dict = {"q0.1": [], "q0.5": [], "q0.9": []}

    for i in range(n_periods):
        year = start_year + i
        q10_col = f"{prefix}_{year}_q0.1"
        q50_col = f"{prefix}_{year}_q0.5"
        q90_col = f"{prefix}_{year}_q0.9"

        all_q10_cols.append(q10_col)
        all_q50_cols.append(q50_col)
        all_q90_cols.append(q90_col)
        quantile_cols_dict["q0.1"].append(q10_col)
        quantile_cols_dict["q0.5"].append(q50_col)
        quantile_cols_dict["q0.9"].append(q90_col)

        current_trend = trend_strength * i
        q50 = (
            base_signal
            + current_trend
            + rng.normal(0, noise_level / 3, n_samples)
        )

        current_interval_width = (
            interval_width_base
            + interval_width_trend * i
            + rng.uniform(
                -interval_width_noise / 2, interval_width_noise / 2, n_samples
            )
        )
        current_interval_width = np.maximum(0.1, current_interval_width)

        q10 = q50 - current_interval_width / 2
        q90 = q50 + current_interval_width / 2

        data_dict[q10_col] = q10
        data_dict[q50_col] = q50
        data_dict[q90_col] = q90

    df = pd.DataFrame(data_dict)

    actual_col_name = f"{prefix}_actual"
    if anomaly_frac > 0 and n_samples > 0:
        n_anomalies = int(anomaly_frac * n_samples)
        if n_anomalies > 0 and all_q10_cols and all_q90_cols:
            anomaly_indices = rng.choice(
                n_samples, size=n_anomalies, replace=False
            )
            n_under = n_anomalies // 2
            under_indices = anomaly_indices[:n_under]
            over_indices = anomaly_indices[n_under:]

            q10_first = df[all_q10_cols[0]].iloc[under_indices]
            q90_first = df[all_q90_cols[0]].iloc[over_indices]

            df.loc[under_indices, actual_col_name] = q10_first - rng.uniform(
                0.5, 3.0, size=len(under_indices)
            ) * (interval_width_base / 2 + 1)

            df.loc[over_indices, actual_col_name] = q90_first + rng.uniform(
                0.5, 3.0, size=len(over_indices)
            ) * (interval_width_base / 2 + 1)

    # Define final column order
    feature_names = ["location_id", "longitude", "latitude", "elevation"]
    target_names = [actual_col_name]
    pred_cols_sorted = [
        col
        for pair in zip(all_q10_cols, all_q50_cols, all_q90_cols)
        for col in pair
    ]
    ordered_cols = feature_names + target_names + pred_cols_sorted
    df = df[ordered_cols]

    # --- Return based on as_frame ---
    if as_frame:
        return df
    else:
        # Create Bunch object
        numeric_cols = feature_names + target_names + pred_cols_sorted
        # data_array = df[numeric_cols].values # Data array (optional)
        # target_array = df[target_names[0]].values
        target_array = df[target_names[0]].to_numpy(
            dtype=np.float64, copy=True
        )
        data_array = df[numeric_cols].to_numpy(dtype=np.float64, copy=True)

        # Create detailed description string
        descr = textwrap.dedent(
            f"""\
        Synthetic Multi-Period Uncertainty Dataset for k-diagram

        **Description:**
        This dataset simulates quantile forecasts (Q10, Q50, Q90) for a
        single variable ('{prefix}') over {n_periods} consecutive time periods
        (starting from {start_year}) across {n_samples} independent samples or
        locations. It includes simulated spatial coordinates and an
        auxiliary feature ('elevation'). An 'actual' value column
        (``{actual_col_name}``) corresponding to the *first* time
        period is provided, with ~{anomaly_frac*100:.0f}% of these values
        artificially placed outside the first period's Q10-Q90 interval
        to simulate prediction anomalies.

        The Q50 predictions follow a base signal with added noise and a
        linear trend controlled by `trend_strength`. The prediction
        interval width (Q90-Q10) also includes baseline width, noise,
        and a linear trend controlled by `interval_width_trend`.

        **Generation Parameters:**
        - n_samples             : {n_samples}
        - n_periods             : {n_periods}
        - start_year            : {start_year}
        - prefix                : '{prefix}'
        - anomaly_frac          : {anomaly_frac:.2f}
        - base_value            : {base_value:.2f}
        - trend_strength        : {trend_strength:.2f} (for Q50)
        - noise_level           : {noise_level:.2f} (added to Q50/actual)
        - interval_width_base   : {interval_width_base:.2f}
        - interval_width_noise  : {interval_width_noise:.2f}
        - interval_width_trend  : {interval_width_trend:.2f}
        - seed                  : {seed}

        **Data Structure (Bunch object):**
        - frame           : Complete pandas DataFrame.
        - feature_names   : List of spatial/auxiliary feature column names.
        - target_names    : List containing the target column name.
        - target          : NumPy array of target ('actual') values.
        - quantile_cols   : Dict mapping quantiles ('q0.1', 'q0.5', 'q0.9')
                          to lists of column names across periods.
        - q10_cols        : Convenience list of Q10 column names.
        - q50_cols        : Convenience list of Q50 column names.
        - q90_cols        : Convenience list of Q90 column names.
        - n_periods       : Number of periods with quantile data.
        - prefix          : Prefix used for value/quantile columns.
        - DESCR           : This description.

        This dataset is ideal for testing functions like plot_model_drift,
        plot_uncertainty_drift, plot_interval_consistency,
        plot_anomaly_magnitude, plot_coverage_diagnostic, etc.
        """
        )

        # Create and return Bunch object
        return Bunch(
            frame=df,
            data=data_array,
            feature_names=feature_names,
            target_names=target_names,
            target=target_array,
            quantile_cols=quantile_cols_dict,
            q10_cols=all_q10_cols,
            q50_cols=all_q50_cols,
            q90_cols=all_q90_cols,
            n_periods=n_periods,
            prefix=prefix,
            DESCR=descr,
        )



make_uncertainty_data.__doc__ = r"""
Generate a synthetic multi-period uncertainty dataset.

Creates a compact dataset for testing `k-diagram` uncertainty
visualizations: simulated **actuals** (for the first period),
quantile predictions **Q10/Q50/Q90** over multiple periods,
controllable trends and noise, injected interval-coverage
failures (anomalies), and simple spatial features. This is
useful for coverage, calibration, drift, and consistency
diagnostics :footcite:p:`Jolliffe2012, Gneiting2007b, kouadiob2025`.

Parameters
----------
n_samples : int, default=150
    Number of rows (locations) to generate.

n_periods : int, default=4
    Number of consecutive periods (e.g., years) for which to
    generate quantiles.

anomaly_frac : float, default=0.15
    Fraction in ``[0, 1]`` of rows whose first-period actual is
    forced **outside** the Q10–Q90 interval (half under-, half
    over-prediction, up to rounding).

start_year : int, default=2022
    First period’s year used in column names.

prefix : str, default='value'
    Base prefix for generated value/quantile columns.

base_value : float, default=10.0
    Mean level for the latent signal that drives Q50.

trend_strength : float, default=1.5
    Linear trend added to Q50 by period index (lead time).

noise_level : float, default=2.0
    Standard deviation for Gaussian noise added to the latent
    signal (for Q50 and actuals).

interval_width_base : float, default=4.0
    Baseline width of the Q10–Q90 interval in the first period.

interval_width_noise : float, default=1.5
    Uniform jitter magnitude applied per row/period to the
    interval width.

interval_width_trend : float, default=0.5
    Linear trend added to interval width across periods.

seed : int or None, default=42
    NumPy RNG seed for reproducibility. If ``None``, a fresh RNG
    is used.

as_frame : bool, default=False
    If ``False``, return a :class:`~kdiagram.bunch.Bunch` with
    arrays and metadata. If ``True``, return only the pandas
    ``DataFrame``.

Returns
-------
data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame
    If ``as_frame=False`` (default), a Bunch with:

    - ``frame`` : pandas ``DataFrame`` with spatial features,
      first-period actual, and Q10/Q50/Q90 columns by period.
    - ``feature_names`` : ``['location_id','longitude','latitude',
      'elevation']``.
    - ``target_names`` : ``[f'{prefix}_actual']``.
    - ``target`` : ``ndarray`` of actual values.
    - ``quantile_cols`` : dict mapping ``'q0.1'``, ``'q0.5'``,
      ``'q0.9'`` to lists of column names across periods.
    - ``q10_cols``, ``q50_cols``, ``q90_cols`` : convenience lists.
    - ``n_periods`` : number of generated periods.
    - ``prefix`` : the column name prefix.
    - ``DESCR`` : human-readable description.

    If ``as_frame=True``, only the pandas ``DataFrame`` is
    returned.

Raises
------
TypeError
    If numeric inputs cannot be processed.

Notes
-----
**Column naming.** Quantile columns encode the year :math:`y`
and quantile level :math:`q`:

.. math::

   \text{quantile name}
   \;\equiv\;
   \texttt{<prefix>}\_{y}\_\texttt{q}q,
   \qquad
   y \in \{\texttt{start\_year},\dots\},
   \;\; q \in \{0.1,0.5,0.9\}.

The first-period actual is stored once as
``f"{prefix}_actual"``.

**Signal and interval model.** Let period index be
:math:`t \in \{0,\dots,n\_\text{periods}-1\}` and row index
:math:`i`. Define latent base signal :math:`s_i` and Q50:

.. math::

   s_i \;=\; \texttt{base\_value}
          \;+\; \varepsilon_i,
   \qquad
   \varepsilon_i \sim \mathcal{N}(0, \sigma^2),\;
   \sigma=\texttt{noise\_level}/2,

.. math::

   Q50_{i,t} \;=\; s_i \;+\; t\cdot\texttt{trend\_strength}
                   \;+\; \eta_{i,t},
   \quad
   \eta_{i,t} \sim \mathcal{N}\!\big(0,
   (\texttt{noise\_level}/3)^2\big).

Interval width :math:`w_{i,t}` has baseline, trend, and jitter:

.. math::

   w_{i,t}
   \;=\;
   \max\!\Bigl(
     0.1,\,
     \texttt{interval\_width\_base}
     + t\cdot\texttt{interval\_width\_trend}
     + u_{i,t}
   \Bigr),
   \quad
   u_{i,t} \sim \mathcal{U}\!\Bigl(-\tfrac{
   \texttt{interval\_width\_noise}}{2},\,
   \tfrac{\texttt{interval\_width\_noise}}{2}\Bigr),

and

.. math::

   Q10_{i,t} \;=\; Q50_{i,t} - \tfrac{1}{2}w_{i,t},\qquad
   Q90_{i,t} \;=\; Q50_{i,t} + \tfrac{1}{2}w_{i,t}.

**Anomaly injection (first period).** For a fraction
``anomaly_frac`` of rows we enforce a coverage failure:

.. math::

   y^{\text{actual}}_{i}
   \notin
   [\,Q10_{i,0},\,Q90_{i,0}\,],

splitting under/over cases approximately evenly to aid tests of
coverage diagnostics and anomaly magnitude plots. Use this data to
study calibration vs. sharpness trade-offs
:footcite:p:`Gneiting2007b` and operational verification practice
:footcite:p:`Jolliffe2012`.

See Also
--------
kdiagram.plot.uncertainty.plot_coverage
    Aggregate empirical coverage vs. nominal levels.

kdiagram.plot.uncertainty.plot_coverage_diagnostic
    Point-wise success/failure on a polar layout.

kdiagram.plot.uncertainty.plot_interval_consistency
    Temporal stability of interval widths per location.

kdiagram.plot.uncertainty.plot_model_drift
    Lead-time trend of mean interval width.

kdiagram.plot.uncertainty.plot_anomaly_magnitude
    Where and how severely intervals fail.

Examples
--------
>>> # Return a Bunch and inspect quantile columns:
>>> 
>>> from kdiagram.datasets import make_uncertainty_data
>>> ds = make_uncertainty_data(n_samples=12, n_periods=3, seed=7)
>>> sorted(ds.quantile_cols.keys())
['q0.1', 'q0.5', 'q0.9']
>>> 
>>> # Return only a DataFrame and check column order:
>>> 
>>> df = make_uncertainty_data(as_frame=True, n_samples=5, seed=0)
>>> df.columns[:6].tolist()  # features + actual then Q10/Q50/Q90
['location_id', 'longitude', 'latitude', 'elevation',
 f'{ 'value'}_actual', 'value_2022_q0.1']  # doctest: +ELLIPSIS

References
----------
.. footbibliography::
"""



[docs]
def make_taylor_data(
    n_samples: int = 100,
    n_models: int = 3,
    ref_std: float = 1.0,
    corr_range: tuple[float, float] = (0.5, 0.99),
    std_range: tuple[float, float] = (0.7, 1.3),
    noise_level: float = 0.3,
    bias_level: float = 0.1,
    seed: int | None = 101,
    as_frame: bool = False,
) -> Bunch | pd.DataFrame:
    # --- Input Validation & Setup ---
    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    # Basic validation for ranges
    if not (0 <= corr_range[0] <= corr_range[1] <= 1.0):
        warnings.warn(
            "corr_range limits should ideally be between 0 and 1 for "
            "standard Taylor Diagrams. Adjusting...",
            stacklevel=2,
        )
        corr_range = (max(0, corr_range[0]), min(1.0, corr_range[1]))
        if corr_range[0] > corr_range[1]:
            corr_range = (0.5, 0.99)

    if not (0 <= std_range[0] <= std_range[1]):
        warnings.warn(
            "std_range factors should be non-negative and min <= max."
            " Using defaults.",
            stacklevel=2,
        )
        std_range = (0.7, 1.3)

    if noise_level <= 1e-9 and corr_range[1] < 1.0 - 1e-9:
        raise ValueError(
            "noise_level cannot be zero if target correlation < 1 is possible."
        )

    # --- Generate Reference Data ---
    reference_raw = rng.normal(0, ref_std, n_samples)
    # Center mean at 0
    reference = reference_raw - np.mean(reference_raw)
    # Scale to desired std dev
    current_std = np.std(reference)
    if current_std > 1e-9:
        reference = reference * (ref_std / current_std)
    # Store actual std dev
    actual_ref_std = np.std(reference)

    # --- Generate Model Predictions ---
    predictions = []
    model_names = []
    calculated_stds = []
    calculated_corrs = []

    for i in range(n_models):
        model_name = f"Model_{chr(65+i)}"  # Model A, B, C...
        model_names.append(model_name)

        # Sample target stats for this model
        target_rho = rng.uniform(corr_range[0], corr_range[1])
        target_std_factor = rng.uniform(std_range[0], std_range[1])
        target_std = target_std_factor * actual_ref_std

        # Calculate coefficients a and b for p = a*r + b*noise + bias
        a = target_rho * target_std_factor
        b_squared_term = target_std**2 - (a * actual_ref_std) ** 2

        if b_squared_term < -1e-9:
            warnings.warn(
                f"Model {model_name}: Cannot achieve target std "
                f"({target_std:.2f}) with target correlation "
                f"({target_rho:.2f}) and noise "
                f"({noise_level:.2f}). Setting b=0.",
                UserWarning,
                stacklevel=2,
            )
            b = 0
        else:
            # Ensure noise_level isn't zero if b_squared_term > 0
            if noise_level <= 1e-9 and b_squared_term > 1e-9:
                raise ValueError(
                    "noise_level cannot be zero if needed to reach target std"
                )
            b = np.sqrt(max(0, b_squared_term)) / max(noise_level, 1e-9)

        # Generate noise and bias
        noise = rng.normal(0, noise_level, n_samples)
        bias = rng.uniform(-bias_level, bias_level)

        # Create prediction
        pred = a * reference + b * noise + bias
        predictions.append(pred)

        # Calculate actual stats
        calculated_stds.append(np.std(pred))
        # Clip correlation calculation for safety
        corr_val = np.corrcoef(pred, reference)[0, 1]
        calculated_corrs.append(np.clip(corr_val, -1.0, 1.0))

    # --- Assemble DataFrame (used for both frame and Bunch) ---
    df_dict = {"reference": reference}
    for name, pred_array in zip(model_names, predictions):
        df_dict[name] = pred_array
    df = pd.DataFrame(df_dict)

    # --- Return based on as_frame ---
    if as_frame:
        return df
    else:
        # Assemble stats DataFrame
        stats_df = pd.DataFrame(
            {"stddev": calculated_stds, "corrcoef": calculated_corrs},
            index=model_names,
        )

        # Assemble description
        descr = textwrap.dedent(
            f"""\
        Synthetic Taylor Diagram Data

        **Generated Parameters:**
        - n_samples    : {n_samples}
        - n_models     : {n_models}
        - ref_std      : {ref_std:.2f} (target), {actual_ref_std:.2f} (actual)
        - corr_range   : ({corr_range[0]:.2f}, {corr_range[1]:.2f}) (target)
        - std_range    : ({std_range[0]:.2f}, {std_range[1]:.2f}) (target factor)
        - noise_level  : {noise_level:.2f}
        - bias_level   : {bias_level:.2f}
        - seed         : {seed}

        **Contents (Bunch object):**
        - frame        : DataFrame with reference and prediction columns.
        - reference    : NumPy array (n_samples,) - Reference data.
        - predictions  : List of {n_models} NumPy arrays (n_samples,) - Model data.
        - model_names  : List of {n_models} strings - Model labels.
        - stats        : DataFrame with actual calculated 'stddev' and
                         'corrcoef' for each model vs reference.
        - ref_std      : Actual standard deviation of the reference data.
        - DESCR        : This description.
        """
        )

        return Bunch(
            frame=df,
            reference=reference,
            predictions=predictions,
            model_names=model_names,
            stats=stats_df,
            ref_std=actual_ref_std,
            DESCR=descr,
        )



make_taylor_data.__doc__ = r"""
Generate synthetic data for Taylor diagrams.

Taylor diagrams, introduced by :footcite:t:`Taylor2001`, summarize
correlation, standard deviation, and centered RMS difference between
model outputs and a reference. This routine creates one reference
series and several model-like series with controllable correlation
and spread, suitable for exercising plotting functions such as
:func:`~kdiagram.plot.evaluation.taylor_diagram`. Practical guidance
on verification appears in :footcite:p:`Jolliffe2012`.

Parameters
----------
n_samples : int, default=100
    Number of observations in each generated series.

n_models : int, default=3
    Number of model (prediction) series to simulate.

ref_std : float, default=1.0
    Target standard deviation for the reference series
    (mean is centered to 0).

corr_range : tuple of (float, float), default=(0.5, 0.99)
    Closed interval from which target correlations :math:`\rho`
    for models are sampled uniformly. Values should be in
    :math:`[0,1]` for standard Taylor use.

std_range : tuple of (float, float), default=(0.7, 1.3)
    Closed interval for multiplicative factors applied to the
    reference standard deviation to obtain each model’s target
    spread.

noise_level : float, default=0.3
    Standard deviation of the independent noise used to reach
    the requested spread and correlation. Must be positive if
    any target correlation is less than 1.

bias_level : float, default=0.1
    Maximum absolute bias added to each model series (uniform
    in ``[-bias_level, bias_level]``). Note that Taylor diagrams
    are insensitive to overall bias.

seed : int or None, default=101
    NumPy random seed. If ``None``, a fresh RNG is used.

as_frame : bool, default=False
    If ``False``, return a :class:`~kdiagram.bunch.Bunch` with
    arrays, names, and summary stats. If ``True``, return only
    a pandas ``DataFrame`` with columns for the reference and
    each model series.

Returns
-------
data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame
    If ``as_frame=False`` (default), a Bunch with:

    - ``frame`` : pandas ``DataFrame`` with ``'reference'`` and
      model columns.
    - ``reference`` : ``ndarray`` of shape ``(n_samples,)``.
    - ``predictions`` : list of ``ndarray`` predictions.
    - ``model_names`` : list of model labels.
    - ``stats`` : pandas ``DataFrame`` with columns
      ``'stddev'`` and ``'corrcoef'`` vs the reference.
    - ``ref_std`` : actual standard deviation of the reference.
    - ``DESCR`` : human-readable description.

    If ``as_frame=True``, only the pandas ``DataFrame`` is
    returned.

Raises
------
ValueError
    If ranges are invalid, or ``noise_level`` is non-positive
    while a sub-perfect target correlation is requested.

Notes
-----
**Construction.** Let the reference be :math:`r` with
:math:`\mathrm{E}[r]=0` and :math:`\mathrm{sd}(r)=\sigma_r`
(we target :math:`\sigma_r=\texttt{ref\_std}`). For model
:math:`k`, we synthesize

.. math::

   p^{(k)} \;=\; a^{(k)} r \;+\; b^{(k)} \epsilon^{(k)} \;+\; \text{bias}^{(k)},

with :math:`\epsilon^{(k)} \sim \mathcal{N}(0,\sigma_\epsilon^2)`
independent of :math:`r`, where
:math:`\sigma_\epsilon=\texttt{noise\_level}`. Ignoring bias
(centered statistics), the model spread and correlation satisfy

.. math::

   \sigma_{p}^{(k)} \;=\; \sqrt{(a^{(k)} \sigma_r)^2 + (b^{(k)} \sigma_\epsilon)^2},
   \qquad
   \rho^{(k)} \;=\; \frac{a^{(k)} \sigma_r}{\sigma_{p}^{(k)}}.

We sample a target
:math:`\rho^{(k)} \in \texttt{corr\_range}` and a target spread
factor :math:`\alpha^{(k)} \in \texttt{std\_range}`, set
:math:`\sigma_p^{(k)} = \alpha^{(k)} \sigma_r`, choose

.. math::

   a^{(k)} \;=\; \rho^{(k)} \alpha^{(k)}, \qquad
   b^{(k)} \;=\; \frac{\sqrt{\left(\sigma_p^{(k)}\right)^2 -
                           \left(a^{(k)} \sigma_r\right)^2}}
                        {\sigma_\epsilon},

and draw a small constant :math:`\text{bias}^{(k)} \in
[-\texttt{bias\_level},\texttt{bias\_level}]`. Centered Taylor
statistics are unaffected by bias. See :footcite:t:`Taylor2001`
for interpretation; broader verification context is covered in
:footcite:p:`Jolliffe2012`.

See Also
--------
kdiagram.plot.evaluation.taylor_diagram
    Flexible Taylor diagram from raw arrays or pre-computed stats.

kdiagram.plot.evaluation.plot_taylor_diagram
    Standard Taylor diagram from raw arrays.

kdiagram.plot.evaluation.plot_taylor_diagram_in
    Taylor diagram with background shading.

Examples
--------
>>>  # Get arrays and stats as a Bunch:
>>> 
>>> from kdiagram.datasets import make_taylor_data
>>> ds = make_taylor_data(n_models=2, seed=0)
>>> list(ds.frame.columns)
['reference', 'Model_A', 'Model_B']
>>> set(ds.stats.columns) == {'stddev', 'corrcoef'}
True
>>> 
>>> # Return only a DataFrame:
>>> 
>>> df = make_taylor_data(as_frame=True, seed=1)
>>> 'reference' in df.columns
True

References
----------
.. footbibliography::
"""



[docs]
def make_multi_model_quantile_data(
    n_samples: int = 100,
    n_models: int = 3,
    quantiles: list[float] = None,
    prefix: str = "pred",
    model_names: list[str] | None = None,
    true_mean: float = 50.0,
    true_std: float = 10.0,
    bias_range: tuple[float, float] = (-2.0, 2.0),
    width_range: tuple[float, float] = (5.0, 15.0),
    noise_level: float = 1.0,
    seed: int | None = 202,
    as_frame: bool = False,
) -> Bunch | pd.DataFrame:

    # --- Input Validation ---
    if quantiles is None:
        quantiles = [0.1, 0.5, 0.9]
    if 0.5 not in quantiles:
        # Current logic relies on 0.5 being present for centering
        raise ValueError("The `quantiles` list must contain 0.5 (median).")

    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    if not width_range[0] <= width_range[1] or width_range[0] < 0:
        raise ValueError(
            "width_range must be (min, max) with min >= 0 and min <= max."
        )
    if not bias_range[0] <= bias_range[1]:
        raise ValueError("bias_range must be (min, max) with min <= max.")

    # --- Setup ---
    # Ensure unique and sorted quantiles
    quantiles_sorted = sorted(list(set(quantiles)))
    if len(quantiles_sorted) < 2:
        q_min, q_max = quantiles_sorted[0], quantiles_sorted[0]
    else:
        q_min = quantiles_sorted[0]
        q_max = quantiles_sorted[-1]
    q_median = 0.5

    # Factor to scale half-width based on min/max quantile range vs Q10-Q90
    # Avoid division by zero if only one quantile provided
    width_denominator = 0.9 - 0.1
    width_numerator = q_max - q_min
    if len(quantiles_sorted) > 1 and abs(width_numerator) > 1e-9:
        width_scale_factor = width_numerator / width_denominator
    else:
        width_scale_factor = (
            1.0  # No scaling needed if range is zero/single q
        )

    # --- Data Generation ---
    y_true = rng.normal(true_mean, true_std, n_samples)
    feature_1 = rng.uniform(0, 1, n_samples)
    feature_2 = rng.normal(5, 2, n_samples)

    data_dict = {  # Use dict to build data before DataFrame
        "y_true": y_true,
        "feature_1": feature_1,
        "feature_2": feature_2,
    }

    # Generate Model Names
    if model_names is None:
        model_names_list = [f"Model_{chr(65+i)}" for i in range(n_models)]
    elif len(model_names) != n_models:
        raise ValueError(
            f"Length of model_names ({len(model_names)}) must "
            f"match n_models ({n_models})."
        )
    else:
        model_names_list = list(model_names)

    prediction_columns_dict = {name: [] for name in model_names_list}

    # --- Generate predictions for each model ---
    for _i, model_name in enumerate(model_names_list):
        # Sample model-specific parameters
        model_bias = rng.uniform(bias_range[0], bias_range[1])
        model_width = rng.uniform(width_range[0], width_range[1])

        # Store generated quantiles temporarily before sorting
        temp_model_quantiles = {}

        # Generate Q50 (median) prediction first
        q50_pred = y_true + model_bias + rng.normal(0, noise_level, n_samples)
        q50_col_name = f"{prefix}_{model_name}_q0.5"
        temp_model_quantiles[0.5] = q50_pred
        # Add name to tracking dict immediately
        prediction_columns_dict[model_name].append(q50_col_name)

        # Generate other quantiles based on Q50 and target width
        for q in quantiles_sorted:
            if q == q_median:
                continue  # Skip if median

            # Calculate offset using proportional distance from median
            # Avoid division by zero if q_max == q_min
            q_range = q_max - q_min
            # from scipy.stats import norm
            # z_score = norm.ppf(q) # Z-score for the quantile
            # Use standard deviation implied by width (e.g. q90-q10 ~ 2.56*std)
            # implied_std = model_width / (norm.ppf(q_max) - norm.ppf(q_min))
            #  if (q_max != q_min) else 1.0
            # quantile_offset = z_score * implied_std

            if abs(q_range) > 1e-9 and abs(width_scale_factor) > 1e-9:
                quantile_offset = (
                    (model_width / width_scale_factor)
                    * (q - q_median)
                    / q_range
                    * 2
                )
            else:  # Handle single quantile or zero range
                quantile_offset = 0

            q_pred = (
                q50_pred
                + quantile_offset
                + rng.normal(
                    0,
                    noise_level / 2,
                    n_samples,  # Slightly less noise for bounds
                )
            )
            temp_model_quantiles[q] = q_pred

        # Ensure quantile order and add to main data dict
        # Create temporary DF for sorting this model's quantiles
        model_data_temp = pd.DataFrame(temp_model_quantiles)
        # Sort values row-wise
        sorted_data = np.sort(model_data_temp.values, axis=1)
        # Assign sorted values back, creating final column names
        for k, q in enumerate(quantiles_sorted):
            col_name = f"{prefix}_{model_name}_q{q:.2f}".rstrip("0").rstrip(
                "."
            )
            data_dict[col_name] = sorted_data[:, k]
            # Add to tracking dict if not already added (handles Q50 case)
            if col_name not in prediction_columns_dict[model_name]:
                prediction_columns_dict[model_name].append(col_name)

    # Create the final DataFrame
    df = pd.DataFrame(data_dict)

    # Order columns somewhat logically
    feature_names = ["feature_1", "feature_2"]
    target_name = ["y_true"]
    pred_cols_sorted = sorted(
        [col for col in df.columns if col.startswith(prefix)]
    )
    ordered_cols = target_name + feature_names + pred_cols_sorted
    df = df[ordered_cols]

    # --- Return based on as_frame ---
    if as_frame:
        return df
    else:
        # Create Bunch object
        data_numeric_cols = feature_names + pred_cols_sorted
        data_array = df[data_numeric_cols].values
        target_array = df[target_name[0]].values

        descr = textwrap.dedent(
            f"""\
        Synthetic Multi-Model Quantile Dataset for k-diagram

        **Generated Parameters:**
        - n_samples    : {n_samples}
        - n_models     : {n_models}
        - quantiles    : {quantiles_sorted}
        - prefix       : {prefix}
        - true_mean    : {true_mean:.2f}
        - true_std     : {true_std:.2f}
        - bias_range   : {bias_range}
        - width_range  : {width_range}
        - noise_level  : {noise_level:.2f}
        - seed         : {seed}

        **Data Structure (Bunch object):**
        - frame           : Complete pandas DataFrame.
        - data            : NumPy array of numeric feature & prediction columns.
        - feature_names   : List of auxiliary feature column names.
        - target_names    : List containing the target column name ('y_true').
        - target          : NumPy array of 'y_true' values.
        - model_names     : List of simulated model names.
        - quantile_levels : Sorted list of quantile levels generated.
        - prediction_columns : Dict mapping model names to their column names.
        - prefix          : Prefix used for prediction columns.
        - DESCR           : This description.

        This dataset simulates quantile predictions from {n_models} models
        for a single time point, allowing comparison of their
        uncertainty characteristics.
        """
        )

        return Bunch(
            frame=df,
            data=data_array,
            feature_names=feature_names,
            target_names=target_name,
            target=target_array,
            model_names=model_names_list,
            quantile_levels=quantiles_sorted,
            prediction_columns=prediction_columns_dict,
            prefix=prefix,
            DESCR=descr,
        )



make_multi_model_quantile_data.__doc__ = r"""
Generate multi-model quantile forecast data for a single horizon.

Simulates a target variable :math:`y_{\text{true}}` and
quantile predictions (e.g., Q10/Q50/Q90) from several models
for the **same** forecast time. Each model can have its own
systematic bias and characteristic interval width, enabling
reproducible examples for coverage/calibration and cross-model
comparisons :footcite:p:`Gneiting2007b, Jolliffe2012`.

Parameters
----------
n_samples : int, default=100
    Number of rows (independent samples/locations).

n_models : int, default=3
    Number of simulated models providing quantile forecasts.

quantiles : list of float, default=[0.1, 0.5, 0.9]
    Quantile levels in ``(0, 1)`` to generate for **each** model.
    Must include ``0.5`` (the median). The list is de-duplicated
    and sorted internally.

prefix : str, default='pred'
    Base prefix for prediction columns. Final names follow
    ``{prefix}_{model_name}_q{quantile}``.

model_names : list of str, optional
    Custom model names of length ``n_models``. If ``None``,
    ``'Model_A'``, ``'Model_B'``, … are generated.

true_mean : float, default=50.0
    Mean of the Normal distribution used to draw ``y_true``.

true_std : float, default=10.0
    Standard deviation of the Normal distribution for ``y_true``.

bias_range : tuple of (float, float), default=(-2.0, 2.0)
    Uniform range from which a model-specific bias for Q50 is
    sampled and added to ``y_true``.

width_range : tuple of (float, float), default=(5.0, 15.0)
    Uniform range for the target **overall** interval width
    (e.g., Q90–Q10) of each model.

noise_level : float, default=1.0
    Standard deviation of independent Gaussian noise added to
    each generated quantile series.

seed : int or None, default=202
    NumPy RNG seed (``default_rng``). If ``None``, a fresh RNG is used.

as_frame : bool, default=False
    If ``False``, return a :class:`~kdiagram.bunch.Bunch` with
    arrays/metadata; if ``True``, return only the pandas ``DataFrame``.

Returns
-------
data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame
    If ``as_frame=False`` (default), a Bunch with:

    - ``frame`` : pandas ``DataFrame`` of shape
      ``(n_samples, 3 + n_models * n_quantiles)`` containing
      ``'y_true'``, two auxiliary features, and all quantile columns.
    - ``data`` : ``ndarray`` with numeric feature + prediction columns.
    - ``feature_names`` : ``['feature_1', 'feature_2']``.
    - ``target_names`` : ``['y_true']``.
    - ``target`` : ``ndarray`` of ``y_true`` values.
    - ``model_names`` : list of model labels.
    - ``quantile_levels`` : sorted list of unique quantiles.
    - ``prediction_columns`` : dict mapping each model name to its
      list of quantile column names.
    - ``prefix`` : the column prefix.
    - ``DESCR`` : human-readable description.

    If ``as_frame=True``, only the pandas ``DataFrame`` is returned.

Raises
------
ValueError
    If ``0.5`` is not in ``quantiles``, if name/range lengths are
    inconsistent, or if ranges are invalid.

TypeError
    If non-numeric inputs prevent computation.

Notes
-----
**Generation model.** Draw the truth as
:math:`y_{\text{true}} \sim \mathcal{N}(\mu, \sigma^2)` with
``mu=true_mean`` and ``sigma=true_std``. For model :math:`m`, let
:math:`b^{(m)}` be a sampled bias
and :math:`W^{(m)}` a sampled overall width (e.g., Q90–Q10). The
median prediction (Q50) is

.. math::

   q_{0.5}^{(m)} \;=\; y_{\text{true}} \;+\; b^{(m)} \;+\;
   \varepsilon^{(m)}, \qquad
   \varepsilon^{(m)} \sim \mathcal{N}(0, \sigma_\varepsilon^2),

with ``sigma_ε = noise_level``. Other quantiles are created by
adding offsets proportional to their distance from the median and
scaled so that the extreme quantiles span approximately
:math:`W^{(m)}`; small independent noise is then added. Finally, for
each row we sort the model’s quantile values to enforce
:math:`q_{\alpha} \le q_{0.5} \le q_{\beta}` (e.g., Q10 ≤ Q50 ≤ Q90),
which is useful for coverage and calibration diagnostics
:footcite:p:`Gneiting2007b, Jolliffe2012`.

Two auxiliary columns (``feature_1``, ``feature_2``) are included
for convenience in examples; they do not influence the simulated
target or quantiles.

See Also
--------
make_uncertainty_data
    Temporal multi-period quantiles with drift/consistency controls.
make_taylor_data
    Synthetic data tailored for Taylor diagram evaluation.
kdiagram.plot.uncertainty.plot_coverage
    Aggregate empirical coverage vs nominal.
kdiagram.plot.uncertainty.plot_temporal_uncertainty
    General polar visualization for multiple series.

Examples
--------
>>> # As a Bunch with metadata:
>>> 
>>> from kdiagram.datasets import make_multi_model_quantile_data
>>> ds = make_multi_model_quantile_data(n_samples=50, n_models=2, seed=1)
>>> ds.model_names
['Model_A', 'Model_B']
>>> sorted(ds.quantile_levels)
[0.1, 0.5, 0.9]
>>> ds.prediction_columns['Model_A'][:3]  # doctest: +ELLIPSIS
['pred_Model_A_q0.1', 'pred_Model_A_q0.5', 'pred_Model_A_q0.9']
>>> 
>>> # As a DataFrame:
>>> 
>>> df = make_multi_model_quantile_data(as_frame=True, seed=2)
>>> set(['y_true','feature_1','feature_2']).issubset(df.columns)
True

References
----------
.. footbibliography::
"""