Source code for kdiagram.datasets.make

# Author: LKouadio <etanoyau@gmail.com>
# License: Apache License 2.0 (see LICENSE file)

"""
Dataset Generation Utilities (:mod:`kdiagram.datasets.make`)
============================================================

This module provides functions to create synthetic datasets tailored
for demonstrating and testing the various plotting functions within
the `k-diagram` package, particularly those focused on uncertainty.
"""

from __future__ import annotations

import re
import textwrap
import warnings
from typing import Any, Callable

import numpy as np
import pandas as pd

from ..api.bunch import Bunch

__all__ = [
    "make_uncertainty_data",
    "make_taylor_data",
    "make_multi_model_quantile_data",
    "make_cyclical_data",
    "make_regression_data",
    "make_classification_data",
]



[docs]
def make_cyclical_data(
    n_samples: int = 365,
    n_series: int = 2,
    cycle_period: float = 365,
    noise_level: float = 0.5,
    amplitude_true: float = 10.0,
    offset_true: float = 20.0,
    pred_bias: float | list[float] = None,
    pred_noise_factor: float | list[float] = None,
    pred_amplitude_factor: float | list[float] = None,
    pred_phase_shift: float | list[float] = None,
    prefix: str = "model",
    series_names: list[str] | None = None,
    seed: int | None = 404,
    as_frame: bool = False,
) -> Bunch | pd.DataFrame:
    # --- Input Validation & Setup ---
    if pred_phase_shift is None:
        pred_phase_shift = [0, np.pi / 6]
    if pred_amplitude_factor is None:
        pred_amplitude_factor = [1.0, 0.8]
    if pred_noise_factor is None:
        pred_noise_factor = [1.0, 1.5]
    if pred_bias is None:
        pred_bias = [0, 1.5]
    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    # Ensure prediction parameters are lists of correct length
    params_to_check = {
        "pred_bias": pred_bias,
        "pred_noise_factor": pred_noise_factor,
        "pred_amplitude_factor": pred_amplitude_factor,
        "pred_phase_shift": pred_phase_shift,
    }
    processed_params = {}
    for name, param in params_to_check.items():
        if isinstance(param, (int, float)):
            processed_params[name] = [param] * n_series
        elif isinstance(param, list):
            if len(param) != n_series:
                raise ValueError(
                    f"Length of '{name}' ({len(param)}) must match "
                    f"n_series ({n_series})."
                )
            processed_params[name] = param
        else:
            raise TypeError(f"'{name}' must be float or list of floats.")

    # --- Generate Time Step and True Signal ---
    time_step = np.arange(n_samples)
    # Angular frequency based on cycle period
    omega = 2 * np.pi / cycle_period
    theta = omega * time_step

    # True signal (e.g., sine wave + offset + noise)
    y_true = (
        offset_true
        + amplitude_true * np.sin(theta)
        + rng.normal(0, noise_level, n_samples)
    )

    data_dict = {"time_step": time_step, "y_true": y_true}

    # --- Generate Model Names & Prediction Columns ---
    if series_names is None:
        series_names_list = [
            f"{prefix}_{chr(65 + i)}" for i in range(n_series)
        ]
    elif len(series_names) != n_series:
        raise ValueError(
            f"Length of series_names ({len(series_names)}) must "
            f"match n_series ({n_series})."
        )
    else:
        series_names_list = list(series_names)

    prediction_cols_list = []

    for i, series_name in enumerate(series_names_list):
        col_name = series_name  # Use provided or generated name
        prediction_cols_list.append(col_name)

        # Get parameters for this series
        amp = amplitude_true * processed_params["pred_amplitude_factor"][i]
        bias = processed_params["pred_bias"][i]
        noise = noise_level * processed_params["pred_noise_factor"][i]
        phase = processed_params["pred_phase_shift"][i]

        # Generate prediction series
        y_pred = (
            offset_true
            + bias
            + amp * np.sin(theta + phase)
            + rng.normal(0, noise, n_samples)
        )

        data_dict[col_name] = y_pred

    # --- Create DataFrame ---
    df = pd.DataFrame(data_dict)

    # Define column categories for Bunch
    feature_names = ["time_step"]
    target_name = ["y_true"]

    # --- Return based on as_frame ---
    if as_frame:
        # Order columns logically
        ordered_cols = target_name + feature_names + prediction_cols_list
        return df[ordered_cols]
    else:
        # Create Bunch description
        descr = textwrap.dedent(
            f"""\
        Synthetic Cyclical Pattern Data for k-diagram

        **Description:**
        Simulates a dataset with a primary 'true' cyclical signal and
        {n_series} related prediction series over {n_samples} time steps.
        The true signal is a sine wave with added noise. Prediction
        series are generated based on the true signal but may include
        systematic bias, different amplitude scaling, phase shifts (lag/lead),
        and varying noise levels, according to the specified parameters.

        **Generation Parameters:**
        - n_samples             : {n_samples}
        - n_series              : {n_series}
        - cycle_period          : {cycle_period}
        - noise_level           : {noise_level:.2f} (base for y_true)
        - amplitude_true        : {amplitude_true:.2f}
        - offset_true           : {offset_true:.2f}
        - pred_bias             : {processed_params["pred_bias"]}
        - pred_noise_factor     : {processed_params["pred_noise_factor"]}
        - pred_amplitude_factor : {processed_params["pred_amplitude_factor"]}
        - pred_phase_shift      : {processed_params["pred_phase_shift"]} (radians)
        - prefix                : '{prefix}'
        - seed                  : {seed}

        **Data Structure (Bunch object):**
        - frame           : Complete pandas DataFrame.
        - feature_names   : List of feature column names (['time_step']).
        - target_names    : List containing the target column name (['y_true']).
        - target          : NumPy array of 'y_true' values.
        - series_names    : List of prediction series names.
        - prediction_columns: List of prediction column names in the frame.
        - DESCR           : This description.

        This dataset is suitable for visualizing relationships or temporal
        patterns in a polar context using functions like plot_relationship
        or plot_temporal_uncertainty.
        """
        )

        # Build arrays with a uniform dtype to avoid pandas -> np.find_common_type
        num_cols = feature_names + prediction_cols_list

        target_array = df[target_name[0]].to_numpy(
            dtype=np.float64, copy=True
        )
        data_array = df[num_cols].to_numpy(dtype=np.float64, copy=True)

        return Bunch(
            frame=df[target_name + feature_names + prediction_cols_list],
            data=data_array,
            feature_names=feature_names,
            target_names=target_name,
            target=target_array,
            series_names=series_names_list,
            prediction_columns=prediction_cols_list,
            DESCR=descr,
        )



make_cyclical_data.__doc__ = r"""
Generate synthetic cyclical data for relationship and temporal plots.

Creates a dataset with a single **true** cyclical signal and one or
more **prediction** series that can differ in amplitude, phase, bias,
and noise relative to the truth. This is useful for demos of
polar relationship and temporal-uncertainty plots in `k-diagram`
:footcite:p:`harris2020array, 2020SciPy-NMeth, Hunter:2007`.

This data is useful for demonstrating and testing functions like
:func:`~kdiagram.plot.relationship.plot_relationship` or
:func:`~kdiagram.plot.uncertainty.plot_temporal_uncertainty` where
visualizing behavior over a cycle is important.

Parameters
----------
n_samples : int, default=365
    Number of time steps to generate. Interpreted as evenly
    spaced samples over one or more cycles.

n_series : int, default=2
    Number of simulated prediction series (e.g., models).

cycle_period : float, default=365
    Samples per full cycle :math:`P`. The angular frequency is
    :math:`\omega = 2\pi / P`. Use ``365`` for daily data over
    one year, ``12`` for monthly data over one year, etc.

noise_level : float, default=0.5
    Standard deviation of Gaussian noise added to the **true**
    signal. Prediction series scale this by ``pred_noise_factor``.

amplitude_true : float, default=10.0
    Amplitude of the sinusoidal **true** signal.

offset_true : float, default=20.0
    Vertical offset (mean level) of the **true** signal.

pred_bias : float or list of float, optional
    Additive bias for each prediction series. If a scalar is
    provided it is broadcast to all ``n_series``. If a list is
    provided, its length must equal ``n_series``. Defaults to
    ``[0.0, 1.5]`` when ``None``.

pred_noise_factor : float or list of float, optional
    Multiplier for ``noise_level`` per series. Scalar values are
    broadcast; lists must match ``n_series`` in length. Defaults
    to ``[1.0, 1.5]`` when ``None``.

pred_amplitude_factor : float or list of float, optional
    Multiplier of ``amplitude_true`` per series (allows under/
    over-estimation of the cycle amplitude). Scalar broadcast is
    supported. Defaults to ``[1.0, 0.8]`` when ``None``.

pred_phase_shift : float or list of float, optional
    Phase shift (radians) added to each series. Positive values
    produce a lag relative to the truth. Scalar broadcast is
    supported. Defaults to ``[0.0, np.pi / 6]`` when ``None``.

prefix : str, default='model'
    Prefix used to generate prediction column names, e.g.,
    ``model_A``, ``model_B``, …

series_names : list of str, optional
    Explicit names for prediction columns. If omitted, names are
    generated from ``prefix`` as ``prefix_A``, ``prefix_B``, …
    Must have length ``n_series`` if provided.

seed : int or None, default=404
    Seed for NumPy’s random generator. If ``None``, a fresh RNG
    is used.

as_frame : bool, default=False
    If ``False``, return a :class:`~kdiagram.bunch.Bunch` with
    metadata and arrays. If ``True``, return only the pandas
    ``DataFrame``.

Returns
-------
data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame
    If ``as_frame=False`` (default), a Bunch with:

    - ``frame`` : pandas ``DataFrame`` containing ``'time_step'``,
      ``'y_true'``, and prediction columns.
    - ``feature_names`` : ``['time_step']``.
    - ``target_names`` : ``['y_true']``.
    - ``target`` : ``ndarray`` of shape ``(n_samples,)`` with the
      true signal.
    - ``series_names`` : list of prediction series names.
    - ``prediction_columns`` : list of prediction column names.
    - ``DESCR`` : human-readable description.

    If ``as_frame=True``, only the pandas ``DataFrame`` is
    returned.

Raises
------
ValueError
    If a provided list for prediction parameters does not match
    ``n_series`` in length.

TypeError
    If prediction parameters are not float or list of float.

Notes
-----
**Signal model.** Let :math:`P` be the cycle period and
:math:`\omega = 2\pi/P`. The **true** signal at time step
:math:`t \in \{0,\dots,n\_samples-1\}` is

.. math::

   y_{\text{true}}(t)
   \;=\;
   \texttt{offset\_true}
   \;+\;
   \texttt{amplitude\_true}\,\sin(\omega t)
   \;+\;
   \varepsilon_t,
   \qquad
   \varepsilon_t \sim \mathcal{N}(0,\sigma^2),
   \;\; \sigma=\texttt{noise\_level}.

For series :math:`k=1,\dots,n\_{\text{series}}`, the prediction is

.. math::

   y_{\text{pred}}^{(k)}(t)
   \;=\;
   \texttt{offset\_true}
   \;+\;
   b_k
   \;+\;
   \big(\texttt{amplitude\_true}\,\alpha_k\big)
   \sin(\omega t + \phi_k)
   \;+\;
   \eta^{(k)}_t,

with :math:`\eta^{(k)}_t \sim \mathcal{N}\!\big(0,\,
(\sigma\,\gamma_k)^2\big)`.
Here :math:`b_k` is the bias (``pred_bias``),
:math:`\alpha_k` the amplitude factor (``pred_amplitude_factor``),
:math:`\phi_k` the phase shift (``pred_phase_shift``), and
:math:`\gamma_k` the noise factor (``pred_noise_factor``).
Numerical generation and plotting typically rely on array/scientific
and graphics stacks :footcite:p:`harris2020array, 2020SciPy-NMeth, Hunter:2007`.

See Also
--------
kdiagram.plot.relationship.plot_relationship
    Polar relationship scatter for true vs. predictions.

kdiagram.plot.uncertainty.plot_temporal_uncertainty
    General-purpose polar series plot; useful for Q10/Q50/Q90 and
    cyclical visualizations.

Examples
--------
>>> Generate a small cyclical dataset as a Bunch:
>>> 
>>> from kdiagram.datasets import make_cyclical_data
>>> ds = make_cyclical_data(
...     n_samples=24, n_series=2, cycle_period=12, seed=7
... )
>>> ds.frame.head().columns.tolist()[:3]
['time_step', 'y_true', ds.prediction_columns[0]]
>>> 
>>> Return only a DataFrame and supply custom names:
>>> 
>>> df = make_cyclical_data(
...     n_samples=50,
...     n_series=3,
...     series_names=['A','B','C'],
...     as_frame=True,
...     seed=1
... )
>>> set(['time_step','y_true']).issubset(df.columns)
True

References 
----------

.. footbibliography::
    
"""



[docs]
def make_fingerprint_data(
    n_layers: int = 3,
    n_features: int = 8,
    layer_names: list[str] | None = None,
    feature_names: list[str] | None = None,
    value_range: tuple[float, float] = (0.0, 1.0),
    sparsity: float = 0.1,
    add_structure: bool = True,
    seed: int | None = 303,
    as_frame: bool = False,
) -> Bunch | pd.DataFrame:
    # --- Input Validation & Setup ---
    if not (0.0 <= sparsity <= 1.0):
        raise ValueError("sparsity must be between 0.0 and 1.0")
    if not (
        isinstance(value_range, tuple)
        and len(value_range) == 2
        and value_range[0] <= value_range[1]
    ):
        raise ValueError(
            "value_range must be a tuple (min, max) with min <= max."
        )

    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    # Generate names if needed
    if feature_names is None:
        feature_names = [f"Feature_{i + 1}" for i in range(n_features)]
    elif len(feature_names) != n_features:
        raise ValueError(
            f"Length of feature_names ({len(feature_names)}) "
            f"must match n_features ({n_features})."
        )

    if layer_names is None:
        layer_names = [f"Layer_{chr(65 + i)}" for i in range(n_layers)]
    elif len(layer_names) != n_layers:
        raise ValueError(
            f"Length of layer_names ({len(layer_names)}) "
            f"must match n_layers ({n_layers})."
        )

    # --- Generate Importance Matrix ---
    min_val, max_val = value_range
    importances = rng.uniform(min_val, max_val, size=(n_layers, n_features))

    # Add optional structure
    if add_structure and n_layers > 1 and n_features > 1:
        for i in range(n_layers):
            # Example structure: layer 'i' emphasizes feature 'i' (cycling)
            emphasized_feature = i % n_features
            importances[i, emphasized_feature] = rng.uniform(
                (min_val + max_val) / 1.5,  # Emphasize higher values
                max_val * 1.1,  # Allow slightly exceeding max
            )
            # Maybe deemphasize another feature
            deemphasized_feature = (i + n_features // 2) % n_features
            if deemphasized_feature != emphasized_feature:
                importances[i, deemphasized_feature] = rng.uniform(
                    min_val * 0.9,  # Allow slightly below min
                    (min_val + max_val) / 2.5,  # Emphasize lower values
                )
        # Ensure values stay within reasonable bounds if needed
        importances = np.clip(importances, min_val * 0.8, max_val * 1.2)

    # Introduce sparsity
    if sparsity > 0:
        mask = rng.choice(
            [0, 1], size=importances.shape, p=[sparsity, 1 - sparsity]
        )
        importances *= mask

    # --- Assemble DataFrame ---
    df = pd.DataFrame(importances, index=layer_names, columns=feature_names)

    # --- Return based on as_frame ---
    if as_frame:
        return df
    else:
        # Create Bunch description
        descr = textwrap.dedent(
            f"""\
        Synthetic Feature Fingerprint Data

        **Description:**
        Simulated feature importance matrix for {n_layers} layers/groups
        and {n_features} features. Values were sampled uniformly from
        the range {value_range} and approximately {sparsity * 100:.0f}% were
        randomly set to zero (sparsity).{
                " Some basic structure was added." if add_structure else ""
            } This dataset is suitable for use with
        plot_feature_fingerprint.

        **Generation Parameters:**
        - n_layers       : {n_layers}
        - n_features     : {n_features}
        - value_range    : {value_range}
        - sparsity       : {sparsity:.2f}
        - add_structure  : {add_structure}
        - seed           : {seed}

        **Contents (Bunch object):**
        - importances    : NumPy array ({n_layers}, {n_features}) with scores.
        - frame          : Pandas DataFrame view of importances matrix.
        - layer_names    : List of {n_layers} layer names (index).
        - feature_names  : List of {n_features} feature names (columns).
        - DESCR          : This description.
        """
        )

        return Bunch(
            importances=importances,
            frame=df,
            layer_names=list(layer_names),
            feature_names=list(feature_names),
            DESCR=descr,
        )



make_fingerprint_data.__doc__ = r"""
Generate synthetic feature-importance data for fingerprint plots.

Creates a matrix of feature-importance scores across multiple
**layers** (e.g., models, periods, experimental groups) suitable
for visualization with
:func:`~kdiagram.plot.feature_based.plot_feature_fingerprint`.
This is handy for comparing profiles in a compact polar radar
view and for testing feature-comparison workflows in forecasting
and ML :footcite:p:`scikit-learn, Lim2021, kouadiob2025`.

Parameters
----------
n_layers : int, default=3
    Number of rows (layers) in the importance matrix. Each row
    represents a group such as a model or time period.

n_features : int, default=8
    Number of columns (features) in the importance matrix.

layer_names : list of str, optional
    Names for the layers. If ``None``, generic names like
    ``'Layer_A'``, ``'Layer_B'`` are generated. Must have length
    ``n_layers`` if provided.

feature_names : list of str, optional
    Names for the features. If ``None``, generic names like
    ``'Feature_1'``, ``'Feature_2'`` are generated. Must have
    length ``n_features`` if provided.

value_range : tuple of (float, float), default=(0.0, 1.0)
    Approximate sampling range ``(min_val, max_val)`` for raw
    importance scores. Values are drawn from a uniform
    distribution before structure/sparsity are applied.

sparsity : float, default=0.1
    Fraction in ``[0, 1]`` of entries that are set to zero
    at random, simulating unimportant features for some layers.

add_structure : bool, default=True
    If ``True``, inject simple patterns to make fingerprints
    distinct, e.g., emphasizing one feature per layer and
    de-emphasizing another. If ``False``, the matrix is fully
    random apart from sparsity.

seed : int or None, default=303
    Seed for NumPy’s random generator. If ``None``, a fresh RNG
    is used.

as_frame : bool, default=False
    If ``False``, return a :class:`~kdiagram.bunch.Bunch` with
    metadata and arrays. If ``True``, return only the pandas
    ``DataFrame`` indexed by layers with feature columns.

Returns
-------
data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame
    If ``as_frame=False`` (default), a Bunch with:

    - ``importances`` : ``ndarray`` of shape
      ``(n_layers, n_features)``.
    - ``frame`` : pandas ``DataFrame`` view of the matrix with
      layers as index and features as columns.
    - ``layer_names`` : list of layer names.
    - ``feature_names`` : list of feature names.
    - ``DESCR`` : human-readable description.

    If ``as_frame=True``, only the pandas ``DataFrame`` is
    returned.

Raises
------
ValueError
    If ``layer_names`` or ``feature_names`` lengths do not match
    the specified dimensions, if ``sparsity`` is outside
    ``[0, 1]``, or if ``value_range`` does not satisfy
    ``min_val <= max_val``.

Notes
-----
**Generation model.** Let :math:`I \in \mathbb{R}^{L \times F}`
denote the importance matrix with :math:`L = \texttt{n\_layers}`
and :math:`F = \texttt{n\_features}`. Raw scores are sampled as

.. math::

   I_{k,j}^{(0)} \sim \mathcal{U}(m, M),
   \qquad m = \texttt{value\_range[0]},\; M = \texttt{value\_range[1]}.

If structure is enabled, a layer-specific emphasis and
de-emphasis may be applied, producing :math:`I^{(1)}`. Finally,
a sparsity mask :math:`\;M_{k,j} \sim \text{Bernoulli}(1-s)\;`
with :math:`s=\texttt{sparsity}` is applied:

.. math::

   I_{k,j} \;=\; I_{k,j}^{(1)} \cdot M_{k,j}.

Scores are left in their original scale; you may normalize
per-layer or per-feature downstream if desired. For practical
feature-importance workflows and attribution in forecasting,
see :footcite:t:`scikit-learn` and :footcite:t:`Lim2021`. The
fingerprint visualization concept is part of our polar analytics
framework :footcite:t:`kouadiob2025`.

See Also
--------
kdiagram.plot.feature_based.plot_feature_fingerprint
    Radar-style comparison of multi-feature profiles across layers.

Examples
--------
>>> Return a Bunch with arrays and a DataFrame view:
>>> 
>>> from kdiagram.datasets import make_fingerprint_data
>>> fp = make_fingerprint_data(n_layers=4, n_features=10, seed=1)
>>> fp.importances.shape
(4, 10)
>>> list(fp.frame.index)[:2], list(fp.frame.columns)[:3]
(['Layer_A', 'Layer_B'], ['Feature_1', 'Feature_2', 'Feature_3'])
>>> 
>>> Return only a DataFrame with custom names:
>>> 
>>> df = make_fingerprint_data(
...     n_layers=3,
...     n_features=5,
...     layer_names=['L1','L2','L3'],
...     feature_names=['f1','f2','f3','f4','f5'],
...     as_frame=True,
...     seed=2,
... )
>>> df.shape
(3, 5)

References
----------
.. footbibliography::
"""



[docs]
def make_uncertainty_data(
    n_samples: int = 150,
    n_periods: int = 4,
    anomaly_frac: float = 0.15,
    start_year: int = 2022,
    prefix: str = "value",
    base_value: float = 10.0,
    trend_strength: float = 1.5,
    noise_level: float = 2.0,
    interval_width_base: float = 4.0,
    interval_width_noise: float = 1.5,
    interval_width_trend: float = 0.5,
    seed: int | None = 42,
    as_frame: bool = False,
) -> Bunch | pd.DataFrame:
    # --- Generation Logic (same as before) ---
    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    location_id = np.arange(n_samples)
    longitude = rng.uniform(-120, -115, n_samples)
    latitude = rng.uniform(33, 36, n_samples)
    elevation = rng.uniform(50, 500, n_samples) + latitude * 5
    base_signal = (
        base_value
        + np.sin(np.linspace(0, 3 * np.pi, n_samples)) * 5
        + rng.normal(0, noise_level / 2, n_samples)
    )
    actual_first_period = base_signal + rng.normal(
        0, noise_level / 2, n_samples
    )

    data_dict = {
        "location_id": location_id,
        "longitude": longitude,
        "latitude": latitude,
        "elevation": elevation,
        # Store actual only once, representing T=0 or reference time
        f"{prefix}_actual": actual_first_period.copy(),
    }

    all_q10_cols, all_q50_cols, all_q90_cols = [], [], []
    quantile_cols_dict = {"q0.1": [], "q0.5": [], "q0.9": []}

    for i in range(n_periods):
        year = start_year + i
        q10_col = f"{prefix}_{year}_q0.1"
        q50_col = f"{prefix}_{year}_q0.5"
        q90_col = f"{prefix}_{year}_q0.9"

        all_q10_cols.append(q10_col)
        all_q50_cols.append(q50_col)
        all_q90_cols.append(q90_col)
        quantile_cols_dict["q0.1"].append(q10_col)
        quantile_cols_dict["q0.5"].append(q50_col)
        quantile_cols_dict["q0.9"].append(q90_col)

        current_trend = trend_strength * i
        q50 = (
            base_signal
            + current_trend
            + rng.normal(0, noise_level / 3, n_samples)
        )

        current_interval_width = (
            interval_width_base
            + interval_width_trend * i
            + rng.uniform(
                -interval_width_noise / 2, interval_width_noise / 2, n_samples
            )
        )
        current_interval_width = np.maximum(0.1, current_interval_width)

        q10 = q50 - current_interval_width / 2
        q90 = q50 + current_interval_width / 2

        data_dict[q10_col] = q10
        data_dict[q50_col] = q50
        data_dict[q90_col] = q90

    df = pd.DataFrame(data_dict)

    actual_col_name = f"{prefix}_actual"
    if anomaly_frac > 0 and n_samples > 0:
        n_anomalies = int(anomaly_frac * n_samples)
        if n_anomalies > 0 and all_q10_cols and all_q90_cols:
            anomaly_indices = rng.choice(
                n_samples, size=n_anomalies, replace=False
            )
            n_under = n_anomalies // 2
            under_indices = anomaly_indices[:n_under]
            over_indices = anomaly_indices[n_under:]

            q10_first = df[all_q10_cols[0]].iloc[under_indices]
            q90_first = df[all_q90_cols[0]].iloc[over_indices]

            df.loc[under_indices, actual_col_name] = q10_first - rng.uniform(
                0.5, 3.0, size=len(under_indices)
            ) * (interval_width_base / 2 + 1)

            df.loc[over_indices, actual_col_name] = q90_first + rng.uniform(
                0.5, 3.0, size=len(over_indices)
            ) * (interval_width_base / 2 + 1)

    # Define final column order
    feature_names = ["location_id", "longitude", "latitude", "elevation"]
    target_names = [actual_col_name]
    pred_cols_sorted = [
        col
        for pair in zip(all_q10_cols, all_q50_cols, all_q90_cols)
        for col in pair
    ]
    ordered_cols = feature_names + target_names + pred_cols_sorted
    df = df[ordered_cols]

    # --- Return based on as_frame ---
    if as_frame:
        return df
    else:
        # Create Bunch object
        numeric_cols = feature_names + target_names + pred_cols_sorted
        # data_array = df[numeric_cols].values # Data array (optional)
        # target_array = df[target_names[0]].values
        target_array = df[target_names[0]].to_numpy(
            dtype=np.float64, copy=True
        )
        data_array = df[numeric_cols].to_numpy(dtype=np.float64, copy=True)

        # Create detailed description string
        descr = textwrap.dedent(
            f"""\
        Synthetic Multi-Period Uncertainty Dataset for k-diagram

        **Description:**
        This dataset simulates quantile forecasts (Q10, Q50, Q90) for a
        single variable ('{prefix}') over {n_periods} consecutive time periods
        (starting from {start_year}) across {n_samples} independent samples or
        locations. It includes simulated spatial coordinates and an
        auxiliary feature ('elevation'). An 'actual' value column
        (``{actual_col_name}``) corresponding to the *first* time
        period is provided, with ~{anomaly_frac * 100:.0f}% of these values
        artificially placed outside the first period's Q10-Q90 interval
        to simulate prediction anomalies.

        The Q50 predictions follow a base signal with added noise and a
        linear trend controlled by `trend_strength`. The prediction
        interval width (Q90-Q10) also includes baseline width, noise,
        and a linear trend controlled by `interval_width_trend`.

        **Generation Parameters:**
        - n_samples             : {n_samples}
        - n_periods             : {n_periods}
        - start_year            : {start_year}
        - prefix                : '{prefix}'
        - anomaly_frac          : {anomaly_frac:.2f}
        - base_value            : {base_value:.2f}
        - trend_strength        : {trend_strength:.2f} (for Q50)
        - noise_level           : {noise_level:.2f} (added to Q50/actual)
        - interval_width_base   : {interval_width_base:.2f}
        - interval_width_noise  : {interval_width_noise:.2f}
        - interval_width_trend  : {interval_width_trend:.2f}
        - seed                  : {seed}

        **Data Structure (Bunch object):**
        - frame           : Complete pandas DataFrame.
        - feature_names   : List of spatial/auxiliary feature column names.
        - target_names    : List containing the target column name.
        - target          : NumPy array of target ('actual') values.
        - quantile_cols   : Dict mapping quantiles ('q0.1', 'q0.5', 'q0.9')
                          to lists of column names across periods.
        - q10_cols        : Convenience list of Q10 column names.
        - q50_cols        : Convenience list of Q50 column names.
        - q90_cols        : Convenience list of Q90 column names.
        - n_periods       : Number of periods with quantile data.
        - prefix          : Prefix used for value/quantile columns.
        - DESCR           : This description.

        This dataset is ideal for testing functions like plot_model_drift,
        plot_uncertainty_drift, plot_interval_consistency,
        plot_anomaly_magnitude, plot_coverage_diagnostic, etc.
        """
        )

        # Create and return Bunch object
        return Bunch(
            frame=df,
            data=data_array,
            feature_names=feature_names,
            target_names=target_names,
            target=target_array,
            quantile_cols=quantile_cols_dict,
            q10_cols=all_q10_cols,
            q50_cols=all_q50_cols,
            q90_cols=all_q90_cols,
            n_periods=n_periods,
            prefix=prefix,
            DESCR=descr,
        )



make_uncertainty_data.__doc__ = r"""
Generate a synthetic multi-period uncertainty dataset.

Creates a compact dataset for testing `k-diagram` uncertainty
visualizations: simulated **actuals** (for the first period),
quantile predictions **Q10/Q50/Q90** over multiple periods,
controllable trends and noise, injected interval-coverage
failures (anomalies), and simple spatial features. This is
useful for coverage, calibration, drift, and consistency
diagnostics :footcite:p:`Jolliffe2012, Gneiting2007b, kouadiob2025`.

Parameters
----------
n_samples : int, default=150
    Number of rows (locations) to generate.

n_periods : int, default=4
    Number of consecutive periods (e.g., years) for which to
    generate quantiles.

anomaly_frac : float, default=0.15
    Fraction in ``[0, 1]`` of rows whose first-period actual is
    forced **outside** the Q10–Q90 interval (half under-, half
    over-prediction, up to rounding).

start_year : int, default=2022
    First period’s year used in column names.

prefix : str, default='value'
    Base prefix for generated value/quantile columns.

base_value : float, default=10.0
    Mean level for the latent signal that drives Q50.

trend_strength : float, default=1.5
    Linear trend added to Q50 by period index (lead time).

noise_level : float, default=2.0
    Standard deviation for Gaussian noise added to the latent
    signal (for Q50 and actuals).

interval_width_base : float, default=4.0
    Baseline width of the Q10–Q90 interval in the first period.

interval_width_noise : float, default=1.5
    Uniform jitter magnitude applied per row/period to the
    interval width.

interval_width_trend : float, default=0.5
    Linear trend added to interval width across periods.

seed : int or None, default=42
    NumPy RNG seed for reproducibility. If ``None``, a fresh RNG
    is used.

as_frame : bool, default=False
    If ``False``, return a :class:`~kdiagram.bunch.Bunch` with
    arrays and metadata. If ``True``, return only the pandas
    ``DataFrame``.

Returns
-------
data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame
    If ``as_frame=False`` (default), a Bunch with:

    - ``frame`` : pandas ``DataFrame`` with spatial features,
      first-period actual, and Q10/Q50/Q90 columns by period.
    - ``feature_names`` : ``['location_id','longitude','latitude',
      'elevation']``.
    - ``target_names`` : ``[f'{prefix}_actual']``.
    - ``target`` : ``ndarray`` of actual values.
    - ``quantile_cols`` : dict mapping ``'q0.1'``, ``'q0.5'``,
      ``'q0.9'`` to lists of column names across periods.
    - ``q10_cols``, ``q50_cols``, ``q90_cols`` : convenience lists.
    - ``n_periods`` : number of generated periods.
    - ``prefix`` : the column name prefix.
    - ``DESCR`` : human-readable description.

    If ``as_frame=True``, only the pandas ``DataFrame`` is
    returned.

Raises
------
TypeError
    If numeric inputs cannot be processed.

Notes
-----
**Column naming.** Quantile columns encode the year :math:`y`
and quantile level :math:`q`:

.. math::

   \text{quantile name}
   \;\equiv\;
   \texttt{<prefix>}\_{y}\_\texttt{q}q,
   \qquad
   y \in \{\texttt{start\_year},\dots\},
   \;\; q \in \{0.1,0.5,0.9\}.

The first-period actual is stored once as
``f"{prefix}_actual"``.

**Signal and interval model.** Let period index be
:math:`t \in \{0,\dots,n\_\text{periods}-1\}` and row index
:math:`i`. Define latent base signal :math:`s_i` and Q50:

.. math::

   s_i \;=\; \texttt{base\_value}
          \;+\; \varepsilon_i,
   \qquad
   \varepsilon_i \sim \mathcal{N}(0, \sigma^2),\;
   \sigma=\texttt{noise\_level}/2,

.. math::

   Q50_{i,t} \;=\; s_i \;+\; t\cdot\texttt{trend\_strength}
                   \;+\; \eta_{i,t},
   \quad
   \eta_{i,t} \sim \mathcal{N}\!\big(0,
   (\texttt{noise\_level}/3)^2\big).

Interval width :math:`w_{i,t}` has baseline, trend, and jitter:

.. math::

   w_{i,t}
   \;=\;
   \max\!\Bigl(
     0.1,\,
     \texttt{interval\_width\_base}
     + t\cdot\texttt{interval\_width\_trend}
     + u_{i,t}
   \Bigr),
   \quad
   u_{i,t} \sim \mathcal{U}\!\Bigl(-\tfrac{
   \texttt{interval\_width\_noise}}{2},\,
   \tfrac{\texttt{interval\_width\_noise}}{2}\Bigr),

and

.. math::

   Q10_{i,t} \;=\; Q50_{i,t} - \tfrac{1}{2}w_{i,t},\qquad
   Q90_{i,t} \;=\; Q50_{i,t} + \tfrac{1}{2}w_{i,t}.

**Anomaly injection (first period).** For a fraction
``anomaly_frac`` of rows we enforce a coverage failure:

.. math::

   y^{\text{actual}}_{i}
   \notin
   [\,Q10_{i,0},\,Q90_{i,0}\,],

splitting under/over cases approximately evenly to aid tests of
coverage diagnostics and anomaly magnitude plots. Use this data to
study calibration vs. sharpness trade-offs
:footcite:p:`Gneiting2007b` and operational verification practice
:footcite:p:`Jolliffe2012`.

See Also
--------
kdiagram.plot.uncertainty.plot_coverage
    Aggregate empirical coverage vs. nominal levels.

kdiagram.plot.uncertainty.plot_coverage_diagnostic
    Point-wise success/failure on a polar layout.

kdiagram.plot.uncertainty.plot_interval_consistency
    Temporal stability of interval widths per location.

kdiagram.plot.uncertainty.plot_model_drift
    Lead-time trend of mean interval width.

kdiagram.plot.uncertainty.plot_anomaly_magnitude
    Where and how severely intervals fail.

Examples
--------
>>> # Return a Bunch and inspect quantile columns:
>>> 
>>> from kdiagram.datasets import make_uncertainty_data
>>> ds = make_uncertainty_data(n_samples=12, n_periods=3, seed=7)
>>> sorted(ds.quantile_cols.keys())
['q0.1', 'q0.5', 'q0.9']
>>> 
>>> # Return only a DataFrame and check column order:
>>> 
>>> df = make_uncertainty_data(as_frame=True, n_samples=5, seed=0)
>>> df.columns[:6].tolist()  # features + actual then Q10/Q50/Q90
['location_id', 'longitude', 'latitude', 'elevation',
 f'{ 'value'}_actual', 'value_2022_q0.1']  # doctest: +ELLIPSIS

References
----------
.. footbibliography::
"""



[docs]
def make_taylor_data(
    n_samples: int = 100,
    n_models: int = 3,
    ref_std: float = 1.0,
    corr_range: tuple[float, float] = (0.5, 0.99),
    std_range: tuple[float, float] = (0.7, 1.3),
    noise_level: float = 0.3,
    bias_level: float = 0.1,
    seed: int | None = 101,
    as_frame: bool = False,
) -> Bunch | pd.DataFrame:
    # --- Input Validation & Setup ---
    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    # Basic validation for ranges
    if not (0 <= corr_range[0] <= corr_range[1] <= 1.0):
        warnings.warn(
            "corr_range limits should ideally be between 0 and 1 for "
            "standard Taylor Diagrams. Adjusting...",
            stacklevel=2,
        )
        corr_range = (max(0, corr_range[0]), min(1.0, corr_range[1]))
        if corr_range[0] > corr_range[1]:
            corr_range = (0.5, 0.99)

    if not (0 <= std_range[0] <= std_range[1]):
        warnings.warn(
            "std_range factors should be non-negative and min <= max."
            " Using defaults.",
            stacklevel=2,
        )
        std_range = (0.7, 1.3)

    if noise_level <= 1e-9 and corr_range[1] < 1.0 - 1e-9:
        raise ValueError(
            "noise_level cannot be zero if target correlation < 1 is possible."
        )

    # --- Generate Reference Data ---
    reference_raw = rng.normal(0, ref_std, n_samples)
    # Center mean at 0
    reference = reference_raw - np.mean(reference_raw)
    # Scale to desired std dev
    current_std = np.std(reference)
    if current_std > 1e-9:
        reference = reference * (ref_std / current_std)
    # Store actual std dev
    actual_ref_std = np.std(reference)

    # --- Generate Model Predictions ---
    predictions = []
    model_names = []
    calculated_stds = []
    calculated_corrs = []

    for i in range(n_models):
        model_name = f"Model_{chr(65 + i)}"  # Model A, B, C...
        model_names.append(model_name)

        # Sample target stats for this model
        target_rho = rng.uniform(corr_range[0], corr_range[1])
        target_std_factor = rng.uniform(std_range[0], std_range[1])
        target_std = target_std_factor * actual_ref_std

        # Calculate coefficients a and b for p = a*r + b*noise + bias
        a = target_rho * target_std_factor
        b_squared_term = target_std**2 - (a * actual_ref_std) ** 2

        if b_squared_term < -1e-9:
            warnings.warn(
                f"Model {model_name}: Cannot achieve target std "
                f"({target_std:.2f}) with target correlation "
                f"({target_rho:.2f}) and noise "
                f"({noise_level:.2f}). Setting b=0.",
                UserWarning,
                stacklevel=2,
            )
            b = 0
        else:
            # Ensure noise_level isn't zero if b_squared_term > 0
            if noise_level <= 1e-9 and b_squared_term > 1e-9:
                raise ValueError(
                    "noise_level cannot be zero if needed to reach target std"
                )
            b = np.sqrt(max(0, b_squared_term)) / max(noise_level, 1e-9)

        # Generate noise and bias
        noise = rng.normal(0, noise_level, n_samples)
        bias = rng.uniform(-bias_level, bias_level)

        # Create prediction
        pred = a * reference + b * noise + bias
        predictions.append(pred)

        # Calculate actual stats
        calculated_stds.append(np.std(pred))
        # Clip correlation calculation for safety
        corr_val = np.corrcoef(pred, reference)[0, 1]
        calculated_corrs.append(np.clip(corr_val, -1.0, 1.0))

    # --- Assemble DataFrame (used for both frame and Bunch) ---
    df_dict = {"reference": reference}
    for name, pred_array in zip(model_names, predictions):
        df_dict[name] = pred_array
    df = pd.DataFrame(df_dict)

    # --- Return based on as_frame ---
    if as_frame:
        return df
    else:
        # Assemble stats DataFrame
        stats_df = pd.DataFrame(
            {"stddev": calculated_stds, "corrcoef": calculated_corrs},
            index=model_names,
        )

        # Assemble description
        descr = textwrap.dedent(
            f"""\
        Synthetic Taylor Diagram Data

        **Generated Parameters:**
        - n_samples    : {n_samples}
        - n_models     : {n_models}
        - ref_std      : {ref_std:.2f} (target), {actual_ref_std:.2f} (actual)
        - corr_range   : ({corr_range[0]:.2f}, {corr_range[1]:.2f}) (target)
        - std_range    : ({std_range[0]:.2f}, {std_range[1]:.2f}) (target factor)
        - noise_level  : {noise_level:.2f}
        - bias_level   : {bias_level:.2f}
        - seed         : {seed}

        **Contents (Bunch object):**
        - frame        : DataFrame with reference and prediction columns.
        - reference    : NumPy array (n_samples,) - Reference data.
        - predictions  : List of {n_models} NumPy arrays (n_samples,) - Model data.
        - model_names  : List of {n_models} strings - Model labels.
        - stats        : DataFrame with actual calculated 'stddev' and
                         'corrcoef' for each model vs reference.
        - ref_std      : Actual standard deviation of the reference data.
        - DESCR        : This description.
        """
        )

        return Bunch(
            frame=df,
            reference=reference,
            predictions=predictions,
            model_names=model_names,
            stats=stats_df,
            ref_std=actual_ref_std,
            DESCR=descr,
        )



make_taylor_data.__doc__ = r"""
Generate synthetic data for Taylor diagrams.

Taylor diagrams, introduced by :footcite:t:`Taylor2001`, summarize
correlation, standard deviation, and centered RMS difference between
model outputs and a reference. This routine creates one reference
series and several model-like series with controllable correlation
and spread, suitable for exercising plotting functions such as
:func:`~kdiagram.plot.evaluation.taylor_diagram`. Practical guidance
on verification appears in :footcite:p:`Jolliffe2012`.

Parameters
----------
n_samples : int, default=100
    Number of observations in each generated series.

n_models : int, default=3
    Number of model (prediction) series to simulate.

ref_std : float, default=1.0
    Target standard deviation for the reference series
    (mean is centered to 0).

corr_range : tuple of (float, float), default=(0.5, 0.99)
    Closed interval from which target correlations :math:`\rho`
    for models are sampled uniformly. Values should be in
    :math:`[0,1]` for standard Taylor use.

std_range : tuple of (float, float), default=(0.7, 1.3)
    Closed interval for multiplicative factors applied to the
    reference standard deviation to obtain each model’s target
    spread.

noise_level : float, default=0.3
    Standard deviation of the independent noise used to reach
    the requested spread and correlation. Must be positive if
    any target correlation is less than 1.

bias_level : float, default=0.1
    Maximum absolute bias added to each model series (uniform
    in ``[-bias_level, bias_level]``). Note that Taylor diagrams
    are insensitive to overall bias.

seed : int or None, default=101
    NumPy random seed. If ``None``, a fresh RNG is used.

as_frame : bool, default=False
    If ``False``, return a :class:`~kdiagram.bunch.Bunch` with
    arrays, names, and summary stats. If ``True``, return only
    a pandas ``DataFrame`` with columns for the reference and
    each model series.

Returns
-------
data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame
    If ``as_frame=False`` (default), a Bunch with:

    - ``frame`` : pandas ``DataFrame`` with ``'reference'`` and
      model columns.
    - ``reference`` : ``ndarray`` of shape ``(n_samples,)``.
    - ``predictions`` : list of ``ndarray`` predictions.
    - ``model_names`` : list of model labels.
    - ``stats`` : pandas ``DataFrame`` with columns
      ``'stddev'`` and ``'corrcoef'`` vs the reference.
    - ``ref_std`` : actual standard deviation of the reference.
    - ``DESCR`` : human-readable description.

    If ``as_frame=True``, only the pandas ``DataFrame`` is
    returned.

Raises
------
ValueError
    If ranges are invalid, or ``noise_level`` is non-positive
    while a sub-perfect target correlation is requested.

Notes
-----
**Construction.** Let the reference be :math:`r` with
:math:`\mathrm{E}[r]=0` and :math:`\mathrm{sd}(r)=\sigma_r`
(we target :math:`\sigma_r=\texttt{ref\_std}`). For model
:math:`k`, we synthesize

.. math::

   p^{(k)} \;=\; a^{(k)} r \;+\; b^{(k)} \epsilon^{(k)} \;+\; \text{bias}^{(k)},

with :math:`\epsilon^{(k)} \sim \mathcal{N}(0,\sigma_\epsilon^2)`
independent of :math:`r`, where
:math:`\sigma_\epsilon=\texttt{noise\_level}`. Ignoring bias
(centered statistics), the model spread and correlation satisfy

.. math::

   \sigma_{p}^{(k)} \;=\; \sqrt{(a^{(k)} \sigma_r)^2 + (b^{(k)} \sigma_\epsilon)^2},
   \qquad
   \rho^{(k)} \;=\; \frac{a^{(k)} \sigma_r}{\sigma_{p}^{(k)}}.

We sample a target
:math:`\rho^{(k)} \in \texttt{corr\_range}` and a target spread
factor :math:`\alpha^{(k)} \in \texttt{std\_range}`, set
:math:`\sigma_p^{(k)} = \alpha^{(k)} \sigma_r`, choose

.. math::

   a^{(k)} \;=\; \rho^{(k)} \alpha^{(k)}, \qquad
   b^{(k)} \;=\; \frac{\sqrt{\left(\sigma_p^{(k)}\right)^2 -
                           \left(a^{(k)} \sigma_r\right)^2}}
                        {\sigma_\epsilon},

and draw a small constant :math:`\text{bias}^{(k)} \in
[-\texttt{bias\_level},\texttt{bias\_level}]`. Centered Taylor
statistics are unaffected by bias. See :footcite:t:`Taylor2001`
for interpretation; broader verification context is covered in
:footcite:p:`Jolliffe2012`.

See Also
--------
kdiagram.plot.evaluation.taylor_diagram
    Flexible Taylor diagram from raw arrays or pre-computed stats.

kdiagram.plot.evaluation.plot_taylor_diagram
    Standard Taylor diagram from raw arrays.

kdiagram.plot.evaluation.plot_taylor_diagram_in
    Taylor diagram with background shading.

Examples
--------
>>>  # Get arrays and stats as a Bunch:
>>> 
>>> from kdiagram.datasets import make_taylor_data
>>> ds = make_taylor_data(n_models=2, seed=0)
>>> list(ds.frame.columns)
['reference', 'Model_A', 'Model_B']
>>> set(ds.stats.columns) == {'stddev', 'corrcoef'}
True
>>> 
>>> # Return only a DataFrame:
>>> 
>>> df = make_taylor_data(as_frame=True, seed=1)
>>> 'reference' in df.columns
True

References
----------
.. footbibliography::
"""



[docs]
def make_multi_model_quantile_data(
    n_samples: int = 100,
    n_models: int = 3,
    quantiles: list[float] = None,
    prefix: str = "pred",
    model_names: list[str] | None = None,
    true_mean: float = 50.0,
    true_std: float = 10.0,
    bias_range: tuple[float, float] = (-2.0, 2.0),
    width_range: tuple[float, float] = (5.0, 15.0),
    noise_level: float = 1.0,
    seed: int | None = 202,
    as_frame: bool = False,
) -> Bunch | pd.DataFrame:
    # --- Input Validation ---
    if quantiles is None:
        quantiles = [0.1, 0.5, 0.9]
    if 0.5 not in quantiles:
        # Current logic relies on 0.5 being present for centering
        raise ValueError("The `quantiles` list must contain 0.5 (median).")

    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    _validate_range_or_list(
        width_range, "width_range", require_nonneg_min=True
    )
    _validate_range_or_list(
        bias_range, "bias_range", require_nonneg_min=False
    )

    # now expand to per-model lists
    bias_ranges = _expand_param(bias_range, n_models, "bias_range")
    width_ranges = _expand_param(width_range, n_models, "width_range")

    bias_ranges = _expand_param(bias_range, n_models, "bias_range")
    width_ranges = _expand_param(width_range, n_models, "width_range")

    # --- Setup ---
    # Ensure unique and sorted quantiles
    quantiles_sorted = sorted(list(set(quantiles)))
    if len(quantiles_sorted) < 2:
        q_min, q_max = quantiles_sorted[0], quantiles_sorted[0]
    else:
        q_min = quantiles_sorted[0]
        q_max = quantiles_sorted[-1]
    q_median = 0.5

    # Factor to scale half-width based on min/max quantile range vs Q10-Q90
    # Avoid division by zero if only one quantile provided
    width_denominator = 0.9 - 0.1
    width_numerator = q_max - q_min
    if len(quantiles_sorted) > 1 and abs(width_numerator) > 1e-9:
        width_scale_factor = width_numerator / width_denominator
    else:
        width_scale_factor = (
            1.0  # No scaling needed if range is zero/single q
        )

    # --- Data Generation ---
    y_true = rng.normal(true_mean, true_std, n_samples)
    feature_1 = rng.uniform(0, 1, n_samples)
    feature_2 = rng.normal(5, 2, n_samples)

    data_dict = {  # Use dict to build data before DataFrame
        "y_true": y_true,
        "feature_1": feature_1,
        "feature_2": feature_2,
    }

    # Generate Model Names
    if model_names is None:
        model_names_list = [f"Model_{chr(65 + i)}" for i in range(n_models)]
    elif len(model_names) != n_models:
        raise ValueError(
            f"Length of model_names ({len(model_names)}) must "
            f"match n_models ({n_models})."
        )
    else:
        model_names_list = list(model_names)

    prediction_columns_dict = {name: [] for name in model_names_list}

    # --- Generate predictions for each model ---
    for _i, model_name in enumerate(model_names_list):
        # Sample model-specific parameters from the expanded lists
        current_bias_range = bias_ranges[_i]
        current_width_range = width_ranges[_i]

        model_bias = rng.uniform(current_bias_range[0], current_bias_range[1])
        model_width = rng.uniform(
            current_width_range[0], current_width_range[1]
        )

        # Store generated quantiles temporarily before sorting
        temp_model_quantiles = {}

        # Generate Q50 (median) prediction first
        q50_pred = y_true + model_bias + rng.normal(0, noise_level, n_samples)
        q50_col_name = f"{prefix}_{model_name}_q0.5"
        temp_model_quantiles[0.5] = q50_pred
        # Add name to tracking dict immediately
        prediction_columns_dict[model_name].append(q50_col_name)

        # Generate other quantiles based on Q50 and target width
        for q in quantiles_sorted:
            if q == q_median:
                continue  # Skip if median

            # Calculate offset using proportional distance from median
            # Avoid division by zero if q_max == q_min
            q_range = q_max - q_min
            # from scipy.stats import norm
            # z_score = norm.ppf(q) # Z-score for the quantile
            # Use standard deviation implied by width (e.g. q90-q10 ~ 2.56*std)
            # implied_std = model_width / (norm.ppf(q_max) - norm.ppf(q_min))
            #  if (q_max != q_min) else 1.0
            # quantile_offset = z_score * implied_std

            if abs(q_range) > 1e-9 and abs(width_scale_factor) > 1e-9:
                quantile_offset = (
                    (model_width / width_scale_factor)
                    * (q - q_median)
                    / q_range
                    * 2
                )
            else:  # Handle single quantile or zero range
                quantile_offset = 0

            q_pred = (
                q50_pred
                + quantile_offset
                + rng.normal(
                    0,
                    noise_level / 2,
                    n_samples,  # Slightly less noise for bounds
                )
            )
            temp_model_quantiles[q] = q_pred

        # Ensure quantile order and add to main data dict
        # Create temporary DF for sorting this model's quantiles
        model_data_temp = pd.DataFrame(temp_model_quantiles)
        # Sort values row-wise
        sorted_data = np.sort(model_data_temp.values, axis=1)
        # Assign sorted values back, creating final column names
        for k, q in enumerate(quantiles_sorted):
            col_name = f"{prefix}_{model_name}_q{q:.2f}".rstrip("0").rstrip(
                "."
            )
            data_dict[col_name] = sorted_data[:, k]
            # Add to tracking dict if not already added (handles Q50 case)
            if col_name not in prediction_columns_dict[model_name]:
                prediction_columns_dict[model_name].append(col_name)

    # Create the final DataFrame
    df = pd.DataFrame(data_dict)

    # Order columns somewhat logically
    feature_names = ["feature_1", "feature_2"]
    target_name = ["y_true"]
    pred_cols_sorted = sorted(
        [col for col in df.columns if col.startswith(prefix)]
    )
    ordered_cols = target_name + feature_names + pred_cols_sorted
    df = df[ordered_cols]

    # --- Return based on as_frame ---
    if as_frame:
        return df
    else:
        # Create Bunch object
        data_numeric_cols = feature_names + pred_cols_sorted
        data_array = df[data_numeric_cols].values
        target_array = df[target_name[0]].values

        descr = textwrap.dedent(
            f"""\
        Synthetic Multi-Model Quantile Dataset for k-diagram

        **Generated Parameters:**
        - n_samples    : {n_samples}
        - n_models     : {n_models}
        - quantiles    : {quantiles_sorted}
        - prefix       : {prefix}
        - true_mean    : {true_mean:.2f}
        - true_std     : {true_std:.2f}
        - bias_range   : {bias_range}
        - width_range  : {width_range}
        - noise_level  : {noise_level:.2f}
        - seed         : {seed}

        **Data Structure (Bunch object):**
        - frame           : Complete pandas DataFrame.
        - data            : NumPy array of numeric feature & prediction columns.
        - feature_names   : List of auxiliary feature column names.
        - target_names    : List containing the target column name ('y_true').
        - target          : NumPy array of 'y_true' values.
        - model_names     : List of simulated model names.
        - quantile_levels : Sorted list of quantile levels generated.
        - prediction_columns : Dict mapping model names to their column names.
        - prefix          : Prefix used for prediction columns.
        - DESCR           : This description.

        This dataset simulates quantile predictions from {n_models} models
        for a single time point, allowing comparison of their
        uncertainty characteristics.
        """
        )

        return Bunch(
            frame=df,
            data=data_array,
            feature_names=feature_names,
            target_names=target_name,
            target=target_array,
            model_names=model_names_list,
            quantile_levels=quantiles_sorted,
            prediction_columns=prediction_columns_dict,
            prefix=prefix,
            DESCR=descr,
        )



make_multi_model_quantile_data.__doc__ = r"""
Generate multi-model quantile forecast data for a single horizon.

Simulates a target variable :math:`y_{\text{true}}` and
quantile predictions (e.g., Q10/Q50/Q90) from several models
for the **same** forecast time. Each model can have its own
systematic bias and characteristic interval width, enabling
reproducible examples for coverage/calibration and cross-model
comparisons :footcite:p:`Gneiting2007b, Jolliffe2012`.

Parameters
----------
n_samples : int, default=100
    Number of rows (independent samples/locations).

n_models : int, default=3
    Number of simulated models providing quantile forecasts.

quantiles : list of float, default=[0.1, 0.5, 0.9]
    Quantile levels in ``(0, 1)`` to generate for **each** model.
    Must include ``0.5`` (the median). The list is de-duplicated
    and sorted internally.

prefix : str, default='pred'
    Base prefix for prediction columns. Final names follow
    ``{prefix}_{model_name}_q{quantile}``.

model_names : list of str, optional
    Custom model names of length ``n_models``. If ``None``,
    ``'Model_A'``, ``'Model_B'``, … are generated.

true_mean : float, default=50.0
    Mean of the Normal distribution used to draw ``y_true``.

true_std : float, default=10.0
    Standard deviation of the Normal distribution for ``y_true``.

bias_range : tuple of (float, float), default=(-2.0, 2.0)
    Uniform range from which a model-specific bias for Q50 is
    sampled and added to ``y_true``.

width_range : tuple of (float, float), default=(5.0, 15.0)
    Uniform range for the target **overall** interval width
    (e.g., Q90–Q10) of each model.

noise_level : float, default=1.0
    Standard deviation of independent Gaussian noise added to
    each generated quantile series.

seed : int or None, default=202
    NumPy RNG seed (``default_rng``). If ``None``, a fresh RNG is used.

as_frame : bool, default=False
    If ``False``, return a :class:`~kdiagram.bunch.Bunch` with
    arrays/metadata; if ``True``, return only the pandas ``DataFrame``.

Returns
-------
data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame
    If ``as_frame=False`` (default), a Bunch with:

    - ``frame`` : pandas ``DataFrame`` of shape
      ``(n_samples, 3 + n_models * n_quantiles)`` containing
      ``'y_true'``, two auxiliary features, and all quantile columns.
    - ``data`` : ``ndarray`` with numeric feature + prediction columns.
    - ``feature_names`` : ``['feature_1', 'feature_2']``.
    - ``target_names`` : ``['y_true']``.
    - ``target`` : ``ndarray`` of ``y_true`` values.
    - ``model_names`` : list of model labels.
    - ``quantile_levels`` : sorted list of unique quantiles.
    - ``prediction_columns`` : dict mapping each model name to its
      list of quantile column names.
    - ``prefix`` : the column prefix.
    - ``DESCR`` : human-readable description.

    If ``as_frame=True``, only the pandas ``DataFrame`` is returned.

Raises
------
ValueError
    If ``0.5`` is not in ``quantiles``, if name/range lengths are
    inconsistent, or if ranges are invalid.

TypeError
    If non-numeric inputs prevent computation.

Notes
-----
**Generation model.** Draw the truth as
:math:`y_{\text{true}} \sim \mathcal{N}(\mu, \sigma^2)` with
``mu=true_mean`` and ``sigma=true_std``. For model :math:`m`, let
:math:`b^{(m)}` be a sampled bias
and :math:`W^{(m)}` a sampled overall width (e.g., Q90–Q10). The
median prediction (Q50) is

.. math::

   q_{0.5}^{(m)} \;=\; y_{\text{true}} \;+\; b^{(m)} \;+\;
   \varepsilon^{(m)}, \qquad
   \varepsilon^{(m)} \sim \mathcal{N}(0, \sigma_\varepsilon^2),

with ``sigma_ε = noise_level``. Other quantiles are created by
adding offsets proportional to their distance from the median and
scaled so that the extreme quantiles span approximately
:math:`W^{(m)}`; small independent noise is then added. Finally, for
each row we sort the model’s quantile values to enforce
:math:`q_{\alpha} \le q_{0.5} \le q_{\beta}` (e.g., Q10 ≤ Q50 ≤ Q90),
which is useful for coverage and calibration diagnostics
:footcite:p:`Gneiting2007b, Jolliffe2012`.

Two auxiliary columns (``feature_1``, ``feature_2``) are included
for convenience in examples; they do not influence the simulated
target or quantiles.

See Also
--------
make_uncertainty_data
    Temporal multi-period quantiles with drift/consistency controls.
make_taylor_data
    Synthetic data tailored for Taylor diagram evaluation.
kdiagram.plot.uncertainty.plot_coverage
    Aggregate empirical coverage vs nominal.
kdiagram.plot.uncertainty.plot_temporal_uncertainty
    General polar visualization for multiple series.

Examples
--------
>>> # As a Bunch with metadata:
>>> 
>>> from kdiagram.datasets import make_multi_model_quantile_data
>>> ds = make_multi_model_quantile_data(n_samples=50, n_models=2, seed=1)
>>> ds.model_names
['Model_A', 'Model_B']
>>> sorted(ds.quantile_levels)
[0.1, 0.5, 0.9]
>>> ds.prediction_columns['Model_A'][:3]  # doctest: +ELLIPSIS
['pred_Model_A_q0.1', 'pred_Model_A_q0.5', 'pred_Model_A_q0.9']
>>> 
>>> # As a DataFrame:
>>> 
>>> df = make_multi_model_quantile_data(as_frame=True, seed=2)
>>> set(['y_true','feature_1','feature_2']).issubset(df.columns)
True

References
----------
.. footbibliography::
"""



[docs]
def make_regression_data(
    n_samples: int = 200,
    n_features: int = 1,
    feature_range: tuple[float, float] = (0.0, 10.0),
    n_models: int = 3,
    model_profiles: dict[str, dict[str, Any]] | None = None,
    true_func: Callable[[np.ndarray], np.ndarray] | None = None,
    true_kind: str = "linear",  # 'linear'|'quadratic'|'sine'
    true_coeff_range: tuple[float, float] = (-5.0, 5.0),
    intercept: float = 5.0,
    noise_on_true: float | Callable[[np.ndarray], np.ndarray] = 1.0,
    heteroskedastic: bool = False,
    hetero_strength: float = 0.5,
    prefix: str = "pred_",
    seed: int | None = 0,
    as_frame: bool = False,
    clip_negative: bool = False,
    shuffle: bool = True,
    model_names: list[str] | None = None,
    feature_names: list[str] | None = None,
) -> Bunch | pd.DataFrame:
    # ---------- RNG ----------
    rng = np.random.default_rng(seed)

    # ---------- features ----------
    lo, hi = float(feature_range[0]), float(feature_range[1])
    if hi <= lo:
        raise ValueError("feature_range must satisfy hi > lo.")

    X = rng.uniform(lo, hi, size=(n_samples, n_features))

    if not feature_names:
        feature_names = [f"feature_{i + 1}" for i in range(n_features)]
    elif len(feature_names) != n_features:
        raise ValueError("len(feature_names) must equal n_features.")

    # ---------- true signal ----------
    # allow user supplied function(X) -> shape (n_samples,)
    if true_func is not None:
        y_signal = np.asarray(true_func(X))
        if y_signal.shape != (n_samples,):
            raise ValueError("true_func(X) must return shape (n_samples,).")
    else:
        # built-in shapes controlled by true_kind
        if true_kind not in {"linear", "quadratic", "sine"}:
            raise ValueError("true_kind must be linear|quadratic|sine")

        # random coefficients for shapes needing them
        a = rng.uniform(true_coeff_range[0], true_coeff_range[1], n_features)
        b = rng.uniform(true_coeff_range[0], true_coeff_range[1], n_features)

        if true_kind == "linear":
            y_signal = X @ a + intercept
        elif true_kind == "quadratic":
            # sum_i (a_i * x_i^2 + b_i * x_i) + intercept
            y_signal = (
                (a * (X**2)).sum(axis=1) + (b * X).sum(axis=1) + intercept
            )
        else:  # "sine"
            # sine on the first feature; add small linear mix if >1 feat
            base = np.sin(X[:, 0] / max(1.0, (hi - lo) / np.pi))
            if n_features > 1:
                mix = (b * X).sum(axis=1) / max(1.0, n_features)
            else:
                mix = 0.0
            y_signal = 10.0 * base + mix + intercept

    # ---------- irreducible noise on truth ----------
    if callable(noise_on_true):
        noise = np.asarray(noise_on_true(X))
        if noise.shape != (n_samples,):
            raise ValueError(
                "noise_on_true(X) must return shape (n_samples,)."
            )
    else:
        scale = float(noise_on_true)
        if scale < 0:
            raise ValueError("noise_on_true must be >= 0.")
        # optional heteroskedasticity w.r.t. first feature
        if heteroskedastic:
            f1 = X[:, 0] if n_features > 0 else np.zeros(n_samples)
            f1n = (f1 - lo) / max(1e-9, (hi - lo))
            mult = 1.0 + hetero_strength * (f1n - 0.5) * 2.0
            noise = rng.normal(0.0, scale * np.clip(mult, 0.1, 5.0))
        else:
            noise = rng.normal(0.0, scale, n_samples)

    y_true = y_signal + noise

    # clip negatives if requested
    if clip_negative:
        y_true = np.clip(y_true, 0.0, None)

    # ---------- default model profiles ----------
    # fields: bias (float), noise_std (float),
    #         error_type: "additive"|"multiplicative"|"hetero"
    if model_profiles is None:
        base = [
            (
                "Good Model",
                {
                    "bias": 0.0,
                    "noise_std": 5.0,
                    "error_type": "additive",
                },
            ),
            (
                "Biased Model",
                {
                    "bias": -10.0,
                    "noise_std": 2.0,
                    "error_type": "additive",
                },
            ),
            (
                "High Variance",
                {
                    "bias": 0.0,
                    "noise_std": 15.0,
                    "error_type": "additive",
                },
            ),
        ]
        model_profiles = {k: v for k, v in base[:n_models]}
        if n_models > 3:
            # pad with reasonable defaults for extra models
            for i in range(3, n_models):
                model_profiles[f"Model_{i + 1}"] = {
                    "bias": 0.0,
                    "noise_std": 10.0,
                    "error_type": "additive",
                }

    # preserve insertion order for deterministic behavior
    base_names = list(model_profiles.keys())
    profiles_list = [model_profiles[k] for k in base_names]

    # resolve display vs column names
    display_names, column_names = _resolve_model_labels(
        base_names=base_names,
        user_names=model_names,
        prefix=prefix,
    )

    # ---------- predictions per model ----------
    data_dict: dict[str, Any] = {}
    for i, fn in enumerate(feature_names):
        data_dict[fn] = X[:, i]
    data_dict["y_true"] = y_true

    pred_cols: list[str] = []
    for i, prof in enumerate(profiles_list):
        # name = display_names[i]          # human-facing # noqa
        col = column_names[i]  # DataFrame column label

        bias = float(prof.get("bias", 0.0))
        noise_std = float(prof.get("noise_std", 5.0))
        error_type = str(prof.get("error_type", "additive"))

        if error_type not in ("additive", "multiplicative", "hetero"):
            raise ValueError(
                "unknown error_type "
                f"'{error_type}' for model '{base_names[i]}'"
            )

        if error_type == "additive":
            err = bias + rng.normal(0.0, noise_std, n_samples)
            y_pred = y_true + err
        elif error_type == "multiplicative":
            mul = 1.0 + rng.normal(bias, noise_std, n_samples)
            y_pred = y_true * mul
        else:  # "hetero"
            scale = 1.0 + hetero_strength * (X[:, 0] - X[:, 0].min()) / (
                max(X[:, 0].ptp(), 1e-9)
            )
            err = bias + rng.normal(0.0, noise_std * scale, n_samples)
            y_pred = y_true + err

        if clip_negative:
            y_pred = np.clip(y_pred, 0.0, None)

        data_dict[col] = y_pred
        pred_cols.append(col)

    # ---------- dataframe ----------
    df = pd.DataFrame(data_dict)

    # place columns in a tidy order
    ordered = ["y_true"] + feature_names + pred_cols
    # ordered = sorted(ordered)
    df = df[ordered]

    # shuffle rows if requested
    if shuffle:
        df = df.sample(frac=1.0, random_state=seed).reset_index(drop=True)

    # ---------- return ----------
    if as_frame:
        return df

    # names to report (respect user names if provided)
    _mnames = (
        model_names
        if model_names
        else (
            list(model_profiles.keys())[:n_models]
            if model_profiles
            else [f"Model_{i + 1}" for i in range(n_models)]
        )
    )

    def _pv(seq, k=4):
        seq = [str(s) for s in seq]
        return ", ".join(seq[:k]) + (" …" if len(seq) > k else "")

    _noise_tag = (
        "callable" if callable(noise_on_true) else f"{float(noise_on_true):g}"
    )
    _truth_tag = "custom" if true_func is not None else true_kind
    _lo, _hi = float(feature_range[0]), float(feature_range[1])

    descr = textwrap.dedent(
        f"""
        Synthetic regression dataset.
        samples : {n_samples}
        feats   : {n_features}  range=({_lo:g}, {_hi:g})
        truth   : {_truth_tag}  intercept={intercept:g}
        noise   : {_noise_tag}
        hetero  : {heteroskedastic}  strength={hetero_strength:g}
        models  : {len(_mnames)}  names=[{_pv(_mnames)}]
        prefix  : {prefix}  clipped={clip_negative}
        shuffle : {shuffle}  seed={seed}
        """
    ).strip()

    return Bunch(
        frame=df,
        data=df[pred_cols].values,
        feature_names=feature_names,
        target_names=["y_true"],
        target=df["y_true"].values,
        model_names=display_names,
        prediction_columns=pred_cols,
        prefix=prefix,
        DESCR=descr,
    )



make_regression_data.__doc__ = r"""
Generate a synthetic regression dataset with a configurable
true process and multiple model prediction profiles.

This helper builds features, a noisy ground truth, and one
or more model predictions with user-controlled bias and
noise. It supports additive, multiplicative, and hetero-
skedastic error, custom true functions, and deterministic
column naming when ``model_names`` is provided.

Parameters
----------
n_samples : int, default=200
    Number of rows to generate.

n_features : int, default=1
    Number of feature columns.

feature_range : tuple of float, default=(0.0, 10.0)
    Closed interval for uniform feature sampling. Must
    satisfy ``hi > lo``.

n_models : int, default=3
    Number of model prediction columns to create. If
    ``model_profiles`` is given, only the first ``n_models``
    entries (in insertion order) are used.

model_profiles : dict or None, default=None
    Per-model configuration. Keys are base model names and
    values are dicts with fields:
    ``bias`` (float), ``noise_std`` (float), and
    ``error_type`` in ``{"additive","multiplicative",
    "hetero"}``. If ``None``, built-in defaults are used.

true_func : callable or None, default=None
    Custom function with signature
    ``true_func(X: ndarray) -> ndarray shape (n_samples,)``.
    If ``None``, a built-in shape is chosen via
    ``true_kind``.

true_kind : {"linear","quadratic","sine"}, default="linear"
    Family of the built-in true process when ``true_func``
    is ``None``.

true_coeff_range : tuple of float, default=(-5.0, 5.0)
    Range used to draw coefficients for built-in shapes.

intercept : float, default=5.0
    Intercept term added to the true process.

noise_on_true : float or callable, default=1.0
    If float, standard deviation of additive Gaussian
    noise on the ground truth. If callable, it must accept
    ``X`` and return an array of shape ``(n_samples,)``.

heteroskedastic : bool, default=False
    If ``True`` and ``noise_on_true`` is a float, scales the
    ground-truth noise by a function of the first feature.

hetero_strength : float, default=0.5
    Strength parameter used for hetero scaling (both for
    ground-truth noise when ``heteroskedastic=True`` and for
    ``error_type="hetero"`` in model profiles).

prefix : str, default="pred\_"
    Prefix used for auto-named prediction columns when a
    user name is not supplied for a model.

seed : int or None, default=0
    Seed for the internal random generator. ``None`` uses
    non-deterministic entropy.

as_frame : bool, default=False
    If ``True``, return a ``pandas.DataFrame`` with tidy
    columns. Otherwise return a ``sklearn.utils.Bunch``.

clip_negative : bool, default=False
    If ``True``, clip the ground truth and predictions at
    zero.

shuffle : bool, default=True
    If ``True``, row-shuffle the output with ``seed``.

model_names : list of str or None, default=None
    Explicit display names for the first ``k`` models, where
    ``k = len(model_names)``. When provided, the prediction
    columns for those models are named **exactly** as given,
    without ``prefix``. Remaining models (if any) use
    ``f"{prefix}{snake_case(base_name)}"``. Extra names
    beyond the number of models are ignored with a warning.

feature_names : list of str or None, default=None
    Names for feature columns. Must have length equal to
    ``n_features``. If ``None``, uses ``["feature_1", ...]``.

Returns
-------
pandas.DataFrame or sklearn.utils.Bunch
    If ``as_frame=True``:
        
        A DataFrame with columns
        ``["y_true"] + feature_names + prediction_cols``.
        
    If ``as_frame=False``:
        A Bunch with fields:
            
            ``frame`` : the same DataFrame,
            ``data`` : ndarray of shape
            ``(n_samples, n_models)``, containing predictions
            ordered as in ``prediction_columns``,
            ``feature_names`` : list of str,
            ``target_names`` : ``["y_true"]``,
            ``target`` : ndarray of shape ``(n_samples,)``,
            ``model_names`` : list of display names,
            ``prediction_columns`` : list of column labels,
            ``prefix`` : str,
            ``DESCR`` : short description.

Raises
------
ValueError
    If ``feature_range`` is invalid, if shapes returned by
    ``true_func`` or a noise callable are not
    ``(n_samples,)``, if ``true_kind`` is unknown, if a
    ``model_profiles`` entry has an unknown ``error_type``,
    or if ``feature_names`` length mismatches ``n_features``.

Notes
-----
- Python dicts preserve insertion order. The order of
  models is taken from ``model_profiles`` keys, or from the
  built-in defaults when profiles are not supplied.

- When ``model_names`` is provided, those names are used as
  the **column labels** verbatim for the first ``k`` models.
  This allows clean, human-readable headers in a DataFrame
  and consistent legend labels downstream.

- For ``error_type="multiplicative"``, prediction noise is
  applied as a multiplicative factor around 1 [1]_. For
  ``"hetero"``, the model’s noise is scaled by a normalized
  transform of the first feature and ``hetero_strength`` [2]_.

- Reproducibility is controlled by ``seed``. Set it to an
  integer for deterministic output.

Examples
--------
Create two models with explicit names and return a frame.

>>> from kdiagram.datasets.make import make_regression_data
>>> profiles = {
...     "Good Model": {"bias": 0.0, "noise_std": 5.0,
...                    "error_type": "additive"},
...     "Biased Model": {"bias": -10.0, "noise_std": 2.0,
...                      "error_type": "additive"},
... }
>>> df = make_regression_data(
...     n_samples=200,
...     n_features=1,
...     n_models=2,
...     model_profiles=profiles,
...     model_names=["Good Model", "Biased Model"],
...     as_frame=True,
...     seed=42,
... )
>>> list(df.columns)[:3]
['y_true', 'feature_1', 'Good Model']

Use a custom true function and heteroskedastic noise.

>>> def ftrue(X):
...     return 3.0 * X[:, 0] + 2.0
>>> df = make_regression_data(
...     n_samples=100,
...     true_func=ftrue,
...     noise_on_true=1.5,
...     heteroskedastic=True,
...     as_frame=True,
... )

Return a Bunch for direct array access.

>>> b = make_regression_data(
...     n_samples=50,
...     n_models=3,
...     as_frame=False,
... )
>>> b.data.shape
(50, 3)

See Also
--------
sklearn.datasets.make_regression
    Classic linear regression toy dataset.
numpy.random.Generator
    Modern NumPy RNG used for reproducibility.

References
----------
.. [1] Hastie, Tibshirani, Friedman.
       The Elements of Statistical Learning.
       Springer, 2009.
.. [2] Hyndman, Athanasopoulos.
       Forecasting: Principles and Practice.
       OTexts, 3rd ed., 2021.
"""



[docs]
def make_classification_data(
    n_samples: int = 600,
    n_features: int = 10,
    n_classes: int = 2,
    weights: list[float] | None = None,
    class_sep: float = 1.0,
    flip_y: float = 0.0,
    informative_frac: float = 0.6,
    redundant_frac: float = 0.2,
    seed: int | None = 42,
    # models / output shape
    n_models: int = 2,
    model_profiles: dict[str, dict[str, Any]] | None = None,
    model_names: list[str] | None = None,
    true_col: str = "y",
    prefix_label: str = "pred_",
    prefix_proba: str = "proba_",
    add_compat_cols: bool = False,
    include_binary_pred_cols: bool = False,
    as_frame: bool = False,
) -> Bunch | pd.DataFrame:
    rng = np.random.default_rng(seed)

    # -------- class priors --------
    if weights is None:
        weights = [1.0 / float(n_classes)] * n_classes
    w_sum = float(sum(weights))
    if w_sum <= 0:
        raise ValueError("weights must sum to > 0.")
    weights = [float(x) / w_sum for x in weights]
    if len(weights) != n_classes:
        raise ValueError("len(weights) must equal n_classes.")

    # -------- features ----------
    X = rng.normal(0.0, 1.0, size=(n_samples, n_features))  # noqa

    # mark informative/redundant feature masks
    n_inf = max(1, int(round(n_features * informative_frac)))
    n_inf = min(n_inf, n_features)
    n_red = max(0, int(round(n_features * redundant_frac)))
    n_red = min(n_red, max(0, n_features - n_inf))
    n_noise = n_features - n_inf - n_red

    # build informative subspace that separates classes
    # use class means spaced along a random direction
    dir_vec = rng.normal(0.0, 1.0, size=(n_inf,))
    dir_vec /= np.clip(np.linalg.norm(dir_vec), 1e-9, None)

    # assign class centers on a line, scaled by class_sep
    centers = np.linspace(-1.0, 1.0, n_classes) * class_sep
    Z_inf = rng.normal(0.0, 1.0, size=(n_samples, n_inf))

    # initial labels by priors
    y = rng.choice(np.arange(n_classes), size=n_samples, p=weights)

    # push informative dims towards class centers
    Z_inf = Z_inf + np.outer(centers[y], dir_vec)

    # redundant features = noisy linear combos of informative
    if n_red > 0:
        A = rng.normal(0.0, 0.5, size=(n_inf, n_red))
        Z_red = Z_inf @ A + rng.normal(0.0, 0.3, size=(n_samples, n_red))
    else:
        Z_red = np.zeros((n_samples, 0))

    # noise features (pure noise)
    if n_noise > 0:
        Z_noise = rng.normal(0.0, 1.0, size=(n_samples, n_noise))
    else:
        Z_noise = np.zeros((n_samples, 0))

    # assemble final design matrix (permute cols for realism)
    Z = np.concatenate([Z_inf, Z_red, Z_noise], axis=1)
    perm = rng.permutation(Z.shape[1])
    Z = Z[:, perm]

    # flip labels (label noise)
    if flip_y > 0.0:
        mask = rng.random(n_samples) < float(flip_y)
        if n_classes == 2:
            y[mask] = 1 - y[mask]
        else:
            # random other class
            alt = rng.integers(0, n_classes - 1, mask.sum())
            y[mask] = (y[mask] + 1 + alt) % n_classes

    # -------- model profiles ----------
    # fields:
    #  - logit_scale : float (larger => better separation)
    #  - noise_std   : float (logit noise)
    #  - bias        : float or list/ndarray per-class
    #  - temp        : float > 0 (temperature scaling)
    if model_profiles is None:
        model_profiles = {}
        scales = np.linspace(0.8, 1.6, n_models)
        stdevs = np.linspace(0.6, 0.2, n_models)
        temps = np.linspace(1.2, 0.8, n_models)
        for i in range(n_models):
            model_profiles[f"Model_{i + 1}"] = {
                "logit_scale": float(scales[i]),
                "noise_std": float(stdevs[i]),
                "bias": 0.0 if n_classes == 2 else [0.0] * n_classes,
                "temp": float(temps[i]),
            }

    if model_names is None:
        # convenient names for CLI tests (m1, m2, ...)
        model_names = [f"m{i + 1}" for i in range(n_models)]
    if len(model_names) != len(model_profiles):
        raise ValueError("len(model_names) must match model_profiles.")

    # base linear weights for logits
    if n_classes == 2:
        w = rng.normal(0.0, 1.0, size=(n_features,))
        w /= np.clip(np.linalg.norm(w), 1e-9, None)
        base_logit = Z @ w
    else:
        W = rng.normal(0.0, 1.0, size=(n_features, n_classes))
        # normalize columns
        W = W / np.clip(np.linalg.norm(W, axis=0, keepdims=True), 1e-9, None)
        base_logits = Z @ W

    # -------- build dataframe --------
    df = pd.DataFrame(Z, columns=[f"x{i + 1}" for i in range(n_features)])
    df[true_col] = y.astype(int)

    pred_label_cols: list[str] = []
    proba_cols: list[str] = []

    for name, prof in zip(model_names, model_profiles.values()):
        scale = float(prof.get("logit_scale", 1.0))
        nstd = float(prof.get("noise_std", 0.4))
        temp = max(1e-6, float(prof.get("temp", 1.0)))
        bias = prof.get("bias", 0.0)

        if n_classes == 2:
            # z = scaled + noise + bias
            z = scale * base_logit + rng.normal(0.0, nstd, n_samples)
            z = z + float(bias)
            p1 = _sigmoid(z / temp)
            # probability column named like tests (m1, m2)
            df[name] = p1
            proba_cols.append(name)

            if include_binary_pred_cols:
                lbl = (p1 > 0.5).astype(int)
                df[f"{prefix_label}{name}"] = lbl
                pred_label_cols.append(f"{prefix_label}{name}")
        else:
            # logits per class
            B = (
                np.asarray(bias)
                if np.ndim(bias)
                else np.full((n_classes,), float(bias))
            )
            noise = rng.normal(0.0, nstd, size=(n_samples, n_classes))
            logits = scale * base_logits + noise + B
            probs = _softmax(logits / temp)
            # per-class probs
            for k in range(n_classes):
                col = f"{prefix_proba}{name}_{k}"
                df[col] = probs[:, k]
                proba_cols.append(col)
            # predicted labels
            lbl = probs.argmax(axis=1).astype(int)
            col_lbl = f"{prefix_label}{name}"
            df[col_lbl] = lbl
            pred_label_cols.append(col_lbl)

    # add yt/yp aliases for the first model if requested
    if add_compat_cols and n_classes > 2:
        if true_col != "yt":
            df["yt"] = df[true_col]
        first_pred = f"{prefix_label}{model_names[0]}"
        if first_pred in df.columns and "yp" not in df.columns:
            df["yp"] = df[first_pred]

    # -------- return -----------
    if as_frame:
        return df

    # names to report (respect user names if provided)
    _cmnames = (
        model_names
        if model_names
        else (
            list(model_profiles.keys())[:n_models]
            if model_profiles
            else [f"Model_{i + 1}" for i in range(n_models)]
        )
    )

    def _pv(seq, k=4):
        seq = [str(s) for s in seq]
        return ", ".join(seq[:k]) + (" …" if len(seq) > k else "")

    def _pw(ws, k=5):
        if ws is None:
            return "auto"
        vals = [f"{float(w):.3f}" for w in ws]
        return ", ".join(vals[:k]) + (" …" if len(vals) > k else "")

    descr = textwrap.dedent(
        f"""
        Synthetic classification dataset.
        samples : {n_samples}  feats={n_features}
        classes : {n_classes}  weights={_pw(weights)}
        sep     : {class_sep:g}  flip_y={flip_y:g}
        info/fr : {informative_frac:g}/{redundant_frac:g}
        models  : {len(_cmnames)}  names=[{_pv(_cmnames)}]
        labels  : {true_col}
        prefix  : lbl={prefix_label}  proba={prefix_proba}
        options : binpred={include_binary_pred_cols}
                  compat={add_compat_cols}
        seed    : {seed}
        """
    ).strip()

    return Bunch(
        frame=df,
        data=df.drop(columns=[true_col]).values,
        feature_names=[f"x{i + 1}" for i in range(n_features)],
        target_names=[true_col],
        target=df[true_col].values,
        model_names=model_names,
        # for binary, probas live in names (m1, m2, ...)
        # for multiclass, they live under prefix_proba_*.
        prediction_columns=proba_cols,
        label_columns=pred_label_cols,
        n_classes=n_classes,
        DESCR=descr,
    )



make_classification_data.__doc__ = r"""
Generate a synthetic classification dataset with a configurable
feature process and multiple model outputs (labels and/or
probabilities).

This helper wraps a standard separable feature generator and
then synthesizes the outputs of one or more "models" whose
behavior can be controlled via ``model_profiles`` or via a
simple count ``n_models``. It supports binary and multiclass
targets, class imbalance, label noise, explicit model names,
and convenient, deterministic column naming.

Parameters
----------
n_samples : int, default=600
    Number of rows to generate.

n_features : int, default=10
    Total number of feature columns.

n_classes : int, default=2
    Number of classes. Use ``2`` for binary classification
    and values greater than 2 for multiclass.

weights : list of float or None, default=None
    Class priors that should sum (approximately) to 1. If
    ``None``, classes are (approximately) balanced.

class_sep : float, default=1.0
    Separation between classes in feature space. Larger
    values create an easier problem.

flip_y : float, default=0.0
    Fraction of labels to randomly flip as label noise.
    Must be in ``[0, 1]``.

informative_frac : float, default=0.6
    Fraction of features that are informative. Must be in
    ``[0, 1]`` and should satisfy
    ``informative_frac + redundant_frac <= 1`` [1]_.

redundant_frac : float, default=0.2
    Fraction of features that are linear combinations of
    informative features. Must be in ``[0, 1]`` and should
    satisfy
    ``informative_frac + redundant_frac <= 1``.

seed : int or None, default=42
    Random seed for reproducibility. ``None`` uses
    non-deterministic entropy.

n_models : int, default=2
    Number of model outputs to synthesize. If
    ``model_profiles`` is provided, only the first
    ``n_models`` entries (in insertion order) are used.

model_profiles : dict or None, default=None
    Optional per-model configuration. Keys are base model
    names and values are dicts describing behavior (e.g.,
    logit bias, noise level, calibration skew, thresholding
    policy). The exact keys supported depend on the
    implementation. If ``None``, built-in defaults are used.

model_names : list of str or None, default=None
    Display names for the first ``k`` models, where
    ``k = len(model_names)``. When provided, the probability
    and (for binary) label columns for those models are
    named **exactly** as given (no prefixes). Remaining
    models (if any) use prefixed, sanitized names. Extra
    names beyond ``n_models`` are ignored with a warning.

true_col : str, default="y"
    Column name for the ground-truth labels.

prefix_label : str, default="pred\_"
    Prefix for auto-named discrete label columns (only used
    when a user name is not supplied or when multiclass
    compat columns are requested).

prefix_proba : str, default="proba\_"
    Prefix for auto-named probability columns (only used
    when a user name is not supplied).

add_compat_cols : bool, default=False
    If ``True`` and multiclass, add lightweight
    compatibility columns that some plotting utilities
    expect (e.g., ``yt`` as an alias of ``true_col`` and
    one ``yp_<model>`` column per model with the argmax
    prediction). Has no effect for pure binary unless the
    implementation chooses to add aliases.

include_binary_pred_cols : bool, default=False
    If ``True`` and ``n_classes == 2``, add one discrete
    label column per model in addition to probabilities.
    Column names follow the explicit ``model_names`` when
    available, otherwise use ``f"{prefix_label}_<name>"``.

as_frame : bool, default=False
    If ``True``, return a ``pandas.DataFrame`` with tidy
    columns. Otherwise return a ``sklearn.utils.Bunch``.

Returns
-------
pandas.DataFrame or sklearn.utils.Bunch
    If ``as_frame=True``:
        
        A DataFrame with columns:
            
        ``[true_col] + feature_names + proba/label columns``.
        For binary, each model typically contributes a
        single probability column interpreted as the
        positive-class probability. For multiclass, each
        model contributes one probability column per class
        (e.g., ``name_0, name_1, ...``), plus optional
        compatibility columns if requested.
        
    If ``as_frame=False``:
        
        A Bunch with fields:
            
            ``frame`` : the same DataFrame,
            ``data`` : ndarray containing model outputs
            (shape and content depend on configuration),
            ``feature_names`` : list of str,
            ``target_names`` : list of class labels or
            integers,
            ``target`` : ndarray of shape ``(n_samples,)``,
            ``model_names`` : list of display names,
            ``proba_columns`` : list of probability column
            labels (if available),
            ``label_columns`` : list of discrete label
            column labels (if available),
            ``DESCR`` : short description.

Raises
------
ValueError
    If class priors are invalid, if fractions are outside
    ``[0, 1]`` or sum to more than 1, if
    ``model_names`` length exceeds ``n_models`` in an
    incompatible way, or if other shape checks fail.

Notes
-----
- Dicts preserve insertion order. Model order follows
  ``model_profiles`` keys, or built-in defaults if profiles
  are not provided.
- When ``model_names`` is given, those names are used as
  **column labels** verbatim for the first ``k`` models,
  allowing clean DataFrames and legends downstream.
- Probability column layout differs between binary and
  multiclass. In binary, one column per model is typical.
  In multiclass, one column per class per model is common,
  using class indices ``0..n_classes-1`` unless the
  implementation defines another convention [2]_.

Examples
--------
Binary classification with two named models and explicit
label columns.

>>> df = make_classification_data(
...     n_samples=400,
...     n_features=8,
...     n_classes=2,
...     n_models=2,
...     model_names=["Good", "Biased"],
...     include_binary_pred_cols=True,
...     as_frame=True,
...     seed=7,
... )
>>> [c for c in df.columns if c.startswith("Good")][:1]
['Good']

Multiclass with three models and compatibility columns.

>>> df = make_classification_data(
...     n_samples=600,
...     n_features=12,
...     n_classes=4,
...     n_models=3,
...     add_compat_cols=True,
...     as_frame=True,
... )
>>> any(c.startswith("yp_") for c in df.columns)
True

See Also
--------
sklearn.datasets.make_classification
    Classic feature generator for classification problems.
sklearn.metrics
    Utilities to evaluate classification (e.g., AUC,
    log-loss, accuracy, F1).

References
----------

.. [1] Bishop, C.
       Pattern Recognition and Machine Learning.
       Springer, 2006.
.. [2] Pedregosa et al.
       Scikit-learn: Machine Learning in Python.
       JMLR 12, 2825–2830, 2011.
"""


# --- helpers -----
def _resolve_model_labels(
    base_names: list[str],
    user_names: list[str] | None,
    prefix: str,
) -> tuple[list[str], list[str]]:
    """
    Map model display names and column names.

    - If a user name exists for index i -> use it as display *and* column.
    - If missing -> display = base name; column = prefix + snake(base name).
    - Extra user names are ignored with a warning.
    """
    disp: list[str] = []
    cols: list[str] = []

    def _snake(name: str) -> str:
        s = "".join(ch if ch.isalnum() else "_" for ch in str(name))
        while "__" in s:
            s = s.replace("__", "_")
        return s.strip("_")

    n = len(base_names)
    m = len(user_names) if user_names else 0
    if user_names and m > n:
        warnings.warn(
            (
                "Received more model_names than models. "
                "Extra names will be ignored."
            ),
            stacklevel=2,
        )

    for i, bname in enumerate(base_names):
        if user_names and i < m and user_names[i]:
            name = str(user_names[i])
            disp.append(name)
            cols.append(name)  # <-- exact, no prefix
        else:
            disp.append(bname)
            cols.append(f"{prefix}{_safe_name(bname)}")

    return disp, cols


def _softmax(z: np.ndarray) -> np.ndarray:
    z = z - z.max(axis=1, keepdims=True)
    e = np.exp(z)
    s = e.sum(axis=1, keepdims=True)
    return e / np.clip(s, 1e-12, None)


def _sigmoid(z: np.ndarray) -> np.ndarray:
    return 1.0 / (1.0 + np.exp(-z))


def _expand_param(
    param_value: Any, n_models: int, param_name: str
) -> list[Any]:
    """
    Expands a single parameter value to a list for each model.

    If the parameter is already a list, it validates its length, issues a
    warning on mismatch, and handles it by padding or truncating.
    """
    if not isinstance(param_value, list):
        # It's a single value, so we broadcast it for each model.
        return [param_value] * n_models

    # It's a list, so we check the length.
    current_len = len(param_value)

    if current_len == n_models:
        # The length is perfect, return as is.
        return param_value

    elif current_len < n_models:
        # The list is too short.
        warnings.warn(
            f"Length of `{param_name}` ({current_len}) is less than "
            f"`n_models` ({n_models}). Padding with the last value.",
            UserWarning,
            stacklevel=2,
        )
        # Pad the list by repeating the last element.
        padding_needed = n_models - current_len
        last_value = param_value[-1] if current_len > 0 else None
        return param_value + [last_value] * padding_needed

    else:  # current_len > n_models
        # The list is too long.
        warnings.warn(
            f"Length of `{param_name}` ({current_len}) is greater than "
            f"`n_models` ({n_models}). Truncating the extra values.",
            UserWarning,
            stacklevel=2,
        )
        # Truncate the list to the correct length.
        return param_value[:n_models]


def _safe_name(s: str) -> str:
    # turn any string into a simple identifier for column names
    s = re.sub(r"\W+", "_", str(s).strip())
    return s.strip("_") or "model"


def _validate_range_or_list(val, name, require_nonneg_min: bool):
    """
    Accept (min, max) or a list of (min, max).
    Only check ordering and (optionally) non-negativity of min.
    """

    def _check(lo, hi):
        if require_nonneg_min and lo < 0:
            raise ValueError(
                f"{name} must be (min, max) with min >= 0 and min <= max."
            )
        if lo > hi:
            raise ValueError(f"{name} must be (min, max) with min <= max.")

    if isinstance(val, list):
        if len(val) == 0:
            raise ValueError(f"{name} list must be non-empty.")
        for pair in val:
            if not (isinstance(pair, (list, tuple)) and len(pair) == 2):
                raise TypeError(
                    f"Each element of `{name}` must be a (min, max) tuple."
                )
            lo, hi = float(pair[0]), float(pair[1])
            _check(lo, hi)
    else:
        lo, hi = float(val[0]), float(val[1])  # tuple-like
        _check(lo, hi)