Source code for kdiagram.datasets.load

# Author: LKouadio <etanoyau@gmail.com>
# License: Apache License 2.0

"""
Dataset Loading and Generation Utilities (:mod:`kdiagram.datasets.load`)
==========================================================================

Functions to load sample or included datasets, or generate synthetic
datasets suitable for demonstrating and testing `k-diagram`
visualizations. Datasets can be returned as pandas DataFrames or
structured Bunch objects.
"""

from __future__ import annotations

import os
import re
import shutil
import textwrap
import warnings
from importlib import resources

import numpy as np
import pandas as pd

from ..api.bunch import Bunch
from ._property import (
    KD_DMODULE,
    KD_REMOTE_DATA_URL,
    RemoteMetadata,
    download_file_if,
    get_data,
)

__all__ = ["load_uncertainty_data", "load_zhongshan_subsidence"]

_ZHONGSHAN_METADATA = RemoteMetadata(
    file="min_zhongshan.csv",
    url=KD_REMOTE_DATA_URL,
    checksum=None,  # # TODO: Add SHA256 checksum here if available
    descr_module=None,
    data_module=KD_DMODULE,
)



[docs]
def load_zhongshan_subsidence(
    *,
    as_frame: bool = False,
    years: list[int] | None = None,
    quantiles: list[float] | None = None,
    include_coords: bool = True,
    include_target: bool = True,
    data_home: str | None = None,
    download_if_missing: bool = True,
    force_download: bool = False,
) -> Bunch | pd.DataFrame:
    # --- Step 1: Determine file location (Cache > Package > Download) ---
    if quantiles is None:
        quantiles = [0.1, 0.5, 0.9]
    data_dir = get_data(data_home)
    filename = _ZHONGSHAN_METADATA.file
    if os.path.exists(os.path.join(data_dir, filename)):
        local_filepath = os.path.join(data_dir, filename)
    else:
        try:
            # Construct the full path to the file within the package
            # using importlib.resources
            local_filepath = str(
                resources.files(KD_DMODULE).joinpath(filename)
            )
            data_dir = os.path.dirname(local_filepath)
            # took only the file in data path
        except Exception as e:
            # fallback.
            print(f"An unexpected error occurred: {e}")
            local_filepath = os.path.join(data_dir, filename)

    package_module_path = _ZHONGSHAN_METADATA.data_module

    filepath_to_load = None

    # Force download if requested
    if force_download:
        if download_if_missing:
            print(f"Forcing download of '{filename}'...")
            dl_path = download_file_if(
                _ZHONGSHAN_METADATA,
                data_home=data_dir,
                download_if_missing=True,
                error="warn",
                verbose=1,
            )
            if dl_path and os.path.exists(dl_path):
                filepath_to_load = dl_path
            else:
                # Error handled by download func based on 'error' flag
                # We might still try package resource below if download fails
                warnings.warn(
                    f"Forced download failed for {filename}.", stacklevel=2
                )
                pass  # Continue to check package resource
        else:
            warnings.warn(
                f"Cannot force download for {filename}, "
                f"download_if_missing is False.",
                stacklevel=2,
            )
            # Proceed to check local cache/package only

    # Check cache first (unless download was forced and succeeded)
    if filepath_to_load is None and os.path.exists(local_filepath):
        print(f"Loading dataset from cache: {local_filepath}")
        filepath_to_load = local_filepath

    # Check package resources if not found in cache
    if filepath_to_load is None:
        try:
            # New Traversable API
            pkg_root = resources.files(
                package_module_path
            )  # Traversable root
            candidate = pkg_root.joinpath(filename)  # Traversable file

            if candidate.is_file():
                print(
                    "Loading dataset from installed"
                    f" package: {package_module_path}"
                )
                # Get a real filesystem path even if inside a wheel/zip
                with resources.as_file(candidate) as rpath:
                    filepath_to_load = str(rpath)

                    # Copy to cache for future use if not already there
                    if not os.path.exists(local_filepath):
                        try:
                            os.makedirs(
                                os.path.dirname(local_filepath), exist_ok=True
                            )
                            shutil.copyfile(filepath_to_load, local_filepath)
                            print(
                                f"Copied dataset to cache: {local_filepath}"
                            )
                        except Exception as copy_err:
                            warnings.warn(
                                f"Could not copy dataset to cache: {copy_err}",
                                stacklevel=2,
                            )
            else:
                print(
                    f"Dataset not found in package resources: "
                    f"{package_module_path}/{filename}"
                )
        except ModuleNotFoundError:
            print(f"Package data module not found: {package_module_path}")
        except Exception as res_err:
            warnings.warn(
                f"Error accessing package resources: {res_err}",
                stacklevel=2,
            )

    # Attempt download if still not found and allowed
    if filepath_to_load is None and download_if_missing:
        print(f"Attempting download of '{filename}' to cache: {data_dir}")
        filepath_to_load = download_file_if(
            _ZHONGSHAN_METADATA,
            data_home=data_dir,
            download_if_missing=True,
            error="warn",
            verbose=1,  # Use warn first
        )

    # Final check if we have a path
    if filepath_to_load is None or not os.path.exists(filepath_to_load):
        raise FileNotFoundError(
            f"Zhongshan subsidence dataset ('{filename}') not found in "
            f"cache ('{data_dir}'), package resources, and could not be "
            f"downloaded. Try setting download_if_missing=True or check "
            f"internet connection."
        )

    # --- Step 2: Load data ---
    try:
        df = pd.read_csv(filepath_to_load)
    except Exception as e:
        raise OSError(
            f"Error reading dataset file at {filepath_to_load}: {e}"
        ) from e

    # --- Step 3: Subsetting / Column Selection ---
    cols_to_keep = []
    available_years = set()
    available_quantiles = set()
    q_pattern = re.compile(r"_(\d{4})_q([0-9.]+)$")
    target_pattern = re.compile(r"_(\d{4})$")

    # Identify available years and quantiles from column names
    for col in df.columns:
        q_match = q_pattern.search(col)
        t_match = target_pattern.search(col)
        if q_match:
            available_years.add(int(q_match.group(1)))
            available_quantiles.add(float(q_match.group(2)))
        elif (
            t_match
            and col.endswith(t_match.group(1))
            and col.startswith("subsidence")
        ):  # Be specific for target
            available_years.add(int(t_match.group(1)))

    available_years = sorted(list(available_years))
    available_quantiles = sorted(list(available_quantiles))

    # Validate requested years and quantiles
    requested_years = (
        set(years) if years is not None else set(available_years)
    )
    requested_quantiles = (
        set(quantiles) if quantiles is not None else set(available_quantiles)
    )

    invalid_years = requested_years - set(available_years)
    invalid_quantiles = requested_quantiles - set(available_quantiles)

    if invalid_years:
        warnings.warn(
            f"Requested years not available: {invalid_years}. "
            f"Available: {available_years}",
            UserWarning,
            stacklevel=2,
        )
        requested_years &= set(available_years)  # Keep only valid ones
    if invalid_quantiles:
        warnings.warn(
            f"Requested quantiles not available: {invalid_quantiles}. "
            f"Available: {available_quantiles}",
            UserWarning,
            stacklevel=2,
        )
        requested_quantiles &= set(
            available_quantiles
        )  # Keep only valid ones

    # Select columns based on flags and validated requests
    if include_coords:
        if "longitude" in df.columns:
            cols_to_keep.append("longitude")
        if "latitude" in df.columns:
            cols_to_keep.append("latitude")

    target_cols_found = []
    q_cols_found = {
        "q" + f"{q:.1f}".replace("0.", ""): [] for q in requested_quantiles
    }
    all_q_cols_found = []

    for col in df.columns:
        q_match = q_pattern.search(col)
        t_match = target_pattern.search(col)

        # Check target columns
        if include_target and t_match and col.startswith("subsidence"):
            year = int(t_match.group(1))
            if year in requested_years:
                cols_to_keep.append(col)
                target_cols_found.append(col)
        # Check quantile columns
        elif q_match:
            year = int(q_match.group(1))
            q_val = float(q_match.group(2))
            if year in requested_years and q_val in requested_quantiles:
                cols_to_keep.append(col)
                q_key = "q" + f"{q_val:.1f}".replace("0.", "")
                q_cols_found[q_key].append(col)
                all_q_cols_found.append(col)

    # Ensure order is somewhat logical
    cols_to_keep = sorted(
        list(set(cols_to_keep)),
        key=lambda x: (
            not x.startswith("lon")
            and not x.startswith("lat"),  # Coords first
            not x.startswith("subsidence_")
            or q_pattern.search(x) is None,  # Base target next
            x,  # Then sort alphabetically/numerically
        ),
    )
    df_subset = df[cols_to_keep].copy()

    # --- Step 4: Return DataFrame or Bunch ---
    if as_frame:
        return df_subset
    else:
        # Assemble Bunch
        feature_names = []
        if include_coords:
            if "longitude" in df_subset.columns:
                feature_names.append("longitude")
            if "latitude" in df_subset.columns:
                feature_names.append("latitude")
        target_names = target_cols_found
        target_array = (
            df_subset[target_names].values if target_names else None
        )

        # Initialize dict for quantile columns dynamically
        q_cols_found = {}  # Start empty
        all_q_cols_found = []

        # Re-define patterns just in case
        q_pattern = re.compile(r"_(\d{4})_q([0-9.]+)$")
        target_pattern = re.compile(r"_(\d{4})$")

        # Iterate over the ACTUAL columns present in the SUBSETTED DataFrame
        for col in df_subset.columns:
            q_match = q_pattern.search(col)
            # Skip target columns here, handled above by target_cols_found
            if q_match:
                year = int(
                    q_match.group(1)
                )  # Already filtered by requested_years
                q_val_str = q_match.group(2)
                try:
                    q_val = float(q_val_str)
                    # Check if this quantile was requested (already done by subsetting)
                    # --- FIX: Use consistent key format 'qX.Y' ---
                    q_key = f"q{q_val:.1f}"  # e.g., q0.1, q0.5, q.09
                    # --- End Fix ---
                    # Add key to dict if it's the first time seeing this quantile
                    if q_key not in q_cols_found:
                        q_cols_found[q_key] = []
                    q_cols_found[q_key].append(col)
                    all_q_cols_found.append(
                        col
                    )  # Keep track of all q cols found
                except ValueError:
                    warnings.warn(
                        f"Could not parse quantile value '{q_val_str}'"
                        f" from column '{col}'. Skipping.",
                        stacklevel=2,
                    )

        # Create description
        descr = textwrap.dedent(
            f"""\
        Zhongshan Land Subsidence Prediction Dataset

        **Origin:**
        This dataset contains processed outputs from a land subsidence
        forecasting study focused on Zhongshan, China. It includes
        simulated quantile predictions (Q10, Q50, Q90) for multiple
        future years (2022-2026) and base 'target' subsidence values
        for reference years (2022, 2023) at 898 locations.

        **Data Characteristics:**
        - Samples: {len(df_subset)} (Locations)
        - Features: {len(feature_names)} ({", ".join(feature_names)})
        - Target Columns: {len(target_names)} ({", ".join(target_names)})
        - Quantile Columns: {len(all_q_cols_found)} (Subset based on request)
        - Available Years (in original file): {available_years}
        - Available Quantiles (in original file): {available_quantiles}
        - Loaded Years: {sorted(list(requested_years))}
        - Loaded Quantiles: {sorted(list(requested_quantiles))}

        **Contents (Bunch object):**
        - frame           : Filtered pandas DataFrame based on parameters.
        - feature_names   : List of coordinate column names.
        - target_names    : List of loaded target column names.
        - target          : NumPy array of target values (if loaded).
        - longitude       : NumPy array of longitude values (if loaded).
        - latitude        : NumPy array of latitude values (if loaded).
        - quantile_cols   : Dict mapping requested/loaded quantiles
                          ('q0.1', etc.) to lists of column names.
        - q10_cols        : List of loaded Q10 column names.
        - q50_cols        : List of loaded Q50 column names.
        - q90_cols        : List of loaded Q90 column names.
        - years_available : List of all years detected in original columns.
        - quantiles_available: List of all quantiles detected.
        - n_periods       : Number of periods with quantile data.
        - start_year      : Starting year for period columns.
        - DESCR           : This description.

        This dataset is suitable for demonstrating uncertainty plots like
        plot_model_drift, plot_uncertainty_drift, plot_coverage_diagnostic,
        plot_anomaly_magnitude (using target cols), etc.
        """
        )
        try:
            start_year = list(requested_years)[0]
        except IndexError:
            start_year = ""
        except Exception as err:
            print(f"An unexpected error occurred: {err}")
            start_year = ""

        bunch_dict = {
            "frame": df_subset,
            "feature_names": feature_names,
            "target_names": target_names,
            "target": target_array,
            "quantile_cols": q_cols_found,
            "q10_cols": q_cols_found.get("q0.1", []),
            "q50_cols": q_cols_found.get("q0.5", []),
            "q90_cols": q_cols_found.get("q0.9", []),
            "years_available": available_years,
            "quantiles_available": available_quantiles,
            "start_year": start_year,
            "n_periods": len(requested_years),
            "DESCR": descr,
        }
        # Add coordinates as top-level attributes if included
        if include_coords:
            if "longitude" in df_subset:
                bunch_dict["longitude"] = df_subset["longitude"].values
            if "latitude" in df_subset:
                bunch_dict["latitude"] = df_subset["latitude"].values

        return Bunch(**bunch_dict)



load_zhongshan_subsidence.__doc__ = r"""
Load the Zhongshan land subsidence prediction dataset.

This dataset contains sample multi-period quantile predictions
(Q10, Q50, Q90 for 2022–2026) and simulated actual subsidence for
2022 and 2023, along with geographic coordinates for 898 locations
in Zhongshan, China. It is intended for demonstrating and testing
`k-diagram`'s uncertainty and evaluation plots and for reproducing
examples related to spatiotemporal uncertainty diagnostics
:footcite:p:`Liu2024, kouadiob2025`.

The function searches a local cache directory, bundled package
resources, and optionally a remote repository (in that order).
On success it returns either a pandas ``DataFrame`` or a
:class:`~kdiagram.bunch.Bunch` with convenient attributes.

Parameters
----------
as_frame : bool, default=False
    If ``False``, return a :class:`~kdiagram.bunch.Bunch` that
    includes the filtered ``DataFrame`` plus metadata and sliced
    arrays (e.g., coordinates, target, and quantile columns).
    If ``True``, return only the filtered ``DataFrame``.

years : list of int, optional
    Subset to these calendar years (e.g., ``[2023, 2025]``) when
    selecting target and quantile columns. If ``None``, load all
    years found in the file (quantiles typically 2022–2026; targets
    typically 2022/2023).

quantiles : list of float, optional
    Subset to these quantile levels in ``[0, 1]`` (e.g.,
    ``[0.1, 0.5, 0.9]``). If ``None``, load all detected
    quantiles for the selected years. Defaults to
    ``[0.1, 0.5, 0.9]``.

include_coords : bool, default=True
    If ``True``, include coordinate columns ``'longitude'`` and
    ``'latitude'`` when present.

include_target : bool, default=True
    If ``True``, include base target columns (e.g.,
    ``'subsidence_2022'``, ``'subsidence_2023'``) when present
    and consistent with the requested ``years``.

data_home : str, optional
    Directory path for caching datasets. If ``None``, the path is
    resolved by :func:`~kdiagram.datasets._property.get_data`. You
    may also configure the root via the ``KDIAGRAM_DATA`` environment
    variable. Example default is ``~/kdiagram_data``.

download_if_missing : bool, default=True
    If ``True``, attempt to download the dataset into the cache when
    it is not found locally nor in package resources.

force_download : bool, default=False
    If ``True``, attempt to fetch a fresh copy even if a local file
    exists. Useful to refresh data during development.

Returns
-------
data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame
    If ``as_frame=False`` (default) a Bunch with:
        
    - ``frame`` : pandas ``DataFrame`` filtered by the request.
    - ``feature_names`` : list of included coordinate column names.
    - ``target_names`` : list of included target column names.
    - ``target`` : NumPy array of target values (or ``None``).
    - ``longitude``, ``latitude`` : NumPy arrays when coordinates
      are included.
    - ``quantile_cols`` : dict mapping keys like ``'q0.1'`` to
      lists of matching column names.
    - ``q10_cols``, ``q50_cols``, ``q90_cols`` : convenience lists.
    - ``years_available``, ``quantiles_available`` : lists detected
      in the original file.
    - ``start_year`` : smallest year in the loaded subset (if any).
    - ``n_periods`` : number of loaded years.
    - ``DESCR`` : human-readable dataset description.

    If ``as_frame=True``, only the filtered pandas ``DataFrame`` is
    returned.

Raises
------
FileNotFoundError
    When the dataset cannot be resolved from cache or package
    resources and either downloading is disabled or the download
    fails.

ValueError
    If requested ``years`` or ``quantiles`` are invalid or not
    present in the data file.

Notes
-----
**Search order.** The loader resolves a file path using the
following order: (1) local cache under ``data_home``;
(2) installed package resources; (3) optional remote download
when ``download_if_missing=True``. You can force step (3) with
``force_download=True``.

**Column detection.** Quantile columns encode a year :math:`y` and a
quantile level :math:`q` in their names.

.. math::

   \text{quantile name}
   \;\equiv\;
   \texttt{<prefix>}\_{y}\_\texttt{q}q,
   \qquad
   y \in \{2022,\dots,2026\},\; q \in (0,1)

Target columns encode only the year :math:`y`:

.. math::

   \text{target name}
   \;\equiv\;
   \texttt{subsidence}\_{y}

In code, the implementation detects these with the following regular
expressions (kept as literals, not math):
``r"_(\d{4})_q([0-9.]+)$"`` (quantile columns) and
``r"_(\d{4})$"`` (target columns).

This design enables flexible subsetting by year and quantile without
hard-coding headers.

**Coordinate handling.** When present and ``include_coords=True``,
the columns ``'longitude'`` and ``'latitude'`` are included and
exposed both in the returned frame and as top-level arrays in the
Bunch for convenience.

**Intended use.** The dataset is a compact sample designed for
tutorials, documentation figures, and regression tests of k-diagram
uncertainty diagnostics :footcite:p:`kouadiob2025`. It is not a
comprehensive research release.

See Also
--------
load_uncertainty_data
    Generate a synthetic dataset with controllable anomalies and
    quantiles for testing visual diagnostics.

kdiagram.plot.uncertainty.plot_model_drift
kdiagram.plot.uncertainty.plot_uncertainty_drift
kdiagram.plot.uncertainty.plot_coverage_diagnostic
kdiagram.plot.uncertainty.plot_anomaly_magnitude
    Example consumers of this dataset in documentation figures.

Examples
--------
Basic usage returning a Bunch with metadata:

>>> from kdiagram.datasets import load_zhongshan_subsidence
>>> ds = load_zhongshan_subsidence()
>>> isinstance(ds.frame, type(__import__('pandas').DataFrame()))
True
>>> list(ds.quantile_cols.keys())[:3]
['q0.1', 'q0.5', 'q0.9']
>>>
>>> # Return only the DataFrame and subset to selected years/quantiles:
>>>
>>> df = load_zhongshan_subsidence(
...     as_frame=True, years=[2023, 2025], quantiles=[0.1, 0.9]
... )
>>> set(c.split('_')[-1] for c in df.columns if '_q' in c) <= {'q0.1','q0.9'}
True
>>>
>>> # Force a fresh download into the cache:
>>> _ = load_zhongshan_subsidence(force_download=True)

References
----------

.. footbibliography::
    
"""



[docs]
def load_uncertainty_data(
    *,
    as_frame: bool = False,
    n_samples: int = 150,
    n_periods: int = 4,
    anomaly_frac: float = 0.15,
    start_year: int = 2022,
    prefix: str = "value",
    base_value: float = 10.0,
    trend_strength: float = 1.5,
    noise_level: float = 2.0,
    interval_width_base: float = 4.0,
    interval_width_noise: float = 1.5,
    interval_width_trend: float = 0.5,
    seed: int | None = 42,
) -> Bunch | pd.DataFrame:
    # --- Generation Logic (Moved from make_uncertainty_data) ---
    if seed is not None:
        rng = np.random.default_rng(seed)
    else:
        rng = np.random.default_rng()

    location_id = np.arange(n_samples)
    longitude = rng.uniform(-120, -115, n_samples)
    latitude = rng.uniform(33, 36, n_samples)
    elevation = rng.uniform(50, 500, n_samples) + latitude * 5
    base_signal = (
        base_value
        + np.sin(np.linspace(0, 3 * np.pi, n_samples)) * 5
        + rng.normal(0, noise_level / 2, n_samples)
    )
    actual_first_period = base_signal + rng.normal(
        0, noise_level / 2, n_samples
    )

    data_dict = {
        "location_id": location_id,
        "longitude": longitude,
        "latitude": latitude,
        "elevation": elevation,
        f"{prefix}_actual": actual_first_period.copy(),
    }

    all_q10_cols, all_q50_cols, all_q90_cols = [], [], []
    quantile_cols_dict = {"q0.1": [], "q0.5": [], "q0.9": []}

    for i in range(n_periods):
        year = start_year + i
        q10_col = f"{prefix}_{year}_q0.1"
        q50_col = f"{prefix}_{year}_q0.5"
        q90_col = f"{prefix}_{year}_q0.9"

        all_q10_cols.append(q10_col)
        all_q50_cols.append(q50_col)
        all_q90_cols.append(q90_col)
        quantile_cols_dict["q0.1"].append(q10_col)
        quantile_cols_dict["q0.5"].append(q50_col)
        quantile_cols_dict["q0.9"].append(q90_col)

        current_trend = trend_strength * i
        q50 = (
            base_signal
            + current_trend
            + rng.normal(0, noise_level / 3, n_samples)
        )

        current_interval_width = (
            interval_width_base
            + interval_width_trend * i
            + rng.uniform(
                -interval_width_noise / 2, interval_width_noise / 2, n_samples
            )
        )
        current_interval_width = np.maximum(0.1, current_interval_width)

        q10 = q50 - current_interval_width / 2
        q90 = q50 + current_interval_width / 2

        data_dict[q10_col] = q10
        data_dict[q50_col] = q50
        data_dict[q90_col] = q90

    df = pd.DataFrame(data_dict)

    actual_col_name = f"{prefix}_actual"
    if anomaly_frac > 0 and n_samples > 0:
        n_anomalies = int(anomaly_frac * n_samples)
        if n_anomalies > 0 and all_q10_cols and all_q90_cols:
            anomaly_indices = rng.choice(
                n_samples, size=n_anomalies, replace=False
            )
            n_under = n_anomalies // 2
            under_indices = anomaly_indices[:n_under]
            over_indices = anomaly_indices[n_under:]

            q10_first = df[all_q10_cols[0]].iloc[under_indices]
            q90_first = df[all_q90_cols[0]].iloc[over_indices]

            df.loc[under_indices, actual_col_name] = q10_first - rng.uniform(
                0.5, 3.0, size=len(under_indices)
            ) * (interval_width_base / 2 + 1)

            df.loc[over_indices, actual_col_name] = q90_first + rng.uniform(
                0.5, 3.0, size=len(over_indices)
            ) * (interval_width_base / 2 + 1)

    feature_names = ["location_id", "longitude", "latitude", "elevation"]
    target_names = [actual_col_name]
    pred_cols_sorted = [
        col
        for pair in zip(all_q10_cols, all_q50_cols, all_q90_cols)
        for col in pair
    ]
    ordered_cols = feature_names + target_names + pred_cols_sorted
    df = df[ordered_cols]

    # --- Return based on as_frame ---
    if as_frame:
        return df
    else:
        target_array = df[target_names[0]].values
        descr = textwrap.dedent(
            f"""\
        Synthetic Multi-Period Uncertainty Dataset for k-diagram

        **Description:**
        Generates synthetic data simulating quantile forecasts (Q10,
        Q50, Q90) for '{prefix}' over {n_periods} periods starting
        from {start_year} across {n_samples} samples/locations. Includes
        spatial coordinates, an 'elevation' feature, and an 'actual'
        value (``{actual_col_name}``) for the first period. Anomalies
        (actual values outside the first period's Q10-Q90 interval)
        are introduced for ~{anomaly_frac * 100:.0f}% of samples. Both the
        median (Q50) and the interval width can exhibit configurable
        trends and noise.

        **Generation Parameters:**
        - n_samples             : {n_samples}
        - n_periods             : {n_periods}
        - start_year            : {start_year}
        - prefix                : '{prefix}'
        - anomaly_frac          : {anomaly_frac:.2f}
        - base_value            : {base_value:.2f}
        - trend_strength        : {trend_strength:.2f}
        - noise_level           : {noise_level:.2f}
        - interval_width_base   : {interval_width_base:.2f}
        - interval_width_noise  : {interval_width_noise:.2f}
        - interval_width_trend  : {interval_width_trend:.2f}
        - seed                  : {seed}

        **Bunch Attributes:**
        - frame           : Complete pandas DataFrame.
        - feature_names   : List of coordinate/feature column names.
        - target_names    : List containing the target column name.
        - target          : NumPy array of target values.
        - quantile_cols   : Dict mapping quantiles to column name lists.
        - q10_cols        : List of Q10 column names.
        - q50_cols        : List of Q50 column names.
        - q90_cols        : List of Q90 column names.
        - n_periods       : Number of periods with quantile data.
        - prefix          : Prefix used for value/quantile columns.
        - start_year      : Starting year for period columns.
        - DESCR           : This description.
        """
        )

        return Bunch(
            frame=df,
            feature_names=feature_names,
            target_names=target_names,
            target=target_array,
            quantile_cols=quantile_cols_dict,
            q10_cols=all_q10_cols,
            q50_cols=all_q50_cols,
            q90_cols=all_q90_cols,
            n_periods=n_periods,
            prefix=prefix,
            start_year=start_year,
            DESCR=descr,
        )



load_uncertainty_data.__doc__ = r"""
Generate a synthetic dataset for uncertainty diagnostics.

Creates a compact, controllable dataset for demonstrating
`k-diagram` plots: one period of actuals, multi-period predicted
quantiles (Q10, Q50, Q90), configurable trends and noise, injected
interval failures (anomalies), and optional coordinates. Useful for
examples, unit tests, and performance checks :footcite:p:`kouadiob2025`.

Parameters
----------
as_frame : bool, default=False
    If ``False``, return a :class:`~kdiagram.bunch.Bunch` with the
    generated frame and metadata; if ``True``, return only the
    pandas ``DataFrame``.

n_samples : int, default=150
    Number of rows (locations) to generate.

n_periods : int, default=4
    Number of consecutive periods (e.g., years) for which quantiles
    are generated.

anomaly_frac : float, default=0.15
    Approximate fraction in ``[0, 1]`` where the actual value lies
    outside the first period’s Q10–Q90 interval.

start_year : int, default=2022
    Starting year used when naming time-dependent columns.

prefix : str, default='value'
    Base prefix for value and quantile column names.

base_value : float, default=10.0
    Approximate mean of the signal in the first period.

trend_strength : float, default=1.5
    Linear trend added to the Q50 trajectory across periods.

noise_level : float, default=2.0
    Standard deviation of base random noise added to values.

interval_width_base : float, default=4.0
    Base width of the Q10–Q90 interval in the first period.

interval_width_noise : float, default=1.5
    Random variability added to the interval width per sample/period.

interval_width_trend : float, default=0.5
    Linear trend added to the interval width across periods.

seed : int or None, default=42
    Random seed for reproducibility. If ``None``, use an
    unconstrained RNG state.

Returns
-------
data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame
    If ``as_frame=False`` (default) a Bunch with:
        
    - ``frame`` : pandas ``DataFrame`` of synthesized values.
    - ``feature_names`` : included feature columns (e.g., coords).
    - ``target_names`` : names of actual/target columns.
    - ``target`` : NumPy array of target values (if present).
    - ``quantile_cols`` : dict mapping ``'q0.1'``, ``'q0.5'``,
      ``'q0.9'`` to lists of columns across periods.
    - ``q10_cols``, ``q50_cols``, ``q90_cols`` : convenience lists.
    - ``n_periods``, ``prefix``, ``start_year``, and ``DESCR``.

    If ``as_frame=True``, return only the ``DataFrame``.

Notes
-----
The generator injects a user-controlled fraction of interval
failures to create meaningful examples for coverage and anomaly
diagnostics. Use this dataset to exercise plots such as coverage
rates, point-wise coverage diagnostics, anomaly magnitude,
temporal consistency, and drift views 
:footcite:p:`Jolliffe2012, Gneiting2007b, kouadiob2025`.

See Also
--------
load_zhongshan_subsidence
    Real-world sample for Zhongshan subsidence with quantiles and
    coordinates.

kdiagram.plot.uncertainty.plot_coverage
kdiagram.plot.uncertainty.plot_coverage_diagnostic
kdiagram.plot.uncertainty.plot_anomaly_magnitude
kdiagram.plot.uncertainty.plot_interval_consistency
kdiagram.plot.uncertainty.plot_model_drift
    Visual diagnostics this dataset was designed to support.

Examples
--------
>>> # Create a small dataset and explore quantile columns:
>>> 
>>> from kdiagram.datasets import load_uncertainty_data
>>> ds = load_uncertainty_data(n_samples=10, n_periods=3, seed=0)
>>> sorted(ds.quantile_cols.keys())
['q0.1', 'q0.5', 'q0.9']
>>> 
>>> # Return a ``DataFrame`` only:
>>> 
>>> df = load_uncertainty_data(as_frame=True, n_samples=5, seed=1)
>>> df.shape[0] == 5
True

References 
----------

.. footbibliography::
    
"""