Source code for kdiagram.datasets.load

# Author: LKouadio <etanoyau@gmail.com>
# License: Apache License 2.0

"""
Dataset Loading and Generation Utilities (:mod:`kdiagram.datasets.load`)
==========================================================================

Functions to load sample or included datasets, or generate synthetic
datasets suitable for demonstrating and testing `k-diagram`
visualizations. Datasets can be returned as pandas DataFrames or
structured Bunch objects.
"""

from __future__ import annotations

import os
import re
import shutil
import textwrap
import warnings
from importlib import resources

import numpy as np
import pandas as pd

from ..api.bunch import Bunch
from ._property import (
    KD_DMODULE,
    KD_REMOTE_DATA_URL,
    RemoteMetadata,
    download_file_if,
    get_data,
)

__all__ = ["load_uncertainty_data", "load_zhongshan_subsidence"]

_ZHONGSHAN_METADATA = RemoteMetadata(
    file="min_zhongshan.csv",
    url=KD_REMOTE_DATA_URL,
    checksum=None,  # # TODO: Add SHA256 checksum here if available
    descr_module=None,
    data_module=KD_DMODULE,
)


[docs] def load_zhongshan_subsidence( *, as_frame: bool = False, years: list[int] | None = None, quantiles: list[float] | None = None, include_coords: bool = True, include_target: bool = True, data_home: str | None = None, download_if_missing: bool = True, force_download: bool = False, ) -> Bunch | pd.DataFrame: # --- Step 1: Determine file location (Cache > Package > Download) --- if quantiles is None: quantiles = [0.1, 0.5, 0.9] data_dir = get_data(data_home) filename = _ZHONGSHAN_METADATA.file if os.path.exists(os.path.join(data_dir, filename)): local_filepath = os.path.join(data_dir, filename) else: try: # Construct the full path to the file within the package # using importlib.resources local_filepath = str( resources.files(KD_DMODULE).joinpath(filename) ) data_dir = os.path.dirname(local_filepath) # took only the file in data path except Exception as e: # fallback. print(f"An unexpected error occurred: {e}") local_filepath = os.path.join(data_dir, filename) package_module_path = _ZHONGSHAN_METADATA.data_module filepath_to_load = None # Force download if requested if force_download: if download_if_missing: print(f"Forcing download of '{filename}'...") dl_path = download_file_if( _ZHONGSHAN_METADATA, data_home=data_dir, download_if_missing=True, error="warn", verbose=1, ) if dl_path and os.path.exists(dl_path): filepath_to_load = dl_path else: # Error handled by download func based on 'error' flag # We might still try package resource below if download fails warnings.warn( f"Forced download failed for {filename}.", stacklevel=2 ) pass # Continue to check package resource else: warnings.warn( f"Cannot force download for {filename}, " f"download_if_missing is False.", stacklevel=2, ) # Proceed to check local cache/package only # Check cache first (unless download was forced and succeeded) if filepath_to_load is None and os.path.exists(local_filepath): print(f"Loading dataset from cache: {local_filepath}") filepath_to_load = local_filepath # Check package resources if not found in cache if filepath_to_load is None: try: # New Traversable API pkg_root = resources.files( package_module_path ) # Traversable root candidate = pkg_root.joinpath(filename) # Traversable file if candidate.is_file(): print( "Loading dataset from installed" f" package: {package_module_path}" ) # Get a real filesystem path even if inside a wheel/zip with resources.as_file(candidate) as rpath: filepath_to_load = str(rpath) # Copy to cache for future use if not already there if not os.path.exists(local_filepath): try: os.makedirs( os.path.dirname(local_filepath), exist_ok=True ) shutil.copyfile(filepath_to_load, local_filepath) print( f"Copied dataset to cache: {local_filepath}" ) except Exception as copy_err: warnings.warn( f"Could not copy dataset to cache: {copy_err}", stacklevel=2, ) else: print( f"Dataset not found in package resources: " f"{package_module_path}/{filename}" ) except ModuleNotFoundError: print(f"Package data module not found: {package_module_path}") except Exception as res_err: warnings.warn( f"Error accessing package resources: {res_err}", stacklevel=2, ) # Attempt download if still not found and allowed if filepath_to_load is None and download_if_missing: print(f"Attempting download of '{filename}' to cache: {data_dir}") filepath_to_load = download_file_if( _ZHONGSHAN_METADATA, data_home=data_dir, download_if_missing=True, error="warn", verbose=1, # Use warn first ) # Final check if we have a path if filepath_to_load is None or not os.path.exists(filepath_to_load): raise FileNotFoundError( f"Zhongshan subsidence dataset ('{filename}') not found in " f"cache ('{data_dir}'), package resources, and could not be " f"downloaded. Try setting download_if_missing=True or check " f"internet connection." ) # --- Step 2: Load data --- try: df = pd.read_csv(filepath_to_load) except Exception as e: raise OSError( f"Error reading dataset file at {filepath_to_load}: {e}" ) from e # --- Step 3: Subsetting / Column Selection --- cols_to_keep = [] available_years = set() available_quantiles = set() q_pattern = re.compile(r"_(\d{4})_q([0-9.]+)$") target_pattern = re.compile(r"_(\d{4})$") # Identify available years and quantiles from column names for col in df.columns: q_match = q_pattern.search(col) t_match = target_pattern.search(col) if q_match: available_years.add(int(q_match.group(1))) available_quantiles.add(float(q_match.group(2))) elif ( t_match and col.endswith(t_match.group(1)) and col.startswith("subsidence") ): # Be specific for target available_years.add(int(t_match.group(1))) available_years = sorted(list(available_years)) available_quantiles = sorted(list(available_quantiles)) # Validate requested years and quantiles requested_years = ( set(years) if years is not None else set(available_years) ) requested_quantiles = ( set(quantiles) if quantiles is not None else set(available_quantiles) ) invalid_years = requested_years - set(available_years) invalid_quantiles = requested_quantiles - set(available_quantiles) if invalid_years: warnings.warn( f"Requested years not available: {invalid_years}. " f"Available: {available_years}", UserWarning, stacklevel=2, ) requested_years &= set(available_years) # Keep only valid ones if invalid_quantiles: warnings.warn( f"Requested quantiles not available: {invalid_quantiles}. " f"Available: {available_quantiles}", UserWarning, stacklevel=2, ) requested_quantiles &= set( available_quantiles ) # Keep only valid ones # Select columns based on flags and validated requests if include_coords: if "longitude" in df.columns: cols_to_keep.append("longitude") if "latitude" in df.columns: cols_to_keep.append("latitude") target_cols_found = [] q_cols_found = { "q" + f"{q:.1f}".replace("0.", ""): [] for q in requested_quantiles } all_q_cols_found = [] for col in df.columns: q_match = q_pattern.search(col) t_match = target_pattern.search(col) # Check target columns if include_target and t_match and col.startswith("subsidence"): year = int(t_match.group(1)) if year in requested_years: cols_to_keep.append(col) target_cols_found.append(col) # Check quantile columns elif q_match: year = int(q_match.group(1)) q_val = float(q_match.group(2)) if year in requested_years and q_val in requested_quantiles: cols_to_keep.append(col) q_key = "q" + f"{q_val:.1f}".replace("0.", "") q_cols_found[q_key].append(col) all_q_cols_found.append(col) # Ensure order is somewhat logical cols_to_keep = sorted( list(set(cols_to_keep)), key=lambda x: ( not x.startswith("lon") and not x.startswith("lat"), # Coords first not x.startswith("subsidence_") or q_pattern.search(x) is None, # Base target next x, # Then sort alphabetically/numerically ), ) df_subset = df[cols_to_keep].copy() # --- Step 4: Return DataFrame or Bunch --- if as_frame: return df_subset else: # Assemble Bunch feature_names = [] if include_coords: if "longitude" in df_subset.columns: feature_names.append("longitude") if "latitude" in df_subset.columns: feature_names.append("latitude") target_names = target_cols_found target_array = ( df_subset[target_names].values if target_names else None ) # Initialize dict for quantile columns dynamically q_cols_found = {} # Start empty all_q_cols_found = [] # Re-define patterns just in case q_pattern = re.compile(r"_(\d{4})_q([0-9.]+)$") target_pattern = re.compile(r"_(\d{4})$") # Iterate over the ACTUAL columns present in the SUBSETTED DataFrame for col in df_subset.columns: q_match = q_pattern.search(col) # Skip target columns here, handled above by target_cols_found if q_match: year = int( q_match.group(1) ) # Already filtered by requested_years q_val_str = q_match.group(2) try: q_val = float(q_val_str) # Check if this quantile was requested (already done by subsetting) # --- FIX: Use consistent key format 'qX.Y' --- q_key = f"q{q_val:.1f}" # e.g., q0.1, q0.5, q.09 # --- End Fix --- # Add key to dict if it's the first time seeing this quantile if q_key not in q_cols_found: q_cols_found[q_key] = [] q_cols_found[q_key].append(col) all_q_cols_found.append( col ) # Keep track of all q cols found except ValueError: warnings.warn( f"Could not parse quantile value '{q_val_str}'" f" from column '{col}'. Skipping.", stacklevel=2, ) # Create description descr = textwrap.dedent( f"""\ Zhongshan Land Subsidence Prediction Dataset **Origin:** This dataset contains processed outputs from a land subsidence forecasting study focused on Zhongshan, China. It includes simulated quantile predictions (Q10, Q50, Q90) for multiple future years (2022-2026) and base 'target' subsidence values for reference years (2022, 2023) at 898 locations. **Data Characteristics:** - Samples: {len(df_subset)} (Locations) - Features: {len(feature_names)} ({", ".join(feature_names)}) - Target Columns: {len(target_names)} ({", ".join(target_names)}) - Quantile Columns: {len(all_q_cols_found)} (Subset based on request) - Available Years (in original file): {available_years} - Available Quantiles (in original file): {available_quantiles} - Loaded Years: {sorted(list(requested_years))} - Loaded Quantiles: {sorted(list(requested_quantiles))} **Contents (Bunch object):** - frame : Filtered pandas DataFrame based on parameters. - feature_names : List of coordinate column names. - target_names : List of loaded target column names. - target : NumPy array of target values (if loaded). - longitude : NumPy array of longitude values (if loaded). - latitude : NumPy array of latitude values (if loaded). - quantile_cols : Dict mapping requested/loaded quantiles ('q0.1', etc.) to lists of column names. - q10_cols : List of loaded Q10 column names. - q50_cols : List of loaded Q50 column names. - q90_cols : List of loaded Q90 column names. - years_available : List of all years detected in original columns. - quantiles_available: List of all quantiles detected. - n_periods : Number of periods with quantile data. - start_year : Starting year for period columns. - DESCR : This description. This dataset is suitable for demonstrating uncertainty plots like plot_model_drift, plot_uncertainty_drift, plot_coverage_diagnostic, plot_anomaly_magnitude (using target cols), etc. """ ) try: start_year = list(requested_years)[0] except IndexError: start_year = "" except Exception as err: print(f"An unexpected error occurred: {err}") start_year = "" bunch_dict = { "frame": df_subset, "feature_names": feature_names, "target_names": target_names, "target": target_array, "quantile_cols": q_cols_found, "q10_cols": q_cols_found.get("q0.1", []), "q50_cols": q_cols_found.get("q0.5", []), "q90_cols": q_cols_found.get("q0.9", []), "years_available": available_years, "quantiles_available": available_quantiles, "start_year": start_year, "n_periods": len(requested_years), "DESCR": descr, } # Add coordinates as top-level attributes if included if include_coords: if "longitude" in df_subset: bunch_dict["longitude"] = df_subset["longitude"].values if "latitude" in df_subset: bunch_dict["latitude"] = df_subset["latitude"].values return Bunch(**bunch_dict)
load_zhongshan_subsidence.__doc__ = r""" Load the Zhongshan land subsidence prediction dataset. This dataset contains sample multi-period quantile predictions (Q10, Q50, Q90 for 2022–2026) and simulated actual subsidence for 2022 and 2023, along with geographic coordinates for 898 locations in Zhongshan, China. It is intended for demonstrating and testing `k-diagram`'s uncertainty and evaluation plots and for reproducing examples related to spatiotemporal uncertainty diagnostics :footcite:p:`Liu2024, kouadiob2025`. The function searches a local cache directory, bundled package resources, and optionally a remote repository (in that order). On success it returns either a pandas ``DataFrame`` or a :class:`~kdiagram.bunch.Bunch` with convenient attributes. Parameters ---------- as_frame : bool, default=False If ``False``, return a :class:`~kdiagram.bunch.Bunch` that includes the filtered ``DataFrame`` plus metadata and sliced arrays (e.g., coordinates, target, and quantile columns). If ``True``, return only the filtered ``DataFrame``. years : list of int, optional Subset to these calendar years (e.g., ``[2023, 2025]``) when selecting target and quantile columns. If ``None``, load all years found in the file (quantiles typically 2022–2026; targets typically 2022/2023). quantiles : list of float, optional Subset to these quantile levels in ``[0, 1]`` (e.g., ``[0.1, 0.5, 0.9]``). If ``None``, load all detected quantiles for the selected years. Defaults to ``[0.1, 0.5, 0.9]``. include_coords : bool, default=True If ``True``, include coordinate columns ``'longitude'`` and ``'latitude'`` when present. include_target : bool, default=True If ``True``, include base target columns (e.g., ``'subsidence_2022'``, ``'subsidence_2023'``) when present and consistent with the requested ``years``. data_home : str, optional Directory path for caching datasets. If ``None``, the path is resolved by :func:`~kdiagram.datasets._property.get_data`. You may also configure the root via the ``KDIAGRAM_DATA`` environment variable. Example default is ``~/kdiagram_data``. download_if_missing : bool, default=True If ``True``, attempt to download the dataset into the cache when it is not found locally nor in package resources. force_download : bool, default=False If ``True``, attempt to fetch a fresh copy even if a local file exists. Useful to refresh data during development. Returns ------- data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame If ``as_frame=False`` (default) a Bunch with: - ``frame`` : pandas ``DataFrame`` filtered by the request. - ``feature_names`` : list of included coordinate column names. - ``target_names`` : list of included target column names. - ``target`` : NumPy array of target values (or ``None``). - ``longitude``, ``latitude`` : NumPy arrays when coordinates are included. - ``quantile_cols`` : dict mapping keys like ``'q0.1'`` to lists of matching column names. - ``q10_cols``, ``q50_cols``, ``q90_cols`` : convenience lists. - ``years_available``, ``quantiles_available`` : lists detected in the original file. - ``start_year`` : smallest year in the loaded subset (if any). - ``n_periods`` : number of loaded years. - ``DESCR`` : human-readable dataset description. If ``as_frame=True``, only the filtered pandas ``DataFrame`` is returned. Raises ------ FileNotFoundError When the dataset cannot be resolved from cache or package resources and either downloading is disabled or the download fails. ValueError If requested ``years`` or ``quantiles`` are invalid or not present in the data file. Notes ----- **Search order.** The loader resolves a file path using the following order: (1) local cache under ``data_home``; (2) installed package resources; (3) optional remote download when ``download_if_missing=True``. You can force step (3) with ``force_download=True``. **Column detection.** Quantile columns encode a year :math:`y` and a quantile level :math:`q` in their names. .. math:: \text{quantile name} \;\equiv\; \texttt{<prefix>}\_{y}\_\texttt{q}q, \qquad y \in \{2022,\dots,2026\},\; q \in (0,1) Target columns encode only the year :math:`y`: .. math:: \text{target name} \;\equiv\; \texttt{subsidence}\_{y} In code, the implementation detects these with the following regular expressions (kept as literals, not math): ``r"_(\d{4})_q([0-9.]+)$"`` (quantile columns) and ``r"_(\d{4})$"`` (target columns). This design enables flexible subsetting by year and quantile without hard-coding headers. **Coordinate handling.** When present and ``include_coords=True``, the columns ``'longitude'`` and ``'latitude'`` are included and exposed both in the returned frame and as top-level arrays in the Bunch for convenience. **Intended use.** The dataset is a compact sample designed for tutorials, documentation figures, and regression tests of k-diagram uncertainty diagnostics :footcite:p:`kouadiob2025`. It is not a comprehensive research release. See Also -------- load_uncertainty_data Generate a synthetic dataset with controllable anomalies and quantiles for testing visual diagnostics. kdiagram.plot.uncertainty.plot_model_drift kdiagram.plot.uncertainty.plot_uncertainty_drift kdiagram.plot.uncertainty.plot_coverage_diagnostic kdiagram.plot.uncertainty.plot_anomaly_magnitude Example consumers of this dataset in documentation figures. Examples -------- Basic usage returning a Bunch with metadata: >>> from kdiagram.datasets import load_zhongshan_subsidence >>> ds = load_zhongshan_subsidence() >>> isinstance(ds.frame, type(__import__('pandas').DataFrame())) True >>> list(ds.quantile_cols.keys())[:3] ['q0.1', 'q0.5', 'q0.9'] >>> >>> # Return only the DataFrame and subset to selected years/quantiles: >>> >>> df = load_zhongshan_subsidence( ... as_frame=True, years=[2023, 2025], quantiles=[0.1, 0.9] ... ) >>> set(c.split('_')[-1] for c in df.columns if '_q' in c) <= {'q0.1','q0.9'} True >>> >>> # Force a fresh download into the cache: >>> _ = load_zhongshan_subsidence(force_download=True) References ---------- .. footbibliography:: """
[docs] def load_uncertainty_data( *, as_frame: bool = False, n_samples: int = 150, n_periods: int = 4, anomaly_frac: float = 0.15, start_year: int = 2022, prefix: str = "value", base_value: float = 10.0, trend_strength: float = 1.5, noise_level: float = 2.0, interval_width_base: float = 4.0, interval_width_noise: float = 1.5, interval_width_trend: float = 0.5, seed: int | None = 42, ) -> Bunch | pd.DataFrame: # --- Generation Logic (Moved from make_uncertainty_data) --- if seed is not None: rng = np.random.default_rng(seed) else: rng = np.random.default_rng() location_id = np.arange(n_samples) longitude = rng.uniform(-120, -115, n_samples) latitude = rng.uniform(33, 36, n_samples) elevation = rng.uniform(50, 500, n_samples) + latitude * 5 base_signal = ( base_value + np.sin(np.linspace(0, 3 * np.pi, n_samples)) * 5 + rng.normal(0, noise_level / 2, n_samples) ) actual_first_period = base_signal + rng.normal( 0, noise_level / 2, n_samples ) data_dict = { "location_id": location_id, "longitude": longitude, "latitude": latitude, "elevation": elevation, f"{prefix}_actual": actual_first_period.copy(), } all_q10_cols, all_q50_cols, all_q90_cols = [], [], [] quantile_cols_dict = {"q0.1": [], "q0.5": [], "q0.9": []} for i in range(n_periods): year = start_year + i q10_col = f"{prefix}_{year}_q0.1" q50_col = f"{prefix}_{year}_q0.5" q90_col = f"{prefix}_{year}_q0.9" all_q10_cols.append(q10_col) all_q50_cols.append(q50_col) all_q90_cols.append(q90_col) quantile_cols_dict["q0.1"].append(q10_col) quantile_cols_dict["q0.5"].append(q50_col) quantile_cols_dict["q0.9"].append(q90_col) current_trend = trend_strength * i q50 = ( base_signal + current_trend + rng.normal(0, noise_level / 3, n_samples) ) current_interval_width = ( interval_width_base + interval_width_trend * i + rng.uniform( -interval_width_noise / 2, interval_width_noise / 2, n_samples ) ) current_interval_width = np.maximum(0.1, current_interval_width) q10 = q50 - current_interval_width / 2 q90 = q50 + current_interval_width / 2 data_dict[q10_col] = q10 data_dict[q50_col] = q50 data_dict[q90_col] = q90 df = pd.DataFrame(data_dict) actual_col_name = f"{prefix}_actual" if anomaly_frac > 0 and n_samples > 0: n_anomalies = int(anomaly_frac * n_samples) if n_anomalies > 0 and all_q10_cols and all_q90_cols: anomaly_indices = rng.choice( n_samples, size=n_anomalies, replace=False ) n_under = n_anomalies // 2 under_indices = anomaly_indices[:n_under] over_indices = anomaly_indices[n_under:] q10_first = df[all_q10_cols[0]].iloc[under_indices] q90_first = df[all_q90_cols[0]].iloc[over_indices] df.loc[under_indices, actual_col_name] = q10_first - rng.uniform( 0.5, 3.0, size=len(under_indices) ) * (interval_width_base / 2 + 1) df.loc[over_indices, actual_col_name] = q90_first + rng.uniform( 0.5, 3.0, size=len(over_indices) ) * (interval_width_base / 2 + 1) feature_names = ["location_id", "longitude", "latitude", "elevation"] target_names = [actual_col_name] pred_cols_sorted = [ col for pair in zip(all_q10_cols, all_q50_cols, all_q90_cols) for col in pair ] ordered_cols = feature_names + target_names + pred_cols_sorted df = df[ordered_cols] # --- Return based on as_frame --- if as_frame: return df else: target_array = df[target_names[0]].values descr = textwrap.dedent( f"""\ Synthetic Multi-Period Uncertainty Dataset for k-diagram **Description:** Generates synthetic data simulating quantile forecasts (Q10, Q50, Q90) for '{prefix}' over {n_periods} periods starting from {start_year} across {n_samples} samples/locations. Includes spatial coordinates, an 'elevation' feature, and an 'actual' value (``{actual_col_name}``) for the first period. Anomalies (actual values outside the first period's Q10-Q90 interval) are introduced for ~{anomaly_frac * 100:.0f}% of samples. Both the median (Q50) and the interval width can exhibit configurable trends and noise. **Generation Parameters:** - n_samples : {n_samples} - n_periods : {n_periods} - start_year : {start_year} - prefix : '{prefix}' - anomaly_frac : {anomaly_frac:.2f} - base_value : {base_value:.2f} - trend_strength : {trend_strength:.2f} - noise_level : {noise_level:.2f} - interval_width_base : {interval_width_base:.2f} - interval_width_noise : {interval_width_noise:.2f} - interval_width_trend : {interval_width_trend:.2f} - seed : {seed} **Bunch Attributes:** - frame : Complete pandas DataFrame. - feature_names : List of coordinate/feature column names. - target_names : List containing the target column name. - target : NumPy array of target values. - quantile_cols : Dict mapping quantiles to column name lists. - q10_cols : List of Q10 column names. - q50_cols : List of Q50 column names. - q90_cols : List of Q90 column names. - n_periods : Number of periods with quantile data. - prefix : Prefix used for value/quantile columns. - start_year : Starting year for period columns. - DESCR : This description. """ ) return Bunch( frame=df, feature_names=feature_names, target_names=target_names, target=target_array, quantile_cols=quantile_cols_dict, q10_cols=all_q10_cols, q50_cols=all_q50_cols, q90_cols=all_q90_cols, n_periods=n_periods, prefix=prefix, start_year=start_year, DESCR=descr, )
load_uncertainty_data.__doc__ = r""" Generate a synthetic dataset for uncertainty diagnostics. Creates a compact, controllable dataset for demonstrating `k-diagram` plots: one period of actuals, multi-period predicted quantiles (Q10, Q50, Q90), configurable trends and noise, injected interval failures (anomalies), and optional coordinates. Useful for examples, unit tests, and performance checks :footcite:p:`kouadiob2025`. Parameters ---------- as_frame : bool, default=False If ``False``, return a :class:`~kdiagram.bunch.Bunch` with the generated frame and metadata; if ``True``, return only the pandas ``DataFrame``. n_samples : int, default=150 Number of rows (locations) to generate. n_periods : int, default=4 Number of consecutive periods (e.g., years) for which quantiles are generated. anomaly_frac : float, default=0.15 Approximate fraction in ``[0, 1]`` where the actual value lies outside the first period’s Q10–Q90 interval. start_year : int, default=2022 Starting year used when naming time-dependent columns. prefix : str, default='value' Base prefix for value and quantile column names. base_value : float, default=10.0 Approximate mean of the signal in the first period. trend_strength : float, default=1.5 Linear trend added to the Q50 trajectory across periods. noise_level : float, default=2.0 Standard deviation of base random noise added to values. interval_width_base : float, default=4.0 Base width of the Q10–Q90 interval in the first period. interval_width_noise : float, default=1.5 Random variability added to the interval width per sample/period. interval_width_trend : float, default=0.5 Linear trend added to the interval width across periods. seed : int or None, default=42 Random seed for reproducibility. If ``None``, use an unconstrained RNG state. Returns ------- data : :class:`~kdiagram.bunch.Bunch` or pandas.DataFrame If ``as_frame=False`` (default) a Bunch with: - ``frame`` : pandas ``DataFrame`` of synthesized values. - ``feature_names`` : included feature columns (e.g., coords). - ``target_names`` : names of actual/target columns. - ``target`` : NumPy array of target values (if present). - ``quantile_cols`` : dict mapping ``'q0.1'``, ``'q0.5'``, ``'q0.9'`` to lists of columns across periods. - ``q10_cols``, ``q50_cols``, ``q90_cols`` : convenience lists. - ``n_periods``, ``prefix``, ``start_year``, and ``DESCR``. If ``as_frame=True``, return only the ``DataFrame``. Notes ----- The generator injects a user-controlled fraction of interval failures to create meaningful examples for coverage and anomaly diagnostics. Use this dataset to exercise plots such as coverage rates, point-wise coverage diagnostics, anomaly magnitude, temporal consistency, and drift views :footcite:p:`Jolliffe2012, Gneiting2007b, kouadiob2025`. See Also -------- load_zhongshan_subsidence Real-world sample for Zhongshan subsidence with quantiles and coordinates. kdiagram.plot.uncertainty.plot_coverage kdiagram.plot.uncertainty.plot_coverage_diagnostic kdiagram.plot.uncertainty.plot_anomaly_magnitude kdiagram.plot.uncertainty.plot_interval_consistency kdiagram.plot.uncertainty.plot_model_drift Visual diagnostics this dataset was designed to support. Examples -------- >>> # Create a small dataset and explore quantile columns: >>> >>> from kdiagram.datasets import load_uncertainty_data >>> ds = load_uncertainty_data(n_samples=10, n_periods=3, seed=0) >>> sorted(ds.quantile_cols.keys()) ['q0.1', 'q0.5', 'q0.9'] >>> >>> # Return a ``DataFrame`` only: >>> >>> df = load_uncertainty_data(as_frame=True, n_samples=5, seed=1) >>> df.shape[0] == 5 True References ---------- .. footbibliography:: """