Source code for kdiagram.plot.feature_based

# License: Apache 2.0
# Author: LKouadio <etanoyau@gmail.com>

from __future__ import annotations

import warnings
from typing import Any

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from ..compat.matplotlib import get_cmap
from ..decorators import check_non_emptiness, isdf
from ..utils.handlers import columns_manager
from ..utils.plot import set_axis_grid
from ..utils.validator import ensure_2d, exist_features

__all__ = ["plot_feature_fingerprint", "plot_feature_interaction"]


[docs] @check_non_emptiness(params=["df"]) @isdf def plot_feature_interaction( df: pd.DataFrame, theta_col: str, r_col: str, color_col: str, *, statistic: str = "mean", theta_period: float | None = None, theta_bins: int = 24, r_bins: int = 10, title: str | None = None, figsize: tuple[float, float] = (8, 8), cmap: str = "viridis", show_grid: bool = True, grid_props: dict[str, Any] | None = None, mask_radius: bool = False, savefig: str | None = None, dpi: int = 300, ): # --- Input Validation --- required_cols = [theta_col, r_col, color_col] exist_features(df, features=required_cols) data = df[required_cols].dropna().copy() if data.empty: warnings.warn( "DataFrame is empty after dropping NaNs.", UserWarning, stacklevel=2, ) return None # --- Binning and Aggregation --- if theta_period: data["theta_rad"] = ( ((data[theta_col] % theta_period) / theta_period) * 2 * np.pi ) else: min_theta, max_theta = data[theta_col].min(), data[theta_col].max() if (max_theta - min_theta) > 1e-9: data["theta_rad"] = ( ((data[theta_col] - min_theta) / (max_theta - min_theta)) * 2 * np.pi ) else: data["theta_rad"] = 0 data["theta_bin"] = pd.cut(data["theta_rad"], bins=theta_bins) data["r_bin"] = pd.cut(data[r_col], bins=r_bins) # Group by both bins and calculate the statistic heatmap_data = ( data.groupby(["theta_bin", "r_bin"], observed=False)[color_col] .agg(statistic) .unstack() ) # Get bin edges for plotting theta_edges = np.linspace(0, 2 * np.pi, theta_bins + 1) r_edges = np.linspace(data[r_col].min(), data[r_col].max(), r_bins + 1) # --- Plotting --- fig, ax = plt.subplots( figsize=figsize, subplot_kw={"projection": "polar"} ) T, R = np.meshgrid(theta_edges, r_edges) # Note: pcolormesh expects C with shape (num_r_bins, num_theta_bins) # The unstacked heatmap_data has shape (num_theta_bins, num_r_bins) ax.grid(False) # New version set to False explicitly pcm = ax.pcolormesh( T, R, heatmap_data.T.values, cmap=cmap, shading="auto" ) # --- Formatting --- cbar = fig.colorbar(pcm, ax=ax, pad=0.1, shrink=0.75) cbar.set_label(f"{statistic.capitalize()} of {color_col}", fontsize=10) ax.set_title( title or f"Interaction between {theta_col} and {r_col}", fontsize=14, y=1.1, ) ax.set_xlabel(theta_col) ax.set_ylabel(r_col, labelpad=25) set_axis_grid(ax, show_grid=show_grid, grid_props=grid_props) if mask_radius: # Hide radial tick labels (often preferred for normalized data) ax.set_yticklabels([]) plt.tight_layout() if savefig: plt.savefig(savefig, dpi=dpi, bbox_inches="tight") plt.close(fig) else: plt.show() return ax
plot_feature_interaction.__doc__ = r""" Plots a polar heatmap of feature interactions. This function visualizes how a target variable (``color_col``) changes based on the interaction between two features, one mapped to the angle and one to the radius. It is a powerful tool for discovering non-linear relationships and conditional patterns in the data. - The **angular position (θ)** represents the binned values of the first feature (``theta_col``). - The **radial distance (r)** represents the binned values of the second feature (``r_col``). - The **color** of each polar sector represents the aggregated value (e.g., mean) of the target variable (``color_col``) for all data points that fall into that specific bin. This plot is useful for identifying "hot spots" where a particular combination of feature values leads to a specific outcome, revealing complex interactions that are not visible from one-dimensional feature importance plots. Parameters ---------- df : pd.DataFrame The input DataFrame containing the feature and target data. theta_col : str The name of the feature to be mapped to the angular axis. This is often a cyclical feature like "month" or "hour". r_col : str The name of the feature to be mapped to the radial axis. color_col : str The name of the target column whose value will be represented by the color in each bin. statistic : str, default='mean' The aggregation function to apply to ``color_col`` within each bin (e.g., 'mean', 'median', 'std'). theta_period : float, optional The period of the cyclical data in ``theta_col`` (e.g., 24 for hours, 12 for months). This ensures the data wraps correctly around the polar plot. theta_bins : int, default=24 The number of bins to create for the angular feature. r_bins : int, default=10 The number of bins to create for the radial feature. title : str, optional The title for the plot. If ``None``, a default is generated. figsize : tuple of (float, float), default=(8, 8) The figure size in inches. cmap : str, default='viridis' The colormap for the heatmap. show_grid : bool, default=True Toggle the visibility of the polar grid lines. grid_props : dict, optional Custom keyword arguments passed to the grid for styling. mask_radius : bool, default=False If ``True``, hide the radial tick labels. savefig : str, optional The file path to save the plot. If ``None``, the plot is displayed interactively. dpi : int, default=300 The resolution (dots per inch) for the saved figure. Returns ------- ax : matplotlib.axes.Axes The Matplotlib Axes object containing the plot. Raises ------ ValueError If any of the specified columns are not found in the DataFrame. See Also -------- pandas.cut : Bin values into discrete intervals. pandas.DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. matplotlib.pyplot.pcolormesh : Create a pseudocolor plot with a non-regular rectangular grid. Notes ----- This plot is a novel visualization method developed as part of the analytics framework in :footcite:p:`kouadiob2025`. The heatmap is constructed by first binning the 2D polar space defined by ``theta_col`` and ``r_col``. For each resulting polar sector, the specified ``statistic`` (e.g., mean) is calculated for all data points whose feature values fall within that sector. The resulting aggregate value is then mapped to a color, creating the heatmap effect. **Coordinate Mapping and Binning**: 1. The angular data from ``theta_col``, :math:`\theta_{data}`, is converted to radians :math:`[0, 2\pi]`. If a period :math:`P` is given, the mapping is: .. math:: :label: eq:theta_mapping \theta_{rad} = \left( \frac{\theta_{data} \pmod P}{P} \right) \cdot 2\pi 2. The data space is then divided into a grid of :math:`K_r \times K_{\theta}` bins, where :math:`K_r` is ``r_bins`` and :math:`K_{\theta}` is ``theta_bins``. 3. For each bin :math:`B_{ij}`, the aggregate value :math:`C_{ij}` is computed from the target column ``color_col`` (:math:`z`): .. math:: :label: eq:bin_aggregation C_{ij} = \text{statistic}(\{z_k \mid (r_k, \theta_k) \in B_{ij}\}) Examples -------- >>> import numpy as np >>> import pandas as pd >>> from kdiagram.plot.feature_based import plot_feature_interaction >>> >>> # Simulate solar panel output data >>> np.random.seed(0) >>> n_points = 5000 >>> hour = np.random.uniform(0, 24, n_points) >>> cloud = np.random.rand(n_points) >>> >>> # Output depends on the interaction of daylight and cloud cover >>> daylight = np.sin(hour * np.pi / 24)**2 >>> cloud_factor = (1 - cloud**0.5) >>> output = 100 * daylight * cloud_factor + np.random.rand(n_points) * 5 >>> output[(hour < 6) | (hour > 18)] = 0 # No output at night >>> >>> df_solar = pd.DataFrame({ ... 'hour_of_day': hour, ... 'cloud_cover': cloud, ... 'panel_output': output ... }) >>> >>> # Generate the plot >>> ax = plot_feature_interaction( ... df=df_solar, ... theta_col='hour_of_day', ... r_col='cloud_cover', ... color_col='panel_output', ... theta_period=24, ... theta_bins=24, ... r_bins=8, ... cmap='inferno', ... title='Solar Panel Output by Hour and Cloud Cover' ... ) References ---------- .. footbibliography:: """
[docs] @check_non_emptiness(params=["importances"]) def plot_feature_fingerprint( importances, features: list[str] | None = None, labels: list[str] | None = None, normalize: bool = True, fill: bool = True, cmap: str | list[Any] = "tab10", title: str = "Feature Impact Fingerprint", figsize: tuple[float, float] | None = None, show_grid: bool = True, savefig: str | None = None, ): # Ensure importances is a 2D NumPy array importance_matrix = ensure_2d(importances) n_layers, n_features_data = importance_matrix.shape # Manage feature names if features is None: # Generate default feature names if none provided features_list = [f"feature {i+1}" for i in range(n_features_data)] else: # Ensure features is a list and handle potential discrepancies features_list = columns_manager(features, empty_as_none=False) # If user provided fewer feature names than data columns, append # generic names if len(features_list) < n_features_data: features_list.extend( [ f"feature {ix + 1}" for ix in range(len(features_list), n_features_data) ] ) # Truncate if user provided more names than needed (optional, # could also raise error) elif len(features_list) > n_features_data: warnings.warn( f"More feature names ({len(features_list)}) provided " f"than data columns ({n_features_data}). " "Extra names ignored.", UserWarning, stacklevel=2, ) features_list = features_list[:n_features_data] n_features = len(features_list) # Final number of features used # Manage labels if labels is None: # Generate default layer labels if none provided labels_list = [f"Layer {idx+1}" for idx in range(n_layers)] else: labels_list = list(labels) # Ensure it's a list # Check label count consistency if len(labels_list) < n_layers: warnings.warn( f"Fewer labels ({len(labels_list)}) provided than " f"layers ({n_layers}). Using generic names for the rest.", UserWarning, stacklevel=2, ) labels_list.extend( [ f"Layer {ix + 1}" for ix in range(len(labels_list), n_layers) ] ) elif len(labels_list) > n_layers: warnings.warn( f"More labels ({len(labels_list)}) provided than " f"layers ({n_layers}). Extra labels ignored.", UserWarning, stacklevel=2, ) labels_list = labels_list[:n_layers] # --- Normalization (if requested) --- if normalize: # Calculate max per row (layer), keep dimensions for broadcasting # max_per_row shape: (n_layers, 1), e.g., (3, 1) importance_matrix = ( importance_matrix.values if isinstance(importance_matrix, pd.DataFrame) else importance_matrix ) max_per_row = importance_matrix.max(axis=1, keepdims=True) # Create a mask for rows with max_val > 0 (where normalization is safe) # valid_max_mask shape: (n_layers, 1), e.g., (3, 1) valid_max_mask = max_per_row > 1e-9 # Initialize normalized matrix normalized_matrix = np.zeros_like(importance_matrix, dtype=float) # Get boolean index for valid rows, shape (n_layers,) e.g., (3,) valid_rows_indices = valid_max_mask[:, 0] # Proceed only if there are any rows to normalize if np.any(valid_rows_indices): # Select the rows from the original matrix that need normalization # Shape: (n_valid_rows, n_features), e.g., (3, 6) rows_to_normalize = importance_matrix[valid_rows_indices] # Select the corresponding max values for these rows # Since max_per_row is (n_layers, 1) and valid_rows_indices is (n_layers,), # this indexing correctly results in shape (n_valid_rows, 1), e.g., (3, 1) max_values_for_valid_rows = max_per_row[valid_rows_indices] # Perform the division using broadcasting: (MxN / Mx1 works) normalized_rows = rows_to_normalize / max_values_for_valid_rows # Place the normalized rows back into the result matrix normalized_matrix[valid_rows_indices] = normalized_rows # Rows where max_val <= 0 remain zero (already initialized) # Update importance_matrix with normalized values importance_matrix = normalized_matrix # --- Angle Calculation for Radar Axes --- # Calculate evenly spaced angles for each feature axis angles = np.linspace(0, 2 * np.pi, n_features, endpoint=False).tolist() # Add the first angle to the end to close the loop for plotting angles_closed = angles + angles[:1] # --- Plotting Setup --- fig, ax = plt.subplots(figsize=figsize, subplot_kw=dict(polar=True)) # Get colors from specified colormap or list try: cmap_obj = get_cmap(cmap, default="tab10", failsafe="discrete") # Sample colors if it's a standard Matplotlib cmap colors = [cmap_obj(i / n_layers) for i in range(n_layers)] except ValueError: # Handle case where cmap might be a list of colors if isinstance(cmap, list): colors = cmap if len(colors) < n_layers: warnings.warn( f"Provided color list has fewer colors " f"({len(colors)}) than layers ({n_layers}). " f"Colors will repeat.", UserWarning, stacklevel=2, ) else: # Fallback if cmap is invalid string or list warnings.warn( f"Invalid cmap '{cmap}'. Falling back to 'tab10'.", UserWarning, stacklevel=2, ) cmap_obj = get_cmap("tab10", default="tab10", failsafe="discrete") colors = [cmap_obj(i / n_layers) for i in range(n_layers)] # --- Plot Each Layer --- for idx, row in enumerate(importance_matrix): # Get the importance values for the current layer values = row.tolist() # Add the first value to the end to close the loop values_closed = values + values[:1] # Determine the label for the legend label = labels_list[idx] # Determine the color, cycling if necessary color = colors[idx % len(colors)] # Plot the outline ax.plot( angles_closed, values_closed, label=label, color=color, linewidth=2, ) # Fill the area if requested if fill: ax.fill(angles_closed, values_closed, color=color, alpha=0.25) # --- Customize Plot Appearance --- ax.set_title(title, size=16, y=1.1) # Adjust title position # Set feature labels on the angular axes ax.set_xticks(angles) ax.set_xticklabels(features_list, fontsize=11) # Hide radial tick labels (often preferred for normalized data) ax.set_yticklabels([]) # Set radial limits (optional, e.g., enforce 0 start) ax.set_ylim(bottom=0) if normalize: # Optionally add a single radial label for the max value (1.0) ax.set_yticks([0.25, 0.5, 0.75, 1.0]) ax.set_yticklabels( ["0.25", "0.50", "0.75", "1.00"], fontsize=9, color="gray" ) # Show grid lines if requested if show_grid: ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.6) else: ax.grid(False) # Add legend, positioned outside the plot area ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1), fontsize=10) # Adjust layout to prevent labels/title overlapping plt.tight_layout(pad=2.0) # --- Save or Show --- if savefig: try: plt.savefig(savefig, bbox_inches="tight", dpi=300) print(f"Plot saved to {savefig}") except Exception as e: print(f"Error saving plot to {savefig}: {e}") else: plt.show() return ax
plot_feature_fingerprint.__doc__ = r""" Create a radar chart visualizing feature-importance profiles. This function draws a polar (radar) chart that compares how the importance of a common set of features varies across multiple groups/layers (e.g., different models, years, or spatial zones). Each group is drawn as a closed polygon, producing an interpretable "fingerprint" of relative influence across features (see also the dataset helper :func:`~kdiagram.datasets.make_fingerprint_data`; concept introduced in :footcite:t:`kouadiob2025`. The angular position encodes the feature index, and the radius encodes its (optionally normalized) importance value. Normalization allows shape-only comparison across layers, independent of absolute scale. Parameters ---------- importances : array-like of shape (n_layers, n_features) The importance matrix. Each row corresponds to one layer/group and each column to a feature. Accepts a list of lists, a NumPy array, or a pandas DataFrame. features : list of str, optional Names of the features (length must match the number of columns in ``importances``). If ``None``, generic names ``['feature 1', ..., 'feature N']`` are generated. labels : list of str, optional Display names for layers (length should match ``n_layers``). If ``None``, generic names ``['Layer 1', ..., 'Layer M']`` are generated. When counts mismatch, the function pads/truncates and issues a warning. normalize : bool, default=True If ``True``, normalize each row to the unit interval via :math:`r'_{ij} = r_{ij}/\max_k r_{ik}` (safe-dividing by zero yields zeros). This highlights *shape* differences across layers. If ``False``, raw magnitudes are plotted. fill : bool, default=True If ``True``, fill each polygon with a translucent color; otherwise draw outlines only. cmap : str or list, default='tab10' Either a Matplotlib colormap name (e.g., ``'viridis'``, ``'plasma'``, ``'tab10'``) or an explicit list of colors. Lists shorter than the number of layers will cycle with a warning. title : str, default='Feature Impact Fingerprint' Figure title. figsize : tuple of (float, float), optional Figure size in inches. If ``None``, a sensible default is used. show_grid : bool, default=True Whether to show polar grid lines. savefig : str, optional Path to save the figure (e.g., ``'fingerprint.png'``). If omitted, the plot is shown interactively. Returns ------- ax : matplotlib.axes.Axes The polar axes containing the radar chart (useful for further customization). Notes ----- **Angular encoding.** With :math:`N` features, angular positions are equally spaced as: .. math:: \theta_j \;=\; \frac{2\pi j}{N}, \qquad j = 0, \dots, N-1. **Closing polygons.** To draw closed fingerprints, the first vertex :math:`(\theta_0, r_{i0})` is appended again at :math:`2\pi` for each layer :math:`i`. **Row-wise normalization (default).** If ``normalize=True``, each row :math:`\mathbf r_i=(r_{i0},\dots,r_{i,N-1})` is scaled to its maximum: .. math:: r'_{ij} \;=\; \begin{cases} \dfrac{r_{ij}}{\max_k r_{ik}}, & \max_k r_{ik} > 0,\\[6pt] 0, & \text{otherwise,} \end{cases} which emphasizes *shape* differences between layers but removes absolute magnitude information. Set ``normalize=False`` to compare magnitudes. **Alternative min–max scaling (pre-processing).** If you prefer values distributed over :math:`[0,1]` using the local range, apply this transformation per row before calling the function as: .. math:: r''_{ij} \;=\; \frac{r_{ij} - \min_k r_{ik}} {\max_k r_{ik} - \min_k r_{ik} + \varepsilon}, with a small :math:`\varepsilon>0` to avoid division by zero. **Data assumptions.** Importance values are expected to be non-negative. Rows with a non-positive maximum (all zeros or all negative) become zeros under the default normalization. If your data can be negative, either: (1) set ``normalize=False`` and choose appropriate radial limits, or (2) shift/scale to non-negative values (e.g., min–max per row). **Missing/invalid values.** ``NaN`` or ``inf`` entries propagate to the plot and may render gaps. Clean data beforehand, e.g.: .. code-block:: python import numpy as np X = np.asarray(importances, float) X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) **Radial limits and ticks.** The plot enforces a non-negative radius (``ax.set_ylim(bottom=0)``). For unnormalized data, you may set a custom maximum: .. code-block:: python ax.set_rmax( np.nanmax(importances) ) Optionally add/readjust radial ticks for readability: .. code-block:: python ax.set_yticks([0.25, 0.5, 0.75, 1.0]) ax.set_yticklabels(["0.25", "0.50", "0.75", "1.00"]) **Feature order matters.** The perceived shape depends on feature ordering around the circle. Keep a consistent, meaningful order across comparisons (e.g., domain grouping or sorted by average importance). **Many features or layers.** With large :math:`N`, tick labels can overlap. Consider thinning labels or rotating them: .. code-block:: python angles = ax.get_xticks() ax.set_xticks(angles[::2]) ax.set_xticklabels([lbl for i, lbl in enumerate(features) if i % 2 == 0], rotation=25, ha="right") For many layers, prefer a discrete colormap and a multi-column legend or move it outside: .. code-block:: python ax.legend(loc="upper left", bbox_to_anchor=(1.02, 1.0), ncol=2) **Color and accessibility.** Use colorblind-friendly palettes (e.g., ``'tab10'``, ``'tab20'``) or pass an explicit color list. Avoid relying on color alone when printing in grayscale—consider distinct linestyles. **Complexity.** Runtime and memory scale as :math:`\mathcal O(MN)` for :math:`M` layers and :math:`N` features. For very large inputs, down-select features or layers for clarity. **Utilities.** Inputs are coerced to a numeric 2D array and feature names managed via lightweight helpers (e.g., ``ensure_2d``, ``columns_manager``). Name count mismatches are padded/truncated with a warning rather than raising. See Also -------- kdiagram.datasets.make_fingerprint_data : Generate a synthetic importance matrix suitable for this plot. kdiagram.plot.relationship.plot_relationship : Polar scatter for true–predicted relationships. matplotlib.pyplot.polar : Underlying polar plotting primitives. Examples -------- Generate random importances and plot with normalization and fills. >>> import numpy as np >>> from kdiagram.plot.feature_based import plot_feature_fingerprint >>> rng = np.random.default_rng(42) >>> imp = rng.random((3, 6)) # 3 layers, 6 features >>> feats = [f'Feature {i+1}' for i in range(6)] >>> labels = ['Model A', 'Model B', 'Model C'] >>> ax = plot_feature_fingerprint( ... importances=imp, ... features=feats, ... labels=labels, ... title='Random Feature Importance Comparison', ... cmap='Set3', ... normalize=True, ... fill=True ... ) Year-over-year weights without normalization. >>> features = ['rainfall', 'GWL', 'seismic', 'density', 'geo'] >>> weights = [ ... [0.2, 0.4, 0.1, 0.6, 0.3], # 2023 ... [0.3, 0.5, 0.2, 0.4, 0.4], # 2024 ... [0.1, 0.6, 0.2, 0.5, 0.3], # 2025 ... ] >>> years = ['2023', '2024', '2025'] >>> ax = plot_feature_fingerprint( ... importances=weights, ... features=features, ... labels=years, ... title='Feature Influence Over Years', ... cmap='tab10', ... normalize=False ... ) References ---------- .. footbibliography:: """