Source code for kim.data

"""The general data class."""

# Author: Peishi Jiang <shixijps@gmail.com>

import numpy as np

import json
import pickle
from pathlib import Path, PosixPath

from typing import Optional

from .pre_analysis import analyze_interdependency
from .utils import get_scaler

from jaxtyping import Array

    # Attributes
    # ----------
    # xdata (array-like): the copy of xdata
    # ydata (array-like): the copy of ydata
    # Ns (int): the number of samples
    # Nx (int): the number of predictors
    # Ny (int): the number of predictands
    # xscaler_type (str): the type of xdata scaler, either 'minmax', 'normalize', 'standard', or 'log'
    # yscaler_type (str): the type of ydata scaler, either 'minmax', 'normalize', 'standard', or 'log'
    # xscaler (str): the xdata scaler
    # yscaler (str): the ydata scaler
    # sensitivity_config (dict): the sensitivity analysis configuration
    # sensitivity_done (bool): whether the sensitivity analysis is performed
    # sensitivity (array-like): the calculated sensitivity with shape (Nx, Ny)
    # sensitivity_mask (array-like): the calculated sensitivity mask with shape (Nx, Ny)
    # cond_sensitivity_mask (array-like): the calculated conditional sensitivity mask with shape (Nx, Ny)


[docs]
class Data(object):
    """The Data object.

    Attributes
    ----------
    xdata : array-like
        the copy of xdata
    ydata : array-like
        the copy of ydata
    Ns : int
        the number of samples
    Nx : int
        the number of predictors
    Ny : int
        the number of predictands
    xscaler_type : str
        the type of xdata scaler, either 'minmax', 'normalize', 'standard', or 'log'
    yscaler_type : str
        the type of ydata scaler, either 'minmax', 'normalize', 'standard', or 'log'
    xscaler : str
        the xdata scaler
    yscaler : str
        the ydata scaler
    sensitivity_config : dict
        the sensitivity analysis configuration
    sensitivity_done : bool
        whether the sensitivity analysis is performed
    sensitivity : array-like
        the calculated sensitivity with shape (Nx, Ny)
    sensitivity_mask : array-like
        the calculated sensitivity mask with shape (Nx, Ny)
    cond_sensitivity_mask : array-like
        the calculated conditional sensitivity mask with shape (Nx, Ny)

    """


[docs]
    def __init__(self, xdata: Optional[Array]=None, ydata: Optional[Array]=None, 
                 fdata: Optional[PosixPath]=None, xscaler_type: str='', yscaler_type: str=''):
        """Initialization function.

        Args:
            xdata (array-like): the predictors with shape (Ns, Nx)
            fdata (PosixPath): the root path where an existing data instance will be loaded
            ydata (array-like): the predictands with shape (Ns, Ny)
            xscaler_type (str): the type of xdata scaler, either `minmax`, `normalize`, `standard`, `log`, or ``
            yscaler_type (str): the type of ydata scaler, either `minmax`, `normalize`, `standard`, `log`, or ``
        """
        if fdata is not None:
           self.sensitivity_done = True
           self.load(fdata, check_xy=False, overwrite=True) 
        
        elif xdata is None or ydata is None:
            raise Exception("xdata and ydata are not given!")
        
        else:
            # Data array
            self.xdata = xdata
            self.ydata = ydata

            # Data dimensions
            assert xdata.shape[0] == ydata.shape[0], \
                "xdata and ydata must be the same number of samples"
            self.Ns = xdata.shape[0]
            self.Nx = xdata.shape[1]
            self.Ny = ydata.shape[1]

            # Create the transformer of the data
            self.xscaler_type = xscaler_type.lower()
            self.yscaler_type = yscaler_type.lower()
            self.xscaler = get_scaler(self.xdata, self.xscaler_type)
            self.yscaler = get_scaler(self.ydata, self.yscaler_type)

            # Data sensitivity
            self.sensitivity_config = {
                "method": None,
                "metric": None,
                "sst": None,
                "ntest": None,
                "alpha": None,
                "bins": None,
                "k": None,
                "n_jobs": None,
                "seed_shuffle": None,
            }
            self.sensitivity = np.zeros([self.Nx, self.Ny])
            self.sensitivity_mask = np.zeros([self.Nx, self.Ny], dtype='bool')
            self.cond_sensitivity_mask = np.zeros([self.Nx, self.Ny], dtype='bool')
            self.sensitivity_done = False
            self.loaded_from_other_sources = False

    


[docs]
    def calculate_sensitivity(
        self, method: str='gsa', metric: str='it-bins', 
        sst: bool=False, ntest: int=100, alpha: float=0.05, 
        bins: int=10, k: int=5, n_jobs=-1, seed_shuffle: int=1234,
        verbose: int=0
    ):
        """Calculate the sensitivity between `self.xdata` and `self.ydata` using either `pairwise_analysis` or `pc` method.
           The results are updated in `self.sensitivity_done`, `self.sensitivity`, `self.sensitivity_mask`, and `self.cond_sensitivity_mask`.
        
        Args:
            method (str): The preliminary analysis method, including:
                `gsa`: the pairwise global sensitivity analysis
                `pc`: a modified PC algorithm that include conditional indendpence test after gsa
                Defaults to `gsa`.
            metric (str): The metric calculating the sensitivity, including:
                `it-bins`: the information-theoretic measures (MI and CMI) using binning approach
                `it-knn`: the information-theoretic measures (MI and CMI) using knn approach
                `corr`: the correlation coefficient
                Defaults to `corr`.
            sst (bool): Whether to perform the statistical significance test or the shuffle test. Defaults to False.
            ntest (int): The number of shuffled samples in sst. Defaults to 100.
            alpha (float): The significance level. Defaults to 0.05.
            bins (int): The number of bins for each dimension when metric == "it-bins". Defaults to 10.
            k (int): The number of nearest neighbors when metric == "it-knn". Defaults to 5.
            n_jobs (int): The number of processers/threads used by joblib.Parallel. Defaults to -1.
            seed_shuffle (int): The random seed number for doing shuffle test. Defaults to 5.
            verbose (int): The verbosity level (0: normal, 1: debug). Defaults to 0.
        """
        sensitivity_config = self.sensitivity_config
        # xdata, ydata = self.xdata, self.ydata
        xdata_scaled, ydata_scaled = self.xdata_scaled, self.ydata_scaled
        # Calculate sensitivity
        sensitivity, sensitivity_mask, cond_sensitivity_mask = analyze_interdependency(
            xdata_scaled, ydata_scaled, method, metric, sst, 
            ntest, alpha, bins, k, n_jobs, seed_shuffle, verbose=verbose
        )

        # Update the configuration
        sensitivity_config['method'] = method
        sensitivity_config['metric'] = metric
        sensitivity_config['sst'] = sst
        sensitivity_config['ntest'] = ntest
        sensitivity_config['alpha'] = alpha
        sensitivity_config['bins'] = bins
        sensitivity_config['k'] = k
        sensitivity_config['n_jobs'] = n_jobs
        sensitivity_config['seed_shuffle'] = seed_shuffle
        self.sensitivity_config = sensitivity_config

        # Update the analysis result
        self.sensitivity_done = True
        self.sensitivity = sensitivity
        self.sensitivity_mask = sensitivity_mask
        self.cond_sensitivity_mask = cond_sensitivity_mask

    
    @property
    def xdata_scaled(self):
        """Perform normalization on `self.xdata` based on the given normalization type `self.xscaler_type`.
        
        Returns:
            array-like: the scaled `self.xdata`
        """
        return self.xscaler.transform(self.xdata)

    @property
    def ydata_scaled(self):
        """Perform normalization on `self.ydata` based on the given normalization type `self.yscaler_type`.

        Returns:
            array-like: the scaled `self.ydata`
        """
        return self.yscaler.transform(self.ydata)
    

[docs]
    def save(self, rootpath: PosixPath=Path("./")):
        """Save data and sensitivity analysis results to specified location, including:
            - data (x, y) and scaler
            - sensitivity analysis configuration
            - sensitivity analysis results

        Args:
            rootpath (PosixPath): the root path where data will be saved

        """
        if not self.sensitivity_done:
            raise Exception("Sensitivity analysis is not done yet.")

        if not rootpath.exists():
            rootpath.mkdir(parents=True)

        # xdata and ydata
        f_x, f_y = rootpath / "x.npy", rootpath / "y.npy"
        np.save(f_x, self.xdata)
        np.save(f_y, self.ydata)

        # x and y scalers
        f_scaler = rootpath / "scaler.pkl"
        scaler = {"x": self.xscaler, "y": self.yscaler, 
                  "xtype": self.xscaler_type, "ytype": self.yscaler_type}
        with open(f_scaler, "wb") as f:
            pickle.dump(scaler, f)
        
        # sensitivity configurations
        f_sensitivity_config = rootpath / "sens_configs.json"
        with open(f_sensitivity_config, "w") as f:
            json.dump(self.sensitivity_config, f)

        # sensitivity results
        f_s = rootpath / "sensitivity.npy"
        f_mask = rootpath / "sensitivity_mask.npy"
        f_cond_mask = rootpath / "cond_sensitivity_mask.npy"
        np.save(f_s, self.sensitivity)
        np.save(f_mask, self.sensitivity_mask)
        np.save(f_cond_mask, self.cond_sensitivity_mask)

    

[docs]
    def load(self, rootpath: PosixPath=Path("./"), check_xy: bool=True, overwrite: bool=False):
        """load data and sensitivity analysis results from specified location, including:
            - data (x, y) and scaler
            - sensitivity analysis configuration
            - sensitivity analysis results

        Args:
            rootpath (PosixPath): the root path where data will be loaded

        """
        if self.sensitivity_done and not overwrite:
            raise Exception("Sensitivity analysis has been performed.")
        
        # Load xdata and ydata
        f_x, f_y = rootpath / "x.npy", rootpath / "y.npy"
        xdata = np.load(f_x)
        ydata = np.load(f_y)
        if check_xy:
            assert np.allclose(xdata, self.xdata)
            assert np.allclose(ydata, self.ydata)
        self.xdata, self.ydata = xdata, ydata
        self.Ns = xdata.shape[0]
        self.Nx = xdata.shape[1]
        self.Ny = ydata.shape[1]

        # x and y scalers
        f_scaler = rootpath / "scaler.pkl"
        with open(f_scaler, "rb") as f:
            scaler = pickle.load(f)
        self.xscaler = scaler['x']
        self.yscaler = scaler['y']
        self.xscaler_type = scaler['xtype']
        self.yscaler_type = scaler['ytype']
        
        # sensitivity configurations
        f_sensitivity_config = rootpath / "sens_configs.json"
        with open(f_sensitivity_config, "r") as f:
            self.sensitivity_config = json.load(f)

        # sensitivity results
        f_s = rootpath / "sensitivity.npy"
        f_mask = rootpath / "sensitivity_mask.npy"
        f_cond_mask = rootpath / "cond_sensitivity_mask.npy"
        sensitivity = np.load(f_s)
        sensitivity_mask = np.load(f_mask)
        cond_sensitivity_mask = np.load(f_cond_mask)
        assert sensitivity.shape == (self.Nx, self.Ny)
        assert sensitivity_mask.shape == (self.Nx, self.Ny)
        assert cond_sensitivity_mask.shape == (self.Nx, self.Ny)
        self.sensitivity = sensitivity
        self.sensitivity_mask = sensitivity_mask
        self.cond_sensitivity_mask = cond_sensitivity_mask

        self.loaded_from_other_sources = True
        self.sensitivity_done = True