Source code for rc.data.models

#  BSD 3-Clause License.
# 
#  Copyright (c) 2019-2025 Robert A. Milton. All rights reserved.
# 
#  Redistribution and use in source and binary forms, with or without modification, are permitted provided that
#  the following conditions are met:
#
#  1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
#  following disclaimer.
#
#  2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
#  following disclaimer in the documentation and/or other materials provided with the distribution.
#
#  3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or
#  promote products derived from this software without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
#  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
#  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
#  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
#  OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
#  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

""" Models for data storage. """

from __future__ import annotations

from torchvision.transforms.v2 import Normalize

from rc.base import *
from copy import deepcopy
import itertools
import random
import shutil
import scipy.stats
from enum import IntEnum


#: Slice for ``n`` (row) in a Repo Table.
n: Tuple[slice, slice] = (slice(None, None, None), slice(None, 1, None))

#: Slice for ``x`` (inputs) in a Repo Table.
x: Tuple[slice, slice] = (slice(None, None, None), slice(1, -2, None))

#: Slice for ``l`` (categorical state) in a Repo Table.
l: Tuple[slice, slice] = (slice(None, None, None), slice(-2, -1, None))

#: Slice for ``y`` (output) in a Repo Table.
y: Tuple[slice, slice] = (slice(None, None, None), slice(-1, None, None))



[docs]
class DesignMatrix(Table):
    """ The familiar user format of ``DesignMatrix`` which is fat (has many columns). """

    Label = Path | str
    """ Class attribute aliasing acceptable Types for column (or index) labels. """

    class Options(NamedTuple):

        read: MetaData =  {'index_col': 0, 'header': [0, 1]}  # Read options passed to ``pd.read_csv``.
        write: MetaData =  {}   # Write options passed to ``pd.DataFrame.to_csv``.

    skeleton: Pd.DataFrame = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                                    (('Input', 'float'),('Category', 'int'),
                                     ('Column', 'str'), ('Output', 'func'))))
    """ DataFrame of the minimal, skeleton ``DesignMatrix``."""

    defaultOptions: MetaData = Options().read | Options().write
    """ Default file handling ``DesignMatrix.Options()``."""

    @classmethod
    def headers(cls, label: str) -> str:
        match label.lower():
            case 'x' | 'input' | 'continuous' | 'float' :
                return 'x'
            case 'i' | 'category' | 'discrete' | 'int' :
                return 'i'
            case 'l' | 'column' | 'label' | 'str' :
                return 'l'
            case 'y' | 'output' | 'map' | 'func' :
                return 'y'
        return '~'


[docs]
    @classmethod
    def create(cls, path: Store.Path, src: NormalDesignMatrix, columns_in_l: Label = '') -> Self:
        """ Reformat the ``NormalDesignMatrix`` in ``src`` as a ``Self(DesignMatrix)``.

        Args:
            path: The ``Path`` to store the ``DesignMatrix`` created, overwritten if existing.
            src: The ``NormalDesignMatrix`` to reformat.

        Returns: The ``NormalDesignMatrix`` created at ``dst``.
        """



[docs]
    @classmethod
    def copy(cls, src: Self, dst: Store.Path = '') -> NormalDesignMatrix:
        """ Reformat this ``DesignMatrix`` to a ``NormalDesignMatrix``.

        Args:
            src: The ``DesignMatrix`` to reformat.
            dst: Optional ``Path`` to the ``NormalDesignMatrix``.
                Defaults to ``''``, which overwrites ``src``.

        Returns: The ``NormalDesignMatrix`` created at ``dst``.
        """





[docs]
class NormalDesignMatrix(DesignMatrix):
    """ The internal format of ``DesignMatrix``, which is thin (has few columns). """

    class Options(NamedTuple):

        read: MetaData =  {'index_col': 0, 'header': 0}  # Read options passed to ``pd.read_csv``.
        write: MetaData =  {}   # Write options passed to ``pd.DataFrame.to_csv``.


[docs]
    def create(cls, path: Store.Path, src: NormalDesignMatrix) -> Self:
        return cls(path, data = src)



[docs]
    def copy(cls, src: Self, dst: Store.Path = '') -> NormalDesignMatrix:
        return cls(dst, data = src) if dst else src





[docs]
class Normalization(DataBase):
    """ Normalization of a Repo. """

[docs]
    class NamedTables(NamedTuple):

        data: DesignMatrix | MetaData = DesignMatrix.skeleton


[docs]
        def __call__(self, name: str) -> Table | Matrix | MetaData:
            """ Returns the Table named ``name``."""
            return getattr(self, name)



    options: NamedTables[MetaData] = NamedTables(data = DesignMatrix.defaultOptions)

    defaultMetaData: MetaData = {'category delimiter' : '│'}


[docs]
    def __call__(self, **meta: Any) -> Self:
        """ Optimize and update ``self``.

        Args:
            **meta: Optimization ``MetaData``.

        Returns: ``self``
        """
        self._tables(**meta)
        return self


    def __init__(self, path: Store.Path, **tables: Table | PD.DataFrame):
        super().__init__(path, **tables)



[docs]
    @classmethod
    def create(cls, path: Store.Path, data: DesignMatrix, **meta: Any) -> Self:
        """ Create a ``Normalization`` in ``path``.

        Args:
            path: The folder to store the ``Normalization`` in. Need not exist,
                any existing ``Tables`` will be overwritten if it does.
            **meta: Optimization ``MetaData``.

        Returns: The ``Normalization`` created.
        """
        Meta.create(cls._meta_in(path), **(cls.defaultMetaData | meta))
        return cls(path, data = data)





[docs]
class Repo(DataBase):
    """ A Repository of data and models. Informally a dataset and all the things we'd like to do to it. """


[docs]
    class NamedTables(NamedTuple):

        data: Table | Matrix | MetaData = pd.DataFrame(columns=('x', 'l', 'y'))


[docs]
        def __call__(self, name: str) -> Table | Matrix | MetaData:
            """ Returns the Table named ``name``."""
            return getattr(self, name)



    options: NamedTables[MetaData] = NamedTables(data = Table.Options.default())

    defaultMetaData: MetaData = {'K': 0}

    @property
    def fold(self):
        """ The current fold. """
        return self._fold

    @fold.setter
    def fold(self, value: int):
        """ The current fold. A negative value refers test data in the fold numbered ``abs(value)``.
            In case ``abs(value)`` is 0 or greater ``len(self)-1`` the current fold is ``self``. """
        self._fold = value


[docs]
    def __len__(self) -> int:
        """ 1 + K proper folds in ``self``. """
        return self._meta['K'] + 1



[docs]
    def __getitem__(self, fold: int | slice) -> Path | Tuple[Path, ...]:
        """ Indexer returns the ``Path`` (s) to the Folds indexed or sliced by ``fold``. """
        if isinstance(fold, int):
            return self.path  if fold == 0 else self.path / f'{abs(fold)}'
        else:
            return tuple((self[i] for i in range(len(self))))[fold]



[docs]
    def __setitem__(self, fold: int | slice , tables: Table | Matrix | Tuple[Table | Matrix, ...]):
        """ Indexer creates the ``Fold`` (s) named or sliced by ``name``."""
        self[fold] = tables



[docs]
    def __call__(self, **meta: Any) -> Self:
        """ Optimize and update ``self``.

        Args:
            **meta: Optimization ``MetaData``.

        Returns: ``self``
        """
        self(**meta)
        return self