# BSD 3-Clause License.
#
# Copyright (c) 2019-2025 Robert A. Milton. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that
# the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
# following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
# following disclaimer in the documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or
# promote products derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
""" Models for data storage. """
from __future__ import annotations
from torchvision.transforms.v2 import Normalize
from rc.base import *
from copy import deepcopy
import itertools
import random
import shutil
import scipy.stats
from enum import IntEnum
#: Slice for ``n`` (row) in a Repo Table.
n: Tuple[slice, slice] = (slice(None, None, None), slice(None, 1, None))
#: Slice for ``x`` (inputs) in a Repo Table.
x: Tuple[slice, slice] = (slice(None, None, None), slice(1, -2, None))
#: Slice for ``l`` (categorical state) in a Repo Table.
l: Tuple[slice, slice] = (slice(None, None, None), slice(-2, -1, None))
#: Slice for ``y`` (output) in a Repo Table.
y: Tuple[slice, slice] = (slice(None, None, None), slice(-1, None, None))
[docs]
class DesignMatrix(Table):
""" The familiar user format of ``DesignMatrix`` which is fat (has many columns). """
Label = Path | str
""" Class attribute aliasing acceptable Types for column (or index) labels. """
class Options(NamedTuple):
read: MetaData = {'index_col': 0, 'header': [0, 1]} # Read options passed to ``pd.read_csv``.
write: MetaData = {} # Write options passed to ``pd.DataFrame.to_csv``.
skeleton: Pd.DataFrame = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
(('Input', 'float'),('Category', 'int'),
('Column', 'str'), ('Output', 'func'))))
""" DataFrame of the minimal, skeleton ``DesignMatrix``."""
defaultOptions: MetaData = Options().read | Options().write
""" Default file handling ``DesignMatrix.Options()``."""
@classmethod
def headers(cls, label: str) -> str:
match label.lower():
case 'x' | 'input' | 'continuous' | 'float' :
return 'x'
case 'i' | 'category' | 'discrete' | 'int' :
return 'i'
case 'l' | 'column' | 'label' | 'str' :
return 'l'
case 'y' | 'output' | 'map' | 'func' :
return 'y'
return '~'
[docs]
@classmethod
def create(cls, path: Store.Path, src: NormalDesignMatrix, columns_in_l: Label = '') -> Self:
""" Reformat the ``NormalDesignMatrix`` in ``src`` as a ``Self(DesignMatrix)``.
Args:
path: The ``Path`` to store the ``DesignMatrix`` created, overwritten if existing.
src: The ``NormalDesignMatrix`` to reformat.
Returns: The ``NormalDesignMatrix`` created at ``dst``.
"""
[docs]
@classmethod
def copy(cls, src: Self, dst: Store.Path = '') -> NormalDesignMatrix:
""" Reformat this ``DesignMatrix`` to a ``NormalDesignMatrix``.
Args:
src: The ``DesignMatrix`` to reformat.
dst: Optional ``Path`` to the ``NormalDesignMatrix``.
Defaults to ``''``, which overwrites ``src``.
Returns: The ``NormalDesignMatrix`` created at ``dst``.
"""
[docs]
class NormalDesignMatrix(DesignMatrix):
""" The internal format of ``DesignMatrix``, which is thin (has few columns). """
class Options(NamedTuple):
read: MetaData = {'index_col': 0, 'header': 0} # Read options passed to ``pd.read_csv``.
write: MetaData = {} # Write options passed to ``pd.DataFrame.to_csv``.
[docs]
def create(cls, path: Store.Path, src: NormalDesignMatrix) -> Self:
return cls(path, data = src)
[docs]
def copy(cls, src: Self, dst: Store.Path = '') -> NormalDesignMatrix:
return cls(dst, data = src) if dst else src
[docs]
class Normalization(DataBase):
""" Normalization of a Repo. """
[docs]
class NamedTables(NamedTuple):
data: DesignMatrix | MetaData = DesignMatrix.skeleton
[docs]
def __call__(self, name: str) -> Table | Matrix | MetaData:
""" Returns the Table named ``name``."""
return getattr(self, name)
options: NamedTables[MetaData] = NamedTables(data = DesignMatrix.defaultOptions)
defaultMetaData: MetaData = {'category delimiter' : '│'}
[docs]
def __call__(self, **meta: Any) -> Self:
""" Optimize and update ``self``.
Args:
**meta: Optimization ``MetaData``.
Returns: ``self``
"""
self._tables(**meta)
return self
def __init__(self, path: Store.Path, **tables: Table | PD.DataFrame):
super().__init__(path, **tables)
[docs]
@classmethod
def create(cls, path: Store.Path, data: DesignMatrix, **meta: Any) -> Self:
""" Create a ``Normalization`` in ``path``.
Args:
path: The folder to store the ``Normalization`` in. Need not exist,
any existing ``Tables`` will be overwritten if it does.
**meta: Optimization ``MetaData``.
Returns: The ``Normalization`` created.
"""
Meta.create(cls._meta_in(path), **(cls.defaultMetaData | meta))
return cls(path, data = data)
[docs]
class Repo(DataBase):
""" A Repository of data and models. Informally a dataset and all the things we'd like to do to it. """
[docs]
class NamedTables(NamedTuple):
data: Table | Matrix | MetaData = pd.DataFrame(columns=('x', 'l', 'y'))
[docs]
def __call__(self, name: str) -> Table | Matrix | MetaData:
""" Returns the Table named ``name``."""
return getattr(self, name)
options: NamedTables[MetaData] = NamedTables(data = Table.Options.default())
defaultMetaData: MetaData = {'K': 0}
@property
def fold(self):
""" The current fold. """
return self._fold
@fold.setter
def fold(self, value: int):
""" The current fold. A negative value refers test data in the fold numbered ``abs(value)``.
In case ``abs(value)`` is 0 or greater ``len(self)-1`` the current fold is ``self``. """
self._fold = value
[docs]
def __len__(self) -> int:
""" 1 + K proper folds in ``self``. """
return self._meta['K'] + 1
[docs]
def __getitem__(self, fold: int | slice) -> Path | Tuple[Path, ...]:
""" Indexer returns the ``Path`` (s) to the Folds indexed or sliced by ``fold``. """
if isinstance(fold, int):
return self.path if fold == 0 else self.path / f'{abs(fold)}'
else:
return tuple((self[i] for i in range(len(self))))[fold]
[docs]
def __setitem__(self, fold: int | slice , tables: Table | Matrix | Tuple[Table | Matrix, ...]):
""" Indexer creates the ``Fold`` (s) named or sliced by ``name``."""
self[fold] = tables
[docs]
def __call__(self, **meta: Any) -> Self:
""" Optimize and update ``self``.
Args:
**meta: Optimization ``MetaData``.
Returns: ``self``
"""
self(**meta)
return self