Source code for fedot.utilities.synth_dataset_generator

from typing import Dict

import numpy as np
from sklearn import datasets


[docs]def classification_dataset(samples_amount: int, features_amount: int, classes_amount: int,
                           features_options: Dict, noise_fraction: float = 0.1,
                           full_shuffle: bool = True, weights: list = None):
    """Generates a random dataset for ``n-class`` classification problem
    using scikit-learn API.

    Args:
        samples_amount: Total amount of samples in the resulted dataset.
        features_amount: Total amount of features per sample.
        classes_amount: The amount of classes in the dataset.
        features_options: The dictionary containing features options in key-value format

            .. details:: possible ``features_options`` variants:

                - ``informative`` -> the amount of informative features
                - ``redundant`` -> the amount of redundant features
                - ``repeated`` -> the amount of features that repeat the informative features
                - ``clusters_per_class`` -> the amount of clusters for each class

        noise_fraction: the fraction of noisy labels in the dataset
        full_shuffle: if true then all features and samples will be shuffled
        weights: The proportions of samples assigned to each class. If None, then classes are balanced

    Returns:
        array: features and target as numpy-arrays
    """

    features, target = datasets.make_classification(n_samples=samples_amount, n_features=features_amount,
                                                    n_informative=features_options['informative'],
                                                    n_redundant=features_options['redundant'],
                                                    n_repeated=features_options['repeated'],
                                                    n_classes=classes_amount,
                                                    n_clusters_per_class=features_options['clusters_per_class'],
                                                    weights=weights,
                                                    flip_y=noise_fraction,
                                                    shuffle=full_shuffle)

    return features, target


[docs]def regression_dataset(samples_amount: int, features_amount: int, features_options: Dict,
                       n_targets: int, noise: float = 0.0, shuffle: bool = True):
    """Generates a random dataset for regression problem using scikit-learn API.

    Args:
        samples_amount: total amount of samples in the resulted dataset
        features_amount: total amount of features per sample
        features_options: the dictionary containing features options in key-value format

            .. details:: possible ``features_options`` variants:

                - ``informative`` -> the amount of informative features
                - ``bias`` -> bias term in the underlying linear model

        n_targets: the amount of target variables
        noise: the standard deviation of the gaussian noise applied to the output
        shuffle: if ``True`` then all features and samples will be shuffled

    Returns:
        array: features and target as numpy-arrays
    """

    features, target = datasets.make_regression(n_samples=samples_amount, n_features=features_amount,
                                                n_informative=features_options['informative'],
                                                bias=features_options['bias'],
                                                n_targets=n_targets,
                                                noise=noise,
                                                shuffle=shuffle)

    return features, target


[docs]def gauss_quantiles_dataset(samples_amount: int, features_amount: int,
                            classes_amount: int, full_shuffle=True, **kwargs):
    """Generates a random dataset for n-class classification problem
    based on multi-dimensional gaussian distribution quantiles
    using scikit-learn API.

    Args:
        samples_amount: total amount of samples in the resulted dataset
        features_amount: total amount of features per sample
        classes_amount: the amount of classes in the dataset
        full_shuffle: if ``True`` then all features and samples will be shuffled
        kwargs: Optional['gauss_params'] mean and covariance values of the distribution

    Returns:
        array: features and target as numpy-arrays
    """

    if 'gauss_params' in kwargs:
        mean, cov = kwargs['gauss_params']
    else:
        mean, cov = None, 1.

    features, target = datasets.make_gaussian_quantiles(n_samples=samples_amount,
                                                        n_features=features_amount,
                                                        n_classes=classes_amount,
                                                        shuffle=full_shuffle,
                                                        mean=mean, cov=cov)
    return features, target


[docs]def generate_synthetic_data(length: int = 2200, periods: int = 5):
    """The function generates a synthetic one-dimensional array without omissions

    Args:
        length: the length of the array
        periods: the number of periods in the sine wave

    Returns:
        array: an array without gaps
    """

    sinusoidal_data = np.linspace(-periods * np.pi, periods * np.pi, length)
    sinusoidal_data = np.sin(sinusoidal_data)
    random_noise = np.random.normal(loc=0.0, scale=0.1, size=length)

    # Combining a sine wave and random noise
    synthetic_data = sinusoidal_data + random_noise
    return synthetic_data