Source code for fedot.utilities.synth_dataset_generator

from typing import Dict

import numpy as np
from sklearn import datasets


[docs]def classification_dataset(samples_amount: int, features_amount: int, classes_amount: int, features_options: Dict, noise_fraction: float = 0.1, full_shuffle: bool = True, weights: list = None): """Generates a random dataset for ``n-class`` classification problem using scikit-learn API. Args: samples_amount: Total amount of samples in the resulted dataset. features_amount: Total amount of features per sample. classes_amount: The amount of classes in the dataset. features_options: The dictionary containing features options in key-value format .. details:: possible ``features_options`` variants: - ``informative`` -> the amount of informative features - ``redundant`` -> the amount of redundant features - ``repeated`` -> the amount of features that repeat the informative features - ``clusters_per_class`` -> the amount of clusters for each class noise_fraction: the fraction of noisy labels in the dataset full_shuffle: if true then all features and samples will be shuffled weights: The proportions of samples assigned to each class. If None, then classes are balanced Returns: array: features and target as numpy-arrays """ features, target = datasets.make_classification(n_samples=samples_amount, n_features=features_amount, n_informative=features_options['informative'], n_redundant=features_options['redundant'], n_repeated=features_options['repeated'], n_classes=classes_amount, n_clusters_per_class=features_options['clusters_per_class'], weights=weights, flip_y=noise_fraction, shuffle=full_shuffle) return features, target
[docs]def regression_dataset(samples_amount: int, features_amount: int, features_options: Dict, n_targets: int, noise: float = 0.0, shuffle: bool = True): """Generates a random dataset for regression problem using scikit-learn API. Args: samples_amount: total amount of samples in the resulted dataset features_amount: total amount of features per sample features_options: the dictionary containing features options in key-value format .. details:: possible ``features_options`` variants: - ``informative`` -> the amount of informative features - ``bias`` -> bias term in the underlying linear model n_targets: the amount of target variables noise: the standard deviation of the gaussian noise applied to the output shuffle: if ``True`` then all features and samples will be shuffled Returns: array: features and target as numpy-arrays """ features, target = datasets.make_regression(n_samples=samples_amount, n_features=features_amount, n_informative=features_options['informative'], bias=features_options['bias'], n_targets=n_targets, noise=noise, shuffle=shuffle) return features, target
[docs]def gauss_quantiles_dataset(samples_amount: int, features_amount: int, classes_amount: int, full_shuffle=True, **kwargs): """Generates a random dataset for n-class classification problem based on multi-dimensional gaussian distribution quantiles using scikit-learn API. Args: samples_amount: total amount of samples in the resulted dataset features_amount: total amount of features per sample classes_amount: the amount of classes in the dataset full_shuffle: if ``True`` then all features and samples will be shuffled kwargs: Optional['gauss_params'] mean and covariance values of the distribution Returns: array: features and target as numpy-arrays """ if 'gauss_params' in kwargs: mean, cov = kwargs['gauss_params'] else: mean, cov = None, 1. features, target = datasets.make_gaussian_quantiles(n_samples=samples_amount, n_features=features_amount, n_classes=classes_amount, shuffle=full_shuffle, mean=mean, cov=cov) return features, target
[docs]def generate_synthetic_data(length: int = 2200, periods: int = 5): """The function generates a synthetic one-dimensional array without omissions Args: length: the length of the array periods: the number of periods in the sine wave Returns: array: an array without gaps """ sinusoidal_data = np.linspace(-periods * np.pi, periods * np.pi, length) sinusoidal_data = np.sin(sinusoidal_data) random_noise = np.random.normal(loc=0.0, scale=0.1, size=length) # Combining a sine wave and random noise synthetic_data = sinusoidal_data + random_noise return synthetic_data