Source code for lpspline.datasets

import numpy as np
import polars as pl
from typing import Tuple



[docs] def load_by_dataset(samples: int = 1000, type: str = 'linear') -> Tuple[pl.DataFrame, pl.Series]: """ Generate a synthetic dataset for demonstrating LPSpline features including group effects. Parameters ---------- samples : int, default=1000 Number of samples to generate. type : str, default='cubic' Type of structural relationship for `y`. Options include: 'linear', 'cubic', 'cyclic'. Returns ------- X : pl.DataFrame A DataFrame containing the predictive feature 'x' and grouping structure 'by'. y : pl.Series The response Target series. """ x = np.linspace(-10, 10, samples) by = np.random.randint(low=0, high=3, size=samples) X = pl.DataFrame({ 'x': x, 'by': by }) if type == 'linear': y = pl.Series(x * (1 + by) + np.random.normal(size=samples) * 2) elif type == 'cubic': y = pl.Series(x**3 * (1 + by) + np.random.normal(size=samples) * 10) elif type == 'cyclic': y = pl.Series(np.sin(x) * (1 + by) + np.random.normal(size=samples) * 0.5) else: raise ValueError("type must be one of: 'linear', 'cubic', 'cyclic'") return X, y
[docs] def load_demo_dataset(samples: int = 1000) -> Tuple[pl.DataFrame, pl.Series]: """ Generate a diverse synthetic dataset for demonstrating multiple LPSpline component features. Parameters ---------- samples : int, default=1000 Number of samples to generate. Returns ------- X : pl.DataFrame A DataFrame containing multiple features tracking various generative relationships. y_series : pl.Series The synthesized composite target variable. """ x_linear = np.linspace(0, 10, samples) x_pwl = np.linspace(0, 10, samples) x_bs = np.linspace(0, 10, samples) x_cyc = np.linspace(0, 2*np.pi, samples) x_factor = np.random.randint(0, 3, samples) # True functions y_pwl = np.where(x_pwl < 5, 0, x_pwl - 5) # Hinge at 5 y_bs = np.sin(x_bs) # Sine wave y_cyc = np.cos(x_cyc) # Cosine wave y_factor = np.array([0, 2, -1])[x_factor] # Categorical effects slopes = np.array([-1, 0.5, 2]) y_linear = slopes[x_factor] * x_linear # Combined target with noise y = ( y_linear + y_pwl + y_bs + y_cyc + y_factor + np.random.normal(0, 0.2, samples) ) X = pl.DataFrame({ "xl": x_linear, "xpwl": x_pwl, "xbs": x_bs, "xcyc": x_cyc, "xfactor": x_factor }) y_series = pl.Series("target", y) return X, y_series