diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..8a519bd
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,109 @@
+name: CI
+
+on:
+ push:
+ branches: [main, "autoloop/**"]
+ pull_request:
+ branches: [main]
+
+permissions:
+ contents: read
+
+jobs:
+ test:
+ name: Test
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Setup Bun
+ uses: oven-sh/setup-bun@v2
+ with:
+ bun-version: latest
+
+ - name: Install dependencies
+ run: bun install
+
+ - name: Type check
+ run: bunx tsc --noEmit
+
+ - name: Run tests
+ run: bun test
+
+ lint:
+ name: Lint
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Setup Bun
+ uses: oven-sh/setup-bun@v2
+ with:
+ bun-version: latest
+
+ - name: Install dependencies
+ run: bun install
+
+ - name: Lint
+ run: bunx biome check src tests
+
+ playground:
+ name: Build Playground
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Setup Bun
+ uses: oven-sh/setup-bun@v2
+ with:
+ bun-version: latest
+
+ - name: Install dependencies
+ run: bun install
+
+ - name: Build library bundle
+ run: bun build src/index.ts --outfile playground/tsikit-learn.js --target browser --minify
+
+ - name: Upload playground artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: playground
+ path: playground/
+
+ pages:
+ name: Deploy to GitHub Pages
+ runs-on: ubuntu-latest
+ needs: [test, playground]
+ if: github.ref == 'refs/heads/main'
+ permissions:
+ contents: read
+ pages: write
+ id-token: write
+ environment:
+ name: github-pages
+ url: ${{ steps.deployment.outputs.page_url }}
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Setup Bun
+ uses: oven-sh/setup-bun@v2
+ with:
+ bun-version: latest
+
+ - name: Install dependencies
+ run: bun install
+
+ - name: Build library bundle
+ run: bun build src/index.ts --outfile playground/tsikit-learn.js --target browser --minify
+
+ - name: Setup Pages
+ uses: actions/configure-pages@v5
+
+ - name: Upload artifact
+ uses: actions/upload-pages-artifact@v3
+ with:
+ path: playground/
+
+ - name: Deploy to GitHub Pages
+ id: deployment
+ uses: actions/deploy-pages@v4
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9ebfc2d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+node_modules/
+dist/
+coverage/
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..adbb06d
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,79 @@
+# Agent Instructions for tsikit-learn
+
+## Overview
+
+`tsikit-learn` is a TypeScript port of [scikit-learn](https://scikit-learn.org/). The project is being built one feature at a time by the Autoloop agent.
+
+## Stack
+
+- **Runtime & bundler**: Bun
+- **Language**: TypeScript (strictest settings — `strict: true`, `noUncheckedIndexedAccess: true`, `exactOptionalPropertyTypes: true`)
+- **Linting**: Biome
+- **Testing**: Bun test runner with fast-check for property-based tests
+- **Data layer**: `tsb` (TypeScript pandas port) as a peer dependency; typed arrays for numeric computation
+
+## Directory Structure
+
+```
+src/
+ index.ts — public entry point, re-exports everything
+ exceptions.ts — NotFittedError, ConvergenceWarning, etc.
+ base.ts — BaseEstimator, mixins, clone, check_is_fitted
+ utils/
+ extmath.ts — math utilities (safeDot, gramMatrix, cholesky, etc.)
+ validation.ts — input validation
+ multiclass.ts — multiclass helpers
+ class_weight.ts — class weight utilities
+ index.ts — re-exports all utils
+ preprocessing/
+ standard_scaler.ts — StandardScaler
+ minmax_scaler.ts — MinMaxScaler
+ label_encoder.ts — LabelEncoder
+ normalizer.ts — Normalizer
+ index.ts
+ metrics/
+ regression.ts — MSE, MAE, R², MAPE, explained_variance
+ classification.ts — accuracy, confusion_matrix, precision, recall, F1, log_loss
+ index.ts
+ model_selection/
+ split.ts — train_test_split, KFold, StratifiedKFold
+ index.ts
+ linear_model/
+ linear_regression.ts — LinearRegression (OLS via Cholesky)
+ ridge.ts — Ridge (L2 regularization)
+ index.ts
+tests/
+ base.test.ts
+ preprocessing.test.ts
+ metrics_model_selection.test.ts
+ linear_model.test.ts
+playground/
+ index.html — interactive demos, deployed to GitHub Pages
+```
+
+## TypeScript Conventions
+
+- No `any`, no `@ts-ignore`, no `as` casts (unless provably safe)
+- Use `Float64Array` for continuous numeric data, `Int32Array` for integer labels
+- Use `?? 0` or null checks for `noUncheckedIndexedAccess` compliance
+- Export everything from module `index.ts` files
+
+## Evaluation Metric
+
+The CI evaluation script counts TypeScript source files in `src/` (excluding `index.ts`) that contain `export`. Currently: **15 files**.
+
+## Adding a New Module
+
+1. Create `src/{module}/{feature}.ts` — implement the class with `fit`, `predict`/`transform`, `score`
+2. Create or update `src/{module}/index.ts` — re-export from the new file
+3. Update `src/index.ts` — add `export * from "./{module}/index.js"`
+4. Add tests in `tests/{module}.test.ts`
+5. Add a card to `playground/index.html`
+
+## Running Locally
+
+```bash
+bun install
+bun test
+bunx tsc --noEmit
+```
diff --git a/biome.json b/biome.json
new file mode 100644
index 0000000..600b130
--- /dev/null
+++ b/biome.json
@@ -0,0 +1,15 @@
+{
+ "$schema": "https://biomejs.dev/schemas/1.9.4/schema.json",
+ "organizeImports": { "enabled": true },
+ "linter": {
+ "enabled": true,
+ "rules": {
+ "recommended": true
+ }
+ },
+ "formatter": {
+ "enabled": true,
+ "indentStyle": "space",
+ "indentWidth": 2
+ }
+}
diff --git a/bunfig.toml b/bunfig.toml
new file mode 100644
index 0000000..0c9079a
--- /dev/null
+++ b/bunfig.toml
@@ -0,0 +1,2 @@
+[test]
+coverage = true
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..e6bd00a
--- /dev/null
+++ b/package.json
@@ -0,0 +1,34 @@
+{
+ "name": "tsikit-learn",
+ "version": "0.1.0",
+ "description": "A complete TypeScript port of scikit-learn",
+ "type": "module",
+ "main": "./dist/index.js",
+ "module": "./dist/index.js",
+ "types": "./dist/index.d.ts",
+ "exports": {
+ ".": {
+ "import": "./dist/index.js",
+ "types": "./dist/index.d.ts"
+ }
+ },
+ "scripts": {
+ "build": "bun build src/index.ts --outdir dist --target browser",
+ "test": "bun test",
+ "typecheck": "bunx tsc --noEmit",
+ "lint": "bunx biome check src tests"
+ },
+ "devDependencies": {
+ "@biomejs/biome": "^1.9.4",
+ "fast-check": "^3.22.0",
+ "typescript": "^5.7.2"
+ },
+ "peerDependencies": {
+ "tsb": "^0.1.0"
+ },
+ "peerDependenciesMeta": {
+ "tsb": {
+ "optional": true
+ }
+ }
+}
diff --git a/playground/index.html b/playground/index.html
new file mode 100644
index 0000000..2004305
--- /dev/null
+++ b/playground/index.html
@@ -0,0 +1,225 @@
+
+
+
+
+
+ tsikit-learn — TypeScript scikit-learn
+
+
+
+
+
+
+
+
exceptions
+
NotFittedError, ConvergenceWarning, ValueError
+
✅ Ported
+
+
+
base
+
BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin
+
✅ Ported
+
+
+
utils
+
extmath, validation, multiclass, class_weight
+
✅ Ported
+
+
+
preprocessing
+
StandardScaler, MinMaxScaler, LabelEncoder, Normalizer
+
✅ Ported
+
+
+
metrics
+
MSE, MAE, R², accuracy, precision, recall, F1, log_loss
+
✅ Ported
+
+
+
model_selection
+
train_test_split, KFold, StratifiedKFold
+
✅ Ported
+
+
+
linear_model.LinearRegression
+
OLS via Cholesky decomposition (normal equations)
+
✅ Ported
+
▶ Demo
+
+
+
linear_model.Ridge
+
L2-regularized least squares
+
✅ Ported
+
+
+
linear_model.Lasso
+
L1-regularized least squares
+
🕐 Pending
+
+
+
linear_model.LogisticRegression
+
Logistic regression with SGD/L-BFGS solver
+
🕐 Pending
+
+
+
tree
+
DecisionTreeClassifier, DecisionTreeRegressor
+
🕐 Pending
+
+
+
neighbors
+
KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors
+
🕐 Pending
+
+
+
naive_bayes
+
GaussianNB, MultinomialNB, BernoulliNB
+
🕐 Pending
+
+
+
svm
+
SVC, SVR, LinearSVC, LinearSVR
+
🕐 Pending
+
+
+
cluster
+
KMeans, DBSCAN, AgglomerativeClustering
+
🕐 Pending
+
+
+
ensemble
+
RandomForest, GradientBoosting, AdaBoost
+
🕐 Pending
+
+
+
+
+
+
LinearRegression Demo
+
Click "Generate" to create a noisy linear dataset, then "Fit" to train a LinearRegression model.
+
+
+
+
// Click "Generate Data" to start
+
+
+
+
+
+
diff --git a/src/base.ts b/src/base.ts
new file mode 100644
index 0000000..236df4d
--- /dev/null
+++ b/src/base.ts
@@ -0,0 +1,149 @@
+/**
+ * Base classes for all estimators.
+ * Mirrors sklearn.base.
+ */
+
+import { NotFittedError } from "./exceptions.js";
+
+export type Params = Record;
+
+/**
+ * Base class for all scikit-learn estimators.
+ * Provides get_params / set_params following sklearn conventions.
+ */
+export abstract class BaseEstimator {
+ /**
+ * Get parameters for this estimator.
+ * Returns own enumerable string-keyed properties that are not functions.
+ */
+ get_params(deep = true): Params {
+ const out: Params = {};
+ for (const key of Object.keys(this)) {
+ const val = (this as Record)[key];
+ if (typeof val !== "function") {
+ out[key] =
+ deep && val instanceof BaseEstimator ? val.get_params(deep) : val;
+ }
+ }
+ return out;
+ }
+
+ /** Set the parameters of this estimator. */
+ set_params(params: Params): this {
+ for (const [key, val] of Object.entries(params)) {
+ (this as Record)[key] = val;
+ }
+ return this;
+ }
+
+ /** Assert the estimator is fitted. */
+ protected _check_is_fitted(attributes: string[]): void {
+ const missing = attributes.filter(
+ (a) => (this as Record)[a] === undefined,
+ );
+ if (missing.length > 0) {
+ throw new NotFittedError(
+ `This ${this.constructor.name} instance is not fitted yet. Call 'fit' first.`,
+ );
+ }
+ }
+}
+
+/** Mixin class for all classifiers. */
+export abstract class ClassifierMixin {
+ readonly _estimator_type = "classifier" as const;
+
+ /** Return the mean accuracy on the given test data and labels. */
+ score(X: Float64Array[], y: Float64Array | Int32Array): number {
+ const yPred = this.predict(X);
+ let correct = 0;
+ for (let i = 0; i < y.length; i++) {
+ if ((yPred[i] ?? 0) === (y[i] ?? 0)) correct++;
+ }
+ return y.length > 0 ? correct / y.length : 0;
+ }
+
+ abstract predict(X: Float64Array[]): Int32Array | Float64Array;
+}
+
+/** Mixin class for all regressors. */
+export abstract class RegressorMixin {
+ readonly _estimator_type = "regressor" as const;
+
+ /** Return the coefficient of determination R² of the prediction. */
+ score(X: Float64Array[], y: Float64Array): number {
+ const yPred = this.predict(X);
+ const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length;
+ let ssTot = 0;
+ let ssRes = 0;
+ for (let i = 0; i < y.length; i++) {
+ const yi = y[i] ?? 0;
+ const pi = yPred[i] ?? 0;
+ ssTot += (yi - yMean) ** 2;
+ ssRes += (yi - pi) ** 2;
+ }
+ return ssTot === 0 ? 1 : 1 - ssRes / ssTot;
+ }
+
+ abstract predict(X: Float64Array[]): Float64Array;
+}
+
+/** Mixin class for all transformers. */
+export abstract class TransformerMixin {
+ readonly _estimator_type = "transformer" as const;
+
+ /** Fit and transform in one step. */
+ fit_transform(
+ X: Float64Array[],
+ y?: Float64Array | Int32Array,
+ ): Float64Array[] {
+ return this.fit(X, y).transform(X);
+ }
+
+ abstract fit(X: Float64Array[], y?: Float64Array | Int32Array): this;
+ abstract transform(X: Float64Array[]): Float64Array[];
+}
+
+/** Mixin class for all clusterers. */
+export abstract class ClusterMixin {
+ readonly _estimator_type = "clusterer" as const;
+
+ /** Perform clustering on X and return cluster labels. */
+ fit_predict(X: Float64Array[], y?: Float64Array | Int32Array): Int32Array {
+ return this.fit(X, y).labels_ ?? new Int32Array(X.length);
+ }
+
+ abstract fit(X: Float64Array[], y?: Float64Array | Int32Array): this;
+ labels_?: Int32Array;
+}
+
+/** Clone an estimator with the same parameters. */
+export function clone(estimator: T): T {
+ const Cls = estimator.constructor as new () => T;
+ const newEst = new Cls();
+ newEst.set_params(estimator.get_params(false));
+ return newEst;
+}
+
+/** Check if an estimator is fitted by looking for a trailing underscore attribute. */
+export function check_is_fitted(
+ estimator: BaseEstimator,
+ attributes?: string[],
+): void {
+ const attrs =
+ attributes ??
+ Object.keys(estimator).filter((k) => k.endsWith("_") && !k.startsWith("_"));
+ if (attrs.length === 0) {
+ throw new NotFittedError(
+ `This ${estimator.constructor.name} instance is not fitted yet.`,
+ );
+ }
+ const missing = attrs.filter(
+ (a) => (estimator as unknown as Record)[a] === undefined,
+ );
+ if (missing.length > 0) {
+ throw new NotFittedError(
+ `This ${estimator.constructor.name} instance is not fitted yet. Missing attributes: ${missing.join(", ")}.`,
+ );
+ }
+}
diff --git a/src/exceptions.ts b/src/exceptions.ts
new file mode 100644
index 0000000..f1a2bce
--- /dev/null
+++ b/src/exceptions.ts
@@ -0,0 +1,39 @@
+/**
+ * Exceptions used throughout tsikit-learn.
+ * Mirrors sklearn.exceptions.
+ */
+
+/** Raised when an estimator is used before being fitted. */
+export class NotFittedError extends Error {
+ override readonly name = "NotFittedError";
+ constructor(
+ message = "This estimator is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.",
+ ) {
+ super(message);
+ }
+}
+
+/** Warning raised when convergence is not reached. */
+export class ConvergenceWarning extends Error {
+ override readonly name = "ConvergenceWarning";
+}
+
+/** Raised when an invalid value is encountered. */
+export class ValueError extends Error {
+ override readonly name = "ValueError";
+}
+
+/** Raised when feature dimensions don't match. */
+export class DataDimensionalityWarning extends Error {
+ override readonly name = "DataDimensionalityWarning";
+}
+
+/** Raised when an undefined parameter is encountered. */
+export class UndefinedMetricWarning extends Error {
+ override readonly name = "UndefinedMetricWarning";
+}
+
+/** Raised when a change or metric is not positive. */
+export class EfficiencyWarning extends Error {
+ override readonly name = "EfficiencyWarning";
+}
diff --git a/src/index.ts b/src/index.ts
new file mode 100644
index 0000000..0d022c2
--- /dev/null
+++ b/src/index.ts
@@ -0,0 +1,31 @@
+/**
+ * tsikit-learn — A complete TypeScript port of scikit-learn.
+ *
+ * Ported modules (Phase 1 + Phase 2 + linear_model):
+ * - exceptions: NotFittedError, ConvergenceWarning, ValueError
+ * - base: BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin, ClusterMixin
+ * - utils: extmath, validation, multiclass, class_weight
+ * - preprocessing: StandardScaler, MinMaxScaler, LabelEncoder, Normalizer
+ * - metrics: regression (mse, mae, r2), classification (accuracy, precision, recall, f1)
+ * - model_selection: train_test_split, KFold, StratifiedKFold
+ * - linear_model: LinearRegression, Ridge
+ */
+
+// Core
+export * from "./exceptions.js";
+export * from "./base.js";
+
+// Utils
+export * from "./utils/index.js";
+
+// Preprocessing
+export * from "./preprocessing/index.js";
+
+// Metrics
+export * from "./metrics/index.js";
+
+// Model selection
+export * from "./model_selection/index.js";
+
+// Linear models
+export * from "./linear_model/index.js";
diff --git a/src/linear_model/index.ts b/src/linear_model/index.ts
new file mode 100644
index 0000000..1875ef5
--- /dev/null
+++ b/src/linear_model/index.ts
@@ -0,0 +1,2 @@
+export * from "./linear_regression.js";
+export * from "./ridge.js";
diff --git a/src/linear_model/linear_regression.ts b/src/linear_model/linear_regression.ts
new file mode 100644
index 0000000..bee73b2
--- /dev/null
+++ b/src/linear_model/linear_regression.ts
@@ -0,0 +1,152 @@
+/**
+ * Linear Regression — Ordinary Least Squares.
+ * Mirrors sklearn.linear_model.LinearRegression.
+ *
+ * Uses the normal equations: β = (X.T X)⁻¹ X.T y
+ * Solved via Cholesky decomposition for numerical stability.
+ */
+
+import { BaseEstimator, RegressorMixin } from "../base.js";
+import {
+ addDiagonal,
+ choleskyLinsolve,
+ gramMatrix,
+ safeDot,
+ xtDotY,
+} from "../utils/extmath.js";
+import { checkArray, checkXy } from "../utils/validation.js";
+
+export interface LinearRegressionParams {
+ fit_intercept?: boolean;
+ copy_X?: boolean;
+ positive?: boolean;
+}
+
+/**
+ * Ordinary least squares Linear Regression.
+ *
+ * Minimizes the residual sum of squares between observed and predicted values.
+ * Equivalent to sklearn.linear_model.LinearRegression.
+ *
+ * @example
+ * ```ts
+ * import { LinearRegression } from 'tsikit-learn';
+ *
+ * const X = [new Float64Array([1]), new Float64Array([2]), new Float64Array([3])];
+ * const y = new Float64Array([2, 4, 6]);
+ *
+ * const reg = new LinearRegression();
+ * reg.fit(X, y);
+ * console.log(reg.coef_); // Float64Array [2]
+ * console.log(reg.intercept_); // ~0
+ * console.log(reg.predict([new Float64Array([4])])); // Float64Array [8]
+ * ```
+ */
+export class LinearRegression extends BaseEstimator {
+ fit_intercept: boolean;
+ copy_X: boolean;
+ positive: boolean;
+
+ coef_?: Float64Array;
+ intercept_?: number;
+ n_features_in_?: number;
+ rank_?: number;
+
+ constructor(params: LinearRegressionParams = {}) {
+ super();
+ this.fit_intercept = params.fit_intercept ?? true;
+ this.copy_X = params.copy_X ?? true;
+ this.positive = params.positive ?? false;
+ }
+
+ fit(X: Float64Array[], y: Float64Array): this {
+ checkXy(X, y);
+ checkArray(X);
+
+ const n = X.length;
+ const nFeatures = (X[0] ?? new Float64Array(0)).length;
+ this.n_features_in_ = nFeatures;
+
+ let XCenter = X;
+ let yCenter = y;
+ let xMean: Float64Array | undefined;
+ let yMean = 0;
+
+ if (this.fit_intercept) {
+ // Center X and y
+ xMean = new Float64Array(nFeatures);
+ for (let i = 0; i < n; i++) {
+ const row = X[i] ?? new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) {
+ xMean[j] = (xMean[j] ?? 0) + (row[j] ?? 0);
+ }
+ }
+ for (let j = 0; j < nFeatures; j++) {
+ xMean[j] = (xMean[j] ?? 0) / n;
+ }
+ yMean = 0;
+ for (const v of y) yMean += v;
+ yMean /= n;
+
+ XCenter = X.map((row) => {
+ const centered = new Float64Array(row);
+ for (let j = 0; j < centered.length; j++) {
+ centered[j] = (centered[j] ?? 0) - ((xMean as Float64Array)[j] ?? 0);
+ }
+ return centered;
+ });
+ yCenter = new Float64Array(y.length);
+ for (let i = 0; i < y.length; i++) {
+ yCenter[i] = (y[i] ?? 0) - yMean;
+ }
+ }
+
+ // Solve normal equations: (X.T @ X) @ β = X.T @ y
+ const XtX = gramMatrix(XCenter);
+ const Xty = xtDotY(XCenter, yCenter);
+
+ // Add tiny ridge to handle near-singular matrices
+ addDiagonal(XtX, 1e-12);
+
+ const coef = choleskyLinsolve(XtX, Xty);
+ this.coef_ = coef;
+ this.rank_ = nFeatures;
+
+ if (this.fit_intercept && xMean !== undefined) {
+ let intercept = yMean;
+ for (let j = 0; j < nFeatures; j++) {
+ intercept -= (coef[j] ?? 0) * (xMean[j] ?? 0);
+ }
+ this.intercept_ = intercept;
+ } else {
+ this.intercept_ = 0;
+ }
+
+ return this;
+ }
+
+ predict(X: Float64Array[]): Float64Array {
+ this._check_is_fitted(["coef_", "intercept_"]);
+ const coef = this.coef_ as Float64Array;
+ const intercept = this.intercept_ as number;
+ const yPred = safeDot(X, coef);
+ for (let i = 0; i < yPred.length; i++) {
+ yPred[i] = (yPred[i] ?? 0) + intercept;
+ }
+ return yPred;
+ }
+
+ /** R² score on test data. */
+ score(X: Float64Array[], y: Float64Array): number {
+ const yPred = this.predict(X);
+ const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length;
+ let ssTot = 0;
+ let ssRes = 0;
+ for (let i = 0; i < y.length; i++) {
+ const yi = y[i] ?? 0;
+ ssTot += (yi - yMean) ** 2;
+ ssRes += (yi - (yPred[i] ?? 0)) ** 2;
+ }
+ return ssTot === 0 ? 1 : 1 - ssRes / ssTot;
+ }
+}
diff --git a/src/linear_model/ridge.ts b/src/linear_model/ridge.ts
new file mode 100644
index 0000000..eab9a65
--- /dev/null
+++ b/src/linear_model/ridge.ts
@@ -0,0 +1,156 @@
+/**
+ * Ridge Regression — L2-regularized Linear Regression.
+ * Mirrors sklearn.linear_model.Ridge.
+ *
+ * Minimizes: ||y - Xw||² + alpha * ||w||²
+ * Solved as: β = (X.T X + alpha * I)⁻¹ X.T y
+ */
+
+import { BaseEstimator } from "../base.js";
+import {
+ addDiagonal,
+ choleskyLinsolve,
+ gramMatrix,
+ safeDot,
+ xtDotY,
+} from "../utils/extmath.js";
+import { checkArray, checkXy } from "../utils/validation.js";
+
+export interface RidgeParams {
+ alpha?: number;
+ fit_intercept?: boolean;
+ copy_X?: boolean;
+ max_iter?: number;
+ tol?: number;
+ solver?: "auto" | "cholesky";
+}
+
+/**
+ * Linear least squares with L2 regularization.
+ *
+ * Equivalent to sklearn.linear_model.Ridge.
+ *
+ * @example
+ * ```ts
+ * import { Ridge } from 'tsikit-learn';
+ *
+ * const X = [new Float64Array([1, 0]), new Float64Array([0, 1]), new Float64Array([1, 1])];
+ * const y = new Float64Array([1, 2, 3]);
+ *
+ * const reg = new Ridge({ alpha: 1.0 });
+ * reg.fit(X, y);
+ * console.log(reg.coef_);
+ * ```
+ */
+export class Ridge extends BaseEstimator {
+ alpha: number;
+ fit_intercept: boolean;
+ copy_X: boolean;
+ max_iter: number;
+ tol: number;
+ solver: "auto" | "cholesky";
+
+ coef_?: Float64Array;
+ intercept_?: number;
+ n_features_in_?: number;
+ n_iter_?: number;
+
+ constructor(params: RidgeParams = {}) {
+ super();
+ this.alpha = params.alpha ?? 1.0;
+ this.fit_intercept = params.fit_intercept ?? true;
+ this.copy_X = params.copy_X ?? true;
+ this.max_iter = params.max_iter ?? 1000;
+ this.tol = params.tol ?? 1e-4;
+ this.solver = params.solver ?? "auto";
+ }
+
+ fit(X: Float64Array[], y: Float64Array): this {
+ checkXy(X, y);
+ checkArray(X);
+
+ const n = X.length;
+ const nFeatures = (X[0] ?? new Float64Array(0)).length;
+ this.n_features_in_ = nFeatures;
+
+ let XCenter = X;
+ let yCenter = y;
+ let xMean: Float64Array | undefined;
+ let yMean = 0;
+
+ if (this.fit_intercept) {
+ xMean = new Float64Array(nFeatures);
+ for (let i = 0; i < n; i++) {
+ const row = X[i] ?? new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) {
+ xMean[j] = (xMean[j] ?? 0) + (row[j] ?? 0);
+ }
+ }
+ for (let j = 0; j < nFeatures; j++) {
+ xMean[j] = (xMean[j] ?? 0) / n;
+ }
+ for (const v of y) yMean += v;
+ yMean /= n;
+
+ XCenter = X.map((row) => {
+ const centered = new Float64Array(row);
+ for (let j = 0; j < centered.length; j++) {
+ centered[j] = (centered[j] ?? 0) - ((xMean as Float64Array)[j] ?? 0);
+ }
+ return centered;
+ });
+ yCenter = new Float64Array(y.length);
+ for (let i = 0; i < y.length; i++) {
+ yCenter[i] = (y[i] ?? 0) - yMean;
+ }
+ }
+
+ // Solve (X.T @ X + alpha * I) @ β = X.T @ y
+ const XtX = gramMatrix(XCenter);
+ const Xty = xtDotY(XCenter, yCenter);
+
+ // Add alpha * I (ridge regularization)
+ addDiagonal(XtX, this.alpha);
+
+ const coef = choleskyLinsolve(XtX, Xty);
+ this.coef_ = coef;
+ this.n_iter_ = 1;
+
+ if (this.fit_intercept && xMean !== undefined) {
+ let intercept = yMean;
+ for (let j = 0; j < nFeatures; j++) {
+ intercept -= (coef[j] ?? 0) * (xMean[j] ?? 0);
+ }
+ this.intercept_ = intercept;
+ } else {
+ this.intercept_ = 0;
+ }
+
+ return this;
+ }
+
+ predict(X: Float64Array[]): Float64Array {
+ this._check_is_fitted(["coef_", "intercept_"]);
+ const coef = this.coef_ as Float64Array;
+ const intercept = this.intercept_ as number;
+ const yPred = safeDot(X, coef);
+ for (let i = 0; i < yPred.length; i++) {
+ yPred[i] = (yPred[i] ?? 0) + intercept;
+ }
+ return yPred;
+ }
+
+ /** R² score on test data. */
+ score(X: Float64Array[], y: Float64Array): number {
+ const yPred = this.predict(X);
+ const yMean = Array.from(y).reduce((a, b) => a + b, 0) / y.length;
+ let ssTot = 0;
+ let ssRes = 0;
+ for (let i = 0; i < y.length; i++) {
+ const yi = y[i] ?? 0;
+ ssTot += (yi - yMean) ** 2;
+ ssRes += (yi - (yPred[i] ?? 0)) ** 2;
+ }
+ return ssTot === 0 ? 1 : 1 - ssRes / ssTot;
+ }
+}
diff --git a/src/metrics/classification.ts b/src/metrics/classification.ts
new file mode 100644
index 0000000..408f55a
--- /dev/null
+++ b/src/metrics/classification.ts
@@ -0,0 +1,177 @@
+/**
+ * Classification metrics.
+ * Mirrors sklearn.metrics (classification subset).
+ */
+
+import { ValueError } from "../exceptions.js";
+
+/** Accuracy score. */
+export function accuracy_score(
+ yTrue: Float64Array | Int32Array,
+ yPred: Float64Array | Int32Array,
+ normalize = true,
+): number {
+ if (yTrue.length !== yPred.length) {
+ throw new ValueError("yTrue and yPred must have the same length");
+ }
+ let correct = 0;
+ for (let i = 0; i < yTrue.length; i++) {
+ if ((yTrue[i] ?? 0) === (yPred[i] ?? 0)) correct++;
+ }
+ return normalize ? (yTrue.length > 0 ? correct / yTrue.length : 0) : correct;
+}
+
+/** Confusion matrix. Returns a 2D array [actual][predicted]. */
+export function confusion_matrix(
+ yTrue: Float64Array | Int32Array,
+ yPred: Float64Array | Int32Array,
+ labels?: Int32Array,
+): number[][] {
+ const labelSet =
+ labels ??
+ (() => {
+ const s = new Set();
+ for (const v of yTrue) s.add(v);
+ for (const v of yPred) s.add(v);
+ return new Int32Array([...s].sort((a, b) => a - b));
+ })();
+
+ const n = labelSet.length;
+ const labelIdx = new Map();
+ for (let i = 0; i < n; i++) labelIdx.set(labelSet[i] ?? 0, i);
+
+ const matrix: number[][] = Array.from({ length: n }, () =>
+ new Array(n).fill(0),
+ );
+ for (let i = 0; i < yTrue.length; i++) {
+ const ti = labelIdx.get(yTrue[i] ?? 0);
+ const pi = labelIdx.get(yPred[i] ?? 0);
+ if (ti !== undefined && pi !== undefined) {
+ (matrix[ti] as number[])[pi] = ((matrix[ti] as number[])[pi] ?? 0) + 1;
+ }
+ }
+ return matrix;
+}
+
+/** Precision score for binary or multiclass (macro average). */
+export function precision_score(
+ yTrue: Float64Array | Int32Array,
+ yPred: Float64Array | Int32Array,
+ options: { average?: "binary" | "macro" | "micro"; posLabel?: number } = {},
+): number {
+ const { average = "binary", posLabel = 1 } = options;
+ const classes = (() => {
+ const s = new Set();
+ for (const v of yTrue) s.add(v);
+ return new Int32Array([...s].sort((a, b) => a - b));
+ })();
+
+ if (average === "binary") {
+ let tp = 0;
+ let fp = 0;
+ for (let i = 0; i < yTrue.length; i++) {
+ if ((yPred[i] ?? 0) === posLabel) {
+ if ((yTrue[i] ?? 0) === posLabel) tp++;
+ else fp++;
+ }
+ }
+ return tp + fp === 0 ? 0 : tp / (tp + fp);
+ }
+
+ if (average === "macro") {
+ let total = 0;
+ for (const c of classes) {
+ let tp = 0;
+ let fp = 0;
+ for (let i = 0; i < yTrue.length; i++) {
+ if ((yPred[i] ?? 0) === c) {
+ if ((yTrue[i] ?? 0) === c) tp++;
+ else fp++;
+ }
+ }
+ total += tp + fp === 0 ? 0 : tp / (tp + fp);
+ }
+ return classes.length > 0 ? total / classes.length : 0;
+ }
+
+ // micro
+ let tp = 0;
+ let fp = 0;
+ for (let i = 0; i < yTrue.length; i++) {
+ if ((yPred[i] ?? 0) === (yTrue[i] ?? 0)) tp++;
+ else fp++;
+ }
+ return tp + fp === 0 ? 0 : tp / (tp + fp);
+}
+
+/** Recall score. */
+export function recall_score(
+ yTrue: Float64Array | Int32Array,
+ yPred: Float64Array | Int32Array,
+ options: { average?: "binary" | "macro" | "micro"; posLabel?: number } = {},
+): number {
+ const { average = "binary", posLabel = 1 } = options;
+ const classes = (() => {
+ const s = new Set();
+ for (const v of yTrue) s.add(v);
+ return new Int32Array([...s].sort((a, b) => a - b));
+ })();
+
+ if (average === "binary") {
+ let tp = 0;
+ let fn = 0;
+ for (let i = 0; i < yTrue.length; i++) {
+ if ((yTrue[i] ?? 0) === posLabel) {
+ if ((yPred[i] ?? 0) === posLabel) tp++;
+ else fn++;
+ }
+ }
+ return tp + fn === 0 ? 0 : tp / (tp + fn);
+ }
+
+ if (average === "macro") {
+ let total = 0;
+ for (const c of classes) {
+ let tp = 0;
+ let fn = 0;
+ for (let i = 0; i < yTrue.length; i++) {
+ if ((yTrue[i] ?? 0) === c) {
+ if ((yPred[i] ?? 0) === c) tp++;
+ else fn++;
+ }
+ }
+ total += tp + fn === 0 ? 0 : tp / (tp + fn);
+ }
+ return classes.length > 0 ? total / classes.length : 0;
+ }
+
+ return accuracy_score(yTrue, yPred);
+}
+
+/** F1 score. */
+export function f1_score(
+ yTrue: Float64Array | Int32Array,
+ yPred: Float64Array | Int32Array,
+ options: { average?: "binary" | "macro" | "micro"; posLabel?: number } = {},
+): number {
+ const p = precision_score(yTrue, yPred, options);
+ const r = recall_score(yTrue, yPred, options);
+ return p + r === 0 ? 0 : (2 * p * r) / (p + r);
+}
+
+/** Log loss (cross-entropy). */
+export function log_loss(
+ yTrue: Float64Array | Int32Array,
+ yProba: Float64Array[],
+ eps = 1e-15,
+): number {
+ let total = 0;
+ for (let i = 0; i < yTrue.length; i++) {
+ const row = yProba[i] ?? new Float64Array(0);
+ const label = yTrue[i] ?? 0;
+ // For binary: row[1] is P(class=1)
+ const p = Math.min(1 - eps, Math.max(eps, row[label] ?? eps));
+ total += -Math.log(p);
+ }
+ return yTrue.length > 0 ? total / yTrue.length : 0;
+}
diff --git a/src/metrics/index.ts b/src/metrics/index.ts
new file mode 100644
index 0000000..96b3cab
--- /dev/null
+++ b/src/metrics/index.ts
@@ -0,0 +1,2 @@
+export * from "./regression.js";
+export * from "./classification.js";
diff --git a/src/metrics/regression.ts b/src/metrics/regression.ts
new file mode 100644
index 0000000..c42b5d3
--- /dev/null
+++ b/src/metrics/regression.ts
@@ -0,0 +1,120 @@
+/**
+ * Regression metrics.
+ * Mirrors sklearn.metrics (regression subset).
+ */
+
+import { ValueError } from "../exceptions.js";
+
+/** Mean squared error. */
+export function mean_squared_error(
+ yTrue: Float64Array,
+ yPred: Float64Array,
+ options: { sampleWeight?: Float64Array; squared?: boolean } = {},
+): number {
+ const { sampleWeight, squared = true } = options;
+ if (yTrue.length !== yPred.length) {
+ throw new ValueError("yTrue and yPred must have the same length");
+ }
+ let total = 0;
+ let wSum = 0;
+ for (let i = 0; i < yTrue.length; i++) {
+ const diff = (yTrue[i] ?? 0) - (yPred[i] ?? 0);
+ const w = sampleWeight ? (sampleWeight[i] ?? 1) : 1;
+ total += w * diff * diff;
+ wSum += w;
+ }
+ const mse = wSum > 0 ? total / wSum : 0;
+ return squared ? mse : Math.sqrt(mse);
+}
+
+/** Mean absolute error. */
+export function mean_absolute_error(
+ yTrue: Float64Array,
+ yPred: Float64Array,
+ sampleWeight?: Float64Array,
+): number {
+ if (yTrue.length !== yPred.length) {
+ throw new ValueError("yTrue and yPred must have the same length");
+ }
+ let total = 0;
+ let wSum = 0;
+ for (let i = 0; i < yTrue.length; i++) {
+ const w = sampleWeight ? (sampleWeight[i] ?? 1) : 1;
+ total += w * Math.abs((yTrue[i] ?? 0) - (yPred[i] ?? 0));
+ wSum += w;
+ }
+ return wSum > 0 ? total / wSum : 0;
+}
+
+/** R² score (coefficient of determination). */
+export function r2_score(
+ yTrue: Float64Array,
+ yPred: Float64Array,
+ sampleWeight?: Float64Array,
+): number {
+ if (yTrue.length !== yPred.length) {
+ throw new ValueError("yTrue and yPred must have the same length");
+ }
+ let wSum = 0;
+ let yMeanNum = 0;
+ for (let i = 0; i < yTrue.length; i++) {
+ const w = sampleWeight ? (sampleWeight[i] ?? 1) : 1;
+ yMeanNum += w * (yTrue[i] ?? 0);
+ wSum += w;
+ }
+ const yMean = wSum > 0 ? yMeanNum / wSum : 0;
+
+ let ssTot = 0;
+ let ssRes = 0;
+ for (let i = 0; i < yTrue.length; i++) {
+ const w = sampleWeight ? (sampleWeight[i] ?? 1) : 1;
+ const diff = (yTrue[i] ?? 0) - yMean;
+ ssTot += w * diff * diff;
+ ssRes += w * ((yTrue[i] ?? 0) - (yPred[i] ?? 0)) ** 2;
+ }
+ return ssTot === 0 ? 1 : 1 - ssRes / ssTot;
+}
+
+/** Mean absolute percentage error. */
+export function mean_absolute_percentage_error(
+ yTrue: Float64Array,
+ yPred: Float64Array,
+): number {
+ if (yTrue.length !== yPred.length) {
+ throw new ValueError("yTrue and yPred must have the same length");
+ }
+ let total = 0;
+ for (let i = 0; i < yTrue.length; i++) {
+ const yt = yTrue[i] ?? 0;
+ if (yt === 0) continue;
+ total += Math.abs((yt - (yPred[i] ?? 0)) / yt);
+ }
+ return total / yTrue.length;
+}
+
+/** Explained variance score. */
+export function explained_variance_score(
+ yTrue: Float64Array,
+ yPred: Float64Array,
+): number {
+ const n = yTrue.length;
+ let meanTrue = 0;
+ let meanErr = 0;
+ for (let i = 0; i < n; i++) {
+ meanTrue += yTrue[i] ?? 0;
+ meanErr += (yTrue[i] ?? 0) - (yPred[i] ?? 0);
+ }
+ meanTrue /= n;
+ meanErr /= n;
+
+ let varTrue = 0;
+ let varErr = 0;
+ for (let i = 0; i < n; i++) {
+ varTrue += ((yTrue[i] ?? 0) - meanTrue) ** 2;
+ varErr += ((yTrue[i] ?? 0) - (yPred[i] ?? 0) - meanErr) ** 2;
+ }
+ varTrue /= n;
+ varErr /= n;
+
+ return varTrue === 0 ? 0 : 1 - varErr / varTrue;
+}
diff --git a/src/model_selection/index.ts b/src/model_selection/index.ts
new file mode 100644
index 0000000..35a025e
--- /dev/null
+++ b/src/model_selection/index.ts
@@ -0,0 +1 @@
+export * from "./split.js";
diff --git a/src/model_selection/split.ts b/src/model_selection/split.ts
new file mode 100644
index 0000000..c693e53
--- /dev/null
+++ b/src/model_selection/split.ts
@@ -0,0 +1,219 @@
+/**
+ * Model selection utilities: train/test split and cross-validation.
+ * Mirrors sklearn.model_selection.
+ */
+
+import { ValueError } from "../exceptions.js";
+
+export interface TrainTestSplitOptions {
+ testSize?: number;
+ trainSize?: number;
+ randomState?: number;
+ shuffle?: boolean;
+ stratify?: Float64Array | Int32Array;
+}
+
+export interface TrainTestSplitResult {
+ XTrain: Float64Array[];
+ XTest: Float64Array[];
+ yTrain: Float64Array | Int32Array;
+ yTest: Float64Array | Int32Array;
+}
+
+/** Simple linear congruential generator for reproducible shuffles. */
+function lcg(seed: number): () => number {
+ let s = seed;
+ return () => {
+ s = (s * 1664525 + 1013904223) & 0xffffffff;
+ return (s >>> 0) / 0x100000000;
+ };
+}
+
+/** Fisher-Yates shuffle with optional seed. */
+function shuffleIndices(n: number, rng: () => number): Int32Array {
+ const idx = new Int32Array(n);
+ for (let i = 0; i < n; i++) idx[i] = i;
+ for (let i = n - 1; i > 0; i--) {
+ const j = Math.floor(rng() * (i + 1));
+ const tmp = idx[i] ?? 0;
+ idx[i] = idx[j] ?? 0;
+ idx[j] = tmp;
+ }
+ return idx;
+}
+
+/**
+ * Split arrays or matrices into random train and test subsets.
+ * Mirrors sklearn.model_selection.train_test_split.
+ */
+export function train_test_split(
+ X: Float64Array[],
+ y: Float64Array | Int32Array,
+ options: TrainTestSplitOptions = {},
+): TrainTestSplitResult {
+ const { testSize = 0.25, randomState = 42, shuffle = true } = options;
+ const n = X.length;
+ const nTest = Math.max(1, Math.round(n * testSize));
+ const nTrain = n - nTest;
+
+ if (nTrain <= 0) {
+ throw new ValueError(
+ `With n_samples=${n} and test_size=${testSize}, the resulting train set would be empty.`,
+ );
+ }
+
+ const rng = lcg(randomState);
+ const indices = shuffle
+ ? shuffleIndices(n, rng)
+ : (() => {
+ const idx = new Int32Array(n);
+ for (let i = 0; i < n; i++) idx[i] = i;
+ return idx;
+ })();
+
+ const trainIdx = indices.slice(0, nTrain);
+ const testIdx = indices.slice(nTrain);
+
+ const XTrain = Array.from(trainIdx, (i) => X[i] ?? new Float64Array(0));
+ const XTest = Array.from(testIdx, (i) => X[i] ?? new Float64Array(0));
+
+ const isInt = y instanceof Int32Array;
+ const yTrain = isInt
+ ? new Int32Array(Array.from(trainIdx, (i) => (y as Int32Array)[i] ?? 0))
+ : new Float64Array(
+ Array.from(trainIdx, (i) => (y as Float64Array)[i] ?? 0),
+ );
+ const yTest = isInt
+ ? new Int32Array(Array.from(testIdx, (i) => (y as Int32Array)[i] ?? 0))
+ : new Float64Array(Array.from(testIdx, (i) => (y as Float64Array)[i] ?? 0));
+
+ return { XTrain, XTest, yTrain, yTest };
+}
+
+export interface KFoldOptions {
+ nSplits?: number;
+ shuffle?: boolean;
+ randomState?: number;
+}
+
+export interface Fold {
+ trainIndex: Int32Array;
+ testIndex: Int32Array;
+}
+
+/**
+ * K-Folds cross-validator.
+ * Mirrors sklearn.model_selection.KFold.
+ */
+export class KFold {
+ nSplits: number;
+ shuffle: boolean;
+ randomState: number;
+
+ constructor(options: KFoldOptions = {}) {
+ this.nSplits = options.nSplits ?? 5;
+ this.shuffle = options.shuffle ?? false;
+ this.randomState = options.randomState ?? 0;
+ }
+
+ /** Generate indices to split data into training and test sets. */
+ *split(X: Float64Array[]): Generator {
+ const n = X.length;
+ if (this.nSplits > n) {
+ throw new ValueError(
+ `Cannot have number of splits n_splits=${this.nSplits} greater than the number of samples=${n}`,
+ );
+ }
+
+ const rng = lcg(this.randomState);
+ const indices = this.shuffle
+ ? shuffleIndices(n, rng)
+ : (() => {
+ const idx = new Int32Array(n);
+ for (let i = 0; i < n; i++) idx[i] = i;
+ return idx;
+ })();
+
+ const foldSizes = new Int32Array(this.nSplits).fill(
+ Math.floor(n / this.nSplits),
+ );
+ for (let i = 0; i < n % this.nSplits; i++) {
+ foldSizes[i] = (foldSizes[i] ?? 0) + 1;
+ }
+
+ let current = 0;
+ for (let fold = 0; fold < this.nSplits; fold++) {
+ const start = current;
+ const stop = current + (foldSizes[fold] ?? 0);
+ const testIndex = indices.slice(start, stop);
+ const trainIndex = new Int32Array([
+ ...Array.from(indices.slice(0, start)),
+ ...Array.from(indices.slice(stop)),
+ ]);
+ yield { trainIndex, testIndex };
+ current = stop;
+ }
+ }
+
+ getNumSplits(): number {
+ return this.nSplits;
+ }
+}
+
+export interface StratifiedKFoldOptions {
+ nSplits?: number;
+ shuffle?: boolean;
+ randomState?: number;
+}
+
+/**
+ * Stratified K-Folds cross-validator.
+ * Mirrors sklearn.model_selection.StratifiedKFold.
+ */
+export class StratifiedKFold {
+ nSplits: number;
+ shuffle: boolean;
+ randomState: number;
+
+ constructor(options: StratifiedKFoldOptions = {}) {
+ this.nSplits = options.nSplits ?? 5;
+ this.shuffle = options.shuffle ?? false;
+ this.randomState = options.randomState ?? 0;
+ }
+
+ *split(X: Float64Array[], y: Float64Array | Int32Array): Generator {
+ const n = X.length;
+ const rng = lcg(this.randomState);
+
+ // Group indices by class
+ const classIndices = new Map();
+ for (let i = 0; i < n; i++) {
+ const c = y[i] ?? 0;
+ if (!classIndices.has(c)) classIndices.set(c, []);
+ (classIndices.get(c) as number[]).push(i);
+ }
+
+ // Assign indices to folds
+ const foldIndices: number[][] = Array.from(
+ { length: this.nSplits },
+ () => [],
+ );
+ for (const [, idxList] of classIndices) {
+ const shuffled = this.shuffle
+ ? [...idxList].sort(() => rng() - 0.5)
+ : idxList;
+ shuffled.forEach((idx, i) => {
+ (foldIndices[i % this.nSplits] as number[]).push(idx);
+ });
+ }
+
+ for (let fold = 0; fold < this.nSplits; fold++) {
+ const testIndex = new Int32Array(foldIndices[fold] as number[]);
+ const trainIndicesList: number[] = [];
+ for (let f = 0; f < this.nSplits; f++) {
+ if (f !== fold) trainIndicesList.push(...(foldIndices[f] as number[]));
+ }
+ yield { trainIndex: new Int32Array(trainIndicesList), testIndex };
+ }
+ }
+}
diff --git a/src/preprocessing/index.ts b/src/preprocessing/index.ts
new file mode 100644
index 0000000..7c8f35b
--- /dev/null
+++ b/src/preprocessing/index.ts
@@ -0,0 +1,4 @@
+export * from "./standard_scaler.js";
+export * from "./minmax_scaler.js";
+export * from "./label_encoder.js";
+export * from "./normalizer.js";
diff --git a/src/preprocessing/label_encoder.ts b/src/preprocessing/label_encoder.ts
new file mode 100644
index 0000000..e6bdd91
--- /dev/null
+++ b/src/preprocessing/label_encoder.ts
@@ -0,0 +1,56 @@
+/**
+ * LabelEncoder — encode target labels with value between 0 and n_classes-1.
+ * Mirrors sklearn.preprocessing.LabelEncoder.
+ */
+
+import { BaseEstimator } from "../base.js";
+import { ValueError } from "../exceptions.js";
+
+export class LabelEncoder extends BaseEstimator {
+ classes_?: Int32Array;
+
+ fit(y: Float64Array | Int32Array): this {
+ const unique = new Set();
+ for (const v of y) unique.add(v);
+ this.classes_ = new Int32Array([...unique].sort((a, b) => a - b));
+ return this;
+ }
+
+ transform(y: Float64Array | Int32Array): Int32Array {
+ this._check_is_fitted(["classes_"]);
+ const classes = this.classes_ as Int32Array;
+ const classMap = new Map();
+ for (let i = 0; i < classes.length; i++) {
+ classMap.set(classes[i] ?? 0, i);
+ }
+ const result = new Int32Array(y.length);
+ for (let i = 0; i < y.length; i++) {
+ const encoded = classMap.get(y[i] ?? 0);
+ if (encoded === undefined) {
+ throw new ValueError(
+ `y contains previously unseen labels: ${String(y[i])}`,
+ );
+ }
+ result[i] = encoded;
+ }
+ return result;
+ }
+
+ inverse_transform(y: Int32Array): Int32Array {
+ this._check_is_fitted(["classes_"]);
+ const classes = this.classes_ as Int32Array;
+ const result = new Int32Array(y.length);
+ for (let i = 0; i < y.length; i++) {
+ const idx = y[i] ?? 0;
+ if (idx < 0 || idx >= classes.length) {
+ throw new ValueError("y contains values not in the fitted classes");
+ }
+ result[i] = classes[idx] ?? 0;
+ }
+ return result;
+ }
+
+ fit_transform(y: Float64Array | Int32Array): Int32Array {
+ return this.fit(y).transform(y);
+ }
+}
diff --git a/src/preprocessing/minmax_scaler.ts b/src/preprocessing/minmax_scaler.ts
new file mode 100644
index 0000000..2773fc8
--- /dev/null
+++ b/src/preprocessing/minmax_scaler.ts
@@ -0,0 +1,112 @@
+/**
+ * MinMaxScaler — scales features to a given range.
+ * Mirrors sklearn.preprocessing.MinMaxScaler.
+ */
+
+import { BaseEstimator } from "../base.js";
+import { ValueError } from "../exceptions.js";
+import { checkArray } from "../utils/validation.js";
+
+export interface MinMaxScalerParams {
+ feature_range?: [number, number];
+ copy?: boolean;
+ clip?: boolean;
+}
+
+export class MinMaxScaler extends BaseEstimator {
+ feature_range: [number, number];
+ copy: boolean;
+ clip: boolean;
+
+ data_min_?: Float64Array;
+ data_max_?: Float64Array;
+ data_range_?: Float64Array;
+ scale_?: Float64Array;
+ min_?: Float64Array;
+ n_features_in_?: number;
+ n_samples_seen_?: number;
+
+ constructor(params: MinMaxScalerParams = {}) {
+ super();
+ this.feature_range = params.feature_range ?? [0, 1];
+ this.copy = params.copy ?? true;
+ this.clip = params.clip ?? false;
+ }
+
+ fit(X: Float64Array[], _y?: Float64Array | Int32Array): this {
+ checkArray(X);
+ const [rMin, rMax] = this.feature_range;
+ if (rMin >= rMax) {
+ throw new ValueError(
+ `Minimum of desired feature range must be smaller than maximum. Got ${String(this.feature_range)}.`,
+ );
+ }
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ this.n_samples_seen_ = n;
+ this.n_features_in_ = p;
+
+ const dataMin = new Float64Array(p).fill(Number.POSITIVE_INFINITY);
+ const dataMax = new Float64Array(p).fill(Number.NEGATIVE_INFINITY);
+ for (const row of X) {
+ for (let j = 0; j < p; j++) {
+ const v = row[j] ?? 0;
+ if (v < (dataMin[j] ?? Number.POSITIVE_INFINITY)) dataMin[j] = v;
+ if (v > (dataMax[j] ?? Number.NEGATIVE_INFINITY)) dataMax[j] = v;
+ }
+ }
+ this.data_min_ = dataMin;
+ this.data_max_ = dataMax;
+ this.data_range_ = Float64Array.from(
+ dataMax,
+ (v, i) => v - (dataMin[i] ?? 0),
+ );
+ const rangeScale = rMax - rMin;
+ this.scale_ = Float64Array.from(this.data_range_, (v) =>
+ v === 0 ? 0 : rangeScale / v,
+ );
+ this.min_ = Float64Array.from(
+ this.scale_,
+ (v, i) => rMin - v * (dataMin[i] ?? 0),
+ );
+ return this;
+ }
+
+ transform(X: Float64Array[]): Float64Array[] {
+ this._check_is_fitted(["scale_", "min_"]);
+ const scale = this.scale_ as Float64Array;
+ const min = this.min_ as Float64Array;
+ const [rMin, rMax] = this.feature_range;
+ return X.map((row) => {
+ const out = this.copy ? new Float64Array(row) : row;
+ for (let j = 0; j < out.length; j++) {
+ out[j] = (out[j] ?? 0) * (scale[j] ?? 1) + (min[j] ?? 0);
+ if (this.clip) {
+ out[j] = Math.max(rMin, Math.min(rMax, out[j] ?? 0));
+ }
+ }
+ return out;
+ });
+ }
+
+ inverse_transform(X: Float64Array[]): Float64Array[] {
+ this._check_is_fitted(["scale_", "min_"]);
+ const scale = this.scale_ as Float64Array;
+ const min = this.min_ as Float64Array;
+ return X.map((row) => {
+ const out = new Float64Array(row);
+ for (let j = 0; j < out.length; j++) {
+ const s = scale[j] ?? 0;
+ out[j] = s !== 0 ? ((out[j] ?? 0) - (min[j] ?? 0)) / s : 0;
+ }
+ return out;
+ });
+ }
+
+ fit_transform(
+ X: Float64Array[],
+ y?: Float64Array | Int32Array,
+ ): Float64Array[] {
+ return this.fit(X, y).transform(X);
+ }
+}
diff --git a/src/preprocessing/normalizer.ts b/src/preprocessing/normalizer.ts
new file mode 100644
index 0000000..ab7ef5d
--- /dev/null
+++ b/src/preprocessing/normalizer.ts
@@ -0,0 +1,71 @@
+/**
+ * Normalizer — normalize samples individually to unit norm.
+ * Mirrors sklearn.preprocessing.Normalizer.
+ */
+
+import { BaseEstimator } from "../base.js";
+import { ValueError } from "../exceptions.js";
+
+export type NormType = "l1" | "l2" | "max";
+
+export interface NormalizerParams {
+ norm?: NormType;
+ copy?: boolean;
+}
+
+export class Normalizer extends BaseEstimator {
+ norm: NormType;
+ copy: boolean;
+
+ constructor(params: NormalizerParams = {}) {
+ super();
+ this.norm = params.norm ?? "l2";
+ this.copy = params.copy ?? true;
+ }
+
+ fit(_X: Float64Array[], _y?: Float64Array | Int32Array): this {
+ // Normalizer is stateless — nothing to fit
+ return this;
+ }
+
+ transform(X: Float64Array[]): Float64Array[] {
+ return X.map((row) => {
+ const out = this.copy ? new Float64Array(row) : row;
+ const norm = this._computeNorm(out);
+ if (norm === 0) return out;
+ for (let j = 0; j < out.length; j++) {
+ out[j] = (out[j] ?? 0) / norm;
+ }
+ return out;
+ });
+ }
+
+ fit_transform(
+ X: Float64Array[],
+ _y?: Float64Array | Int32Array,
+ ): Float64Array[] {
+ return this.transform(X);
+ }
+
+ private _computeNorm(row: Float64Array): number {
+ switch (this.norm) {
+ case "l1": {
+ let sum = 0;
+ for (const v of row) sum += Math.abs(v);
+ return sum;
+ }
+ case "l2": {
+ let sum = 0;
+ for (const v of row) sum += v * v;
+ return Math.sqrt(sum);
+ }
+ case "max": {
+ let max = 0;
+ for (const v of row) max = Math.max(max, Math.abs(v));
+ return max;
+ }
+ default:
+ throw new ValueError(`Unknown norm: ${String(this.norm)}`);
+ }
+ }
+}
diff --git a/src/preprocessing/standard_scaler.ts b/src/preprocessing/standard_scaler.ts
new file mode 100644
index 0000000..576cb12
--- /dev/null
+++ b/src/preprocessing/standard_scaler.ts
@@ -0,0 +1,98 @@
+/**
+ * StandardScaler — zero-mean, unit-variance normalization.
+ * Mirrors sklearn.preprocessing.StandardScaler.
+ */
+
+import { BaseEstimator, TransformerMixin } from "../base.js";
+import { ValueError } from "../exceptions.js";
+import { checkArray, checkFeaturesConsistency } from "../utils/validation.js";
+
+export interface StandardScalerParams {
+ copy?: boolean;
+ with_mean?: boolean;
+ with_std?: boolean;
+}
+
+export class StandardScaler extends BaseEstimator {
+ copy: boolean;
+ with_mean: boolean;
+ with_std: boolean;
+
+ mean_?: Float64Array;
+ scale_?: Float64Array;
+ var_?: Float64Array;
+ n_features_in_?: number;
+ n_samples_seen_?: number;
+
+ constructor(params: StandardScalerParams = {}) {
+ super();
+ this.copy = params.copy ?? true;
+ this.with_mean = params.with_mean ?? true;
+ this.with_std = params.with_std ?? true;
+ }
+
+ fit(X: Float64Array[], _y?: Float64Array | Int32Array): this {
+ checkArray(X);
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ this.n_samples_seen_ = n;
+ this.n_features_in_ = p;
+
+ const mean = new Float64Array(p);
+ const M2 = new Float64Array(p);
+
+ // Welford's online algorithm for mean and variance
+ for (let i = 0; i < n; i++) {
+ const row = X[i] ?? new Float64Array(p);
+ for (let j = 0; j < p; j++) {
+ const x = row[j] ?? 0;
+ const delta = x - (mean[j] ?? 0);
+ mean[j] = (mean[j] ?? 0) + delta / (i + 1);
+ M2[j] = (M2[j] ?? 0) + delta * (x - (mean[j] ?? 0));
+ }
+ }
+
+ this.mean_ = mean;
+ const variance =
+ n > 1 ? Float64Array.from(M2, (v) => v / (n - 1)) : new Float64Array(p);
+ this.var_ = variance;
+ this.scale_ = Float64Array.from(variance, (v) => Math.sqrt(v) || 1.0);
+ return this;
+ }
+
+ transform(X: Float64Array[]): Float64Array[] {
+ this._check_is_fitted(["mean_", "scale_"]);
+ checkFeaturesConsistency(X, X); // just shape check
+ const mean = this.mean_ as Float64Array;
+ const scale = this.scale_ as Float64Array;
+ return X.map((row) => {
+ const out = this.copy ? new Float64Array(row) : row;
+ for (let j = 0; j < out.length; j++) {
+ if (this.with_mean) out[j] = (out[j] ?? 0) - (mean[j] ?? 0);
+ if (this.with_std) out[j] = (out[j] ?? 0) / (scale[j] ?? 1);
+ }
+ return out;
+ });
+ }
+
+ inverse_transform(X: Float64Array[]): Float64Array[] {
+ this._check_is_fitted(["mean_", "scale_"]);
+ const mean = this.mean_ as Float64Array;
+ const scale = this.scale_ as Float64Array;
+ return X.map((row) => {
+ const out = new Float64Array(row);
+ for (let j = 0; j < out.length; j++) {
+ if (this.with_std) out[j] = (out[j] ?? 0) * (scale[j] ?? 1);
+ if (this.with_mean) out[j] = (out[j] ?? 0) + (mean[j] ?? 0);
+ }
+ return out;
+ });
+ }
+
+ fit_transform(
+ X: Float64Array[],
+ y?: Float64Array | Int32Array,
+ ): Float64Array[] {
+ return this.fit(X, y).transform(X);
+ }
+}
diff --git a/src/utils/class_weight.ts b/src/utils/class_weight.ts
new file mode 100644
index 0000000..9c23c4b
--- /dev/null
+++ b/src/utils/class_weight.ts
@@ -0,0 +1,69 @@
+/**
+ * Class weight utilities.
+ * Mirrors sklearn.utils.class_weight.
+ */
+
+import { ValueError } from "../exceptions.js";
+
+/**
+ * Compute class weights for imbalanced datasets.
+ * For 'balanced': n_samples / (n_classes * bincount(y))
+ */
+export function computeClassWeight(
+ classWeight: "balanced" | Record,
+ classes: Int32Array,
+ y: Float64Array | Int32Array,
+): Float64Array {
+ const weights = new Float64Array(classes.length);
+
+ if (classWeight === "balanced") {
+ const nSamples = y.length;
+ const nClasses = classes.length;
+ const counts = new Map();
+ for (const c of classes) counts.set(c, 0);
+ for (const v of y) {
+ const cur = counts.get(v);
+ if (cur !== undefined) counts.set(v, cur + 1);
+ }
+ for (let i = 0; i < classes.length; i++) {
+ const c = classes[i] ?? 0;
+ const count = counts.get(c) ?? 0;
+ if (count === 0) {
+ throw new ValueError(`Class ${c} is not present in y`);
+ }
+ weights[i] = nSamples / (nClasses * count);
+ }
+ } else {
+ for (let i = 0; i < classes.length; i++) {
+ const c = classes[i] ?? 0;
+ const w = classWeight[c];
+ if (w === undefined) {
+ throw new ValueError(`Class ${c} is not in classWeight`);
+ }
+ weights[i] = w;
+ }
+ }
+ return weights;
+}
+
+/**
+ * Compute per-sample weights from class weights.
+ */
+export function computeSampleWeight(
+ classWeight: "balanced" | Record,
+ y: Float64Array | Int32Array,
+): Float64Array {
+ const uniqueClasses = new Set();
+ for (const v of y) uniqueClasses.add(v);
+ const classes = new Int32Array([...uniqueClasses].sort((a, b) => a - b));
+ const cw = computeClassWeight(classWeight, classes, y);
+ const classToWeight = new Map();
+ for (let i = 0; i < classes.length; i++) {
+ classToWeight.set(classes[i] ?? 0, cw[i] ?? 1.0);
+ }
+ const sampleWeights = new Float64Array(y.length);
+ for (let i = 0; i < y.length; i++) {
+ sampleWeights[i] = classToWeight.get(y[i] ?? 0) ?? 1.0;
+ }
+ return sampleWeights;
+}
diff --git a/src/utils/extmath.ts b/src/utils/extmath.ts
new file mode 100644
index 0000000..43a42cb
--- /dev/null
+++ b/src/utils/extmath.ts
@@ -0,0 +1,221 @@
+/**
+ * Mathematical utilities for tsikit-learn.
+ * Mirrors sklearn.utils.extmath.
+ */
+
+/** Compute the log of the logistic function element-wise. */
+export function logLogistic(x: Float64Array): Float64Array {
+ const result = new Float64Array(x.length);
+ for (let i = 0; i < x.length; i++) {
+ const xi = x[i] ?? 0;
+ result[i] =
+ xi >= 0 ? -Math.log1p(Math.exp(-xi)) : xi - Math.log1p(Math.exp(xi));
+ }
+ return result;
+}
+
+/** Compute softmax values for each row of X. */
+export function softmax(X: Float64Array[], copy = true): Float64Array[] {
+ const result = copy ? X.map((row) => new Float64Array(row)) : X;
+ for (const row of result) {
+ const maxVal = Math.max(...row);
+ let sum = 0;
+ for (let j = 0; j < row.length; j++) {
+ row[j] = Math.exp((row[j] ?? 0) - maxVal);
+ sum += row[j] ?? 0;
+ }
+ for (let j = 0; j < row.length; j++) {
+ row[j] = (row[j] ?? 0) / sum;
+ }
+ }
+ return result;
+}
+
+/** Compute row norms of a matrix. */
+export function rowNorms(X: Float64Array[], squared = false): Float64Array {
+ const norms = new Float64Array(X.length);
+ for (let i = 0; i < X.length; i++) {
+ const row = X[i] ?? new Float64Array(0);
+ let norm2 = 0;
+ for (const v of row) norm2 += v * v;
+ norms[i] = squared ? norm2 : Math.sqrt(norm2);
+ }
+ return norms;
+}
+
+/** Safe sparse dot (dense version). Computes X @ y. */
+export function safeDot(X: Float64Array[], y: Float64Array): Float64Array {
+ const n = X.length;
+ const result = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const row = X[i] ?? new Float64Array(0);
+ let dot = 0;
+ for (let j = 0; j < row.length; j++) {
+ dot += (row[j] ?? 0) * (y[j] ?? 0);
+ }
+ result[i] = dot;
+ }
+ return result;
+}
+
+/** Matrix transpose. */
+export function transpose(X: Float64Array[]): Float64Array[] {
+ if (X.length === 0) return [];
+ const nRows = X.length;
+ const nCols = (X[0] ?? new Float64Array(0)).length;
+ const result: Float64Array[] = Array.from(
+ { length: nCols },
+ () => new Float64Array(nRows),
+ );
+ for (let i = 0; i < nRows; i++) {
+ for (let j = 0; j < nCols; j++) {
+ (result[j] ?? new Float64Array(0))[i] =
+ (X[i] ?? new Float64Array(0))[j] ?? 0;
+ }
+ }
+ return result;
+}
+
+/** Matrix-matrix multiply: A @ B. */
+export function matMul(A: Float64Array[], B: Float64Array[]): Float64Array[] {
+ if (A.length === 0 || B.length === 0) return [];
+ const nRows = A.length;
+ const nCols = (B[0] ?? new Float64Array(0)).length;
+ const nInner = B.length;
+ const result: Float64Array[] = Array.from(
+ { length: nRows },
+ () => new Float64Array(nCols),
+ );
+ for (let i = 0; i < nRows; i++) {
+ for (let k = 0; k < nInner; k++) {
+ const aik = (A[i] ?? new Float64Array(0))[k] ?? 0;
+ if (aik === 0) continue;
+ for (let j = 0; j < nCols; j++) {
+ const resultRow = result[i] ?? new Float64Array(0);
+ resultRow[j] =
+ (resultRow[j] ?? 0) + aik * ((B[k] ?? new Float64Array(0))[j] ?? 0);
+ }
+ }
+ }
+ return result;
+}
+
+/**
+ * Solve a lower triangular system Lx = b using forward substitution.
+ */
+export function forwardSubstitution(
+ L: Float64Array[],
+ b: Float64Array,
+): Float64Array {
+ const n = b.length;
+ const x = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ let sum = b[i] ?? 0;
+ for (let j = 0; j < i; j++) {
+ sum -= ((L[i] ?? new Float64Array(0))[j] ?? 0) * (x[j] ?? 0);
+ }
+ x[i] = sum / ((L[i] ?? new Float64Array(0))[i] ?? 1);
+ }
+ return x;
+}
+
+/**
+ * Solve an upper triangular system Ux = b using back substitution.
+ */
+export function backSubstitution(
+ U: Float64Array[],
+ b: Float64Array,
+): Float64Array {
+ const n = b.length;
+ const x = new Float64Array(n);
+ for (let i = n - 1; i >= 0; i--) {
+ let sum = b[i] ?? 0;
+ for (let j = i + 1; j < n; j++) {
+ sum -= ((U[i] ?? new Float64Array(0))[j] ?? 0) * (x[j] ?? 0);
+ }
+ x[i] = sum / ((U[i] ?? new Float64Array(0))[i] ?? 1);
+ }
+ return x;
+}
+
+/**
+ * Cholesky decomposition of a symmetric positive definite matrix.
+ * Returns L such that A = L @ L.T
+ */
+export function cholesky(A: Float64Array[]): Float64Array[] {
+ const n = A.length;
+ const L: Float64Array[] = Array.from(
+ { length: n },
+ () => new Float64Array(n),
+ );
+ for (let i = 0; i < n; i++) {
+ for (let j = 0; j <= i; j++) {
+ let sum = (A[i] ?? new Float64Array(0))[j] ?? 0;
+ for (let k = 0; k < j; k++) {
+ sum -=
+ ((L[i] ?? new Float64Array(0))[k] ?? 0) *
+ ((L[j] ?? new Float64Array(0))[k] ?? 0);
+ }
+ if (i === j) {
+ (L[i] ?? new Float64Array(0))[j] = Math.sqrt(Math.max(sum, 0));
+ } else {
+ const ljj = (L[j] ?? new Float64Array(0))[j] ?? 1;
+ (L[i] ?? new Float64Array(0))[j] = ljj !== 0 ? sum / ljj : 0;
+ }
+ }
+ }
+ return L;
+}
+
+/**
+ * Solve the linear system Ax = b using Cholesky decomposition.
+ * A must be symmetric positive definite.
+ */
+export function choleskyLinsolve(
+ A: Float64Array[],
+ b: Float64Array,
+): Float64Array {
+ const L = cholesky(A);
+ const y = forwardSubstitution(L, b);
+ const Lt = transpose(L);
+ return backSubstitution(Lt, y);
+}
+
+/** Compute the Euclidean distance between two vectors. */
+export function euclideanDistance(a: Float64Array, b: Float64Array): number {
+ let sum = 0;
+ for (let i = 0; i < a.length; i++) {
+ const diff = (a[i] ?? 0) - (b[i] ?? 0);
+ sum += diff * diff;
+ }
+ return Math.sqrt(sum);
+}
+
+/** Add identity * alpha to a matrix (in-place). */
+export function addDiagonal(A: Float64Array[], alpha: number): Float64Array[] {
+ for (let i = 0; i < A.length; i++) {
+ (A[i] ?? new Float64Array(0))[i] =
+ ((A[i] ?? new Float64Array(0))[i] ?? 0) + alpha;
+ }
+ return A;
+}
+
+/** Compute X.T @ X (Gram matrix). */
+export function gramMatrix(X: Float64Array[]): Float64Array[] {
+ const Xt = transpose(X);
+ return matMul(Xt, X);
+}
+
+/** Compute X.T @ y. */
+export function xtDotY(X: Float64Array[], y: Float64Array): Float64Array {
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const result = new Float64Array(p);
+ for (let i = 0; i < X.length; i++) {
+ const yi = y[i] ?? 0;
+ const row = X[i] ?? new Float64Array(0);
+ for (let j = 0; j < p; j++) {
+ result[j] = (result[j] ?? 0) + (row[j] ?? 0) * yi;
+ }
+ }
+ return result;
+}
diff --git a/src/utils/index.ts b/src/utils/index.ts
new file mode 100644
index 0000000..2ea8323
--- /dev/null
+++ b/src/utils/index.ts
@@ -0,0 +1,4 @@
+export * from "./extmath.js";
+export * from "./validation.js";
+export * from "./multiclass.js";
+export * from "./class_weight.js";
diff --git a/src/utils/multiclass.ts b/src/utils/multiclass.ts
new file mode 100644
index 0000000..cd461ad
--- /dev/null
+++ b/src/utils/multiclass.ts
@@ -0,0 +1,68 @@
+/**
+ * Multiclass utilities.
+ * Mirrors sklearn.utils.multiclass.
+ */
+
+import { ValueError } from "../exceptions.js";
+
+export type MulticlassType =
+ | "binary"
+ | "multiclass"
+ | "multiclass-multioutput"
+ | "multilabel-indicator"
+ | "continuous"
+ | "continuous-multioutput"
+ | "unknown";
+
+/** Determine the type of target variable. */
+export function typeOfTarget(y: Float64Array | Int32Array): MulticlassType {
+ const unique = new Set();
+ for (const v of y) unique.add(v);
+ const nUnique = unique.size;
+
+ // Check if all values are integers
+ const allInt = Array.from(unique).every((v) => Number.isInteger(v));
+ if (!allInt) return "continuous";
+
+ if (nUnique <= 2) return "binary";
+ return "multiclass";
+}
+
+/** Return sorted unique class labels. */
+export function uniqueLabels(...ys: (Float64Array | Int32Array)[]): Int32Array {
+ const all = new Set();
+ for (const y of ys) {
+ for (const v of y) all.add(v);
+ }
+ return new Int32Array([...all].sort((a, b) => a - b));
+}
+
+/** Check if classification is binary. */
+export function isBinaryClassification(y: Float64Array | Int32Array): boolean {
+ const unique = new Set();
+ for (const v of y) unique.add(v);
+ return unique.size === 2;
+}
+
+/** Check if classification is multilabel. */
+export function isMultilabel(_y: Float64Array[]): boolean {
+ // For dense arrays this is always false in our simplified implementation
+ return false;
+}
+
+/** Return the number of classes for a label array. */
+export function classCount(y: Float64Array | Int32Array): number {
+ const unique = new Set();
+ for (const v of y) unique.add(v);
+ return unique.size;
+}
+
+/** Validate that y only contains values in the expected classes. */
+export function checkClassificationTargets(y: Float64Array | Int32Array): void {
+ const t = typeOfTarget(y);
+ if (t === "continuous") {
+ throw new ValueError(
+ `Unknown label type: ${t}. Maybe you are trying to fit a classifier, which expects discrete classes.`,
+ );
+ }
+}
diff --git a/src/utils/validation.ts b/src/utils/validation.ts
new file mode 100644
index 0000000..e366ffc
--- /dev/null
+++ b/src/utils/validation.ts
@@ -0,0 +1,108 @@
+/**
+ * Input validation utilities.
+ * Mirrors sklearn.utils.validation.
+ */
+
+import { ValueError } from "../exceptions.js";
+
+/** Validate that X is a non-empty 2D array of Float64Arrays. */
+export function checkArray(
+ X: Float64Array[],
+ options: {
+ minSamples?: number;
+ minFeatures?: number;
+ allowNd?: boolean;
+ } = {},
+): Float64Array[] {
+ const { minSamples = 1, minFeatures = 1 } = options;
+ if (!Array.isArray(X)) {
+ throw new ValueError("X must be an array of Float64Arrays");
+ }
+ if (X.length < minSamples) {
+ throw new ValueError(
+ `X must have at least ${minSamples} samples, got ${X.length}`,
+ );
+ }
+ const nFeatures = (X[0] ?? new Float64Array(0)).length;
+ if (nFeatures < minFeatures) {
+ throw new ValueError(
+ `X must have at least ${minFeatures} features, got ${nFeatures}`,
+ );
+ }
+ for (let i = 0; i < X.length; i++) {
+ const row = X[i];
+ if (!(row instanceof Float64Array)) {
+ throw new ValueError(`X[${i}] must be a Float64Array`);
+ }
+ if (row.length !== nFeatures) {
+ throw new ValueError(
+ `X rows must all have the same length. Row 0 has ${nFeatures}, row ${i} has ${row.length}`,
+ );
+ }
+ }
+ return X;
+}
+
+/** Validate that X and y have compatible shapes. */
+export function checkXy(
+ X: Float64Array[],
+ y: Float64Array | Int32Array,
+): [Float64Array[], Float64Array | Int32Array] {
+ checkArray(X);
+ if (X.length !== y.length) {
+ throw new ValueError(
+ `X and y have inconsistent first dimensions: X has ${X.length} samples, y has ${y.length}`,
+ );
+ }
+ return [X, y];
+}
+
+/** Return the number of features in X. */
+export function getNumFeatures(X: Float64Array[]): number {
+ if (X.length === 0) return 0;
+ return (X[0] ?? new Float64Array(0)).length;
+}
+
+/** Validate that test features match training features. */
+export function checkFeaturesConsistency(
+ XTrain: Float64Array[],
+ XTest: Float64Array[],
+): void {
+ const trainFeats = getNumFeatures(XTrain);
+ const testFeats = getNumFeatures(XTest);
+ if (trainFeats !== testFeats) {
+ throw new ValueError(
+ `X has ${testFeats} features, but the estimator was trained with ${trainFeats} features`,
+ );
+ }
+}
+
+/** Convert a number array to Float64Array. */
+export function asFloat64Array(arr: number[] | Float64Array): Float64Array {
+ if (arr instanceof Float64Array) return arr;
+ return new Float64Array(arr);
+}
+
+/** Convert a number array to Int32Array. */
+export function asInt32Array(arr: number[] | Int32Array): Int32Array {
+ if (arr instanceof Int32Array) return arr;
+ return new Int32Array(arr);
+}
+
+/** Validate sample weights, returning a uniform weight array if null. */
+export function checkSampleWeight(
+ sampleWeight: Float64Array | null | undefined,
+ nSamples: number,
+): Float64Array {
+ if (sampleWeight == null) {
+ const w = new Float64Array(nSamples);
+ w.fill(1.0);
+ return w;
+ }
+ if (sampleWeight.length !== nSamples) {
+ throw new ValueError(
+ `sampleWeight.length (${sampleWeight.length}) != n_samples (${nSamples})`,
+ );
+ }
+ return sampleWeight;
+}
diff --git a/tests/base.test.ts b/tests/base.test.ts
new file mode 100644
index 0000000..550000e
--- /dev/null
+++ b/tests/base.test.ts
@@ -0,0 +1,69 @@
+import { describe, expect, it } from "bun:test";
+import {
+ BaseEstimator,
+ ClassifierMixin,
+ RegressorMixin,
+ check_is_fitted,
+ clone,
+} from "../src/base.ts";
+import { NotFittedError } from "../src/exceptions.ts";
+
+class DummyEstimator extends BaseEstimator {
+ alpha: number;
+ beta: string;
+ fitted_?: boolean;
+
+ constructor(alpha = 1.0, beta = "test") {
+ super();
+ this.alpha = alpha;
+ this.beta = beta;
+ }
+
+ fit(): this {
+ this.fitted_ = true;
+ return this;
+ }
+}
+
+describe("BaseEstimator", () => {
+ it("get_params returns constructor params", () => {
+ const est = new DummyEstimator(2.0, "hello");
+ const params = est.get_params();
+ expect(params.alpha).toBe(2.0);
+ expect(params.beta).toBe("hello");
+ });
+
+ it("set_params updates params", () => {
+ const est = new DummyEstimator();
+ est.set_params({ alpha: 5.0 });
+ expect(est.alpha).toBe(5.0);
+ });
+
+ it("check_is_fitted throws NotFittedError when not fitted", () => {
+ const est = new DummyEstimator();
+ expect(() => est.fit()._check_is_fitted(["fitted_"])).not.toThrow();
+ const est2 = new DummyEstimator();
+ expect(() => est2._check_is_fitted(["fitted_"])).toThrow(NotFittedError);
+ });
+});
+
+describe("clone", () => {
+ it("creates a new instance with same params", () => {
+ const est = new DummyEstimator(3.0, "foo");
+ const cloned = clone(est);
+ expect(cloned).not.toBe(est);
+ expect(cloned.alpha).toBe(3.0);
+ expect(cloned.beta).toBe("foo");
+ });
+});
+
+describe("Exceptions", () => {
+ it("NotFittedError has correct name", () => {
+ const err = new NotFittedError();
+ expect(err.name).toBe("NotFittedError");
+ });
+
+ it("NotFittedError is an Error", () => {
+ expect(new NotFittedError()).toBeInstanceOf(Error);
+ });
+});
diff --git a/tests/linear_model.test.ts b/tests/linear_model.test.ts
new file mode 100644
index 0000000..02dd9b0
--- /dev/null
+++ b/tests/linear_model.test.ts
@@ -0,0 +1,192 @@
+import { describe, expect, it } from "bun:test";
+import { LinearRegression } from "../src/linear_model/linear_regression.ts";
+import { Ridge } from "../src/linear_model/ridge.ts";
+
+describe("LinearRegression", () => {
+ it("fits a simple 1D linear relationship", () => {
+ const X = [
+ new Float64Array([1]),
+ new Float64Array([2]),
+ new Float64Array([3]),
+ new Float64Array([4]),
+ new Float64Array([5]),
+ ];
+ const y = new Float64Array([2, 4, 6, 8, 10]);
+ const reg = new LinearRegression();
+ reg.fit(X, y);
+
+ expect(reg.coef_).toBeDefined();
+ expect(Math.abs(((reg.coef_ as Float64Array)[0] ?? 0) - 2)).toBeLessThan(
+ 1e-6,
+ );
+ expect(Math.abs(reg.intercept_ as number)).toBeLessThan(1e-6);
+ });
+
+ it("fits with intercept", () => {
+ const X = [
+ new Float64Array([0]),
+ new Float64Array([1]),
+ new Float64Array([2]),
+ ];
+ const y = new Float64Array([1, 3, 5]); // y = 2x + 1
+ const reg = new LinearRegression();
+ reg.fit(X, y);
+
+ expect(Math.abs(((reg.coef_ as Float64Array)[0] ?? 0) - 2)).toBeLessThan(
+ 1e-6,
+ );
+ expect(Math.abs((reg.intercept_ as number) - 1)).toBeLessThan(1e-6);
+ });
+
+ it("fits without intercept", () => {
+ const X = [
+ new Float64Array([1]),
+ new Float64Array([2]),
+ new Float64Array([3]),
+ ];
+ const y = new Float64Array([3, 6, 9]); // y = 3x
+ const reg = new LinearRegression({ fit_intercept: false });
+ reg.fit(X, y);
+
+ expect(Math.abs(((reg.coef_ as Float64Array)[0] ?? 0) - 3)).toBeLessThan(
+ 1e-6,
+ );
+ expect(reg.intercept_).toBe(0);
+ });
+
+ it("predicts correctly", () => {
+ const X = [new Float64Array([1]), new Float64Array([2])];
+ const y = new Float64Array([1, 2]);
+ const reg = new LinearRegression();
+ reg.fit(X, y);
+
+ const pred = reg.predict([new Float64Array([3])]);
+ expect(Math.abs((pred[0] ?? 0) - 3)).toBeLessThan(1e-4);
+ });
+
+ it("fits multiple features", () => {
+ // y = 1*x1 + 2*x2
+ const X = [
+ new Float64Array([1, 2]),
+ new Float64Array([2, 1]),
+ new Float64Array([3, 3]),
+ new Float64Array([4, 2]),
+ ];
+ const y = new Float64Array([5, 4, 9, 8]);
+ const reg = new LinearRegression({ fit_intercept: false });
+ reg.fit(X, y);
+
+ const pred = reg.predict([new Float64Array([1, 2])]);
+ expect(Math.abs((pred[0] ?? 0) - 5)).toBeLessThan(0.1);
+ });
+
+ it("computes R² score", () => {
+ const X = [
+ new Float64Array([1]),
+ new Float64Array([2]),
+ new Float64Array([3]),
+ new Float64Array([4]),
+ ];
+ const y = new Float64Array([2, 4, 6, 8]);
+ const reg = new LinearRegression();
+ reg.fit(X, y);
+
+ const score = reg.score(X, y);
+ expect(score).toBeCloseTo(1.0, 5);
+ });
+
+ it("returns R² close to 1 for perfect linear data", () => {
+ const X = Array.from(
+ { length: 20 },
+ (_, i) => new Float64Array([i, i * 2]),
+ );
+ const y = new Float64Array(Array.from({ length: 20 }, (_, i) => i * 3 + 1));
+ const reg = new LinearRegression();
+ reg.fit(X, y);
+ expect(reg.score(X, y)).toBeGreaterThan(0.999);
+ });
+
+ it("throws NotFittedError when predicting before fit", () => {
+ const reg = new LinearRegression();
+ expect(() => reg.predict([new Float64Array([1])])).toThrow();
+ });
+
+ it("get_params returns all params", () => {
+ const reg = new LinearRegression({ alpha: 0 } as never);
+ const params = reg.get_params();
+ expect("fit_intercept" in params).toBe(true);
+ });
+});
+
+describe("Ridge", () => {
+ it("fits a simple linear relationship with regularization", () => {
+ const X = [
+ new Float64Array([1]),
+ new Float64Array([2]),
+ new Float64Array([3]),
+ new Float64Array([4]),
+ new Float64Array([5]),
+ ];
+ const y = new Float64Array([2, 4, 6, 8, 10]);
+ const reg = new Ridge({ alpha: 0.0001 });
+ reg.fit(X, y);
+
+ // With tiny alpha, should be close to OLS
+ expect(Math.abs(((reg.coef_ as Float64Array)[0] ?? 0) - 2)).toBeLessThan(
+ 0.01,
+ );
+ });
+
+ it("shrinks coefficients with large alpha", () => {
+ const X = [
+ new Float64Array([1, 0]),
+ new Float64Array([0, 1]),
+ new Float64Array([1, 1]),
+ ];
+ const y = new Float64Array([2, 3, 5]);
+
+ const regLowAlpha = new Ridge({ alpha: 0.001 });
+ const regHighAlpha = new Ridge({ alpha: 100.0 });
+ regLowAlpha.fit(X, y);
+ regHighAlpha.fit(X, y);
+
+ const normLow = Array.from(regLowAlpha.coef_ as Float64Array).reduce(
+ (a, b) => a + b * b,
+ 0,
+ );
+ const normHigh = Array.from(regHighAlpha.coef_ as Float64Array).reduce(
+ (a, b) => a + b * b,
+ 0,
+ );
+
+ // Higher alpha → smaller coefficients
+ expect(normHigh).toBeLessThan(normLow);
+ });
+
+ it("predicts correctly", () => {
+ const X = [
+ new Float64Array([1]),
+ new Float64Array([2]),
+ new Float64Array([3]),
+ ];
+ const y = new Float64Array([1, 2, 3]);
+ const reg = new Ridge({ alpha: 0.001 });
+ reg.fit(X, y);
+
+ const pred = reg.predict([new Float64Array([4])]);
+ expect(Math.abs((pred[0] ?? 0) - 4)).toBeLessThan(0.1);
+ });
+
+ it("score is R²", () => {
+ const X = Array.from({ length: 20 }, (_, i) => new Float64Array([i]));
+ const y = new Float64Array(Array.from({ length: 20 }, (_, i) => i * 2 + 1));
+ const reg = new Ridge({ alpha: 0.001 });
+ reg.fit(X, y);
+ expect(reg.score(X, y)).toBeGreaterThan(0.99);
+ });
+
+ it("throws NotFittedError when predicting before fit", () => {
+ const reg = new Ridge();
+ expect(() => reg.predict([new Float64Array([1])])).toThrow();
+ });
+});
diff --git a/tests/metrics_model_selection.test.ts b/tests/metrics_model_selection.test.ts
new file mode 100644
index 0000000..1dfb861
--- /dev/null
+++ b/tests/metrics_model_selection.test.ts
@@ -0,0 +1,111 @@
+import { describe, expect, it } from "bun:test";
+import {
+ accuracy_score,
+ confusion_matrix,
+ f1_score,
+ precision_score,
+ recall_score,
+} from "../src/metrics/classification.ts";
+import {
+ mean_absolute_error,
+ mean_squared_error,
+ r2_score,
+} from "../src/metrics/regression.ts";
+import { KFold, train_test_split } from "../src/model_selection/split.ts";
+
+describe("Regression metrics", () => {
+ it("MSE is 0 for perfect prediction", () => {
+ const y = new Float64Array([1, 2, 3]);
+ expect(mean_squared_error(y, y)).toBe(0);
+ });
+
+ it("MAE is 0 for perfect prediction", () => {
+ const y = new Float64Array([1, 2, 3]);
+ expect(mean_absolute_error(y, y)).toBe(0);
+ });
+
+ it("R² is 1 for perfect prediction", () => {
+ const y = new Float64Array([1, 2, 3]);
+ expect(r2_score(y, y)).toBe(1);
+ });
+
+ it("MSE is correct", () => {
+ const yTrue = new Float64Array([1, 2, 3]);
+ const yPred = new Float64Array([2, 3, 4]); // all off by 1
+ expect(mean_squared_error(yTrue, yPred)).toBe(1);
+ });
+});
+
+describe("Classification metrics", () => {
+ it("accuracy is 1 for perfect prediction", () => {
+ const y = new Int32Array([0, 1, 2]);
+ expect(accuracy_score(y, y)).toBe(1);
+ });
+
+ it("accuracy counts correct predictions", () => {
+ const yTrue = new Int32Array([0, 1, 1, 0]);
+ const yPred = new Int32Array([0, 1, 0, 0]);
+ expect(accuracy_score(yTrue, yPred)).toBe(0.75);
+ });
+
+ it("confusion matrix is correct for binary", () => {
+ const yTrue = new Int32Array([0, 1, 0, 1, 0]);
+ const yPred = new Int32Array([0, 1, 1, 1, 0]);
+ const cm = confusion_matrix(yTrue, yPred);
+ // [[TN, FP], [FN, TP]]
+ expect((cm[0] as number[])[0]).toBe(2); // TN
+ expect((cm[0] as number[])[1]).toBe(1); // FP
+ expect((cm[1] as number[])[0]).toBe(0); // FN
+ expect((cm[1] as number[])[1]).toBe(2); // TP
+ });
+
+ it("f1 is 1 for perfect predictions", () => {
+ const y = new Int32Array([0, 1, 0, 1]);
+ expect(f1_score(y, y)).toBeCloseTo(1);
+ });
+});
+
+describe("train_test_split", () => {
+ it("splits data correctly", () => {
+ const X = Array.from({ length: 100 }, (_, i) => new Float64Array([i]));
+ const y = new Float64Array(Array.from({ length: 100 }, (_, i) => i));
+ const { XTrain, XTest, yTrain, yTest } = train_test_split(X, y, {
+ testSize: 0.2,
+ });
+ expect(XTrain.length).toBe(80);
+ expect(XTest.length).toBe(20);
+ expect(yTrain.length).toBe(80);
+ expect(yTest.length).toBe(20);
+ });
+
+ it("is reproducible with randomState", () => {
+ const X = Array.from({ length: 20 }, (_, i) => new Float64Array([i]));
+ const y = new Float64Array(Array.from({ length: 20 }, (_, i) => i));
+ const r1 = train_test_split(X, y, { randomState: 42 });
+ const r2 = train_test_split(X, y, { randomState: 42 });
+ expect(Array.from(r1.yTest)).toEqual(Array.from(r2.yTest));
+ });
+});
+
+describe("KFold", () => {
+ it("generates k folds", () => {
+ const X = Array.from({ length: 10 }, (_, i) => new Float64Array([i]));
+ const kf = new KFold({ nSplits: 5 });
+ const folds = [...kf.split(X)];
+ expect(folds.length).toBe(5);
+ for (const fold of folds) {
+ expect(fold.trainIndex.length).toBe(8);
+ expect(fold.testIndex.length).toBe(2);
+ }
+ });
+
+ it("covers all samples exactly once", () => {
+ const X = Array.from({ length: 9 }, (_, i) => new Float64Array([i]));
+ const kf = new KFold({ nSplits: 3 });
+ const allTest = new Set();
+ for (const fold of kf.split(X)) {
+ for (const idx of fold.testIndex) allTest.add(idx);
+ }
+ expect(allTest.size).toBe(9);
+ });
+});
diff --git a/tests/preprocessing.test.ts b/tests/preprocessing.test.ts
new file mode 100644
index 0000000..cc11d17
--- /dev/null
+++ b/tests/preprocessing.test.ts
@@ -0,0 +1,133 @@
+import { describe, expect, it } from "bun:test";
+import { NotFittedError } from "../src/exceptions.ts";
+import { LabelEncoder } from "../src/preprocessing/label_encoder.ts";
+import { MinMaxScaler } from "../src/preprocessing/minmax_scaler.ts";
+import { Normalizer } from "../src/preprocessing/normalizer.ts";
+import { StandardScaler } from "../src/preprocessing/standard_scaler.ts";
+
+describe("StandardScaler", () => {
+ const X = [
+ new Float64Array([1, 2]),
+ new Float64Array([3, 4]),
+ new Float64Array([5, 6]),
+ ];
+
+ it("computes mean and std correctly", () => {
+ const scaler = new StandardScaler();
+ scaler.fit(X);
+ expect(scaler.mean_).toBeDefined();
+ expect(Math.abs(((scaler.mean_ as Float64Array)[0] ?? 0) - 3)).toBeLessThan(
+ 1e-10,
+ );
+ expect(Math.abs(((scaler.mean_ as Float64Array)[1] ?? 0) - 4)).toBeLessThan(
+ 1e-10,
+ );
+ });
+
+ it("transforms to zero mean", () => {
+ const scaler = new StandardScaler();
+ const Xt = scaler.fit_transform(X);
+ const mean0 = Xt.reduce((a, r) => a + (r[0] ?? 0), 0) / Xt.length;
+ expect(Math.abs(mean0)).toBeLessThan(1e-10);
+ });
+
+ it("inverse_transform recovers original", () => {
+ const scaler = new StandardScaler();
+ const Xt = scaler.fit_transform(X);
+ const Xr = scaler.inverse_transform(Xt);
+ for (let i = 0; i < X.length; i++) {
+ for (let j = 0; j < (X[i] as Float64Array).length; j++) {
+ expect(
+ Math.abs(
+ ((Xr[i] as Float64Array)[j] ?? 0) -
+ ((X[i] as Float64Array)[j] ?? 0),
+ ),
+ ).toBeLessThan(1e-8);
+ }
+ }
+ });
+
+ it("throws when not fitted", () => {
+ const scaler = new StandardScaler();
+ expect(() => scaler.transform(X)).toThrow(NotFittedError);
+ });
+});
+
+describe("MinMaxScaler", () => {
+ const X = [
+ new Float64Array([0, 2]),
+ new Float64Array([5, 4]),
+ new Float64Array([10, 6]),
+ ];
+
+ it("scales to [0, 1] by default", () => {
+ const scaler = new MinMaxScaler();
+ const Xt = scaler.fit_transform(X);
+ expect((Xt[0] as Float64Array)[0]).toBeCloseTo(0, 8);
+ expect((Xt[2] as Float64Array)[0]).toBeCloseTo(1, 8);
+ });
+
+ it("scales to custom range", () => {
+ const scaler = new MinMaxScaler({ feature_range: [-1, 1] });
+ const Xt = scaler.fit_transform(X);
+ expect((Xt[0] as Float64Array)[0]).toBeCloseTo(-1, 6);
+ expect((Xt[2] as Float64Array)[0]).toBeCloseTo(1, 6);
+ });
+
+ it("inverse_transform recovers original", () => {
+ const scaler = new MinMaxScaler();
+ const Xt = scaler.fit_transform(X);
+ const Xr = scaler.inverse_transform(Xt);
+ for (let i = 0; i < X.length; i++) {
+ for (let j = 0; j < (X[i] as Float64Array).length; j++) {
+ expect(
+ Math.abs(
+ ((Xr[i] as Float64Array)[j] ?? 0) -
+ ((X[i] as Float64Array)[j] ?? 0),
+ ),
+ ).toBeLessThan(1e-8);
+ }
+ }
+ });
+});
+
+describe("LabelEncoder", () => {
+ it("encodes labels", () => {
+ const le = new LabelEncoder();
+ const y = new Int32Array([3, 1, 2, 1, 3]);
+ const encoded = le.fit_transform(y);
+ expect(Array.from(encoded)).toEqual([2, 0, 1, 0, 2]);
+ });
+
+ it("inverse_transform recovers original", () => {
+ const le = new LabelEncoder();
+ const y = new Int32Array([10, 20, 30]);
+ const encoded = le.fit_transform(y);
+ const decoded = le.inverse_transform(encoded);
+ expect(Array.from(decoded)).toEqual([10, 20, 30]);
+ });
+
+ it("throws on unseen labels", () => {
+ const le = new LabelEncoder();
+ le.fit(new Int32Array([1, 2, 3]));
+ expect(() => le.transform(new Int32Array([4]))).toThrow();
+ });
+});
+
+describe("Normalizer", () => {
+ it("normalizes to unit L2 norm", () => {
+ const norm = new Normalizer({ norm: "l2" });
+ const X = [new Float64Array([3, 4])]; // 3² + 4² = 25, norm = 5
+ const Xt = norm.transform(X);
+ expect((Xt[0] as Float64Array)[0]).toBeCloseTo(0.6, 8);
+ expect((Xt[0] as Float64Array)[1]).toBeCloseTo(0.8, 8);
+ });
+
+ it("normalizes to unit L1 norm", () => {
+ const norm = new Normalizer({ norm: "l1" });
+ const X = [new Float64Array([1, 3])]; // sum = 4
+ const Xt = norm.transform(X);
+ expect((Xt[0] as Float64Array)[0]).toBeCloseTo(0.25, 8);
+ expect((Xt[0] as Float64Array)[1]).toBeCloseTo(0.75, 8);
+ });
+});
diff --git a/tsconfig.json b/tsconfig.json
new file mode 100644
index 0000000..989d8ca
--- /dev/null
+++ b/tsconfig.json
@@ -0,0 +1,24 @@
+{
+ "compilerOptions": {
+ "target": "ES2022",
+ "module": "ESNext",
+ "moduleResolution": "bundler",
+ "lib": ["ES2022", "DOM"],
+ "strict": true,
+ "noUncheckedIndexedAccess": true,
+ "exactOptionalPropertyTypes": true,
+ "noImplicitOverride": true,
+ "noImplicitReturns": true,
+ "noPropertyAccessFromIndexSignature": true,
+ "noFallthroughCasesInSwitch": true,
+ "verbatimModuleSyntax": true,
+ "declaration": true,
+ "declarationMap": true,
+ "sourceMap": true,
+ "outDir": "./dist",
+ "rootDir": "./src",
+ "skipLibCheck": true
+ },
+ "include": ["src/**/*"],
+ "exclude": ["node_modules", "dist", "tests", "playground"]
+}