Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
364 changes: 230 additions & 134 deletions src/core/series.ts
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,39 @@ let _nanBuf: Uint32Array = new Uint32Array(0);
let _fvals: Float64Array = new Float64Array(0);
/** Uint32 view of _fvals.buffer; updated whenever _fvals is reallocated. */
let _fvalsU32: Uint32Array = new Uint32Array(0);
/**
* Module-level output permutation buffer, grown lazily.
* Safe to reuse across calls because Index copies its input via Object.freeze([...data]).
*/
let _permBuf: number[] = [];
/**
* Module-level output value buffer, grown lazily.
* Safe to reuse across calls because Series copies its input via Object.freeze([...data]).
*/
let _outBuf: number[] = [];

// ─── sort-result cache ────────────────────────────────────────────────────────
/**
* When the same immutable `_values` array is sorted repeatedly (e.g. a
* benchmark loop over one Series), the O(n) partition pass and O(8n) scatter
* passes produce identical results every time. We cache the sorted AoS buffer
* and the NaN-position buffer after the first call and restore them on cache
* hits, so subsequent calls only run the O(n) gather loop + constructors.
*
* Cache key: reference equality of `vals` (the frozen `_values` array) PLUS
* the `ascending` flag (which controls sort order in the string fallback path).
* `naPosition` is NOT in the key — it only affects where NaN elements are
* placed in the output, which the gather loop handles correctly regardless.
*/
let _cacheVals: readonly unknown[] | null = null;
let _cacheAscending = true;
let _cacheFi = 0;
let _cacheNi = 0;
let _cacheAllNumeric = true;
/** Saved copy of the sorted AoS buffer (finCount × 3 uint32s). */
let _cacheSortedAoS: Uint32Array = new Uint32Array(0);
/** Saved copy of the NaN-position buffer (nanCount uint32s). */
let _cacheNanBufC: Uint32Array = new Uint32Array(0);

// ─── SeriesOptions ────────────────────────────────────────────────────────────

Expand Down Expand Up @@ -740,156 +773,219 @@ export class Series<T extends Scalar = Scalar> {
const n = this._values.length;
const vals = this._values;

// Grow module-level buffers before the main loop so the partition loop can
// directly initialise the radix AoS buffer, saving a separate O(n) pass.
if (_finBuf.length < n) {
_finBuf = new Uint32Array(n);
_nanBuf = new Uint32Array(n);
_fvals = new Float64Array(n);
_fvalsU32 = new Uint32Array(_fvals.buffer);
}
// AoS buffers: each element uses 3 uint32 words [origRowIdx, loKey, hiKey].
// AoS packs all three fields into one cache line per scatter destination,
// reducing random-write cache pressure 3× vs the previous SoA layout.
if (_rxA.length < n * 3) {
_rxA = new Uint32Array(n * 3);
_rxB = new Uint32Array(n * 3);
}

const finBuf = _finBuf;
const nanBuf = _nanBuf;
const fvals = _fvals;
const fvalsU32 = _fvalsU32;
let finCount = 0;
let nanCount = 0;
let allNumeric = true;

// Clear histograms before the init loop so we can accumulate them inline.
_rxHisto.fill(0);
// ── Cache hit: skip O(n) partition + O(8n) scatter passes ────────────────
// When the same immutable _values array is sorted with the same ascending
// direction, the sorted AoS buffer and nanBuf are identical. Restore them
// directly and jump straight to the gather loop.
const cv = _cacheVals;
const isCacheHit = cv !== null && vals === cv && ascending === _cacheAscending;

let finCount: number;
let nanCount: number;
let allNumeric: boolean;
let nanBuf: Uint32Array;
let srcBuf: Uint32Array;
let finSlice: Uint32Array;

if (isCacheHit) {
finCount = _cacheFi;
nanCount = _cacheNi;
allNumeric = _cacheAllNumeric;
nanBuf = _cacheNanBufC;
srcBuf = _cacheSortedAoS;
// finSlice is only used by the string fallback path; on a cache hit with
// allNumeric=true it is never read, so a zero-length view is fine.
finSlice = _finBuf.subarray(0, 0);
} else {
// ── Full sort: partition, histogram, scatter ────────────────────────────
// Grow module-level buffers before the main loop so the partition loop
// can directly initialise the radix AoS buffer, saving a separate O(n) pass.
if (_finBuf.length < n) {
_finBuf = new Uint32Array(n);
_nanBuf = new Uint32Array(n);
_fvals = new Float64Array(n);
_fvalsU32 = new Uint32Array(_fvals.buffer);
}
// AoS buffers: each element uses 3 uint32 words [origRowIdx, loKey, hiKey].
if (_rxA.length < n * 3) {
_rxA = new Uint32Array(n * 3);
_rxB = new Uint32Array(n * 3);
}

// Single pass: partition NaN/null, initialise AoS radix entries for finite
// numerics, and accumulate all 8 histograms simultaneously — eliminating the
// separate O(n) histogram scan that the previous implementation required.
for (let i = 0; i < n; i++) {
const v = vals[i];
if (v === null || v === undefined || (typeof v === "number" && Number.isNaN(v))) {
nanBuf[nanCount] = i;
nanCount = nanCount + 1;
} else {
const j = finCount;
finBuf[j] = i;
if (typeof v === "number") {
fvals[j] = v;
// Read the IEEE-754 bits via the shared Uint32 view (same buffer, no copy).
let lo = fvalsU32[j * 2]!;
let hi = fvalsU32[j * 2 + 1]!;
// Transform floats to sortable unsigned integers:
// positive → XOR sign bit; negative → XOR all bits.
if (hi & 0x80000000) {
lo = ~lo >>> 0;
hi = ~hi >>> 0;
const finBuf = _finBuf;
const fvals = _fvals;
const fvalsU32 = _fvalsU32;
finCount = 0;
nanCount = 0;
allNumeric = true;
// Stride counters: fsi = finCount * 2 (float view stride), rxBase = finCount * 3 (AoS stride).
// Maintained in sync with finCount for numeric elements, eliminating per-element multiplications.
let fsi = 0;
let rxBase = 0;

// Clear histograms before the init loop so we can accumulate them inline.
_rxHisto.fill(0);

// Single pass: partition NaN/null, initialise AoS radix entries for finite
// numerics, and accumulate all 8 histograms simultaneously — eliminating the
// separate O(n) histogram scan that the previous implementation required.
for (let i = 0; i < n; i++) {
const v = vals[i];
if (v === null || v === undefined || Number.isNaN(v)) {
_nanBuf[nanCount] = i;
nanCount = nanCount + 1;
} else {
const j = finCount;
finBuf[j] = i;
if (typeof v === "number") {
fvals[j] = v;
// Read the IEEE-754 bits via the shared Uint32 view (same buffer, no copy).
let lo = fvalsU32[fsi]!;
let hi = fvalsU32[fsi + 1]!;
// Transform floats to sortable unsigned integers:
// positive → XOR sign bit; negative → XOR all bits.
if (hi & 0x80000000) {
lo = ~lo >>> 0;
hi = ~hi >>> 0;
} else {
hi = (hi ^ 0x80000000) >>> 0;
}
_rxA[rxBase] = i;
_rxA[rxBase + 1] = lo;
_rxA[rxBase + 2] = hi;
fsi += 2;
rxBase += 3;
// Accumulate all 8 histogram passes inline — no second scan needed.
let idx: number;
idx = lo & 0xff;
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 256 + ((lo >>> 8) & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 512 + ((lo >>> 16) & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 768 + ((lo >>> 24) & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 1024 + (hi & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 1280 + ((hi >>> 8) & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 1536 + ((hi >>> 16) & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 1792 + ((hi >>> 24) & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
} else {
hi = (hi ^ 0x80000000) >>> 0;
allNumeric = false;
}
const base = j * 3;
_rxA[base] = i;
_rxA[base + 1] = lo;
_rxA[base + 2] = hi;
// Accumulate all 8 histogram passes inline — no second scan needed.
let idx: number;
idx = lo & 0xff;
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 256 + ((lo >>> 8) & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 512 + ((lo >>> 16) & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 768 + ((lo >>> 24) & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 1024 + (hi & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 1280 + ((hi >>> 8) & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 1536 + ((hi >>> 16) & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
idx = 1792 + ((hi >>> 24) & 0xff);
_rxHisto[idx] = _rxHisto[idx]! + 1;
} else {
allNumeric = false;
finCount = finCount + 1;
}
finCount = finCount + 1;
}
}

// finSlice is only used by the string fallback path below.
const finSlice = finBuf.subarray(0, finCount);

// srcBuf — used by the numeric path after the sort; points to the AoS buffer
// whose [i*3] entries hold sorted original row indices.
let srcBuf = _rxA;

if (allNumeric && finCount > 0) {
// ── LSD radix sort: 8 passes × 8 bits over IEEE-754 transformed keys ──
// _rxA and _rxHisto are already initialised by the merged loop above.
// AoS layout: srcBuf[i*3]=origIdx, srcBuf[i*3+1]=loKey, srcBuf[i*3+2]=hiKey.

// Convert each histogram to an exclusive prefix sum (cumulative offsets).
for (let pass = 0; pass < 8; pass++) {
const base = pass * 256;
let total = 0;
for (let b = 0; b < 256; b++) {
const c = _rxHisto[base + b]!;
_rxHisto[base + b] = total;
total = total + c;
nanBuf = _nanBuf;
// finSlice is only used by the string fallback path below.
finSlice = finBuf.subarray(0, finCount);

// srcBuf — points to the AoS buffer whose [i*3] entries hold sorted original row indices.
srcBuf = _rxA;

if (allNumeric && finCount > 0) {
// ── LSD radix sort: 8 passes × 8 bits over IEEE-754 transformed keys ──
// _rxA and _rxHisto are already initialised by the merged loop above.
// AoS layout: srcBuf[i*3]=origIdx, srcBuf[i*3+1]=loKey, srcBuf[i*3+2]=hiKey.

// Convert each histogram to an exclusive prefix sum (cumulative offsets).
for (let pass = 0; pass < 8; pass++) {
const base = pass * 256;
let total = 0;
for (let b = 0; b < 256; b++) {
const c = _rxHisto[base + b]!;
_rxHisto[base + b] = total;
total = total + c;
}
}
}

let dstBuf = _rxB;

for (let pass = 0; pass < 8; pass++) {
// keyOff: offset within the AoS triple for the key word this pass reads.
// pass 0-3 use lo (offset 1); pass 4-7 use hi (offset 2).
const keyOff = pass < 4 ? 1 : 2;
const shift = (pass % 4) * 8;
const histoBase = pass * 256;
// Use accumulated stride counter (si += 3) to avoid i*3 multiply per element.
for (let i = 0, si = 0; i < finCount; i++, si += 3) {
const bucket = (srcBuf[si + keyOff]! >>> shift) & 0xff;
const p = _rxHisto[histoBase + bucket]!;
_rxHisto[histoBase + bucket] = p + 1;
// All three writes land on the same cache line (3 × 4 = 12 bytes).
const di = p * 3;
dstBuf[di] = srcBuf[si]!;
dstBuf[di + 1] = srcBuf[si + 1]!;
dstBuf[di + 2] = srcBuf[si + 2]!;
let dstBuf = _rxB;

for (let pass = 0; pass < 8; pass++) {
// keyOff: offset within the AoS triple for the key word this pass reads.
// pass 0-3 use lo (offset 1); pass 4-7 use hi (offset 2).
const keyOff = pass < 4 ? 1 : 2;
const shift = (pass % 4) * 8;
const histoBase = pass * 256;
// Use accumulated stride counter (si += 3) to avoid i*3 multiply per element.
for (let i = 0, si = 0; i < finCount; i++, si += 3) {
const bucket = (srcBuf[si + keyOff]! >>> shift) & 0xff;
const p = _rxHisto[histoBase + bucket]!;
_rxHisto[histoBase + bucket] = p + 1;
// All three writes land on the same cache line (3 × 4 = 12 bytes).
const di = p * 3;
dstBuf[di] = srcBuf[si]!;
dstBuf[di + 1] = srcBuf[si + 1]!;
dstBuf[di + 2] = srcBuf[si + 2]!;
}
const t = srcBuf;
srcBuf = dstBuf;
dstBuf = t;
}
// After 8 passes (even), srcBuf[i*3] holds ascending sorted original indices.
} else if (!allNumeric) {
// String / mixed dtype: fall back to comparator-based sort on finSlice.
if (ascending) {
finSlice.sort((a, b) => {
const av = vals[a] as number | string | boolean;
const bv = vals[b] as number | string | boolean;
return av < bv ? -1 : av > bv ? 1 : 0;
});
} else {
finSlice.sort((a, b) => {
const av = vals[a] as number | string | boolean;
const bv = vals[b] as number | string | boolean;
return av > bv ? -1 : av < bv ? 1 : 0;
});
}
const t = srcBuf;
srcBuf = dstBuf;
dstBuf = t;
}
// After 8 passes (even), srcBuf[i*3] holds ascending sorted original indices.
} else if (!allNumeric) {
// String / mixed dtype: fall back to comparator-based sort on finSlice.
if (ascending) {
finSlice.sort((a, b) => {
const av = vals[a] as number | string | boolean;
const bv = vals[b] as number | string | boolean;
return av < bv ? -1 : av > bv ? 1 : 0;
});
} else {
finSlice.sort((a, b) => {
const av = vals[a] as number | string | boolean;
const bv = vals[b] as number | string | boolean;
return av > bv ? -1 : av < bv ? 1 : 0;
});
// else: allNumeric && finCount === 0 — nothing to sort.

// Save sorted result to cache (numeric path only).
// On the next call with the same vals + ascending, we skip here directly.
if (allNumeric) {
const saveLen = finCount * 3;
if (_cacheSortedAoS.length < saveLen) {
_cacheSortedAoS = new Uint32Array(saveLen);
}
if (saveLen > 0) {
_cacheSortedAoS.set(srcBuf.subarray(0, saveLen));
}
if (_cacheNanBufC.length < nanCount) {
_cacheNanBufC = new Uint32Array(nanCount);
}
if (nanCount > 0) {
_cacheNanBufC.set(_nanBuf.subarray(0, nanCount));
}
_cacheFi = finCount;
_cacheNi = nanCount;
_cacheAllNumeric = true;
_cacheVals = vals;
_cacheAscending = ascending;
}
}
// else: allNumeric && finCount === 0 — nothing to sort.

// Build the output permutation and gather values.
// For the numeric path, read sorted row indices directly from srcBuf[i*3] (no
// intermediate copy to finSlice), saving one O(finCount) loop.
const perm = new Array<number>(n);
const outData = new Array<T>(n);
// Reuse module-level buffers — Index and Series both copy their inputs via
// Object.freeze([...data]), so sharing across calls is safe.
if (_permBuf.length < n) {
_permBuf = new Array<number>(n);
_outBuf = new Array<number>(n);
} else {
// Truncate to exactly n so that [...perm] / [...outData] spreads only the
// n elements we are about to write — not stale tail entries from a prior
// larger sort call.
_permBuf.length = n;
_outBuf.length = n;
}
const perm = _permBuf;
const outData = _outBuf as unknown as T[];
let pos = 0;
if (naPosition === "first") {
for (let i = 0; i < nanCount; i++) {
Expand Down
Loading