Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .jules/thunderbolt.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,8 @@
**Evidence:** Microbenchmarking showed a 2x speedup (99ms -> 49ms) for max_v3 over max_v2 on L1-hot arrays. End-to-end framework benchmarks showed an 8% throughput increase (4.03 -> 4.36 GFLOP/s) on large fixed-memory allocations (N=6553600).

**Action:** For reductions using instructions with >2 cycle latency (like max_ps or add_ps), default to 8x unrolling over 4x unrolling to fully saturate modern out-of-order execution engines.

## 2024-05-18 - Single FMA Range Reduction for Softmax
**Learning:** In transcendental AVX2 SIMD approximations (like exp256 for softmax kernels), combining the constants for range reduction `r = x - n * ln(2)` into a single FMA instruction instead of splitting `ln(2)` for exact precision can significantly boost throughput. The precision loss is negligible within typical ML numerical tolerances (e.g., 1e-4) due to the shift-invariant nature of operations like softmax.
**Evidence:** `softmax_v6` achieved 3.97 GFLOP/s compared to 3.73 GFLOP/s of `softmax_v5` on a 1MB array (N=1048576) due to the reduction of instructions on the critical path, representing a ~6.4% throughput improvement. Tests for 1e-4 correctness passed.
**Action:** Always check if split-precision constants can be combined into a single FMA when exact float-level precision is not strictly required, specifically in shift-invariant domains like softmax.
143 changes: 143 additions & 0 deletions ml_kernels/include/ml_kernels/softmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,37 @@ inline __m256 exp256_ps_estrin(__m256 x) {

return _mm256_mul_ps(p, exp2n);
}

inline __m256 exp256_ps_v3(__m256 x) {
x = _mm256_max_ps(x, _mm256_set1_ps(-87.3f));
__m256 x_log2e = _mm256_mul_ps(x, _mm256_set1_ps(1.4426950408889634f));

__m256i n_int = _mm256_cvtps_epi32(x_log2e);
__m256 n = _mm256_cvtepi32_ps(n_int);

// Combine ln2 split into single FMA
// ln2 = 0.6931471805599453
__m256 r = _mm256_fnmadd_ps(n, _mm256_set1_ps(0.6931471805599453f), x);

// Horner's scheme
__m256 c1 = _mm256_set1_ps(1.0f);
__m256 c2 = _mm256_set1_ps(1.0f / 2.0f);
__m256 c3 = _mm256_set1_ps(1.0f / 6.0f);
__m256 c4 = _mm256_set1_ps(1.0f / 24.0f);
__m256 c5 = _mm256_set1_ps(1.0f / 120.0f);

__m256 p = _mm256_fmadd_ps(c5, r, c4);
p = _mm256_fmadd_ps(p, r, c3);
p = _mm256_fmadd_ps(p, r, c2);
p = _mm256_fmadd_ps(p, r, c1);
p = _mm256_fmadd_ps(p, r, c1);

__m256i exp_shift = _mm256_add_epi32(n_int, _mm256_set1_epi32(127));
__m256i exp_shifted = _mm256_slli_epi32(exp_shift, 23);
__m256 exp2n = _mm256_castsi256_ps(exp_shifted);

return _mm256_mul_ps(p, exp2n);
}
inline __m256 exp256_ps(__m256 x) {
// Range reduction: exp(x) = 2^(x * log2(e)) = 2^(n + f)
// Clamp x to avoid underflow
Expand Down Expand Up @@ -133,6 +164,118 @@ inline void softmax_v2(const float *input, float *output, std::size_t n) {
}
}

// ⚡ Thunderbolt: AVX2 Vectorized Softmax with single-FMA range reduction
// Target: AVX2 (Haswell+)
// Reason: Avoids `round_ps` by leveraging `cvtps_epi32` rounding mode, and replaces Estrin's scheme with Horner's.
// Replaces the two-instruction range reduction (r = x - n*ln2_hi - n*ln2_lo) with a single `_mm256_fnmadd_ps` instruction.
// Since x is already shift-reduced relative to the maximum element, the small precision loss from using a single
// ln2 constant doesn't affect the final float precision of softmax up to 1e-4 tolerance.
// Expected gain: ~5-10% over softmax_v5 due to reduced instruction count and latency on the critical path.

inline float reduce_max(__m256 v);
inline float reduce_sum(__m256 v);

inline void softmax_v6(const float *input, float *output, std::size_t n) {
if (n == 0) return;

// 1. Find max
std::size_t i = 0;
__m256 max_v = _mm256_set1_ps(std::numeric_limits<float>::lowest());
__m256 max0 = max_v, max1 = max_v, max2 = max_v, max3 = max_v;

for (; i + 31 < n; i += 32) {
max0 = _mm256_max_ps(max0, _mm256_loadu_ps(input + i));
max1 = _mm256_max_ps(max1, _mm256_loadu_ps(input + i + 8));
max2 = _mm256_max_ps(max2, _mm256_loadu_ps(input + i + 16));
max3 = _mm256_max_ps(max3, _mm256_loadu_ps(input + i + 24));
}
max0 = _mm256_max_ps(max0, max1);
max2 = _mm256_max_ps(max2, max3);
max0 = _mm256_max_ps(max0, max2);
for (; i + 7 < n; i += 8) {
max0 = _mm256_max_ps(max0, _mm256_loadu_ps(input + i));
}
float max_val = reduce_max(max0);
for (; i < n; ++i) max_val = std::max(max_val, input[i]);

__m256 max_vec = _mm256_set1_ps(max_val);

// 2. Compute exp and sum
i = 0;
__m256 sum0 = _mm256_setzero_ps();
__m256 sum1 = _mm256_setzero_ps();
__m256 sum2 = _mm256_setzero_ps();
__m256 sum3 = _mm256_setzero_ps();

for (; i + 31 < n; i += 32) {
__m256 x0 = _mm256_sub_ps(_mm256_loadu_ps(input + i), max_vec);
__m256 x1 = _mm256_sub_ps(_mm256_loadu_ps(input + i + 8), max_vec);
__m256 x2 = _mm256_sub_ps(_mm256_loadu_ps(input + i + 16), max_vec);
__m256 x3 = _mm256_sub_ps(_mm256_loadu_ps(input + i + 24), max_vec);

__m256 e0 = exp256_ps_v3(x0);
__m256 e1 = exp256_ps_v3(x1);
__m256 e2 = exp256_ps_v3(x2);
__m256 e3 = exp256_ps_v3(x3);

_mm256_storeu_ps(output + i, e0);
_mm256_storeu_ps(output + i + 8, e1);
_mm256_storeu_ps(output + i + 16, e2);
_mm256_storeu_ps(output + i + 24, e3);

sum0 = _mm256_add_ps(sum0, e0);
sum1 = _mm256_add_ps(sum1, e1);
sum2 = _mm256_add_ps(sum2, e2);
sum3 = _mm256_add_ps(sum3, e3);
}
sum0 = _mm256_add_ps(sum0, sum1);
sum2 = _mm256_add_ps(sum2, sum3);
sum0 = _mm256_add_ps(sum0, sum2);

for (; i + 7 < n; i += 8) {
__m256 x = _mm256_loadu_ps(input + i);
__m256 e = exp256_ps_v3(_mm256_sub_ps(x, max_vec));
_mm256_storeu_ps(output + i, e);
sum0 = _mm256_add_ps(sum0, e);
}

float sum_val = reduce_sum(sum0);
for (; i < n; ++i) {
float e = std::exp(input[i] - max_val);
output[i] = e;
sum_val += e;
}

if (sum_val == 0.0f) return;

// 3. Normalize
float inv_sum = 1.0f / sum_val;
__m256 inv_sum_v = _mm256_set1_ps(inv_sum);
i = 0;
for (; i + 31 < n; i += 32) {
__m256 o0 = _mm256_loadu_ps(output + i);
__m256 o1 = _mm256_loadu_ps(output + i + 8);
__m256 o2 = _mm256_loadu_ps(output + i + 16);
__m256 o3 = _mm256_loadu_ps(output + i + 24);

__m256 m0 = _mm256_mul_ps(o0, inv_sum_v);
__m256 m1 = _mm256_mul_ps(o1, inv_sum_v);
__m256 m2 = _mm256_mul_ps(o2, inv_sum_v);
__m256 m3 = _mm256_mul_ps(o3, inv_sum_v);

_mm256_storeu_ps(output + i, m0);
_mm256_storeu_ps(output + i + 8, m1);
_mm256_storeu_ps(output + i + 16, m2);
_mm256_storeu_ps(output + i + 24, m3);
}
for (; i + 7 < n; i += 8) {
_mm256_storeu_ps(output + i, _mm256_mul_ps(_mm256_loadu_ps(output + i), inv_sum_v));
}
for (; i < n; ++i) {
output[i] *= inv_sum;
}
}

// ⚡ Thunderbolt: AVX2 Vectorized Softmax with 4x unrolling and instruction interleaving
// Target: AVX2 (Haswell+)
// Reason: Explicit interleaving of loads/subs and exp evaluations breaks FMA latency chains, giving the out-of-order scheduler 4 independent streams.
Expand Down
11 changes: 11 additions & 0 deletions ml_kernels/src/kernel_bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,17 @@ class SoftmaxV5Benchmark : public SoftmaxBenchmark {
};
REGISTER_BENCHMARK(SoftmaxV5Benchmark);

class SoftmaxV6Benchmark : public SoftmaxBenchmark {
public:
const char *name() const override { return "softmax_v6"; }

void run() override {
ml_kernels::softmax_v6(inputs_[current_idx_].data(), outputs_[current_idx_].data(), inputs_[0].size());
current_idx_ = (current_idx_ + 1) % pool_size_;
}
};
REGISTER_BENCHMARK(SoftmaxV6Benchmark);

} // namespace

int main(int argc, char **argv) {
Expand Down
30 changes: 30 additions & 0 deletions ml_kernels/src/test_naive_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,41 @@ void test_softmax_v5() {
std::cout << "test_softmax_v5 passed!" << std::endl;
}

void test_softmax_v6() {
std::cout << "Running test_softmax_v6..." << std::endl;
std::vector<float> input = {
-2.0f, -0.5f, 1.0f, 3.0f,
0.0f, 0.0f, 0.0f, 0.0f,
100.0f, 100.0f, -100.0f, -100.0f,
5.0f, -5.0f, 2.0f, -2.0f,
1.1f, 1.2f, 1.3f, 1.4f,
-1.1f, -1.2f, -1.3f, -1.4f,
10.0f, 20.0f, 30.0f, 40.0f,
-10.0f, -20.0f, -30.0f, -40.0f
};

std::vector<float> output_naive(input.size(), 0.0f);
std::vector<float> output_v6(input.size(), 0.0f);

ml_kernels::softmax_naive(input.data(), output_naive.data(), input.size());
ml_kernels::softmax_v6(input.data(), output_v6.data(), input.size());

float sum = 0.0f;
for (std::size_t i = 0; i < input.size(); ++i) {
assert(std::fabs(output_naive[i] - output_v6[i]) < 1e-4f);
sum += output_v6[i];
}
assert(std::fabs(sum - 1.0f) < 1e-4f);

std::cout << "test_softmax_v6 passed!" << std::endl;
}

int main() {
test_relu_naive();
test_max_naive();
test_softmax_v3();
test_softmax_v4();
test_softmax_v5();
test_softmax_v6();
std::cout << "All tests passed successfully!" << std::endl;
}
Loading