From aeee83b2766d30ce627945a53456e8455dee4dd7 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 8 Dec 2025 14:00:00 +0100
Subject: [PATCH 001/123] Very preliminary attempt to use the miniexpr library

---
 CMakeLists.txt            |    6 +-
 src/blosc2/blosc2_ext.pyx |  248 ++-
 src/blosc2/lazyexpr.py    |   31 +
 src/blosc2/miniexpr.c     | 3343 +++++++++++++++++++++++++++++++++++++
 src/blosc2/miniexpr.h     |  168 ++
 5 files changed, 3787 insertions(+), 9 deletions(-)
 create mode 100755 src/blosc2/miniexpr.c
 create mode 100644 src/blosc2/miniexpr.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 097ae709..f9e65667 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,10 +24,14 @@ add_custom_command(
   DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/blosc2_ext.pyx"
   VERBATIM)
 # ...and add it to the target
-Python_add_library(blosc2_ext MODULE blosc2_ext.c WITH_SOABI)
+Python_add_library(blosc2_ext MODULE blosc2_ext.c
+                   "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/miniexpr.c" WITH_SOABI)
 # We need to link against NumPy
 target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 
+# Add include directory for miniexpr.h
+target_include_directories(blosc2_ext PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2")
+
 if(DEFINED ENV{USE_SYSTEM_BLOSC2})
     set(USE_SYSTEM_BLOSC2 ON)
 endif()
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 91276883..c43c6681 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -23,6 +23,7 @@ from cpython cimport (
     PyBytes_FromStringAndSize,
     PyObject_GetBuffer,
 )
+from cpython.ref cimport Py_INCREF, Py_DECREF
 from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New
 from cython.operator cimport dereference
 from libc.stdint cimport uintptr_t
@@ -492,7 +493,7 @@ cdef extern from "b2nd.h":
     int b2nd_free(b2nd_array_t *array)
     int b2nd_get_slice_cbuffer(b2nd_array_t *array,
                                int64_t *start, int64_t *stop,
-                               void *buffer, int64_t *buffershape, int64_t buffersize)
+                               void *buffer, int64_t *buffershape, int64_t buffersize) nogil
     int b2nd_set_slice_cbuffer(void *buffer, int64_t *buffershape, int64_t buffersize,
                                int64_t *start, int64_t *stop, b2nd_array_t *array)
     int b2nd_get_slice(b2nd_context_t *ctx, b2nd_array_t **array, b2nd_array_t *src, const int64_t *start,
@@ -524,7 +525,56 @@ cdef extern from "b2nd.h":
                           const void *src, const int64_t *src_pad_shape,
                           const int64_t *src_start, const int64_t *src_stop,
                           void *dst, const int64_t *dst_pad_shape,
-                          const int64_t *dst_start);
+                          const int64_t *dst_start) nogil;
+
+
+# miniexpr C API declarations
+cdef extern from "miniexpr.h":
+    ctypedef enum me_dtype:
+        ME_BOOL
+        ME_INT8
+        ME_INT16
+        ME_INT32
+        ME_INT64
+        ME_UINT8
+        ME_UINT16
+        ME_UINT32
+        ME_UINT64
+        ME_FLOAT32
+        ME_FLOAT64
+        ME_COMPLEX64
+        ME_COMPLEX128
+
+    # typedef struct me_variable
+    ctypedef struct me_variable:
+        const char *name
+        const void *address
+        int type
+        void *context
+        me_dtype dtype
+
+    ctypedef struct me_expr:
+        int type
+        double value
+        const double *bound
+        const void *function
+        void *output
+        int nitems
+        me_dtype dtype
+        me_dtype input_dtype
+        void *bytecode
+        int ncode
+        void *parameters[1]
+
+
+    me_expr *me_compile(const char *expression, const me_variable *variables,
+                        int var_count, void *output, int nitems, me_dtype dtype,
+                        int *error) nogil
+
+    void me_eval(const me_expr *n) nogil
+    void me_eval_fused(const me_expr *n) nogil
+    void me_print(const me_expr *n) nogil
+    void me_free(me_expr *n) nogil
 
 
 ctypedef struct user_filters_udata:
@@ -546,6 +596,7 @@ ctypedef struct udf_udata:
     b2nd_array_t *array
     int64_t chunks_in_array[B2ND_MAX_DIM]
     int64_t blocks_in_chunk[B2ND_MAX_DIM]
+    void* miniexpr_handle  # Cached miniexpr compiled expression handle
 
 MAX_TYPESIZE = BLOSC2_MAXTYPESIZE
 MAX_BUFFERSIZE = BLOSC2_MAX_BUFFERSIZE
@@ -770,7 +821,8 @@ cdef _check_cparams(blosc2_cparams *cparams):
             if ufilters[i] and cparams.filters[i] in blosc2.ufilters_registry.keys():
                 raise ValueError("Cannot use multi-threading with user defined Python filters")
 
-        if cparams.prefilter != NULL:
+        if cparams.prefilter != NULL and cparams.prefilter != <blosc2_prefilter_fn>miniexpr_prefilter:
+            # Note: miniexpr_prefilter uses miniexpr C API which is thread-friendly,
             raise ValueError("`nthreads` must be 1 when a prefilter is set")
 
 cdef _check_dparams(blosc2_dparams* dparams, blosc2_cparams* cparams=NULL):
@@ -1667,13 +1719,25 @@ cdef class SChunk:
             raise RuntimeError("Could not create compression context")
 
     cpdef remove_prefilter(self, func_name, _new_ctx=True):
-        if func_name is not None:
+        cdef udf_udata* udf_data
+        cdef user_filters_udata* udata
+
+        if func_name is not None and func_name in blosc2.prefilter_funcs:
             del blosc2.prefilter_funcs[func_name]
 
-        # From Python the preparams->udata with always have the field py_func
-        cdef user_filters_udata * udata = <user_filters_udata *>self.schunk.storage.cparams.preparams.user_data
-        free(udata.py_func)
-        free(self.schunk.storage.cparams.preparams.user_data)
+        # Clean up the miniexpr handle if this is a miniexpr_prefilter
+        if self.schunk.storage.cparams.prefilter == <blosc2_prefilter_fn>miniexpr_prefilter:
+            udf_data = <udf_udata*>self.schunk.storage.cparams.preparams.user_data
+            if udf_data.miniexpr_handle != NULL:
+                Py_DECREF(<object>udf_data.miniexpr_handle)
+            free(udf_data.py_func)
+            free(udf_data)
+        else:
+            # From Python the preparams->udata with always have the field py_func
+            udata = <user_filters_udata*>self.schunk.storage.cparams.preparams.user_data
+            free(udata.py_func)
+            free(udata)
+
         free(self.schunk.storage.cparams.preparams)
         self.schunk.storage.cparams.preparams = NULL
         self.schunk.storage.cparams.prefilter = NULL
@@ -1741,6 +1805,133 @@ cdef int general_filler(blosc2_prefilter_params *params):
     return 0
 
 
+# Aux function for prefilter and postfilter for last expression
+cdef int aux_miniexpr(udf_udata *udata, int64_t nchunk, int32_t nblock,
+                      c_bool is_postfilter, uint8_t *params_output, int32_t typesize):
+    # Declare all C variables at the beginning
+    cdef int64_t chunk_ndim[B2ND_MAX_DIM]
+    cdef int64_t block_ndim[B2ND_MAX_DIM]
+    cdef Py_buffer view
+    cdef int64_t start_ndim[B2ND_MAX_DIM]
+    cdef int64_t stop_ndim[B2ND_MAX_DIM]
+    cdef int64_t[B2ND_MAX_DIM] buffershape_
+    cdef np.npy_intp dims[B2ND_MAX_DIM]
+    cdef b2nd_array_t* ndarr
+    cdef int rc
+    cdef void* miniexpr_handle
+    cdef int n_inputs
+    cdef void** input_arrays
+    cdef int64_t start[B2ND_MAX_DIM]
+    cdef int64_t slice_shape[B2ND_MAX_DIM]
+    cdef int64_t blockshape_int64[B2ND_MAX_DIM]
+    cdef Py_buffer buf
+
+    blosc2_unidim_to_multidim(udata.array.ndim, udata.chunks_in_array, nchunk, chunk_ndim)
+    blosc2_unidim_to_multidim(udata.array.ndim, udata.blocks_in_chunk, nblock, block_ndim)
+    for i in range(udata.array.ndim):
+        start_ndim[i] = chunk_ndim[i] * udata.array.chunkshape[i] + block_ndim[i] * udata.array.blockshape[i]
+
+    padding = False
+    blockshape = []
+    for i in range(udata.array.ndim):
+        if start_ndim[i] + udata.array.blockshape[i] > udata.array.shape[i]:
+            padding = True
+            blockshape.append(udata.array.shape[i] - start_ndim[i])
+            if blockshape[i] <= 0:
+                # This block contains only padding, skip it
+                return 0
+        else:
+            blockshape.append(udata.array.blockshape[i])
+    for i in range(udata.array.ndim):
+        dims[i] = blockshape[i]
+    #print("blockshape ->", blockshape)
+
+    # if padding:
+    #     output = np.empty(blockshape, udata.array.dtype)
+    # else:
+    #     output = np.PyArray_SimpleNewFromData(udata.array.ndim, dims, udata.output_cdtype, <void*>params_output)
+
+    inputs_dict = _ctypes.PyObj_FromPtr(udata.inputs_id)
+    #print("inputs_dict ->", inputs_dict)
+    inputs_slice = {}
+    # Get slice of each operand
+    l = []
+    for i in range(udata.array.ndim):
+        stop_ndim[i] = start_ndim[i] + dims[i]
+        l.append(slice(start_ndim[i], stop_ndim[i]))
+    slices = tuple(l)
+    #print("slices ->", slices)
+    for key, obj in inputs_dict.items():
+        if isinstance(obj, NDArray):
+            # inputs_slice[key] = obj[slices]
+            arr = np.empty(blockshape, dtype=obj.dtype)
+            # inputs_slice[key] = obj.get_slice_numpy(arr, (start_ndim, stop_ndim))
+            # This is *slightly* faster than using get_slice_numpy; my hope is that,
+            # with multithreading enabled, this should go faster.
+            ndarr = <b2nd_array_t*><uintptr_t>obj.c_array
+            PyObject_GetBuffer(arr, &view, PyBUF_SIMPLE)
+            with nogil:
+                for i in range(udata.array.ndim):
+                    buffershape_[i] = stop_ndim[i] - start_ndim[i]
+
+                rc = b2nd_get_slice_cbuffer(ndarr, start_ndim, stop_ndim,
+                                                 <void *> view.buf,
+                                                 buffershape_, view.len)
+            _check_rc(rc, "Error while getting the buffer")
+            PyBuffer_Release(&view)
+            inputs_slice[key] = arr
+
+        elif isinstance(obj, np.ndarray | blosc2.C2Array):
+            inputs_slice[key] = obj[slices]
+        elif np.isscalar(obj):
+            inputs_slice[key] = obj
+        else:
+            raise ValueError("Unsupported operand")
+    #print("inputs_slice ->", inputs_slice)
+
+    # Call miniexpr C API directly
+    func_id = udata.py_func.decode("utf-8")
+    offset = tuple(start_ndim[i] for i in range(udata.array.ndim))
+    cdef int linear_offset = sum(start_ndim) * typesize + nblock * udata.array.sc.blocksize
+
+    # Use miniexpr C API for faster evaluation
+    # Use the cached handle from udata (set during _set_pref_expr)
+    # This allows multi-threading since all threads share the same handle
+    miniexpr_handle = udata.miniexpr_handle
+    if miniexpr_handle != NULL:
+        # Get the variable names order from the compiled expression
+        compiled_ex = <object>miniexpr_handle
+        input_names = compiled_ex.input_names  # tuple of variable names in order
+
+        # Build list of input arrays in the correct order
+        n_inputs = len(input_names)
+        input_list = []
+        for i in range(n_inputs):
+            var_name = input_names[i]
+            input_list.append(inputs_slice[var_name])
+
+        # Convert to array of void pointers (PyObject*)
+        input_arrays = <void**>malloc(n_inputs * sizeof(void*))
+        try:
+            for i in range(n_inputs):
+                input_arrays[i] = <void*>input_list[i]
+
+            # XXX Call numexpr C API  XXXX
+            # output = numexpr_run_compiled_simple(miniexpr_handle, input_arrays, n_inputs)
+            # Call miniexpr C API
+            # me_eval_expr(miniexpr_handle, input_arrays, n_inputs, <void*>output, typesize)
+        finally:
+            free(input_arrays)
+    else:
+        # Fallback to Python callback if C API not available
+        if is_postfilter:
+            output = blosc2.postfilter_funcs[func_id](inputs_slice)
+        else:
+            output = blosc2.prefilter_funcs[func_id](inputs_slice)
+
+    return 0
+
+
 # Aux function for prefilter and postfilter udf
 cdef int aux_udf(udf_udata *udata, int64_t nchunk, int32_t nblock,
                  c_bool is_postfilter, uint8_t *params_output, int32_t typesize):
@@ -1814,6 +2005,11 @@ cdef int aux_udf(udf_udata *udata, int64_t nchunk, int32_t nblock,
     return 0
 
 
+cdef int miniexpr_prefilter(blosc2_prefilter_params *params):
+    cdef udf_udata *udata = <udf_udata *> params.user_data
+    return aux_miniexpr(udata, params.nchunk, params.nblock, False, params.output, params.output_typesize)
+
+
 cdef int general_udf_prefilter(blosc2_prefilter_params *params):
     cdef udf_udata *udata = <udf_udata *> params.user_data
     return aux_udf(udata, params.nchunk, params.nblock, False, params.output, params.output_typesize)
@@ -2361,6 +2557,10 @@ cdef class NDArray:
         self.array = <b2nd_array_t *> PyCapsule_GetPointer(array, <char *> "b2nd_array_t*")
         self.base = base # add reference to base if NDArray is a view
 
+    @property
+    def c_array(self):
+        return <uintptr_t> self.array
+
     @property
     def shape(self) -> tuple[int]:
         return tuple([self.array.shape[i] for i in range(self.array.ndim)])
@@ -2621,6 +2821,38 @@ cdef class NDArray:
 
         return udata
 
+    def _set_pref_expr(self, func, expression, inputs_id):
+        # Support both function objects and string identifiers
+        if isinstance(func, str):
+            func_id = func
+            # No need to register in prefilter_funcs - C API will be used directly
+        else:
+            func_id = func.__name__
+            blosc2.prefilter_funcs[func_id] = func
+        func_id = func_id.encode("utf-8") if isinstance(func_id, str) else func_id
+
+        # Set prefilter
+        cdef blosc2_cparams* cparams = self.array.sc.storage.cparams
+        cparams.prefilter = <blosc2_prefilter_fn> miniexpr_prefilter
+
+        cdef blosc2_prefilter_params* preparams = <blosc2_prefilter_params *> malloc(sizeof(blosc2_prefilter_params))
+        cdef udf_udata* udata = self._fill_udf_udata(func_id, inputs_id)
+
+        # XXX Get the compiled expression handle for multi-threading
+        # udata.miniexpr_handle = me_compile(expression, inputs_id)
+        # # Increment reference count to keep the expression alive across threads
+        # if udata.miniexpr_handle != NULL:
+        #     Py_INCREF(<object>udata.miniexpr_handle)
+
+        preparams.user_data = udata
+        cparams.preparams = preparams
+        _check_cparams(cparams)
+
+        blosc2_free_ctx(self.array.sc.cctx)
+        self.array.sc.cctx = blosc2_create_cctx(dereference(cparams))
+        if self.array.sc.cctx == NULL:
+            raise RuntimeError("Could not create compression context")
+
     def _set_pref_udf(self, func, inputs_id):
         if self.array.sc.storage.cparams.nthreads > 1:
             raise AttributeError("compress `nthreads` must be 1 when assigning a prefilter")
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 7613c9f2..a0974c96 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1261,6 +1261,35 @@ def fast_eval(  # noqa: C901
         # WebAssembly does not support threading, so we cannot use the iter_disk option
         iter_disk = False
 
+    if True:
+        cparams = kwargs.pop("cparams", blosc2.CParams())
+        # Force single-threaded execution for prefilter evaluation
+        # The prefilter callback accesses Python objects which aren't thread-safe
+        # across blosc2's C threads. numexpr does its own multi-threading internally.
+        if cparams.nthreads > 1:
+            prev_nthreads = cparams.nthreads
+            cparams.nthreads = 1
+        res_eval = blosc2.empty(shape, dtype, cparams=cparams, **kwargs)
+        # Validate expression so that it will be cached in numexpr
+        # numexpr.validate(expression, local_dict=operands)
+        # Register a prefilter for last expression using C API
+        # We use a placeholder function name since the actual evaluation
+        # is done directly via numexpr C API in blosc2_ext.pyx
+        # func_name = "numexpr_last_compiled"
+        # res_eval._set_pref_expr(func_name, id(operands))
+        func_name = "miniexpr"
+        res_eval._set_pref_expr(func_name, expression, id(operands))
+
+        # This line would NOT allocate physical RAM on any modern OS:
+        aux = np.empty(res_eval.shape, res_eval.dtype)
+        # Physical allocation happens here (when writing):
+        res_eval[...] = aux
+        res_eval.schunk.remove_prefilter(func_name)
+        if cparams.nthreads > 1:
+            res_eval.schunk.cparams.nthreads = prev_nthreads
+
+        return res_eval
+
     chunk_operands = {}
     # Check which chunks intersect with _slice
     all_chunks = get_intersecting_chunks((), shape, chunks)  # if _slice is (), returns all chunks
@@ -3363,7 +3392,9 @@ def compute(self, item=(), **kwargs):
         # # Register a prefilter for eval
         # res_eval._set_pref_udf(self.func, id(self.inputs))
 
+        # This line would NOT allocate physical RAM on any modern OS:
         # aux = np.empty(res_eval.shape, res_eval.dtype)
+        # Physical allocation happens here (when writing):
         # res_eval[...] = aux
         # res_eval.schunk.remove_prefilter(self.func.__name__)
         # res_eval.schunk.cparams.nthreads = self._cnthreads
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
new file mode 100755
index 00000000..7185bb80
--- /dev/null
+++ b/src/blosc2/miniexpr.c
@@ -0,0 +1,3343 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
+  https://blosc.org
+  License: BSD 3-Clause (see LICENSE.txt)
+
+  See LICENSE.txt for details about copyright and rights to use.
+**********************************************************************/
+
+// Loosely based on https://github.com/CodePlea/tinyexpr. License follows:
+// SPDX-License-Identifier: Zlib
+/*
+ * TINYEXPR - Tiny recursive descent parser and evaluation engine in C
+ *
+ * Copyright (c) 2015-2020 Lewis Van Winkle
+ *
+ * http://CodePlea.com
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgement in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* COMPILE TIME OPTIONS */
+
+/* Exponentiation associativity:
+For a**b**c = (a**b)**c and -a**b = (-a)**b do nothing.
+For a**b**c = a**(b**c) and -a**b = -(a**b) uncomment the next line.*/
+/* #define ME_POW_FROM_RIGHT */
+
+/* Logarithms
+For log = base 10 log do nothing
+For log = natural log uncomment the next line. */
+/* #define ME_NAT_LOG */
+
+#include "miniexpr.h"
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <complex.h>
+
+#ifndef NAN
+#define NAN (0.0/0.0)
+#endif
+
+#ifndef INFINITY
+#define INFINITY (1.0/0.0)
+#endif
+
+
+typedef double (*me_fun2)(double, double);
+
+enum {
+    TOK_NULL = ME_CLOSURE7 + 1, TOK_ERROR, TOK_END, TOK_SEP,
+    TOK_OPEN, TOK_CLOSE, TOK_NUMBER, TOK_VARIABLE, TOK_INFIX,
+    TOK_BITWISE, TOK_SHIFT, TOK_COMPARE, TOK_POW
+};
+
+
+/* Type promotion table following NumPy rules */
+static const me_dtype type_promotion_table[13][13] = {
+    /* Rows: left operand, Columns: right operand */
+    /* BOOL,  INT8,    INT16,   INT32,   INT64,   UINT8,   UINT16,  UINT32,  UINT64,  FLOAT32, FLOAT64, COMPLEX64, COMPLEX128 */
+    {
+        ME_BOOL, ME_INT8, ME_INT16, ME_INT32, ME_INT64, ME_UINT8, ME_UINT16, ME_UINT32, ME_UINT64, ME_FLOAT32,
+        ME_FLOAT64, ME_COMPLEX64, ME_COMPLEX128
+    }, /* BOOL */
+    {
+        ME_INT8, ME_INT8, ME_INT16, ME_INT32, ME_INT64, ME_INT16, ME_INT32, ME_INT64, ME_FLOAT64, ME_FLOAT32,
+        ME_FLOAT64, ME_COMPLEX64, ME_COMPLEX128
+    }, /* INT8 */
+    {
+        ME_INT16, ME_INT16, ME_INT16, ME_INT32, ME_INT64, ME_INT32, ME_INT32, ME_INT64, ME_FLOAT64, ME_FLOAT32,
+        ME_FLOAT64, ME_COMPLEX64, ME_COMPLEX128
+    }, /* INT16 */
+    {
+        ME_INT32, ME_INT32, ME_INT32, ME_INT32, ME_INT64, ME_INT64, ME_INT64, ME_INT64, ME_FLOAT64, ME_FLOAT64,
+        ME_FLOAT64, ME_COMPLEX128, ME_COMPLEX128
+    }, /* INT32 */
+    {
+        ME_INT64, ME_INT64, ME_INT64, ME_INT64, ME_INT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64,
+        ME_FLOAT64, ME_COMPLEX128, ME_COMPLEX128
+    }, /* INT64 */
+    {
+        ME_UINT8, ME_INT16, ME_INT32, ME_INT64, ME_FLOAT64, ME_UINT8, ME_UINT16, ME_UINT32, ME_UINT64, ME_FLOAT32,
+        ME_FLOAT64, ME_COMPLEX64, ME_COMPLEX128
+    }, /* UINT8 */
+    {
+        ME_UINT16, ME_INT32, ME_INT32, ME_INT64, ME_FLOAT64, ME_UINT16, ME_UINT16, ME_UINT32, ME_UINT64, ME_FLOAT32,
+        ME_FLOAT64, ME_COMPLEX64, ME_COMPLEX128
+    }, /* UINT16 */
+    {
+        ME_UINT32, ME_INT64, ME_INT64, ME_INT64, ME_FLOAT64, ME_UINT32, ME_UINT32, ME_UINT32, ME_UINT64, ME_FLOAT64,
+        ME_FLOAT64, ME_COMPLEX128, ME_COMPLEX128
+    }, /* UINT32 */
+    {
+        ME_UINT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_UINT64, ME_UINT64, ME_UINT64, ME_UINT64,
+        ME_FLOAT64, ME_FLOAT64, ME_COMPLEX128, ME_COMPLEX128
+    }, /* UINT64 */
+    {
+        ME_FLOAT32, ME_FLOAT32, ME_FLOAT32, ME_FLOAT64, ME_FLOAT64, ME_FLOAT32, ME_FLOAT32, ME_FLOAT64, ME_FLOAT64,
+        ME_FLOAT32, ME_FLOAT64, ME_COMPLEX64, ME_COMPLEX128
+    }, /* FLOAT32 */
+    {
+        ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64,
+        ME_FLOAT64, ME_FLOAT64, ME_COMPLEX128, ME_COMPLEX128
+    }, /* FLOAT64 */
+    {
+        ME_COMPLEX64, ME_COMPLEX64, ME_COMPLEX64, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX64, ME_COMPLEX64,
+        ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX64, ME_COMPLEX128, ME_COMPLEX64, ME_COMPLEX128
+    }, /* COMPLEX64 */
+    {
+        ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128,
+        ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128
+    } /* COMPLEX128 */
+};
+
+/* Promote two types according to NumPy rules */
+static me_dtype promome_types(me_dtype a, me_dtype b) {
+    if (a >= 0 && a < 13 && b >= 0 && b < 13) {
+        return type_promotion_table[a][b];
+    }
+    return ME_FLOAT64; // Fallback
+}
+
+/* Get size of a type in bytes */
+static size_t dtype_size(me_dtype dtype) {
+    switch (dtype) {
+        case ME_BOOL: return sizeof(bool);
+        case ME_INT8: return sizeof(int8_t);
+        case ME_INT16: return sizeof(int16_t);
+        case ME_INT32: return sizeof(int32_t);
+        case ME_INT64: return sizeof(int64_t);
+        case ME_UINT8: return sizeof(uint8_t);
+        case ME_UINT16: return sizeof(uint16_t);
+        case ME_UINT32: return sizeof(uint32_t);
+        case ME_UINT64: return sizeof(uint64_t);
+        case ME_FLOAT32: return sizeof(float);
+        case ME_FLOAT64: return sizeof(double);
+        case ME_COMPLEX64: return sizeof(float complex);
+        case ME_COMPLEX128: return sizeof(double complex);
+        default: return 0;
+    }
+}
+
+
+enum { ME_CONSTANT = 1 };
+
+
+typedef struct state {
+    const char *start;
+    const char *next;
+    int type;
+
+    union {
+        double value;
+        const double *bound;
+        const void *function;
+    };
+
+    void *context;
+    me_dtype dtype; // Type of current token
+    me_dtype target_dtype; // Target dtype for the overall expression
+
+    const me_variable *lookup;
+    int lookup_len;
+} state;
+
+
+#define TYPE_MASK(TYPE) ((TYPE)&0x0000001F)
+
+#define IS_PURE(TYPE) (((TYPE) & ME_FLAG_PURE) != 0)
+#define IS_FUNCTION(TYPE) (((TYPE) & ME_FUNCTION0) != 0)
+#define IS_CLOSURE(TYPE) (((TYPE) & ME_CLOSURE0) != 0)
+#define ARITY(TYPE) ( ((TYPE) & (ME_FUNCTION0 | ME_CLOSURE0)) ? ((TYPE) & 0x00000007) : 0 )
+#define NEW_EXPR(type, ...) new_expr((type), (const me_expr*[]){__VA_ARGS__})
+#define CHECK_NULL(ptr, ...) if ((ptr) == NULL) { __VA_ARGS__; return NULL; }
+
+/* Forward declaration */
+static me_expr *new_expr(const int type, const me_expr *parameters[]);
+
+/* Infer result type from expression tree */
+static me_dtype infer_result_type(const me_expr *n) {
+    if (!n) return ME_FLOAT64;
+
+    switch (TYPE_MASK(n->type)) {
+        case ME_CONSTANT:
+            return n->dtype;
+
+        case ME_VARIABLE:
+            return n->dtype;
+
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            const int arity = ARITY(n->type);
+            me_dtype result = ME_BOOL;
+
+            for (int i = 0; i < arity; i++) {
+                me_dtype param_type = infer_result_type((const me_expr *) n->parameters[i]);
+                result = promome_types(result, param_type);
+            }
+
+            return result;
+        }
+    }
+
+    return ME_FLOAT64;
+}
+
+/* Apply type promotion to a binary operation node */
+static me_expr *creame_conversion_node(me_expr *source, me_dtype target_dtype) {
+    /* Create a unary conversion node that converts source to target_dtype */
+    me_expr *conv = NEW_EXPR(ME_FUNCTION1 | ME_FLAG_PURE, source);
+    if (conv) {
+        conv->function = NULL; // Mark as conversion
+        conv->dtype = target_dtype;
+        conv->input_dtype = source->dtype;
+    }
+    return conv;
+}
+
+static void apply_type_promotion(me_expr *node) {
+    if (!node || ARITY(node->type) < 2) return;
+
+    me_expr *left = (me_expr *) node->parameters[0];
+    me_expr *right = (me_expr *) node->parameters[1];
+
+    if (left && right) {
+        me_dtype left_type = left->dtype;
+        me_dtype right_type = right->dtype;
+        me_dtype promoted = promome_types(left_type, right_type);
+
+        // Store the promoted output type
+        node->dtype = promoted;
+
+        // TODO: Conversion nodes not fully implemented yet
+        // See TYPE_PROMOTION_IMPLEMENTATION.md for details
+        /*
+        // Insert conversion nodes if needed
+        if (left_type != promoted) {
+            me_expr *conv_left = creame_conversion_node(left, promoted);
+            if (conv_left) {
+                node->parameters[0] = conv_left;
+            }
+        }
+
+        if (right_type != promoted) {
+            me_expr *conv_right = creame_conversion_node(right, promoted);
+            if (conv_right) {
+                node->parameters[1] = conv_right;
+            }
+        }
+        */
+    }
+}
+
+static me_expr *new_expr(const int type, const me_expr *parameters[]) {
+    const int arity = ARITY(type);
+    const int psize = sizeof(void *) * arity;
+    const int size = (sizeof(me_expr) - sizeof(void *)) + psize + (IS_CLOSURE(type) ? sizeof(void *) : 0);
+    me_expr *ret = malloc(size);
+    CHECK_NULL(ret);
+
+    memset(ret, 0, size);
+    if (arity && parameters) {
+        memcpy(ret->parameters, parameters, psize);
+    }
+    ret->type = type;
+    ret->bound = 0;
+    ret->output = NULL;
+    ret->nitems = 0;
+    ret->dtype = ME_FLOAT64; // Default to double
+    ret->bytecode = NULL;
+    ret->ncode = 0;
+    return ret;
+}
+
+
+void me_free_parameters(me_expr *n) {
+    if (!n) return;
+    switch (TYPE_MASK(n->type)) {
+        case ME_FUNCTION7:
+        case ME_CLOSURE7:
+            if (n->parameters[6] && ((me_expr *) n->parameters[6])->output &&
+                ((me_expr *) n->parameters[6])->output != n->output) {
+                free(((me_expr *) n->parameters[6])->output);
+            }
+            me_free(n->parameters[6]);
+        case ME_FUNCTION6:
+        case ME_CLOSURE6:
+            if (n->parameters[5] && ((me_expr *) n->parameters[5])->output &&
+                ((me_expr *) n->parameters[5])->output != n->output) {
+                free(((me_expr *) n->parameters[5])->output);
+            }
+            me_free(n->parameters[5]);
+        case ME_FUNCTION5:
+        case ME_CLOSURE5:
+            if (n->parameters[4] && ((me_expr *) n->parameters[4])->output &&
+                ((me_expr *) n->parameters[4])->output != n->output) {
+                free(((me_expr *) n->parameters[4])->output);
+            }
+            me_free(n->parameters[4]);
+        case ME_FUNCTION4:
+        case ME_CLOSURE4:
+            if (n->parameters[3] && ((me_expr *) n->parameters[3])->output &&
+                ((me_expr *) n->parameters[3])->output != n->output) {
+                free(((me_expr *) n->parameters[3])->output);
+            }
+            me_free(n->parameters[3]);
+        case ME_FUNCTION3:
+        case ME_CLOSURE3:
+            if (n->parameters[2] && ((me_expr *) n->parameters[2])->output &&
+                ((me_expr *) n->parameters[2])->output != n->output) {
+                free(((me_expr *) n->parameters[2])->output);
+            }
+            me_free(n->parameters[2]);
+        case ME_FUNCTION2:
+        case ME_CLOSURE2:
+            if (n->parameters[1] && ((me_expr *) n->parameters[1])->output &&
+                ((me_expr *) n->parameters[1])->output != n->output) {
+                free(((me_expr *) n->parameters[1])->output);
+            }
+            me_free(n->parameters[1]);
+        case ME_FUNCTION1:
+        case ME_CLOSURE1:
+            if (n->parameters[0] && ((me_expr *) n->parameters[0])->output &&
+                ((me_expr *) n->parameters[0])->output != n->output) {
+                free(((me_expr *) n->parameters[0])->output);
+            }
+            me_free(n->parameters[0]);
+    }
+}
+
+
+void me_free(me_expr *n) {
+    if (!n) return;
+    me_free_parameters(n);
+    if (n->bytecode) {
+        free(n->bytecode);
+    }
+    free(n);
+}
+
+
+static double pi(void) { return 3.14159265358979323846; }
+static double e(void) { return 2.71828182845904523536; }
+
+static double fac(double a) {
+    /* simplest version of fac */
+    if (a < 0.0)
+        return NAN;
+    if (a > UINT_MAX)
+        return INFINITY;
+    unsigned int ua = (unsigned int) (a);
+    unsigned long int result = 1, i;
+    for (i = 1; i <= ua; i++) {
+        if (i > ULONG_MAX / result)
+            return INFINITY;
+        result *= i;
+    }
+    return (double) result;
+}
+
+static double ncr(double n, double r) {
+    if (n < 0.0 || r < 0.0 || n < r) return NAN;
+    if (n > UINT_MAX || r > UINT_MAX) return INFINITY;
+    unsigned long int un = (unsigned int) (n), ur = (unsigned int) (r), i;
+    unsigned long int result = 1;
+    if (ur > un / 2) ur = un - ur;
+    for (i = 1; i <= ur; i++) {
+        if (result > ULONG_MAX / (un - ur + i))
+            return INFINITY;
+        result *= un - ur + i;
+        result /= i;
+    }
+    return result;
+}
+
+static double npr(double n, double r) { return ncr(n, r) * fac(r); }
+
+#ifdef _MSC_VER
+#pragma function (ceil)
+#pragma function (floor)
+#endif
+
+static const me_variable functions[] = {
+    /* must be in alphabetical order */
+    {"abs", fabs, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"acos", acos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"asin", asin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"atan", atan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"atan2", atan2, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"ceil", ceil, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"cos", cos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"cosh", cosh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"e", e, ME_FUNCTION0 | ME_FLAG_PURE, 0},
+    {"exp", exp, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"fac", fac, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"floor", floor, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"ln", log, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+#ifdef ME_NAT_LOG
+    {"log", log, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+#else
+    {"log", log10, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+#endif
+    {"log10", log10, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"ncr", ncr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"npr", npr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"pi", pi, ME_FUNCTION0 | ME_FLAG_PURE, 0},
+    {"pow", pow, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"sin", sin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"sinh", sinh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"sqrt", sqrt, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"tan", tan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"tanh", tanh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {0, 0, 0, 0}
+};
+
+static const me_variable *find_builtin(const char *name, int len) {
+    int imin = 0;
+    int imax = sizeof(functions) / sizeof(me_variable) - 2;
+
+    /*Binary search.*/
+    while (imax >= imin) {
+        const int i = (imin + ((imax - imin) / 2));
+        int c = strncmp(name, functions[i].name, len);
+        if (!c) c = '\0' - functions[i].name[len];
+        if (c == 0) {
+            return functions + i;
+        } else if (c > 0) {
+            imin = i + 1;
+        } else {
+            imax = i - 1;
+        }
+    }
+
+    return 0;
+}
+
+static const me_variable *find_lookup(const state *s, const char *name, int len) {
+    int iters;
+    const me_variable *var;
+    if (!s->lookup) return 0;
+
+    for (var = s->lookup, iters = s->lookup_len; iters; ++var, --iters) {
+        if (strncmp(name, var->name, len) == 0 && var->name[len] == '\0') {
+            return var;
+        }
+    }
+    return 0;
+}
+
+
+static double add(double a, double b) { return a + b; }
+static double sub(double a, double b) { return a - b; }
+static double mul(double a, double b) { return a * b; }
+static double divide(double a, double b) { return a / b; }
+static double negate(double a) { return -a; }
+
+static double comma(double a, double b) {
+    (void) a;
+    return b;
+}
+
+/* Bitwise operators (for integer types) */
+static double bit_and(double a, double b) { return (double) ((int64_t) a & (int64_t) b); }
+static double bit_or(double a, double b) { return (double) ((int64_t) a | (int64_t) b); }
+static double bit_xor(double a, double b) { return (double) ((int64_t) a ^ (int64_t) b); }
+static double bit_not(double a) { return (double) (~(int64_t) a); }
+static double bit_shl(double a, double b) { return (double) ((int64_t) a << (int64_t) b); }
+static double bit_shr(double a, double b) { return (double) ((int64_t) a >> (int64_t) b); }
+
+/* Comparison operators (return 1.0 for true, 0.0 for false) */
+static double cmp_eq(double a, double b) { return a == b ? 1.0 : 0.0; }
+static double cmp_ne(double a, double b) { return a != b ? 1.0 : 0.0; }
+static double cmp_lt(double a, double b) { return a < b ? 1.0 : 0.0; }
+static double cmp_le(double a, double b) { return a <= b ? 1.0 : 0.0; }
+static double cmp_gt(double a, double b) { return a > b ? 1.0 : 0.0; }
+static double cmp_ge(double a, double b) { return a >= b ? 1.0 : 0.0; }
+
+/* Logical operators (for bool type) - short-circuit via OR/AND */
+static double logical_and(double a, double b) { return ((int) a) && ((int) b) ? 1.0 : 0.0; }
+static double logical_or(double a, double b) { return ((int) a) || ((int) b) ? 1.0 : 0.0; }
+static double logical_not(double a) { return !(int) a ? 1.0 : 0.0; }
+static double logical_xor(double a, double b) { return ((int) a) != ((int) b) ? 1.0 : 0.0; }
+
+
+void next_token(state *s) {
+    s->type = TOK_NULL;
+
+    do {
+        if (!*s->next) {
+            s->type = TOK_END;
+            return;
+        }
+
+        /* Try reading a number. */
+        if ((s->next[0] >= '0' && s->next[0] <= '9') || s->next[0] == '.') {
+            s->value = strtod(s->next, (char **) &s->next);
+            s->type = TOK_NUMBER;
+        } else {
+            /* Look for a variable or builtin function call. */
+            if (isalpha(s->next[0])) {
+                const char *start;
+                start = s->next;
+                while (isalpha(s->next[0]) || isdigit(s->next[0]) || (s->next[0] == '_')) s->next++;
+
+                const me_variable *var = find_lookup(s, start, s->next - start);
+                if (!var) var = find_builtin(start, s->next - start);
+
+                if (!var) {
+                    s->type = TOK_ERROR;
+                } else {
+                    switch (TYPE_MASK(var->type)) {
+                        case ME_VARIABLE:
+                            s->type = TOK_VARIABLE;
+                            s->bound = var->address;
+                            s->dtype = var->dtype; // Store the variable's type
+                            break;
+
+                        case ME_CLOSURE0:
+                        case ME_CLOSURE1:
+                        case ME_CLOSURE2:
+                        case ME_CLOSURE3: /* Falls through. */
+                        case ME_CLOSURE4:
+                        case ME_CLOSURE5:
+                        case ME_CLOSURE6:
+                        case ME_CLOSURE7: /* Falls through. */
+                            s->context = var->context; /* Falls through. */
+
+                        case ME_FUNCTION0:
+                        case ME_FUNCTION1:
+                        case ME_FUNCTION2:
+                        case ME_FUNCTION3: /* Falls through. */
+                        case ME_FUNCTION4:
+                        case ME_FUNCTION5:
+                        case ME_FUNCTION6:
+                        case ME_FUNCTION7: /* Falls through. */
+                            s->type = var->type;
+                            s->function = var->address;
+                            break;
+                    }
+                }
+            } else {
+                /* Look for an operator or special character. */
+                char c = s->next[0];
+                char next_c = s->next[1];
+
+                /* Multi-character operators */
+                if (c == '*' && next_c == '*') {
+                    s->type = TOK_POW;
+                    s->function = (const void *) pow;
+                    s->next += 2;
+                } else if (c == '<' && next_c == '<') {
+                    s->type = TOK_SHIFT;
+                    s->function = bit_shl;
+                    s->next += 2;
+                } else if (c == '>' && next_c == '>') {
+                    s->type = TOK_SHIFT;
+                    s->function = bit_shr;
+                    s->next += 2;
+                } else if (c == '=' && next_c == '=') {
+                    s->type = TOK_COMPARE;
+                    s->function = cmp_eq;
+                    s->next += 2;
+                } else if (c == '!' && next_c == '=') {
+                    s->type = TOK_COMPARE;
+                    s->function = cmp_ne;
+                    s->next += 2;
+                } else if (c == '<' && next_c == '=') {
+                    s->type = TOK_COMPARE;
+                    s->function = cmp_le;
+                    s->next += 2;
+                } else if (c == '>' && next_c == '=') {
+                    s->type = TOK_COMPARE;
+                    s->function = cmp_ge;
+                    s->next += 2;
+                } else {
+                    /* Single-character operators */
+                    s->next++;
+                    switch (c) {
+                        case '+': s->type = TOK_INFIX;
+                            s->function = add;
+                            break;
+                        case '-': s->type = TOK_INFIX;
+                            s->function = sub;
+                            break;
+                        case '*': s->type = TOK_INFIX;
+                            s->function = mul;
+                            break;
+                        case '/': s->type = TOK_INFIX;
+                            s->function = divide;
+                            break;
+                        case '%': s->type = TOK_INFIX;
+                            s->function = fmod;
+                            break;
+                        case '&': s->type = TOK_BITWISE;
+                            s->function = bit_and;
+                            break;
+                        case '|': s->type = TOK_BITWISE;
+                            s->function = bit_or;
+                            break;
+                        case '^': s->type = TOK_BITWISE;
+                            s->function = bit_xor;
+                            break; /* XOR for ints/bools */
+                        case '~': s->type = TOK_BITWISE;
+                            s->function = bit_not;
+                            break;
+                        case '<': s->type = TOK_COMPARE;
+                            s->function = cmp_lt;
+                            break;
+                        case '>': s->type = TOK_COMPARE;
+                            s->function = cmp_gt;
+                            break;
+                        case '(': s->type = TOK_OPEN;
+                            break;
+                        case ')': s->type = TOK_CLOSE;
+                            break;
+                        case ',': s->type = TOK_SEP;
+                            break;
+                        case ' ':
+                        case '\t':
+                        case '\n':
+                        case '\r': s->type = TOK_NULL;
+                            break;
+                        default: s->type = TOK_ERROR;
+                            break;
+                    }
+                }
+            }
+        }
+    } while (s->type == TOK_NULL);
+}
+
+
+static me_expr *list(state *s);
+
+static me_expr *expr(state *s);
+
+static me_expr *power(state *s);
+
+static me_expr *shift_expr(state *s);
+
+static me_expr *bitwise_and(state *s);
+
+static me_expr *bitwise_xor(state *s);
+
+static me_expr *bitwise_or(state *s);
+
+static me_expr *comparison(state *s);
+
+
+static me_expr *base(state *s) {
+    /* <base>      =    <constant> | <variable> | <function-0> {"(" ")"} | <function-1> <power> | <function-X> "(" <expr> {"," <expr>} ")" | "(" <list> ")" */
+    me_expr *ret;
+    int arity;
+
+    switch (TYPE_MASK(s->type)) {
+        case TOK_NUMBER:
+            ret = new_expr(ME_CONSTANT, 0);
+            CHECK_NULL(ret);
+
+            ret->value = s->value;
+            ret->dtype = s->target_dtype; // Use target dtype for constants
+            next_token(s);
+            break;
+
+        case TOK_VARIABLE:
+            ret = new_expr(ME_VARIABLE, 0);
+            CHECK_NULL(ret);
+
+            ret->bound = s->bound;
+            ret->dtype = s->dtype; // Set the variable's type
+            ret->input_dtype = s->dtype;
+            next_token(s);
+            break;
+
+        case ME_FUNCTION0:
+        case ME_CLOSURE0:
+            ret = new_expr(s->type, 0);
+            CHECK_NULL(ret);
+
+            ret->function = s->function;
+            if (IS_CLOSURE(s->type)) ret->parameters[0] = s->context;
+            next_token(s);
+            if (s->type == TOK_OPEN) {
+                next_token(s);
+                if (s->type != TOK_CLOSE) {
+                    s->type = TOK_ERROR;
+                } else {
+                    next_token(s);
+                }
+            }
+            break;
+
+        case ME_FUNCTION1:
+        case ME_CLOSURE1:
+            ret = new_expr(s->type, 0);
+            CHECK_NULL(ret);
+
+            ret->function = s->function;
+            if (IS_CLOSURE(s->type)) ret->parameters[1] = s->context;
+            next_token(s);
+            ret->parameters[0] = power(s);
+            CHECK_NULL(ret->parameters[0], me_free(ret));
+            break;
+
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7:
+            arity = ARITY(s->type);
+
+            ret = new_expr(s->type, 0);
+            CHECK_NULL(ret);
+
+            ret->function = s->function;
+            if (IS_CLOSURE(s->type)) ret->parameters[arity] = s->context;
+            next_token(s);
+
+            if (s->type != TOK_OPEN) {
+                s->type = TOK_ERROR;
+            } else {
+                int i;
+                for (i = 0; i < arity; i++) {
+                    next_token(s);
+                    ret->parameters[i] = expr(s);
+                    CHECK_NULL(ret->parameters[i], me_free(ret));
+
+                    if (s->type != TOK_SEP) {
+                        break;
+                    }
+                }
+                if (s->type != TOK_CLOSE || i != arity - 1) {
+                    s->type = TOK_ERROR;
+                } else {
+                    next_token(s);
+                }
+            }
+
+            break;
+
+        case TOK_OPEN:
+            next_token(s);
+            ret = list(s);
+            CHECK_NULL(ret);
+
+            if (s->type != TOK_CLOSE) {
+                s->type = TOK_ERROR;
+            } else {
+                next_token(s);
+            }
+            break;
+
+        default:
+            ret = new_expr(0, 0);
+            CHECK_NULL(ret);
+
+            s->type = TOK_ERROR;
+            ret->value = NAN;
+            break;
+    }
+
+    return ret;
+}
+
+
+static me_expr *power(state *s) {
+    /* <power>     =    {("-" | "+")} <base> */
+    int sign = 1;
+    while (s->type == TOK_INFIX && (s->function == add || s->function == sub)) {
+        if (s->function == sub) sign = -sign;
+        next_token(s);
+    }
+
+    me_expr *ret;
+
+    if (sign == 1) {
+        ret = base(s);
+    } else {
+        me_expr *b = base(s);
+        CHECK_NULL(b);
+
+        ret = NEW_EXPR(ME_FUNCTION1 | ME_FLAG_PURE, b);
+        CHECK_NULL(ret, me_free(b));
+
+        ret->function = negate;
+    }
+
+    return ret;
+}
+
+#ifdef ME_POW_FROM_RIGHT
+static me_expr *factor(state *s) {
+    /* <factor>    =    <power> {"**" <factor>}  (right associative) */
+    me_expr *ret = power(s);
+    CHECK_NULL(ret);
+
+    if (s->type == TOK_POW) {
+        me_fun2 t = s->function;
+        next_token(s);
+        me_expr *f = factor(s); /* Right associative: recurse */
+        CHECK_NULL(f, me_free(ret));
+
+        me_expr *prev = ret;
+        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, f);
+        CHECK_NULL(ret, me_free(f), me_free(prev));
+
+        ret->function = t;
+        apply_type_promotion(ret);
+    }
+
+    return ret;
+}
+#else
+static me_expr *factor(state *s) {
+    /* <factor>    =    <power> {"**" <power>}  (left associative) */
+    me_expr *ret = power(s);
+    CHECK_NULL(ret);
+
+    while (s->type == TOK_POW) {
+        me_fun2 t = s->function;
+        next_token(s);
+        me_expr *f = power(s);
+        CHECK_NULL(f, me_free(ret));
+
+        me_expr *prev = ret;
+        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, f);
+        CHECK_NULL(ret, me_free(f), me_free(prev));
+
+        ret->function = t;
+        apply_type_promotion(ret);
+    }
+
+    return ret;
+}
+#endif
+
+
+static me_expr *term(state *s) {
+    /* <term>      =    <factor> {("*" | "/" | "%") <factor>} */
+    me_expr *ret = factor(s);
+    CHECK_NULL(ret);
+
+    while (s->type == TOK_INFIX && (s->function == mul || s->function == divide || s->function == fmod)) {
+        me_fun2 t = s->function;
+        next_token(s);
+        me_expr *f = factor(s);
+        CHECK_NULL(f, me_free(ret));
+
+        me_expr *prev = ret;
+        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, f);
+        CHECK_NULL(ret, me_free(f), me_free(prev));
+
+        ret->function = t;
+        apply_type_promotion(ret);
+    }
+
+    return ret;
+}
+
+
+static me_expr *expr(state *s) {
+    /* <expr>      =    <term> {("+" | "-") <term>} */
+    me_expr *ret = term(s);
+    CHECK_NULL(ret);
+
+    while (s->type == TOK_INFIX && (s->function == add || s->function == sub)) {
+        me_fun2 t = s->function;
+        next_token(s);
+        me_expr *te = term(s);
+        CHECK_NULL(te, me_free(ret));
+
+        me_expr *prev = ret;
+        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, te);
+        CHECK_NULL(ret, me_free(te), me_free(prev));
+
+        ret->function = t;
+        apply_type_promotion(ret); // Apply type promotion
+    }
+
+    return ret;
+}
+
+
+static me_expr *shift_expr(state *s) {
+    /* <shift_expr> =    <expr> {("<<" | ">>") <expr>} */
+    me_expr *ret = expr(s);
+    CHECK_NULL(ret);
+
+    while (s->type == TOK_SHIFT) {
+        me_fun2 t = s->function;
+        next_token(s);
+        me_expr *e = expr(s);
+        CHECK_NULL(e, me_free(ret));
+
+        me_expr *prev = ret;
+        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
+        CHECK_NULL(ret, me_free(e), me_free(prev));
+
+        ret->function = t;
+        apply_type_promotion(ret);
+    }
+
+    return ret;
+}
+
+
+static me_expr *bitwise_and(state *s) {
+    /* <bitwise_and> =    <shift_expr> {"&" <shift_expr>} */
+    me_expr *ret = shift_expr(s);
+    CHECK_NULL(ret);
+
+    while (s->type == TOK_BITWISE && s->function == bit_and) {
+        next_token(s);
+        me_expr *e = shift_expr(s);
+        CHECK_NULL(e, me_free(ret));
+
+        me_expr *prev = ret;
+        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
+        CHECK_NULL(ret, me_free(e), me_free(prev));
+
+        ret->function = bit_and;
+        apply_type_promotion(ret);
+    }
+
+    return ret;
+}
+
+
+static me_expr *bitwise_xor(state *s) {
+    /* <bitwise_xor> =    <bitwise_and> {"^" <bitwise_and>} */
+    /* Note: ^ is XOR for integers/bools. Use ** for power */
+    me_expr *ret = bitwise_and(s);
+    CHECK_NULL(ret);
+
+    while (s->type == TOK_BITWISE && s->function == bit_xor) {
+        next_token(s);
+        me_expr *e = bitwise_and(s);
+        CHECK_NULL(e, me_free(ret));
+
+        me_expr *prev = ret;
+        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
+        CHECK_NULL(ret, me_free(e), me_free(prev));
+
+        ret->function = bit_xor;
+        apply_type_promotion(ret);
+    }
+
+    return ret;
+}
+
+
+static me_expr *bitwise_or(state *s) {
+    /* <bitwise_or> =    <bitwise_xor> {"|" <bitwise_xor>} */
+    me_expr *ret = bitwise_xor(s);
+    CHECK_NULL(ret);
+
+    while (s->type == TOK_BITWISE && (s->function == bit_or)) {
+        me_fun2 t = s->function;
+        next_token(s);
+        me_expr *e = bitwise_xor(s);
+        CHECK_NULL(e, me_free(ret));
+
+        me_expr *prev = ret;
+        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
+        CHECK_NULL(ret, me_free(e), me_free(prev));
+
+        ret->function = t;
+        apply_type_promotion(ret);
+    }
+
+    return ret;
+}
+
+
+static me_expr *comparison(state *s) {
+    /* <comparison> =    <bitwise_or> {("<" | ">" | "<=" | ">=" | "==" | "!=") <bitwise_or>} */
+    me_expr *ret = bitwise_or(s);
+    CHECK_NULL(ret);
+
+    while (s->type == TOK_COMPARE) {
+        me_fun2 t = s->function;
+        next_token(s);
+        me_expr *e = bitwise_or(s);
+        CHECK_NULL(e, me_free(ret));
+
+        me_expr *prev = ret;
+        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
+        CHECK_NULL(ret, me_free(e), me_free(prev));
+
+        ret->function = t;
+        apply_type_promotion(ret);
+        /* Comparisons always return bool */
+        ret->dtype = ME_BOOL;
+    }
+
+    return ret;
+}
+
+
+static me_expr *list(state *s) {
+    /* <list>      =    <comparison> {"," <comparison>} */
+    me_expr *ret = comparison(s);
+    CHECK_NULL(ret);
+
+    while (s->type == TOK_SEP) {
+        next_token(s);
+        me_expr *e = comparison(s);
+        CHECK_NULL(e, me_free(ret));
+
+        me_expr *prev = ret;
+        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
+        CHECK_NULL(ret, me_free(e), me_free(prev));
+
+        ret->function = comma;
+        apply_type_promotion(ret);
+    }
+
+    return ret;
+}
+
+
+#define ME_FUN(...) ((double(*)(__VA_ARGS__))n->function)
+#define M(e) me_eval_scalar(n->parameters[e])
+
+static double me_eval_scalar(const me_expr *n) {
+    if (!n) return NAN;
+
+    switch (TYPE_MASK(n->type)) {
+        case ME_CONSTANT: return n->value;
+        case ME_VARIABLE: return *n->bound;
+
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+            switch (ARITY(n->type)) {
+                case 0: return ME_FUN(void)();
+                case 1: return ME_FUN(double)(M(0));
+                case 2: return ME_FUN(double, double)(M(0), M(1));
+                case 3: return ME_FUN(double, double, double)(M(0), M(1), M(2));
+                case 4: return ME_FUN(double, double, double, double)(M(0), M(1), M(2), M(3));
+                case 5: return ME_FUN(double, double, double, double, double)(M(0), M(1), M(2), M(3), M(4));
+                case 6: return ME_FUN(double, double, double, double, double, double)(
+                        M(0), M(1), M(2), M(3), M(4), M(5));
+                case 7: return ME_FUN(double, double, double, double, double, double, double)(
+                        M(0), M(1), M(2), M(3), M(4), M(5), M(6));
+                default: return NAN;
+            }
+
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7:
+            switch (ARITY(n->type)) {
+                case 0: return ME_FUN(void*)(n->parameters[0]);
+                case 1: return ME_FUN(void*, double)(n->parameters[1], M(0));
+                case 2: return ME_FUN(void*, double, double)(n->parameters[2], M(0), M(1));
+                case 3: return ME_FUN(void*, double, double, double)(n->parameters[3], M(0), M(1), M(2));
+                case 4: return ME_FUN(void*, double, double, double, double)(n->parameters[4], M(0), M(1), M(2), M(3));
+                case 5: return ME_FUN(void*, double, double, double, double, double)(
+                        n->parameters[5], M(0), M(1), M(2), M(3), M(4));
+                case 6: return ME_FUN(void*, double, double, double, double, double, double)(
+                        n->parameters[6], M(0), M(1), M(2), M(3), M(4), M(5));
+                case 7: return ME_FUN(void*, double, double, double, double, double, double, double)(
+                        n->parameters[7], M(0), M(1), M(2), M(3), M(4), M(5), M(6));
+                default: return NAN;
+            }
+
+        default: return NAN;
+    }
+}
+
+#undef ME_FUN
+#undef M
+
+/* Specialized vector operations for better performance */
+static void vec_add(const double *a, const double *b, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = a[i] + b[i];
+    }
+}
+
+static void vec_sub(const double *a, const double *b, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = a[i] - b[i];
+    }
+}
+
+static void vec_mul(const double *a, const double *b, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = a[i] * b[i];
+    }
+}
+
+static void vec_div(const double *a, const double *b, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = a[i] / b[i];
+    }
+}
+
+static void vec_add_scalar(const double *a, double b, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = a[i] + b;
+    }
+}
+
+static void vec_mul_scalar(const double *a, double b, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = a[i] * b;
+    }
+}
+
+static void vec_pow(const double *a, const double *b, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = pow(a[i], b[i]);
+    }
+}
+
+static void vec_pow_scalar(const double *a, double b, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = pow(a[i], b);
+    }
+}
+
+static void vec_sqrt(const double *a, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = sqrt(a[i]);
+    }
+}
+
+static void vec_sin(const double *a, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = sin(a[i]);
+    }
+}
+
+static void vec_cos(const double *a, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = cos(a[i]);
+    }
+}
+
+static void vec_negate(const double *a, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = -a[i];
+    }
+}
+
+/* ============================================================================
+ * FLOAT32 VECTOR OPERATIONS
+ * ============================================================================ */
+
+static void vec_add_f32(const float *a, const float *b, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = a[i] + b[i];
+    }
+}
+
+static void vec_sub_f32(const float *a, const float *b, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = a[i] - b[i];
+    }
+}
+
+static void vec_mul_f32(const float *a, const float *b, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = a[i] * b[i];
+    }
+}
+
+static void vec_div_f32(const float *a, const float *b, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = a[i] / b[i];
+    }
+}
+
+static void vec_add_scalar_f32(const float *a, float b, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = a[i] + b;
+    }
+}
+
+static void vec_mul_scalar_f32(const float *a, float b, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = a[i] * b;
+    }
+}
+
+static void vec_pow_f32(const float *a, const float *b, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = powf(a[i], b[i]);
+    }
+}
+
+static void vec_pow_scalar_f32(const float *a, float b, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = powf(a[i], b);
+    }
+}
+
+static void vec_sqrt_f32(const float *a, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = sqrtf(a[i]);
+    }
+}
+
+static void vec_sin_f32(const float *a, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = sinf(a[i]);
+    }
+}
+
+static void vec_cos_f32(const float *a, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = cosf(a[i]);
+    }
+}
+
+static void vec_negame_f32(const float *a, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) {
+        out[i] = -a[i];
+    }
+}
+
+/* ============================================================================
+ * INTEGER VECTOR OPERATIONS (int8_t through uint64_t)
+ * ============================================================================ */
+
+/* Macros to generate integer vector operations */
+#define DEFINE_INT_VEC_OPS(SUFFIX, TYPE) \
+static void vec_add_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = a[i] + b[i]; \
+} \
+static void vec_sub_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = a[i] - b[i]; \
+} \
+static void vec_mul_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = a[i] * b[i]; \
+} \
+static void vec_div_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = (b[i] != 0) ? (a[i] / b[i]) : 0; \
+} \
+static void vec_add_scalar_##SUFFIX(const TYPE *a, TYPE b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = a[i] + b; \
+} \
+static void vec_mul_scalar_##SUFFIX(const TYPE *a, TYPE b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = a[i] * b; \
+} \
+static void vec_pow_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = (TYPE)pow((double)a[i], (double)b[i]); \
+} \
+static void vec_pow_scalar_##SUFFIX(const TYPE *a, TYPE b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = (TYPE)pow((double)a[i], (double)b); \
+} \
+static void vec_sqrt_##SUFFIX(const TYPE *a, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = (TYPE)sqrt((double)a[i]); \
+} \
+static void vec_negame_##SUFFIX(const TYPE *a, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = -a[i]; \
+} \
+static void vec_and_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = a[i] & b[i]; \
+} \
+static void vec_or_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = a[i] | b[i]; \
+} \
+static void vec_xor_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = a[i] ^ b[i]; \
+} \
+static void vec_not_##SUFFIX(const TYPE *a, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = ~a[i]; \
+} \
+static void vec_shl_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = a[i] << b[i]; \
+} \
+static void vec_shr_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = a[i] >> b[i]; \
+}
+
+/* Generate ops for all integer types */
+DEFINE_INT_VEC_OPS(i8, int8_t)
+DEFINE_INT_VEC_OPS(i16, int16_t)
+DEFINE_INT_VEC_OPS(i32, int32_t)
+DEFINE_INT_VEC_OPS(i64, int64_t)
+DEFINE_INT_VEC_OPS(u8, uint8_t)
+DEFINE_INT_VEC_OPS(u16, uint16_t)
+DEFINE_INT_VEC_OPS(u32, uint32_t)
+DEFINE_INT_VEC_OPS(u64, uint64_t)
+
+/* Boolean logical operations */
+static void vec_and_bool(const bool *a, const bool *b, bool *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] && b[i];
+}
+
+static void vec_or_bool(const bool *a, const bool *b, bool *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] || b[i];
+}
+
+static void vec_xor_bool(const bool *a, const bool *b, bool *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] != b[i];
+}
+
+static void vec_not_bool(const bool *a, bool *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = !a[i];
+}
+
+/* Comparison operations - generate for all numeric types */
+/* Note: These return bool arrays, but we'll store them as the same type for simplicity */
+#define DEFINE_COMPARE_OPS(SUFFIX, TYPE) \
+static void vec_cmp_eq_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = (a[i] == b[i]) ? 1 : 0; \
+} \
+static void vec_cmp_ne_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = (a[i] != b[i]) ? 1 : 0; \
+} \
+static void vec_cmp_lt_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = (a[i] < b[i]) ? 1 : 0; \
+} \
+static void vec_cmp_le_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = (a[i] <= b[i]) ? 1 : 0; \
+} \
+static void vec_cmp_gt_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = (a[i] > b[i]) ? 1 : 0; \
+} \
+static void vec_cmp_ge_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = (a[i] >= b[i]) ? 1 : 0; \
+}
+
+/* Generate comparison ops for all types */
+DEFINE_COMPARE_OPS(i8, int8_t)
+DEFINE_COMPARE_OPS(i16, int16_t)
+DEFINE_COMPARE_OPS(i32, int32_t)
+DEFINE_COMPARE_OPS(i64, int64_t)
+DEFINE_COMPARE_OPS(u8, uint8_t)
+DEFINE_COMPARE_OPS(u16, uint16_t)
+DEFINE_COMPARE_OPS(u32, uint32_t)
+DEFINE_COMPARE_OPS(u64, uint64_t)
+DEFINE_COMPARE_OPS(f32, float)
+DEFINE_COMPARE_OPS(f64, double)
+
+/* Complex operations */
+static void vec_add_c64(const float complex *a, const float complex *b, float complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] + b[i];
+}
+
+static void vec_sub_c64(const float complex *a, const float complex *b, float complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] - b[i];
+}
+
+static void vec_mul_c64(const float complex *a, const float complex *b, float complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] * b[i];
+}
+
+static void vec_div_c64(const float complex *a, const float complex *b, float complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] / b[i];
+}
+
+static void vec_add_scalar_c64(const float complex *a, float complex b, float complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] + b;
+}
+
+static void vec_mul_scalar_c64(const float complex *a, float complex b, float complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] * b;
+}
+
+static void vec_pow_c64(const float complex *a, const float complex *b, float complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = cpowf(a[i], b[i]);
+}
+
+static void vec_pow_scalar_c64(const float complex *a, float complex b, float complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = cpowf(a[i], b);
+}
+
+static void vec_sqrt_c64(const float complex *a, float complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = csqrtf(a[i]);
+}
+
+static void vec_negame_c64(const float complex *a, float complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = -a[i];
+}
+
+static void vec_add_c128(const double complex *a, const double complex *b, double complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] + b[i];
+}
+
+static void vec_sub_c128(const double complex *a, const double complex *b, double complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] - b[i];
+}
+
+static void vec_mul_c128(const double complex *a, const double complex *b, double complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] * b[i];
+}
+
+static void vec_div_c128(const double complex *a, const double complex *b, double complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] / b[i];
+}
+
+static void vec_add_scalar_c128(const double complex *a, double complex b, double complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] + b;
+}
+
+static void vec_mul_scalar_c128(const double complex *a, double complex b, double complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = a[i] * b;
+}
+
+static void vec_pow_c128(const double complex *a, const double complex *b, double complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = cpow(a[i], b[i]);
+}
+
+static void vec_pow_scalar_c128(const double complex *a, double complex b, double complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = cpow(a[i], b);
+}
+
+static void vec_sqrt_c128(const double complex *a, double complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = csqrt(a[i]);
+}
+
+static void vec_negame_c128(const double complex *a, double complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = -a[i];
+}
+
+/* ============================================================================
+ * TYPE CONVERSION FUNCTIONS
+ * ============================================================================
+ * These functions convert between different data types for mixed-type expressions.
+ */
+
+#define DEFINE_VEC_CONVERT(FROM_SUFFIX, TO_SUFFIX, FROM_TYPE, TO_TYPE) \
+static void vec_convert_##FROM_SUFFIX##_to_##TO_SUFFIX(const FROM_TYPE *in, TO_TYPE *out, int n) { \
+    int i; \
+    _Pragma("GCC ivdep") \
+    for (i = 0; i < n; i++) out[i] = (TO_TYPE)in[i]; \
+}
+
+/* Generate all conversion functions */
+DEFINE_VEC_CONVERT(bool, i8, bool, int8_t)
+DEFINE_VEC_CONVERT(bool, i16, bool, int16_t)
+DEFINE_VEC_CONVERT(bool, i32, bool, int32_t)
+DEFINE_VEC_CONVERT(bool, i64, bool, int64_t)
+DEFINE_VEC_CONVERT(bool, u8, bool, uint8_t)
+DEFINE_VEC_CONVERT(bool, u16, bool, uint16_t)
+DEFINE_VEC_CONVERT(bool, u32, bool, uint32_t)
+DEFINE_VEC_CONVERT(bool, u64, bool, uint64_t)
+DEFINE_VEC_CONVERT(bool, f32, bool, float)
+DEFINE_VEC_CONVERT(bool, f64, bool, double)
+
+DEFINE_VEC_CONVERT(i8, i16, int8_t, int16_t)
+DEFINE_VEC_CONVERT(i8, i32, int8_t, int32_t)
+DEFINE_VEC_CONVERT(i8, i64, int8_t, int64_t)
+DEFINE_VEC_CONVERT(i8, f32, int8_t, float)
+DEFINE_VEC_CONVERT(i8, f64, int8_t, double)
+
+DEFINE_VEC_CONVERT(i16, i32, int16_t, int32_t)
+DEFINE_VEC_CONVERT(i16, i64, int16_t, int64_t)
+DEFINE_VEC_CONVERT(i16, f32, int16_t, float)
+DEFINE_VEC_CONVERT(i16, f64, int16_t, double)
+
+DEFINE_VEC_CONVERT(i32, i64, int32_t, int64_t)
+DEFINE_VEC_CONVERT(i32, f32, int32_t, float)
+DEFINE_VEC_CONVERT(i32, f64, int32_t, double)
+
+DEFINE_VEC_CONVERT(i64, f64, int64_t, double)
+
+DEFINE_VEC_CONVERT(u8, u16, uint8_t, uint16_t)
+DEFINE_VEC_CONVERT(u8, u32, uint8_t, uint32_t)
+DEFINE_VEC_CONVERT(u8, u64, uint8_t, uint64_t)
+DEFINE_VEC_CONVERT(u8, i16, uint8_t, int16_t)
+DEFINE_VEC_CONVERT(u8, i32, uint8_t, int32_t)
+DEFINE_VEC_CONVERT(u8, i64, uint8_t, int64_t)
+DEFINE_VEC_CONVERT(u8, f32, uint8_t, float)
+DEFINE_VEC_CONVERT(u8, f64, uint8_t, double)
+
+DEFINE_VEC_CONVERT(u16, u32, uint16_t, uint32_t)
+DEFINE_VEC_CONVERT(u16, u64, uint16_t, uint64_t)
+DEFINE_VEC_CONVERT(u16, i32, uint16_t, int32_t)
+DEFINE_VEC_CONVERT(u16, i64, uint16_t, int64_t)
+DEFINE_VEC_CONVERT(u16, f32, uint16_t, float)
+DEFINE_VEC_CONVERT(u16, f64, uint16_t, double)
+
+DEFINE_VEC_CONVERT(u32, u64, uint32_t, uint64_t)
+DEFINE_VEC_CONVERT(u32, i64, uint32_t, int64_t)
+DEFINE_VEC_CONVERT(u32, f64, uint32_t, double)
+
+DEFINE_VEC_CONVERT(u64, f64, uint64_t, double)
+
+DEFINE_VEC_CONVERT(f32, f64, float, double)
+DEFINE_VEC_CONVERT(f32, c64, float, float complex)
+DEFINE_VEC_CONVERT(f32, c128, float, double complex)
+
+DEFINE_VEC_CONVERT(f64, c128, double, double complex)
+
+DEFINE_VEC_CONVERT(c64, c128, float complex, double complex)
+
+/* Function to get conversion function pointer */
+typedef void (*convert_func_t)(const void *, void *, int);
+
+static convert_func_t get_convert_func(me_dtype from, me_dtype to) {
+    /* Return conversion function for a specific type pair */
+    if (from == to) return NULL; // No conversion needed
+
+#define CONV_CASE(FROM, TO, FROM_S, TO_S) \
+        if (from == FROM && to == TO) return (convert_func_t)vec_convert_##FROM_S##_to_##TO_S;
+
+    CONV_CASE(ME_BOOL, ME_INT8, bool, i8)
+    CONV_CASE(ME_BOOL, ME_INT16, bool, i16)
+    CONV_CASE(ME_BOOL, ME_INT32, bool, i32)
+    CONV_CASE(ME_BOOL, ME_INT64, bool, i64)
+    CONV_CASE(ME_BOOL, ME_UINT8, bool, u8)
+    CONV_CASE(ME_BOOL, ME_UINT16, bool, u16)
+    CONV_CASE(ME_BOOL, ME_UINT32, bool, u32)
+    CONV_CASE(ME_BOOL, ME_UINT64, bool, u64)
+    CONV_CASE(ME_BOOL, ME_FLOAT32, bool, f32)
+    CONV_CASE(ME_BOOL, ME_FLOAT64, bool, f64)
+
+    CONV_CASE(ME_INT8, ME_INT16, i8, i16)
+    CONV_CASE(ME_INT8, ME_INT32, i8, i32)
+    CONV_CASE(ME_INT8, ME_INT64, i8, i64)
+    CONV_CASE(ME_INT8, ME_FLOAT32, i8, f32)
+    CONV_CASE(ME_INT8, ME_FLOAT64, i8, f64)
+
+    CONV_CASE(ME_INT16, ME_INT32, i16, i32)
+    CONV_CASE(ME_INT16, ME_INT64, i16, i64)
+    CONV_CASE(ME_INT16, ME_FLOAT32, i16, f32)
+    CONV_CASE(ME_INT16, ME_FLOAT64, i16, f64)
+
+    CONV_CASE(ME_INT32, ME_INT64, i32, i64)
+    CONV_CASE(ME_INT32, ME_FLOAT32, i32, f32)
+    CONV_CASE(ME_INT32, ME_FLOAT64, i32, f64)
+
+    CONV_CASE(ME_INT64, ME_FLOAT64, i64, f64)
+
+    CONV_CASE(ME_UINT8, ME_UINT16, u8, u16)
+    CONV_CASE(ME_UINT8, ME_UINT32, u8, u32)
+    CONV_CASE(ME_UINT8, ME_UINT64, u8, u64)
+    CONV_CASE(ME_UINT8, ME_INT16, u8, i16)
+    CONV_CASE(ME_UINT8, ME_INT32, u8, i32)
+    CONV_CASE(ME_UINT8, ME_INT64, u8, i64)
+    CONV_CASE(ME_UINT8, ME_FLOAT32, u8, f32)
+    CONV_CASE(ME_UINT8, ME_FLOAT64, u8, f64)
+
+    CONV_CASE(ME_UINT16, ME_UINT32, u16, u32)
+    CONV_CASE(ME_UINT16, ME_UINT64, u16, u64)
+    CONV_CASE(ME_UINT16, ME_INT32, u16, i32)
+    CONV_CASE(ME_UINT16, ME_INT64, u16, i64)
+    CONV_CASE(ME_UINT16, ME_FLOAT32, u16, f32)
+    CONV_CASE(ME_UINT16, ME_FLOAT64, u16, f64)
+
+    CONV_CASE(ME_UINT32, ME_UINT64, u32, u64)
+    CONV_CASE(ME_UINT32, ME_INT64, u32, i64)
+    CONV_CASE(ME_UINT32, ME_FLOAT64, u32, f64)
+
+    CONV_CASE(ME_UINT64, ME_FLOAT64, u64, f64)
+
+    CONV_CASE(ME_FLOAT32, ME_FLOAT64, f32, f64)
+    CONV_CASE(ME_FLOAT32, ME_COMPLEX64, f32, c64)
+    CONV_CASE(ME_FLOAT32, ME_COMPLEX128, f32, c128)
+
+    CONV_CASE(ME_FLOAT64, ME_COMPLEX128, f64, c128)
+
+    CONV_CASE(ME_COMPLEX64, ME_COMPLEX128, c64, c128)
+
+#undef CONV_CASE
+
+    return NULL; // Unsupported conversion
+}
+
+
+typedef double (*me_fun1)(double);
+
+typedef float (*me_fun1_f32)(float);
+
+/* Template for type-specific evaluator */
+#define DEFINE_ME_EVAL(SUFFIX, TYPE, VEC_ADD, VEC_SUB, VEC_MUL, VEC_DIV, VEC_POW, \
+    VEC_ADD_SCALAR, VEC_MUL_SCALAR, VEC_POW_SCALAR, \
+    VEC_SQRT, VEC_SIN, VEC_COS, VEC_NEGATE, \
+    SQRT_FUNC, SIN_FUNC, COS_FUNC, EXP_FUNC, LOG_FUNC, FABS_FUNC, POW_FUNC) \
+static void me_eval_##SUFFIX(const me_expr *n) { \
+    if (!n || !n->output || n->nitems <= 0) return; \
+    \
+    int i, j; \
+    const int arity = ARITY(n->type); \
+    TYPE *output = (TYPE*)n->output; \
+    \
+    switch(TYPE_MASK(n->type)) { \
+        case ME_CONSTANT: \
+            { \
+                TYPE val = (TYPE)n->value; \
+                for (i = 0; i < n->nitems; i++) { \
+                    output[i] = val; \
+                } \
+            } \
+            break; \
+            \
+        case ME_VARIABLE: \
+            { \
+                const TYPE *src = (const TYPE*)n->bound; \
+                for (i = 0; i < n->nitems; i++) { \
+                    output[i] = src[i]; \
+                } \
+            } \
+            break; \
+        \
+        case ME_FUNCTION0: case ME_FUNCTION1: case ME_FUNCTION2: case ME_FUNCTION3: \
+        case ME_FUNCTION4: case ME_FUNCTION5: case ME_FUNCTION6: case ME_FUNCTION7: \
+        case ME_CLOSURE0: case ME_CLOSURE1: case ME_CLOSURE2: case ME_CLOSURE3: \
+        case ME_CLOSURE4: case ME_CLOSURE5: case ME_CLOSURE6: case ME_CLOSURE7: \
+            for (j = 0; j < arity; j++) { \
+                me_expr *param = (me_expr*)n->parameters[j]; \
+                if (param->type != ME_CONSTANT && param->type != ME_VARIABLE) { \
+                    if (!param->output) { \
+                        param->output = malloc(n->nitems * sizeof(TYPE)); \
+                        param->nitems = n->nitems; \
+                        param->dtype = n->dtype; \
+                    } \
+                    me_eval_##SUFFIX(param); \
+                } \
+            } \
+            \
+            if (arity == 2 && IS_FUNCTION(n->type)) { \
+                me_expr *left = (me_expr*)n->parameters[0]; \
+                me_expr *right = (me_expr*)n->parameters[1]; \
+                \
+                const TYPE *ldata = (left->type == ME_CONSTANT) ? NULL : \
+                                   (left->type == ME_VARIABLE) ? (const TYPE*)left->bound : (const TYPE*)left->output; \
+                const TYPE *rdata = (right->type == ME_CONSTANT) ? NULL : \
+                                    (right->type == ME_VARIABLE) ? (const TYPE*)right->bound : (const TYPE*)right->output; \
+                \
+                me_fun2 func = (me_fun2)n->function; \
+                \
+                if (func == add) { \
+                    if (ldata && rdata) { \
+                        VEC_ADD(ldata, rdata, output, n->nitems); \
+                    } else if (ldata && right->type == ME_CONSTANT) { \
+                        VEC_ADD_SCALAR(ldata, (TYPE)right->value, output, n->nitems); \
+                    } else if (left->type == ME_CONSTANT && rdata) { \
+                        VEC_ADD_SCALAR(rdata, (TYPE)left->value, output, n->nitems); \
+                    } else { \
+                        goto general_case_binary_##SUFFIX; \
+                    } \
+                } else if (func == sub) { \
+                    if (ldata && rdata) { \
+                        VEC_SUB(ldata, rdata, output, n->nitems); \
+                    } else { \
+                        goto general_case_binary_##SUFFIX; \
+                    } \
+                } else if (func == mul) { \
+                    if (ldata && rdata) { \
+                        VEC_MUL(ldata, rdata, output, n->nitems); \
+                    } else if (ldata && right->type == ME_CONSTANT) { \
+                        VEC_MUL_SCALAR(ldata, (TYPE)right->value, output, n->nitems); \
+                    } else if (left->type == ME_CONSTANT && rdata) { \
+                        VEC_MUL_SCALAR(rdata, (TYPE)left->value, output, n->nitems); \
+                    } else { \
+                        goto general_case_binary_##SUFFIX; \
+                    } \
+                } else if (func == divide) { \
+                    if (ldata && rdata) { \
+                        VEC_DIV(ldata, rdata, output, n->nitems); \
+                    } else { \
+                        goto general_case_binary_##SUFFIX; \
+                    } \
+                } else if (func == (me_fun2)pow) { \
+                    if (ldata && rdata) { \
+                        VEC_POW(ldata, rdata, output, n->nitems); \
+                    } else if (ldata && right->type == ME_CONSTANT) { \
+                        VEC_POW_SCALAR(ldata, (TYPE)right->value, output, n->nitems); \
+                    } else { \
+                        goto general_case_binary_##SUFFIX; \
+                    } \
+                } else { \
+                    general_case_binary_##SUFFIX: \
+                    for (i = 0; i < n->nitems; i++) { \
+                        double a = (left->type == ME_CONSTANT) ? left->value : \
+                                  (left->type == ME_VARIABLE) ? (double)ldata[i] : (double)ldata[i]; \
+                        double b = (right->type == ME_CONSTANT) ? right->value : \
+                                  (right->type == ME_VARIABLE) ? (double)rdata[i] : (double)rdata[i]; \
+                        output[i] = (TYPE)func(a, b); \
+                    } \
+                } \
+            } \
+            else if (arity == 1 && IS_FUNCTION(n->type)) { \
+                me_expr *arg = (me_expr*)n->parameters[0]; \
+                \
+                const TYPE *adata = (arg->type == ME_CONSTANT) ? NULL : \
+                                   (arg->type == ME_VARIABLE) ? (const TYPE*)arg->bound : (const TYPE*)arg->output; \
+                \
+                const void *func_ptr = n->function; \
+                \
+                if (func_ptr == (void*)sqrt) { \
+                    if (adata) VEC_SQRT(adata, output, n->nitems); \
+                } else if (func_ptr == (void*)sin) { \
+                    if (adata) VEC_SIN(adata, output, n->nitems); \
+                } else if (func_ptr == (void*)cos) { \
+                    if (adata) VEC_COS(adata, output, n->nitems); \
+                } else if (func_ptr == (void*)negate) { \
+                    if (adata) VEC_NEGATE(adata, output, n->nitems); \
+                } else { \
+                    me_fun1 func = (me_fun1)func_ptr; \
+                    if (arg->type == ME_CONSTANT) { \
+                        TYPE val = (TYPE)func(arg->value); \
+                        for (i = 0; i < n->nitems; i++) { \
+                            output[i] = val; \
+                        } \
+                    } else { \
+                        for (i = 0; i < n->nitems; i++) { \
+                            output[i] = (TYPE)func((double)adata[i]); \
+                        } \
+                    } \
+                } \
+            } \
+            else { \
+                for (i = 0; i < n->nitems; i++) { \
+                    double args[7]; \
+                    \
+                    for (j = 0; j < arity; j++) { \
+                        me_expr *param = (me_expr*)n->parameters[j]; \
+                        const TYPE *pdata = (const TYPE*)((param->type == ME_VARIABLE) ? param->bound : param->output); \
+                        if (param->type == ME_CONSTANT) { \
+                            args[j] = param->value; \
+                        } else { \
+                            args[j] = (double)pdata[i]; \
+                        } \
+                    } \
+                    \
+                    if (IS_FUNCTION(n->type)) { \
+                        switch(arity) { \
+                            case 0: output[i] = (TYPE)((double(*)(void))n->function)(); break; \
+                            case 3: output[i] = (TYPE)((double(*)(double,double,double))n->function)(args[0], args[1], args[2]); break; \
+                            case 4: output[i] = (TYPE)((double(*)(double,double,double,double))n->function)(args[0], args[1], args[2], args[3]); break; \
+                            case 5: output[i] = (TYPE)((double(*)(double,double,double,double,double))n->function)(args[0], args[1], args[2], args[3], args[4]); break; \
+                            case 6: output[i] = (TYPE)((double(*)(double,double,double,double,double,double))n->function)(args[0], args[1], args[2], args[3], args[4], args[5]); break; \
+                            case 7: output[i] = (TYPE)((double(*)(double,double,double,double,double,double,double))n->function)(args[0], args[1], args[2], args[3], args[4], args[5], args[6]); break; \
+                        } \
+                    } else if (IS_CLOSURE(n->type)) { \
+                        void *context = n->parameters[arity]; \
+                        switch(arity) { \
+                            case 0: output[i] = (TYPE)((double(*)(void*))n->function)(context); break; \
+                            case 1: output[i] = (TYPE)((double(*)(void*,double))n->function)(context, args[0]); break; \
+                            case 2: output[i] = (TYPE)((double(*)(void*,double,double))n->function)(context, args[0], args[1]); break; \
+                            case 3: output[i] = (TYPE)((double(*)(void*,double,double,double))n->function)(context, args[0], args[1], args[2]); break; \
+                            case 4: output[i] = (TYPE)((double(*)(void*,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3]); break; \
+                            case 5: output[i] = (TYPE)((double(*)(void*,double,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3], args[4]); break; \
+                            case 6: output[i] = (TYPE)((double(*)(void*,double,double,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3], args[4], args[5]); break; \
+                            case 7: output[i] = (TYPE)((double(*)(void*,double,double,double,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3], args[4], args[5], args[6]); break; \
+                        } \
+                    } \
+                } \
+            } \
+            break; \
+        \
+        default: \
+            for (i = 0; i < n->nitems; i++) { \
+                output[i] = (TYPE)NAN; \
+            } \
+            break; \
+    } \
+}
+
+/* Vector operation macros - expand to inline loops */
+#define vec_add(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
+#define vec_sub(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
+#define vec_mul(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
+#define vec_div(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
+#define vec_pow(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = pow((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
+#define vec_mul_scalar(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
+#define vec_pow_scalar(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = pow((a)[_i], (b)); } while(0)
+#define vec_sqrt(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sqrt((a)[_i]); } while(0)
+#define vec_sin(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sin((a)[_i]); } while(0)
+#define vec_cos(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cos((a)[_i]); } while(0)
+#define vec_negate(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+
+#define vec_add_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
+#define vec_sub_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
+#define vec_mul_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
+#define vec_div_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
+#define vec_pow_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = powf((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
+#define vec_mul_scalar_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
+#define vec_pow_scalar_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = powf((a)[_i], (b)); } while(0)
+#define vec_sqrt_f32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sqrtf((a)[_i]); } while(0)
+#define vec_sin_f32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sinf((a)[_i]); } while(0)
+#define vec_cos_f32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cosf((a)[_i]); } while(0)
+#define vec_negame_f32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+
+#define vec_add_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
+#define vec_sub_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
+#define vec_mul_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
+#define vec_div_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
+#define vec_pow_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int8_t)pow((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
+#define vec_mul_scalar_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
+#define vec_pow_scalar_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int8_t)pow((a)[_i], (b)); } while(0)
+#define vec_sqrt_i8(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int8_t)sqrt((a)[_i]); } while(0)
+#define vec_negame_i8(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+
+#define vec_add_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
+#define vec_sub_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
+#define vec_mul_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
+#define vec_div_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
+#define vec_pow_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int16_t)pow((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
+#define vec_mul_scalar_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
+#define vec_pow_scalar_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int16_t)pow((a)[_i], (b)); } while(0)
+#define vec_sqrt_i16(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int16_t)sqrt((a)[_i]); } while(0)
+#define vec_negame_i16(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+
+#define vec_add_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
+#define vec_sub_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
+#define vec_mul_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
+#define vec_div_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
+#define vec_pow_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int32_t)pow((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
+#define vec_mul_scalar_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
+#define vec_pow_scalar_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int32_t)pow((a)[_i], (b)); } while(0)
+#define vec_sqrt_i32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int32_t)sqrt((a)[_i]); } while(0)
+#define vec_negame_i32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+
+#define vec_add_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
+#define vec_sub_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
+#define vec_mul_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
+#define vec_div_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
+#define vec_pow_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int64_t)pow((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
+#define vec_mul_scalar_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
+#define vec_pow_scalar_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int64_t)pow((a)[_i], (b)); } while(0)
+#define vec_sqrt_i64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int64_t)sqrt((a)[_i]); } while(0)
+#define vec_negame_i64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+
+#define vec_add_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
+#define vec_sub_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
+#define vec_mul_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
+#define vec_div_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
+#define vec_pow_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint8_t)pow((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
+#define vec_mul_scalar_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
+#define vec_pow_scalar_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint8_t)pow((a)[_i], (b)); } while(0)
+#define vec_sqrt_u8(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint8_t)sqrt((a)[_i]); } while(0)
+#define vec_negame_u8(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+
+#define vec_add_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
+#define vec_sub_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
+#define vec_mul_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
+#define vec_div_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
+#define vec_pow_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint16_t)pow((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
+#define vec_mul_scalar_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
+#define vec_pow_scalar_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint16_t)pow((a)[_i], (b)); } while(0)
+#define vec_sqrt_u16(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint16_t)sqrt((a)[_i]); } while(0)
+#define vec_negame_u16(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+
+#define vec_add_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
+#define vec_sub_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
+#define vec_mul_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
+#define vec_div_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
+#define vec_pow_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint32_t)pow((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
+#define vec_mul_scalar_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
+#define vec_pow_scalar_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint32_t)pow((a)[_i], (b)); } while(0)
+#define vec_sqrt_u32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint32_t)sqrt((a)[_i]); } while(0)
+#define vec_negame_u32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+
+#define vec_add_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
+#define vec_sub_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
+#define vec_mul_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
+#define vec_div_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
+#define vec_pow_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint64_t)pow((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
+#define vec_mul_scalar_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
+#define vec_pow_scalar_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint64_t)pow((a)[_i], (b)); } while(0)
+#define vec_sqrt_u64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint64_t)sqrt((a)[_i]); } while(0)
+#define vec_negame_u64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+
+#define vec_add_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
+#define vec_sub_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
+#define vec_mul_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
+#define vec_div_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
+#define vec_pow_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpowf((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
+#define vec_mul_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
+#define vec_pow_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpowf((a)[_i], (b)); } while(0)
+#define vec_sqrt_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = csqrtf((a)[_i]); } while(0)
+#define vec_negame_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+
+#define vec_add_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
+#define vec_sub_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
+#define vec_mul_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
+#define vec_div_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
+#define vec_pow_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpow((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
+#define vec_mul_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
+#define vec_pow_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpow((a)[_i], (b)); } while(0)
+#define vec_sqrt_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = csqrt((a)[_i]); } while(0)
+#define vec_negame_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+
+/* Generate float32 evaluator */
+DEFINE_ME_EVAL(f32, float,
+               vec_add_f32, vec_sub_f32, vec_mul_f32, vec_div_f32, vec_pow_f32,
+               vec_add_scalar_f32, vec_mul_scalar_f32, vec_pow_scalar_f32,
+               vec_sqrt_f32, vec_sin_f32, vec_cos_f32, vec_negame_f32,
+               sqrtf, sinf, cosf, expf, logf, fabsf, powf)
+
+/* Generate float64 (double) evaluator */
+DEFINE_ME_EVAL(f64, double,
+               vec_add, vec_sub, vec_mul, vec_div, vec_pow,
+               vec_add_scalar, vec_mul_scalar, vec_pow_scalar,
+               vec_sqrt, vec_sin, vec_cos, vec_negate,
+               sqrt, sin, cos, exp, log, fabs, pow)
+
+/* Generate integer evaluators - sin/cos cast to double and back */
+DEFINE_ME_EVAL(i8, int8_t,
+               vec_add_i8, vec_sub_i8, vec_mul_i8, vec_div_i8, vec_pow_i8,
+               vec_add_scalar_i8, vec_mul_scalar_i8, vec_pow_scalar_i8,
+               vec_sqrt_i8, vec_sqrt_i8, vec_sqrt_i8, vec_negame_i8,
+               sqrt, sin, cos, exp, log, fabs, pow)
+
+DEFINE_ME_EVAL(i16, int16_t,
+               vec_add_i16, vec_sub_i16, vec_mul_i16, vec_div_i16, vec_pow_i16,
+               vec_add_scalar_i16, vec_mul_scalar_i16, vec_pow_scalar_i16,
+               vec_sqrt_i16, vec_sqrt_i16, vec_sqrt_i16, vec_negame_i16,
+               sqrt, sin, cos, exp, log, fabs, pow)
+
+DEFINE_ME_EVAL(i32, int32_t,
+               vec_add_i32, vec_sub_i32, vec_mul_i32, vec_div_i32, vec_pow_i32,
+               vec_add_scalar_i32, vec_mul_scalar_i32, vec_pow_scalar_i32,
+               vec_sqrt_i32, vec_sqrt_i32, vec_sqrt_i32, vec_negame_i32,
+               sqrt, sin, cos, exp, log, fabs, pow)
+
+DEFINE_ME_EVAL(i64, int64_t,
+               vec_add_i64, vec_sub_i64, vec_mul_i64, vec_div_i64, vec_pow_i64,
+               vec_add_scalar_i64, vec_mul_scalar_i64, vec_pow_scalar_i64,
+               vec_sqrt_i64, vec_sqrt_i64, vec_sqrt_i64, vec_negame_i64,
+               sqrt, sin, cos, exp, log, fabs, pow)
+
+DEFINE_ME_EVAL(u8, uint8_t,
+               vec_add_u8, vec_sub_u8, vec_mul_u8, vec_div_u8, vec_pow_u8,
+               vec_add_scalar_u8, vec_mul_scalar_u8, vec_pow_scalar_u8,
+               vec_sqrt_u8, vec_sqrt_u8, vec_sqrt_u8, vec_negame_u8,
+               sqrt, sin, cos, exp, log, fabs, pow)
+
+DEFINE_ME_EVAL(u16, uint16_t,
+               vec_add_u16, vec_sub_u16, vec_mul_u16, vec_div_u16, vec_pow_u16,
+               vec_add_scalar_u16, vec_mul_scalar_u16, vec_pow_scalar_u16,
+               vec_sqrt_u16, vec_sqrt_u16, vec_sqrt_u16, vec_negame_u16,
+               sqrt, sin, cos, exp, log, fabs, pow)
+
+DEFINE_ME_EVAL(u32, uint32_t,
+               vec_add_u32, vec_sub_u32, vec_mul_u32, vec_div_u32, vec_pow_u32,
+               vec_add_scalar_u32, vec_mul_scalar_u32, vec_pow_scalar_u32,
+               vec_sqrt_u32, vec_sqrt_u32, vec_sqrt_u32, vec_negame_u32,
+               sqrt, sin, cos, exp, log, fabs, pow)
+
+DEFINE_ME_EVAL(u64, uint64_t,
+               vec_add_u64, vec_sub_u64, vec_mul_u64, vec_div_u64, vec_pow_u64,
+               vec_add_scalar_u64, vec_mul_scalar_u64, vec_pow_scalar_u64,
+               vec_sqrt_u64, vec_sqrt_u64, vec_sqrt_u64, vec_negame_u64,
+               sqrt, sin, cos, exp, log, fabs, pow)
+
+/* Generate complex evaluators */
+DEFINE_ME_EVAL(c64, float complex,
+               vec_add_c64, vec_sub_c64, vec_mul_c64, vec_div_c64, vec_pow_c64,
+               vec_add_scalar_c64, vec_mul_scalar_c64, vec_pow_scalar_c64,
+               vec_sqrt_c64, vec_sqrt_c64, vec_sqrt_c64, vec_negame_c64,
+               csqrtf, csqrtf, csqrtf, cexpf, clogf, cabsf, cpowf)
+
+DEFINE_ME_EVAL(c128, double complex,
+               vec_add_c128, vec_sub_c128, vec_mul_c128, vec_div_c128, vec_pow_c128,
+               vec_add_scalar_c128, vec_mul_scalar_c128, vec_pow_scalar_c128,
+               vec_sqrt_c128, vec_sqrt_c128, vec_sqrt_c128, vec_negame_c128,
+               csqrt, csqrt, csqrt, cexp, clog, cabs, cpow)
+
+/* Public API - dispatches to correct type-specific evaluator */
+/* Structure to track promoted variables */
+typedef struct {
+    void *promoted_data; // Temporary buffer for promoted data
+    me_dtype original_type;
+    bool needs_free;
+} promoted_var_t;
+
+/* Helper to save original variable bindings */
+static void save_variable_bindings(const me_expr *node,
+                                   const void **original_bounds,
+                                   me_dtype *original_types,
+                                   int *save_idx) {
+    if (!node) return;
+    switch (TYPE_MASK(node->type)) {
+        case ME_VARIABLE:
+            original_bounds[*save_idx] = node->bound;
+            original_types[*save_idx] = node->dtype;
+            (*save_idx)++;
+            break;
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            const int arity = ARITY(node->type);
+            for (int i = 0; i < arity; i++) {
+                save_variable_bindings((const me_expr *) node->parameters[i],
+                                       original_bounds, original_types, save_idx);
+            }
+            break;
+        }
+    }
+}
+
+/* Recursively promote variables in expression tree */
+static void promome_variables_in_tree(me_expr *n, me_dtype target_type,
+                                      promoted_var_t *promotions, int *promo_count,
+                                      int nitems) {
+    if (!n) return;
+
+    switch (TYPE_MASK(n->type)) {
+        case ME_CONSTANT:
+            // Constants are promoted on-the-fly during evaluation
+            break;
+
+        case ME_VARIABLE:
+            if (n->dtype != target_type) {
+                // Need to promote this variable
+                void *promoted = malloc(nitems * dtype_size(target_type));
+                if (promoted) {
+                    convert_func_t conv = get_convert_func(n->dtype, target_type);
+                    if (conv) {
+                        conv(n->bound, promoted, nitems);
+
+                        // Track this promotion for later cleanup
+                        promotions[*promo_count].promoted_data = promoted;
+                        promotions[*promo_count].original_type = n->dtype;
+                        promotions[*promo_count].needs_free = true;
+                        (*promo_count)++;
+
+                        // Temporarily replace bound pointer
+                        n->bound = promoted;
+                        n->dtype = target_type;
+                    } else {
+                        free(promoted);
+                    }
+                }
+            }
+            break;
+
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            const int arity = ARITY(n->type);
+            for (int i = 0; i < arity; i++) {
+                promome_variables_in_tree((me_expr *) n->parameters[i], target_type,
+                                          promotions, promo_count, nitems);
+            }
+            break;
+        }
+    }
+}
+
+/* Restore original variable bindings after promotion */
+static void restore_variables_in_tree(me_expr *n, const void **original_bounds,
+                                      const me_dtype *original_types, int *restore_idx) {
+    if (!n) return;
+
+    switch (TYPE_MASK(n->type)) {
+        case ME_VARIABLE:
+            if (original_bounds[*restore_idx] != NULL) {
+                n->bound = original_bounds[*restore_idx];
+                n->dtype = original_types[*restore_idx];
+                (*restore_idx)++;
+            }
+            break;
+
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            const int arity = ARITY(n->type);
+            for (int i = 0; i < arity; i++) {
+                restore_variables_in_tree((me_expr *) n->parameters[i],
+                                          original_bounds, original_types, restore_idx);
+            }
+            break;
+        }
+    }
+}
+
+/* Check if all variables in tree match target type */
+static bool all_variables_match_type(const me_expr *n, me_dtype target_type) {
+    if (!n) return true;
+
+    switch (TYPE_MASK(n->type)) {
+        case ME_CONSTANT:
+            return true; // Constants are always OK
+
+        case ME_VARIABLE:
+            return n->dtype == target_type;
+
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            const int arity = ARITY(n->type);
+            for (int i = 0; i < arity; i++) {
+                if (!all_variables_match_type((const me_expr *) n->parameters[i], target_type)) {
+                    return false;
+                }
+            }
+            return true;
+        }
+    }
+
+    return true;
+}
+
+void me_eval(const me_expr *n) {
+    if (!n) return;
+
+    // Infer the result type from the expression tree
+    me_dtype result_type = infer_result_type(n);
+
+    // If all variables already match result type, use fast path
+    bool all_match = all_variables_match_type(n, result_type);
+    if (result_type == n->dtype && all_match) {
+        // Fast path: no promotion needed
+        switch (n->dtype) {
+            case ME_BOOL: me_eval_i8(n);
+                break;
+            case ME_INT8: me_eval_i8(n);
+                break;
+            case ME_INT16: me_eval_i16(n);
+                break;
+            case ME_INT32: me_eval_i32(n);
+                break;
+            case ME_INT64: me_eval_i64(n);
+                break;
+            case ME_UINT8: me_eval_u8(n);
+                break;
+            case ME_UINT16: me_eval_u16(n);
+                break;
+            case ME_UINT32: me_eval_u32(n);
+                break;
+            case ME_UINT64: me_eval_u64(n);
+                break;
+            case ME_FLOAT32: me_eval_f32(n);
+                break;
+            case ME_FLOAT64: me_eval_f64(n);
+                break;
+            case ME_COMPLEX64: me_eval_c64(n);
+                break;
+            case ME_COMPLEX128: me_eval_c128(n);
+                break;
+        }
+        return;
+    }
+
+    // Slow path: need to promote variables
+    // Allocate tracking structures (max 100 variables)
+    promoted_var_t promotions[100];
+    int promo_count = 0;
+
+    // Save original variable bindings
+    const void *original_bounds[100];
+    me_dtype original_types[100];
+    int save_idx = 0;
+
+    save_variable_bindings(n, original_bounds, original_types, &save_idx);
+
+    // Promote variables
+    promome_variables_in_tree((me_expr *) n, result_type, promotions, &promo_count, n->nitems);
+
+    // Update expression type
+    me_dtype saved_dtype = n->dtype;
+    ((me_expr *) n)->dtype = result_type;
+
+    // Evaluate with promoted types
+    switch (result_type) {
+        case ME_BOOL: me_eval_i8(n);
+            break;
+        case ME_INT8: me_eval_i8(n);
+            break;
+        case ME_INT16: me_eval_i16(n);
+            break;
+        case ME_INT32: me_eval_i32(n);
+            break;
+        case ME_INT64: me_eval_i64(n);
+            break;
+        case ME_UINT8: me_eval_u8(n);
+            break;
+        case ME_UINT16: me_eval_u16(n);
+            break;
+        case ME_UINT32: me_eval_u32(n);
+            break;
+        case ME_UINT64: me_eval_u64(n);
+            break;
+        case ME_FLOAT32: me_eval_f32(n);
+            break;
+        case ME_FLOAT64: me_eval_f64(n);
+            break;
+        case ME_COMPLEX64: me_eval_c64(n);
+            break;
+        case ME_COMPLEX128: me_eval_c128(n);
+            break;
+    }
+
+    // Restore original variable bindings
+    int restore_idx = 0;
+    restore_variables_in_tree((me_expr *) n, original_bounds, original_types, &restore_idx);
+
+    // Restore expression type
+    ((me_expr *) n)->dtype = saved_dtype;
+
+    // Free promoted buffers
+    for (int i = 0; i < promo_count; i++) {
+        if (promotions[i].needs_free) {
+            free(promotions[i].promoted_data);
+        }
+    }
+}
+
+/* Helper to update variable bindings and nitems in tree */
+static void save_nitems_in_tree(const me_expr *node, int *nitems_array, int *idx) {
+    if (!node) return;
+    nitems_array[(*idx)++] = node->nitems;
+
+    switch (TYPE_MASK(node->type)) {
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            const int arity = ARITY(node->type);
+            for (int i = 0; i < arity; i++) {
+                save_nitems_in_tree((const me_expr *) node->parameters[i], nitems_array, idx);
+            }
+            break;
+        }
+        default:
+            break;
+    }
+}
+
+static void restore_nitems_in_tree(me_expr *node, const int *nitems_array, int *idx) {
+    if (!node) return;
+    node->nitems = nitems_array[(*idx)++];
+
+    switch (TYPE_MASK(node->type)) {
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            const int arity = ARITY(node->type);
+            for (int i = 0; i < arity; i++) {
+                restore_nitems_in_tree((me_expr *) node->parameters[i], nitems_array, idx);
+            }
+            break;
+        }
+        default:
+            break;
+    }
+}
+
+/* Helper to free intermediate output buffers */
+static void free_intermediate_buffers(me_expr *node) {
+    if (!node) return;
+
+    switch (TYPE_MASK(node->type)) {
+        case ME_CONSTANT:
+        case ME_VARIABLE:
+            // These don't have intermediate buffers
+            break;
+
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            const int arity = ARITY(node->type);
+            for (int i = 0; i < arity; i++) {
+                me_expr *param = (me_expr *) node->parameters[i];
+                free_intermediate_buffers(param);
+
+                // Free intermediate buffer (but not for root or variables/constants)
+                if (param->type != ME_CONSTANT && param->type != ME_VARIABLE && param->output) {
+                    free(param->output);
+                    param->output = NULL;
+                }
+            }
+            break;
+        }
+    }
+}
+
+/* Helper to save original variable bindings with their pointers */
+static void save_variable_pointers(const me_expr *node, const void **var_pointers, int *var_count) {
+    if (!node) return;
+    switch (TYPE_MASK(node->type)) {
+        case ME_VARIABLE:
+            // Check if this pointer is already in the list
+            for (int i = 0; i < *var_count; i++) {
+                if (var_pointers[i] == node->bound) return; // Already saved
+            }
+            var_pointers[*var_count] = node->bound;
+            (*var_count)++;
+            break;
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            const int arity = ARITY(node->type);
+            for (int i = 0; i < arity; i++) {
+                save_variable_pointers((const me_expr *) node->parameters[i], var_pointers, var_count);
+            }
+            break;
+        }
+    }
+}
+
+/* Helper to update variable bindings by matching original pointers */
+static void update_vars_by_pointer(me_expr *node, const void **old_pointers, const void **new_pointers, int n_vars) {
+    if (!node) return;
+    switch (TYPE_MASK(node->type)) {
+        case ME_VARIABLE:
+            // Find which variable this is and update to new pointer
+            for (int i = 0; i < n_vars; i++) {
+                if (node->bound == old_pointers[i]) {
+                    node->bound = new_pointers[i];
+                    break;
+                }
+            }
+            break;
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            const int arity = ARITY(node->type);
+            for (int i = 0; i < arity; i++) {
+                update_vars_by_pointer((me_expr *) node->parameters[i], old_pointers, new_pointers, n_vars);
+            }
+            break;
+        }
+    }
+}
+
+/* Helper to update variable bindings and nitems in tree */
+static void update_variable_bindings(me_expr *node, const void **new_bounds, int *var_idx, int new_nitems) {
+    if (!node) return;
+
+    // Update nitems for all nodes to handle intermediate buffers
+    if (new_nitems > 0) {
+        node->nitems = new_nitems;
+    }
+
+    switch (TYPE_MASK(node->type)) {
+        case ME_VARIABLE:
+            if (new_bounds && *var_idx >= 0) {
+                node->bound = new_bounds[*var_idx];
+                (*var_idx)++;
+            }
+            break;
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            const int arity = ARITY(node->type);
+            for (int i = 0; i < arity; i++) {
+                update_variable_bindings((me_expr *) node->parameters[i], new_bounds, var_idx, new_nitems);
+            }
+            break;
+        }
+    }
+}
+
+/* Evaluate compiled expression with new variable and output pointers */
+void me_eval_chunk(const me_expr *expr, const void **vars_chunk, int n_vars,
+                   void *output_chunk, int chunk_nitems) {
+    if (!expr) return;
+
+    // Save original variable pointers (unique list)
+    const void *original_var_pointers[100];
+    int actual_var_count = 0;
+    save_variable_pointers(expr, original_var_pointers, &actual_var_count);
+
+    // Verify variable count matches
+    if (actual_var_count != n_vars) {
+        // Mismatch in variable count
+        return;
+    }
+
+    // Save original state
+    int original_nitems_array[100];
+    void *original_output = expr->output;
+
+    // Save original nitems for all nodes
+    int nitems_idx = 0;
+    save_nitems_in_tree(expr, original_nitems_array, &nitems_idx);
+
+    // Free intermediate buffers so they can be reallocated with correct size
+    free_intermediate_buffers((me_expr *) expr);
+
+    // Update variable bindings to new chunk pointers (by matching old pointers)
+    update_vars_by_pointer((me_expr *) expr, original_var_pointers, vars_chunk, n_vars);
+
+    // Update nitems throughout the tree
+    int update_idx = 0; // dummy variable
+    update_variable_bindings((me_expr *) expr, NULL, &update_idx, chunk_nitems);
+
+    // Update output pointer
+    ((me_expr *) expr)->output = output_chunk;
+
+    // Evaluate with new pointers
+    me_eval(expr);
+
+    // Restore original variable bindings
+    update_vars_by_pointer((me_expr *) expr, vars_chunk, original_var_pointers, n_vars);
+
+    // Restore output
+    ((me_expr *) expr)->output = original_output;
+
+    // Restore nitems for all nodes
+    nitems_idx = 0;
+    restore_nitems_in_tree((me_expr *) expr, original_nitems_array, &nitems_idx);
+}
+
+/* Clone an expression tree (deep copy of structure, shallow copy of data) */
+static me_expr *clone_expr(const me_expr *src) {
+    if (!src) return NULL;
+
+    const int arity = ARITY(src->type);
+    const int psize = sizeof(void *) * arity;
+    const int size = (sizeof(me_expr) - sizeof(void *)) + psize + (IS_CLOSURE(src->type) ? sizeof(void *) : 0);
+    me_expr *clone = malloc(size);
+    if (!clone) return NULL;
+
+    // Copy the entire structure
+    memcpy(clone, src, size);
+
+    // Clone children recursively
+    if (arity > 0) {
+        for (int i = 0; i < arity; i++) {
+            clone->parameters[i] = clone_expr((const me_expr *) src->parameters[i]);
+            if (src->parameters[i] && !clone->parameters[i]) {
+                // Clone failed, clean up
+                for (int j = 0; j < i; j++) {
+                    me_free((me_expr *) clone->parameters[j]);
+                }
+                free(clone);
+                return NULL;
+            }
+        }
+    }
+
+    // Don't clone output buffer - it will be set by caller
+    // Don't clone bytecode - not needed for clones
+    clone->output = NULL;
+    clone->bytecode = NULL;
+    clone->ncode = 0;
+
+    return clone;
+}
+
+/* Thread-safe chunked evaluation using expression cloning.
+ * This function is safe to call from multiple threads simultaneously,
+ * even on the same expression object. Each call creates a temporary
+ * clone of the expression tree to avoid race conditions. */
+void me_eval_chunk_threadsafe(const me_expr *expr, const void **vars_chunk,
+                              int n_vars, void *output_chunk, int chunk_nitems) {
+    if (!expr) return;
+
+    // Verify variable count matches
+    const void *original_var_pointers[100];
+    int actual_var_count = 0;
+    save_variable_pointers(expr, original_var_pointers, &actual_var_count);
+
+    if (actual_var_count != n_vars) {
+        return;
+    }
+
+    // Clone the expression tree
+    me_expr *clone = clone_expr(expr);
+    if (!clone) return;
+
+    // Update clone's variable bindings
+    update_vars_by_pointer(clone, original_var_pointers, vars_chunk, n_vars);
+
+    // Update clone's nitems throughout the tree
+    int update_idx = 0;
+    update_variable_bindings(clone, NULL, &update_idx, chunk_nitems);
+
+    // Set output pointer
+    clone->output = output_chunk;
+
+    // Evaluate the clone
+    me_eval(clone);
+
+    // Free the clone (including any intermediate buffers it allocated)
+    me_free(clone);
+}
+
+
+static void optimize(me_expr *n) {
+    /* Evaluates as much as possible. */
+    if (!n) return;
+    if (n->type == ME_CONSTANT) return;
+    if (n->type == ME_VARIABLE) return;
+
+    /* Only optimize out functions flagged as pure. */
+    if (IS_PURE(n->type)) {
+        const int arity = ARITY(n->type);
+        int known = 1;
+        int i;
+        for (i = 0; i < arity; ++i) {
+            optimize(n->parameters[i]);
+            if (((me_expr *) (n->parameters[i]))->type != ME_CONSTANT) {
+                known = 0;
+            }
+        }
+        if (known) {
+            const double value = me_eval_scalar(n);
+            me_free_parameters(n);
+            n->type = ME_CONSTANT;
+            n->value = value;
+        }
+    }
+}
+
+
+me_expr *me_compile(const char *expression, const me_variable *variables, int var_count,
+                    void *output, int nitems, me_dtype dtype, int *error) {
+    // Create a copy of variables with dtype filled in (if not already set)
+    me_variable *vars_copy = NULL;
+    if (variables && var_count > 0) {
+        vars_copy = malloc(var_count * sizeof(me_variable));
+        if (!vars_copy) {
+            if (error) *error = -1;
+            return NULL;
+        }
+        for (int i = 0; i < var_count; i++) {
+            vars_copy[i] = variables[i];
+            // If dtype not set (0 = ME_BOOL, which is unlikely for user variables),
+            // use the expression's dtype
+            if (vars_copy[i].dtype == 0 && vars_copy[i].type == 0) {
+                vars_copy[i].dtype = dtype;
+                vars_copy[i].type = ME_VARIABLE;
+            }
+        }
+    }
+
+    state s;
+    s.start = s.next = expression;
+    s.lookup = vars_copy ? vars_copy : variables;
+    s.lookup_len = var_count;
+    s.target_dtype = dtype; // Set target dtype for constants
+
+    next_token(&s);
+    me_expr *root = list(&s);
+
+    if (vars_copy) free(vars_copy);
+
+    if (root == NULL) {
+        if (error) *error = -1;
+        return NULL;
+    }
+
+    if (s.type != TOK_END) {
+        me_free(root);
+        if (error) {
+            *error = (s.next - s.start);
+            if (*error == 0) *error = 1;
+        }
+        return 0;
+    } else {
+        optimize(root);
+        root->output = output;
+        root->nitems = nitems;
+        root->dtype = dtype;
+        if (error) *error = 0;
+        return root;
+    }
+}
+
+static void pn(const me_expr *n, int depth) {
+    int i, arity;
+    printf("%*s", depth, "");
+
+    if (!n) {
+        printf("NULL\n");
+        return;
+    }
+
+    switch (TYPE_MASK(n->type)) {
+        case ME_CONSTANT: printf("%f\n", n->value);
+            break;
+        case ME_VARIABLE: printf("bound %p\n", n->bound);
+            break;
+
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7:
+            arity = ARITY(n->type);
+            printf("f%d", arity);
+            for (i = 0; i < arity; i++) {
+                printf(" %p", n->parameters[i]);
+            }
+            printf("\n");
+            for (i = 0; i < arity; i++) {
+                pn(n->parameters[i], depth + 1);
+            }
+            break;
+    }
+}
+
+
+void me_print(const me_expr *n) {
+    pn(n, 0);
+}
+
+
+/* ============================================================================
+ * BYTECODE COMPILER AND FUSED EXECUTOR
+ * ============================================================================
+ * This implements expression flattening for optimal performance.
+ * The bytecode is type-agnostic and enables loop fusion.
+ */
+
+typedef enum {
+    BC_LOAD_VAR, // Load from variable array: reg[dst] = vars[src1][i]
+    BC_LOAD_CONST, // Load constant: reg[dst] = constant
+    BC_ADD, // reg[dst] = reg[src1] + reg[src2]
+    BC_SUB, // reg[dst] = reg[src1] - reg[src2]
+    BC_MUL, // reg[dst] = reg[src1] * reg[src2]
+    BC_DIV, // reg[dst] = reg[src1] / reg[src2]
+    BC_POW, // reg[dst] = pow(reg[src1], reg[src2])
+    BC_NEG, // reg[dst] = -reg[src1]
+    BC_SQRT, // reg[dst] = sqrt(reg[src1])
+    BC_SIN, // reg[dst] = sin(reg[src1])
+    BC_COS, // reg[dst] = cos(reg[src1])
+    BC_EXP, // reg[dst] = exp(reg[src1])
+    BC_LOG, // reg[dst] = log(reg[src1])
+    BC_ABS, // reg[dst] = fabs(reg[src1])
+    BC_CALL1, // reg[dst] = function(reg[src1])
+    BC_CALL2, // reg[dst] = function(reg[src1], reg[src2])
+    BC_STORE, // output[i] = reg[src1]
+    BC_CONVERT // Type conversion: reg[dst] = convert(reg[src1])
+} bc_opcode;
+
+typedef struct {
+    bc_opcode op;
+    int src1; // First source register/variable index
+    int src2; // Second source register (for binary ops)
+    int dst; // Destination register
+    union {
+        double constant; // For BC_LOAD_CONST
+        const void *function; // For BC_CALL1/BC_CALL2
+        struct {
+            me_dtype from_type;
+            me_dtype to_type;
+        } convert; // For BC_CONVERT
+    } data;
+} bc_instruction;
+
+typedef struct {
+    bc_instruction *code;
+    int capacity;
+    int count;
+    int next_reg; // Next available register
+    const double **var_ptrs; // Array of variable pointers
+    int var_count; // Number of variables
+    int var_capacity;
+} bc_compiler;
+
+static bc_compiler *bc_new() {
+    bc_compiler *bc = malloc(sizeof(bc_compiler));
+    bc->capacity = 16;
+    bc->code = malloc(bc->capacity * sizeof(bc_instruction));
+    bc->count = 0;
+    bc->next_reg = 0;
+    bc->var_capacity = 16;
+    bc->var_ptrs = malloc(bc->var_capacity * sizeof(double *));
+    bc->var_count = 0;
+    return bc;
+}
+
+static void bc_free(bc_compiler *bc) {
+    if (bc) {
+        free(bc->code);
+        free(bc->var_ptrs);
+        free(bc);
+    }
+}
+
+static void bc_emit(bc_compiler *bc, bc_instruction inst) {
+    if (bc->count >= bc->capacity) {
+        bc->capacity *= 2;
+        bc->code = realloc(bc->code, bc->capacity * sizeof(bc_instruction));
+    }
+    bc->code[bc->count++] = inst;
+}
+
+static int bc_alloc_reg(bc_compiler *bc) {
+    return bc->next_reg++;
+}
+
+/* Find or add variable to mapping */
+static int bc_get_var_index(bc_compiler *bc, const double *var_ptr) {
+    for (int i = 0; i < bc->var_count; i++) {
+        if (bc->var_ptrs[i] == var_ptr) {
+            return i;
+        }
+    }
+    // Add new variable
+    if (bc->var_count >= bc->var_capacity) {
+        bc->var_capacity *= 2;
+        bc->var_ptrs = realloc(bc->var_ptrs, bc->var_capacity * sizeof(double *));
+    }
+    bc->var_ptrs[bc->var_count] = var_ptr;
+    return bc->var_count++;
+}
+
+/* Compile expression tree to bytecode */
+static int bc_compile_expr(bc_compiler *bc, const me_expr *n) {
+    if (!n) return -1;
+
+    int dst_reg;
+
+    switch (TYPE_MASK(n->type)) {
+        case ME_CONSTANT:
+            dst_reg = bc_alloc_reg(bc);
+            bc_emit(bc, (bc_instruction){BC_LOAD_CONST, -1, -1, dst_reg, {.constant = n->value}});
+            return dst_reg;
+
+        case ME_VARIABLE:
+            dst_reg = bc_alloc_reg(bc); {
+                int var_idx = bc_get_var_index(bc, n->bound);
+                bc_emit(bc, (bc_instruction){BC_LOAD_VAR, var_idx, -1, dst_reg, {.constant = 0}});
+            }
+            return dst_reg;
+
+        case ME_FUNCTION0:
+        case ME_FUNCTION0 | ME_FLAG_PURE:
+            // Constants like pi(), e()
+            dst_reg = bc_alloc_reg(bc); {
+                double (*func)(void) = (double(*)(void)) n->function;
+                double val = func();
+                bc_emit(bc, (bc_instruction){BC_LOAD_CONST, -1, -1, dst_reg, {.constant = val}});
+            }
+            return dst_reg;
+
+        case ME_FUNCTION1:
+        case ME_FUNCTION1 | ME_FLAG_PURE: {
+            int src = bc_compile_expr(bc, n->parameters[0]);
+            dst_reg = bc_alloc_reg(bc);
+
+            const void *func_ptr = n->function;
+
+            // Recognize common functions
+            if (func_ptr == (void *) sqrt) {
+                bc_emit(bc, (bc_instruction){BC_SQRT, src, -1, dst_reg, {.constant = 0}});
+            } else if (func_ptr == (void *) sin) {
+                bc_emit(bc, (bc_instruction){BC_SIN, src, -1, dst_reg, {.constant = 0}});
+            } else if (func_ptr == (void *) cos) {
+                bc_emit(bc, (bc_instruction){BC_COS, src, -1, dst_reg, {.constant = 0}});
+            } else if (func_ptr == (void *) exp) {
+                bc_emit(bc, (bc_instruction){BC_EXP, src, -1, dst_reg, {.constant = 0}});
+            } else if (func_ptr == (void *) log) {
+                bc_emit(bc, (bc_instruction){BC_LOG, src, -1, dst_reg, {.constant = 0}});
+            } else if (func_ptr == (void *) fabs) {
+                bc_emit(bc, (bc_instruction){BC_ABS, src, -1, dst_reg, {.constant = 0}});
+            } else if (func_ptr == (void *) negate) {
+                bc_emit(bc, (bc_instruction){BC_NEG, src, -1, dst_reg, {.constant = 0}});
+            } else {
+                // Generic call
+                bc_emit(bc, (bc_instruction){BC_CALL1, src, -1, dst_reg, {.function = func_ptr}});
+            }
+            return dst_reg;
+        }
+
+        case ME_FUNCTION2:
+        case ME_FUNCTION2 | ME_FLAG_PURE: {
+            int src1 = bc_compile_expr(bc, n->parameters[0]);
+            int src2 = bc_compile_expr(bc, n->parameters[1]);
+            dst_reg = bc_alloc_reg(bc);
+
+            me_fun2 func = (me_fun2) n->function;
+
+            // Recognize common functions
+            if (func == add) {
+                bc_emit(bc, (bc_instruction){BC_ADD, src1, src2, dst_reg, {.constant = 0}});
+            } else if (func == sub) {
+                bc_emit(bc, (bc_instruction){BC_SUB, src1, src2, dst_reg, {.constant = 0}});
+            } else if (func == mul) {
+                bc_emit(bc, (bc_instruction){BC_MUL, src1, src2, dst_reg, {.constant = 0}});
+            } else if (func == divide) {
+                bc_emit(bc, (bc_instruction){BC_DIV, src1, src2, dst_reg, {.constant = 0}});
+            } else if (func == (me_fun2) pow) {
+                bc_emit(bc, (bc_instruction){BC_POW, src1, src2, dst_reg, {.constant = 0}});
+            } else {
+                // Generic call
+                bc_emit(bc, (bc_instruction){BC_CALL2, src1, src2, dst_reg, {.function = (void *) func}});
+            }
+            return dst_reg;
+        }
+
+        default:
+            // For more complex cases, fall back to tree evaluation
+            return -1;
+    }
+}
+
+/* Compile expression to bytecode and attach to me_expr */
+static void me_compile_bytecode(me_expr *n) {
+    if (!n) return;
+
+    bc_compiler *bc = bc_new();
+
+    // Compile expression
+    int result_reg = bc_compile_expr(bc, n);
+
+    if (result_reg >= 0) {
+        // Emit store instruction
+        bc_emit(bc, (bc_instruction){BC_STORE, result_reg, -1, 0, {.constant = 0}});
+
+        // Attach to expression
+        n->bytecode = bc->code;
+        n->ncode = bc->count;
+
+        // Free compiler but keep code and var mapping
+        free((void *) bc->var_ptrs);
+        free(bc);
+    } else {
+        // Compilation failed, clean up
+        bc_free(bc);
+        n->bytecode = NULL;
+        n->ncode = 0;
+    }
+}
+
+/* Recursive helper for building variable array */
+static void me_traverse_vars(const me_expr *node, const double **vars, int *var_count, int max_vars) {
+    if (!node || *var_count >= max_vars) return;
+
+    if (node->type == ME_VARIABLE) {
+        // Check if already in array
+        for (int i = 0; i < *var_count; i++) {
+            if (vars[i] == node->bound) return;
+        }
+        vars[*var_count] = node->bound;
+        (*var_count)++;
+        return;
+    }
+
+    if (IS_FUNCTION(node->type) || IS_CLOSURE(node->type)) {
+        int arity = ARITY(node->type);
+        for (int i = 0; i < arity; i++) {
+            me_traverse_vars(node->parameters[i], vars, var_count, max_vars);
+        }
+    }
+}
+
+/* Build variable array from expression tree */
+static int me_build_var_array(const me_expr *n, const double **vars, int max_vars) {
+    int var_count = 0;
+    me_traverse_vars(n, vars, &var_count, max_vars);
+    return var_count;
+}
+
+/* Execute bytecode with fused loop - OPTIMIZED VERSION */
+void me_eval_fused(const me_expr *n) {
+    if (!n || !n->output || n->nitems <= 0) return;
+
+    // Compile bytecode if not already done
+    if (!n->bytecode) {
+        me_compile_bytecode((me_expr *) n);
+    }
+
+    // Fall back to regular eval if compilation failed
+    if (!n->bytecode) {
+        me_eval(n);
+        return;
+    }
+
+    const bc_instruction *code = n->bytecode;
+    const int ncode = n->ncode;
+    const int nitems = n->nitems;
+
+    // Build variable array - same order as during compilation
+    const double *vars[16];
+    me_build_var_array(n, vars, 16);
+
+    // Determine max register used
+    int max_reg = 0;
+    for (int pc = 0; pc < ncode; pc++) {
+        if (code[pc].dst > max_reg) max_reg = code[pc].dst;
+        if (code[pc].src1 > max_reg && code[pc].src1 >= 0) max_reg = code[pc].src1;
+        if (code[pc].src2 > max_reg && code[pc].src2 >= 0) max_reg = code[pc].src2;
+    }
+    max_reg++; // Convert to count
+
+    // Allocate temporary arrays for registers
+    double **temps = malloc(max_reg * sizeof(double *));
+    for (int r = 0; r < max_reg; r++) {
+        temps[r] = malloc(nitems * sizeof(double));
+    }
+
+    // Execute each instruction across ALL elements (loop fusion!)
+    for (int pc = 0; pc < ncode; pc++) {
+        bc_instruction inst = code[pc];
+        int i;
+
+        switch (inst.op) {
+            case BC_LOAD_VAR:
+                // Copy variable data to temp register
+                memcpy(temps[inst.dst], vars[inst.src1], nitems * sizeof(double));
+                break;
+
+            case BC_LOAD_CONST:
+                // Broadcast constant to all elements
+#pragma GCC ivdep
+                for (i = 0; i < nitems; i++) {
+                    temps[inst.dst][i] = inst.data.constant;
+                }
+                break;
+
+            case BC_ADD:
+                vec_add(temps[inst.src1], temps[inst.src2], temps[inst.dst], nitems);
+                break;
+
+            case BC_SUB:
+                vec_sub(temps[inst.src1], temps[inst.src2], temps[inst.dst], nitems);
+                break;
+
+            case BC_MUL:
+                vec_mul(temps[inst.src1], temps[inst.src2], temps[inst.dst], nitems);
+                break;
+
+            case BC_DIV:
+                vec_div(temps[inst.src1], temps[inst.src2], temps[inst.dst], nitems);
+                break;
+
+            case BC_POW:
+                vec_pow(temps[inst.src1], temps[inst.src2], temps[inst.dst], nitems);
+                break;
+
+            case BC_NEG:
+                vec_negate(temps[inst.src1], temps[inst.dst], nitems);
+                break;
+
+            case BC_SQRT:
+                vec_sqrt(temps[inst.src1], temps[inst.dst], nitems);
+                break;
+
+            case BC_SIN:
+                vec_sin(temps[inst.src1], temps[inst.dst], nitems);
+                break;
+
+            case BC_COS:
+                vec_cos(temps[inst.src1], temps[inst.dst], nitems);
+                break;
+
+            case BC_EXP:
+#pragma GCC ivdep
+                for (i = 0; i < nitems; i++) {
+                    temps[inst.dst][i] = exp(temps[inst.src1][i]);
+                }
+                break;
+
+            case BC_LOG:
+#pragma GCC ivdep
+                for (i = 0; i < nitems; i++) {
+                    temps[inst.dst][i] = log(temps[inst.src1][i]);
+                }
+                break;
+
+            case BC_ABS:
+#pragma GCC ivdep
+                for (i = 0; i < nitems; i++) {
+                    temps[inst.dst][i] = fabs(temps[inst.src1][i]);
+                }
+                break;
+
+            case BC_CALL1: {
+                double (*func)(double) = inst.data.function;
+#pragma GCC ivdep
+                for (i = 0; i < nitems; i++) {
+                    temps[inst.dst][i] = func(temps[inst.src1][i]);
+                }
+                break;
+            }
+
+            case BC_CALL2: {
+                double (*func)(double, double) = inst.data.function;
+#pragma GCC ivdep
+                for (i = 0; i < nitems; i++) {
+                    temps[inst.dst][i] = func(temps[inst.src1][i], temps[inst.src2][i]);
+                }
+                break;
+            }
+
+            case BC_CONVERT: {
+                // Type conversion - for now, this is a placeholder
+                // Full implementation requires type-aware bytecode execution
+                convert_func_t conv_func = get_convert_func(inst.data.convert.from_type,
+                                                            inst.data.convert.to_type);
+                if (conv_func) {
+                    conv_func(temps[inst.src1], temps[inst.dst], nitems);
+                } else {
+                    // No conversion needed or unsupported
+                    memcpy(temps[inst.dst], temps[inst.src1],
+                           nitems * dtype_size(inst.data.convert.from_type));
+                }
+                break;
+            }
+
+            case BC_STORE:
+                // Copy result to output
+                memcpy(n->output, temps[inst.src1], nitems * sizeof(double));
+                break;
+        }
+    }
+
+    // Free temporary arrays
+    for (int r = 0; r < max_reg; r++) {
+        free(temps[r]);
+    }
+    free(temps);
+}
diff --git a/src/blosc2/miniexpr.h b/src/blosc2/miniexpr.h
new file mode 100644
index 00000000..e8022974
--- /dev/null
+++ b/src/blosc2/miniexpr.h
@@ -0,0 +1,168 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
+  https://blosc.org
+  License: BSD 3-Clause (see LICENSE.txt)
+
+  See LICENSE.txt for details about copyright and rights to use.
+**********************************************************************/
+
+// Loosely based on https://github.com/CodePlea/tinyexpr. License follows:
+// SPDX-License-Identifier: Zlib
+/*
+ * TINYEXPR - Tiny recursive descent parser and evaluation engine in C
+ *
+ * Copyright (c) 2015-2020 Lewis Van Winkle
+ *
+ * http://CodePlea.com
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgement in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#ifndef MINIEXPR_H
+#define MINIEXPR_H
+
+
+#ifdef __cplusplus
+extern "C" {
+
+#endif
+
+
+/* Data type enumeration - Full C99 support */
+typedef enum {
+    /* Boolean */
+    ME_BOOL,
+
+    /* Signed integers */
+    ME_INT8,
+    ME_INT16,
+    ME_INT32,
+    ME_INT64,
+
+    /* Unsigned integers */
+    ME_UINT8,
+    ME_UINT16,
+    ME_UINT32,
+    ME_UINT64,
+
+    /* Floating point */
+    ME_FLOAT32,
+    ME_FLOAT64,
+
+    /* Complex (C99) */
+    ME_COMPLEX64, /* float complex */
+    ME_COMPLEX128 /* double complex */
+} me_dtype;
+
+typedef struct me_expr {
+    int type;
+
+    union {
+        double value;
+        const double *bound;
+        const void *function;
+    };
+
+    /* Vector operation info */
+    void *output; // Generic pointer (can be float* or double*)
+    int nitems;
+    me_dtype dtype; // Data type for this expression (result type after promotion)
+    me_dtype input_dtype; // Original input type (for variables/constants)
+    /* Bytecode info (for fused evaluation) */
+    void *bytecode; // Pointer to compiled bytecode
+    int ncode; // Number of instructions
+    void *parameters[1]; // Must be last (flexible array member)
+} me_expr;
+
+
+enum {
+    ME_VARIABLE = 0,
+
+    ME_FUNCTION0 = 8, ME_FUNCTION1, ME_FUNCTION2, ME_FUNCTION3,
+    ME_FUNCTION4, ME_FUNCTION5, ME_FUNCTION6, ME_FUNCTION7,
+
+    ME_CLOSURE0 = 16, ME_CLOSURE1, ME_CLOSURE2, ME_CLOSURE3,
+    ME_CLOSURE4, ME_CLOSURE5, ME_CLOSURE6, ME_CLOSURE7,
+
+    ME_FLAG_PURE = 32
+};
+
+typedef struct me_variable {
+    const char *name;
+    const void *address;
+    int type;
+    void *context;
+    me_dtype dtype; // Data type of this variable
+} me_variable;
+
+
+/* Parses the input expression and binds variables. */
+/* Returns NULL on error. */
+/* dtype parameter is ignored - result type is inferred from variable types */
+/* The actual result type is returned in n->dtype */
+me_expr *me_compile(const char *expression, const me_variable *variables, int var_count,
+                    void *output, int nitems, me_dtype dtype, int *error);
+
+/* Evaluates the expression on vectors. */
+void me_eval(const me_expr *n);
+
+/* Evaluates using fused bytecode (faster for complex expressions). */
+void me_eval_fused(const me_expr *n);
+
+/* Evaluates compiled expression with new variable and output pointers.
+ * This allows processing large arrays in chunks without recompiling.
+ *
+ * Parameters:
+ *   expr: Compiled expression (from me_compile)
+ *   vars_chunk: Array of pointers to variable data chunks (same order as in me_compile)
+ *   n_vars: Number of variables (must match the number used in me_compile)
+ *   output_chunk: Pointer to output buffer for this chunk
+ *   chunk_nitems: Number of elements in this chunk
+ *
+ * Note: The chunks must have the same data types as the original variables.
+ * WARNING: This function is NOT thread-safe. Use me_eval_chunk_threadsafe() for
+ *          concurrent evaluation from multiple threads.
+ */
+void me_eval_chunk(const me_expr *expr, const void **vars_chunk, int n_vars,
+                   void *output_chunk, int chunk_nitems);
+
+/* Thread-safe version of me_eval_chunk.
+ * This function can be safely called from multiple threads simultaneously on the
+ * same compiled expression. It creates a temporary clone of the expression tree
+ * for each call, eliminating race conditions at the cost of some memory allocation.
+ *
+ * Use this when you need to evaluate the same expression in parallel across
+ * different chunks from multiple threads.
+ */
+void me_eval_chunk_threadsafe(const me_expr *expr, const void **vars_chunk,
+                              int n_vars, void *output_chunk, int chunk_nitems);
+
+/* Prints debugging information on the syntax tree. */
+void me_print(const me_expr *n);
+
+/* Frees the expression. */
+/* This is safe to call on NULL pointers. */
+void me_free(me_expr *n);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*MINIEXPR_H*/

From 8358c5920fea8d12d71740754738942b279df548 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 12 Dec 2025 07:44:16 +0100
Subject: [PATCH 002/123] Upgrade version of miniexpr

---
 src/blosc2/miniexpr.c       | 197 +++++++++++++++++++++++++++++-------
 src/blosc2/miniexpr.h       |  60 +++++++++--
 src/blosc2/miniexpr_numpy.h | 157 ++++++++++++++++++++++++++++
 3 files changed, 374 insertions(+), 40 deletions(-)
 create mode 100644 src/blosc2/miniexpr_numpy.h

diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index 7185bb80..63ce1e7b 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -56,6 +56,7 @@ For log = natural log uncomment the next line. */
 #include <stdint.h>
 #include <stdbool.h>
 #include <complex.h>
+#include <assert.h>
 
 #ifndef NAN
 #define NAN (0.0/0.0)
@@ -76,6 +77,7 @@ enum {
 
 
 /* Type promotion table following NumPy rules */
+/* Note: ME_AUTO (0) should never appear in type promotion, so we index from 1 */
 static const me_dtype type_promotion_table[13][13] = {
     /* Rows: left operand, Columns: right operand */
     /* BOOL,  INT8,    INT16,   INT32,   INT64,   UINT8,   UINT16,  UINT32,  UINT64,  FLOAT32, FLOAT64, COMPLEX64, COMPLEX128 */
@@ -135,10 +137,24 @@ static const me_dtype type_promotion_table[13][13] = {
 
 /* Promote two types according to NumPy rules */
 static me_dtype promome_types(me_dtype a, me_dtype b) {
-    if (a >= 0 && a < 13 && b >= 0 && b < 13) {
-        return type_promotion_table[a][b];
+    // ME_AUTO should have been resolved during compilation
+    if (a == ME_AUTO || b == ME_AUTO) {
+        fprintf(stderr, "FATAL: ME_AUTO in type promotion (a=%d, b=%d). This is a bug.\n", a, b);
+#ifdef NDEBUG
+        abort(); // Release build: terminate immediately
+#else
+        assert(0 && "ME_AUTO should be resolved during compilation"); // Debug: trigger debugger
+#endif
+    }
+
+    // Adjust indices since table starts at ME_BOOL (index 1), not ME_AUTO (index 0)
+    int a_idx = a - 1;
+    int b_idx = b - 1;
+    if (a_idx >= 0 && a_idx < 13 && b_idx >= 0 && b_idx < 13) {
+        return type_promotion_table[a_idx][b_idx];
     }
-    return ME_FLOAT64; // Fallback
+    fprintf(stderr, "WARNING: Invalid dtype in type promotion (a=%d, b=%d). Falling back to FLOAT64.\n", a, b);
+    return ME_FLOAT64; // Fallback for out-of-range types
 }
 
 /* Get size of a type in bytes */
@@ -417,35 +433,36 @@ static double npr(double n, double r) { return ncr(n, r) * fac(r); }
 
 static const me_variable functions[] = {
     /* must be in alphabetical order */
-    {"abs", fabs, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"acos", acos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"asin", asin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"atan", atan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"atan2", atan2, ME_FUNCTION2 | ME_FLAG_PURE, 0},
-    {"ceil", ceil, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"cos", cos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"cosh", cosh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"e", e, ME_FUNCTION0 | ME_FLAG_PURE, 0},
-    {"exp", exp, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"fac", fac, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"floor", floor, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"ln", log, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    /* Format: {name, dtype, address, type, context} */
+    {"abs", 0, fabs, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"acos", 0, acos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"asin", 0, asin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"atan", 0, atan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"atan2", 0, atan2, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"ceil", 0, ceil, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"cos", 0, cos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"cosh", 0, cosh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"e", 0, e, ME_FUNCTION0 | ME_FLAG_PURE, 0},
+    {"exp", 0, exp, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"fac", 0, fac, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"floor", 0, floor, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"ln", 0, log, ME_FUNCTION1 | ME_FLAG_PURE, 0},
 #ifdef ME_NAT_LOG
-    {"log", log, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"log", 0, log, ME_FUNCTION1 | ME_FLAG_PURE, 0},
 #else
-    {"log", log10, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"log", 0, log10, ME_FUNCTION1 | ME_FLAG_PURE, 0},
 #endif
-    {"log10", log10, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"ncr", ncr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
-    {"npr", npr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
-    {"pi", pi, ME_FUNCTION0 | ME_FLAG_PURE, 0},
-    {"pow", pow, ME_FUNCTION2 | ME_FLAG_PURE, 0},
-    {"sin", sin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"sinh", sinh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"sqrt", sqrt, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"tan", tan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"tanh", tanh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {0, 0, 0, 0}
+    {"log10", 0, log10, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"ncr", 0, ncr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"npr", 0, npr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"pi", 0, pi, ME_FUNCTION0 | ME_FLAG_PURE, 0},
+    {"pow", 0, pow, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"sin", 0, sin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"sinh", 0, sinh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"sqrt", 0, sqrt, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"tan", 0, tan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"tanh", 0, tanh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {0, 0, 0, 0, 0}
 };
 
 static const me_variable *find_builtin(const char *name, int len) {
@@ -2357,6 +2374,14 @@ void me_eval(const me_expr *n) {
     bool all_match = all_variables_match_type(n, result_type);
     if (result_type == n->dtype && all_match) {
         // Fast path: no promotion needed
+        if (n->dtype == ME_AUTO) {
+            fprintf(stderr, "FATAL: ME_AUTO dtype in evaluation. This is a bug.\n");
+#ifdef NDEBUG
+            abort(); // Release build: terminate immediately
+#else
+            assert(0 && "ME_AUTO should be resolved during compilation"); // Debug: trigger debugger
+#endif
+        }
         switch (n->dtype) {
             case ME_BOOL: me_eval_i8(n);
                 break;
@@ -2384,6 +2409,13 @@ void me_eval(const me_expr *n) {
                 break;
             case ME_COMPLEX128: me_eval_c128(n);
                 break;
+            default:
+                fprintf(stderr, "FATAL: Invalid dtype %d in evaluation.\n", n->dtype);
+#ifdef NDEBUG
+                abort(); // Release build: terminate immediately
+#else
+                assert(0 && "Invalid dtype"); // Debug: trigger debugger
+#endif
         }
         return;
     }
@@ -2408,6 +2440,14 @@ void me_eval(const me_expr *n) {
     ((me_expr *) n)->dtype = result_type;
 
     // Evaluate with promoted types
+    if (result_type == ME_AUTO) {
+        fprintf(stderr, "FATAL: ME_AUTO result type in evaluation. This is a bug.\n");
+#ifdef NDEBUG
+        abort(); // Release build: terminate immediately
+#else
+        assert(0 && "ME_AUTO should be resolved during compilation"); // Debug: trigger debugger
+#endif
+    }
     switch (result_type) {
         case ME_BOOL: me_eval_i8(n);
             break;
@@ -2435,6 +2475,13 @@ void me_eval(const me_expr *n) {
             break;
         case ME_COMPLEX128: me_eval_c128(n);
             break;
+        default:
+            fprintf(stderr, "FATAL: Invalid result type %d in evaluation.\n", result_type);
+#ifdef NDEBUG
+            abort(); // Release build: terminate immediately
+#else
+            assert(0 && "Invalid dtype"); // Debug: trigger debugger
+#endif
     }
 
     // Restore original variable bindings
@@ -2831,6 +2878,37 @@ static void optimize(me_expr *n) {
 
 me_expr *me_compile(const char *expression, const me_variable *variables, int var_count,
                     void *output, int nitems, me_dtype dtype, int *error) {
+    // Validate dtype usage: either all vars are ME_AUTO (use dtype), or dtype is ME_AUTO (use var dtypes)
+    if (variables && var_count > 0) {
+        int auto_count = 0;
+        int specified_count = 0;
+
+        for (int i = 0; i < var_count; i++) {
+            if (variables[i].dtype == ME_AUTO) {
+                auto_count++;
+            } else {
+                specified_count++;
+            }
+        }
+
+        // Check the two valid modes
+        if (dtype == ME_AUTO) {
+            // Mode 1: Output dtype is ME_AUTO, all variables must have explicit dtypes
+            if (auto_count > 0) {
+                fprintf(stderr, "Error: When output dtype is ME_AUTO, all variable dtypes must be specified (not ME_AUTO)\n");
+                if (error) *error = -1;
+                return NULL;
+            }
+        } else {
+            // Mode 2: Output dtype is specified, all variables must be ME_AUTO
+            if (specified_count > 0) {
+                fprintf(stderr, "Error: When output dtype is specified, all variable dtypes must be ME_AUTO\n");
+                if (error) *error = -1;
+                return NULL;
+            }
+        }
+    }
+
     // Create a copy of variables with dtype filled in (if not already set)
     me_variable *vars_copy = NULL;
     if (variables && var_count > 0) {
@@ -2841,9 +2919,8 @@ me_expr *me_compile(const char *expression, const me_variable *variables, int va
         }
         for (int i = 0; i < var_count; i++) {
             vars_copy[i] = variables[i];
-            // If dtype not set (0 = ME_BOOL, which is unlikely for user variables),
-            // use the expression's dtype
-            if (vars_copy[i].dtype == 0 && vars_copy[i].type == 0) {
+            // If dtype not set (ME_AUTO), use the provided dtype
+            if (vars_copy[i].dtype == ME_AUTO && vars_copy[i].type == 0) {
                 vars_copy[i].dtype = dtype;
                 vars_copy[i].type = ME_VARIABLE;
             }
@@ -2854,7 +2931,7 @@ me_expr *me_compile(const char *expression, const me_variable *variables, int va
     s.start = s.next = expression;
     s.lookup = vars_copy ? vars_copy : variables;
     s.lookup_len = var_count;
-    s.target_dtype = dtype; // Set target dtype for constants
+    s.target_dtype = (dtype != ME_AUTO) ? dtype : ME_FLOAT64; // Set target dtype for constants
 
     next_token(&s);
     me_expr *root = list(&s);
@@ -2877,12 +2954,64 @@ me_expr *me_compile(const char *expression, const me_variable *variables, int va
         optimize(root);
         root->output = output;
         root->nitems = nitems;
-        root->dtype = dtype;
+
+        // If dtype is ME_AUTO, infer from expression; otherwise use provided dtype
+        if (dtype == ME_AUTO) {
+            root->dtype = infer_result_type(root);
+        } else {
+            root->dtype = dtype;
+        }
+
         if (error) *error = 0;
         return root;
     }
 }
 
+// Synthetic addresses for ordinal matching (when user provides NULL addresses)
+static char synthetic_var_addresses[100];
+
+me_expr *me_compile_chunk(const char *expression, const me_variable *variables,
+                          int var_count, me_dtype dtype, int *error) {
+    // For chunked evaluation, we compile without specific output/nitems
+    // If variables have NULL addresses, assign synthetic unique addresses for ordinal matching
+    me_variable *vars_copy = NULL;
+    int needs_synthetic = 0;
+
+    if (variables && var_count > 0) {
+        // Check if any variables have NULL addresses
+        for (int i = 0; i < var_count; i++) {
+            if (variables[i].address == NULL) {
+                needs_synthetic = 1;
+                break;
+            }
+        }
+
+        if (needs_synthetic) {
+            // Create copy with synthetic addresses
+            vars_copy = malloc(var_count * sizeof(me_variable));
+            if (!vars_copy) {
+                if (error) *error = -1;
+                return NULL;
+            }
+
+            for (int i = 0; i < var_count; i++) {
+                vars_copy[i] = variables[i];
+                if (vars_copy[i].address == NULL) {
+                    // Use address in synthetic array (each index is unique)
+                    vars_copy[i].address = &synthetic_var_addresses[i];
+                }
+            }
+
+            me_expr *result = me_compile(expression, vars_copy, var_count, NULL, 0, dtype, error);
+            free(vars_copy);
+            return result;
+        }
+    }
+
+    // No NULL addresses, use variables as-is
+    return me_compile(expression, variables, var_count, NULL, 0, dtype, error);
+}
+
 static void pn(const me_expr *n, int depth) {
     int i, arity;
     printf("%*s", depth, "");
diff --git a/src/blosc2/miniexpr.h b/src/blosc2/miniexpr.h
index e8022974..a499b01d 100644
--- a/src/blosc2/miniexpr.h
+++ b/src/blosc2/miniexpr.h
@@ -46,6 +46,9 @@ extern "C" {
 
 /* Data type enumeration - Full C99 support */
 typedef enum {
+    /* Automatic type inference */
+    ME_AUTO,
+
     /* Boolean */
     ME_BOOL,
 
@@ -105,20 +108,65 @@ enum {
 
 typedef struct me_variable {
     const char *name;
-    const void *address;
-    int type;
-    void *context;
-    me_dtype dtype; // Data type of this variable
+    me_dtype dtype;      // Data type of this variable (ME_AUTO = use output dtype)
+    const void *address; // Pointer to data (NULL for me_compile_chunk)
+    int type;            // ME_VARIABLE for user variables (0 = auto-set to ME_VARIABLE)
+    void *context;       // For closures/functions (NULL for normal variables)
 } me_variable;
 
+/* Note: When initializing variables, only name/dtype/address are typically needed.
+ * Unspecified fields default to 0/NULL, which is correct for normal use:
+ *   {"varname"}                          → defaults all fields
+ *   {"varname", ME_FLOAT64}              → for me_compile_chunk with mixed types
+ *   {"varname", ME_FLOAT64, var_array}   → for me_compile with address
+ * Advanced users can specify type for closures/functions if needed.
+ */
+
 
 /* Parses the input expression and binds variables. */
 /* Returns NULL on error. */
-/* dtype parameter is ignored - result type is inferred from variable types */
-/* The actual result type is returned in n->dtype */
+/*
+ * The dtype parameter controls variable type handling:
+ *   - If dtype is ME_AUTO: All variables must have explicit dtypes (not ME_AUTO).
+ *                          Output dtype is inferred from the expression.
+ *   - If dtype is specified: All variables must be ME_AUTO.
+ *                            Both variables and output use this dtype.
+ * The actual result type is available in expr->dtype after compilation.
+ */
 me_expr *me_compile(const char *expression, const me_variable *variables, int var_count,
                     void *output, int nitems, me_dtype dtype, int *error);
 
+/* Compile expression for chunked evaluation.
+ * This variant is optimized for use with me_eval_chunk() and me_eval_chunk_threadsafe(),
+ * where variable and output pointers are provided later during evaluation.
+ *
+ * Parameters:
+ *   expression: The expression string to compile
+ *   variables: Array of variable definitions. Only the 'name' field is required.
+ *              Variables will be matched by position (ordinal order) during me_eval_chunk().
+ *   var_count: Number of variables
+ *   dtype: Data type handling (same rules as me_compile):
+ *          - ME_AUTO: All variables must specify their dtypes, output is inferred
+ *          - Specific type: All variables must be ME_AUTO, this type is used for all
+ *   error: Optional pointer to receive error position (0 on success, >0 on error)
+ *
+ * Returns: Compiled expression ready for chunked evaluation, or NULL on error
+ *
+ * Example 1 (simple - all same type):
+ *   me_variable vars[] = {{"x"}, {"y"}};  // Both ME_AUTO
+ *   me_expr *expr = me_compile_chunk("x + y", vars, 2, ME_FLOAT64, &err);
+ *
+ * Example 2 (mixed types):
+ *   me_variable vars[] = {{"x", ME_INT32}, {"y", ME_FLOAT64}};
+ *   me_expr *expr = me_compile_chunk("x + y", vars, 2, ME_AUTO, &err);
+ *
+ *   // Later, provide data in same order as variable definitions
+ *   const void *data[] = {x_array, y_array};  // x first, y second
+ *   me_eval_chunk(expr, data, 2, output, nitems);
+ */
+me_expr *me_compile_chunk(const char *expression, const me_variable *variables,
+                          int var_count, me_dtype dtype, int *error);
+
 /* Evaluates the expression on vectors. */
 void me_eval(const me_expr *n);
 
diff --git a/src/blosc2/miniexpr_numpy.h b/src/blosc2/miniexpr_numpy.h
new file mode 100644
index 00000000..8250c812
--- /dev/null
+++ b/src/blosc2/miniexpr_numpy.h
@@ -0,0 +1,157 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
+  https://blosc.org
+  License: BSD 3-Clause (see LICENSE.txt)
+
+  NumPy Integration Utilities for MiniExpr
+
+  This file provides conversion functions between miniexpr dtypes
+  and NumPy type numbers for Python bindings.
+**********************************************************************/
+
+#ifndef MINIEXPR_NUMPY_H
+#define MINIEXPR_NUMPY_H
+
+#include <stdio.h>
+#include "miniexpr.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Convert miniexpr dtype to NumPy type number
+ *
+ * Returns the NumPy dtype.num value corresponding to a miniexpr dtype.
+ * Returns -1 for ME_AUTO (which has no NumPy equivalent).
+ *
+ * Example:
+ *   int numpy_num = me_dtype_to_numpy(ME_INT64);  // Returns 7
+ */
+static inline int me_dtype_to_numpy(me_dtype dtype) {
+    static const int numpy_type_nums[] = {
+        -1,  // ME_AUTO (0) -> No NumPy equivalent
+        0,   // ME_BOOL (1) -> NPY_BOOL
+        1,   // ME_INT8 (2) -> NPY_BYTE
+        3,   // ME_INT16 (3) -> NPY_SHORT
+        5,   // ME_INT32 (4) -> NPY_INT
+        7,   // ME_INT64 (5) -> NPY_LONGLONG
+        2,   // ME_UINT8 (6) -> NPY_UBYTE
+        4,   // ME_UINT16 (7) -> NPY_USHORT
+        6,   // ME_UINT32 (8) -> NPY_UINT
+        8,   // ME_UINT64 (9) -> NPY_ULONGLONG
+        11,  // ME_FLOAT32 (10) -> NPY_FLOAT
+        12,  // ME_FLOAT64 (11) -> NPY_DOUBLE
+        14,  // ME_COMPLEX64 (12) -> NPY_CFLOAT
+        15   // ME_COMPLEX128 (13) -> NPY_CDOUBLE
+    };
+
+    if (dtype >= 0 && dtype <= ME_COMPLEX128) {
+        return numpy_type_nums[dtype];
+    }
+    return -1;  // Invalid dtype
+}
+
+/* Get a string name for a NumPy type number (for error messages)
+ *
+ * Returns a human-readable name for common NumPy types.
+ * Returns "unknown" for unsupported types.
+ */
+static inline const char* me_numpy_type_name(int numpy_type_num) {
+    switch (numpy_type_num) {
+        case 0:  return "bool";
+        case 1:  return "int8";
+        case 2:  return "uint8";
+        case 3:  return "int16";
+        case 4:  return "uint16";
+        case 5:  return "int32";
+        case 6:  return "uint32";
+        case 7:  return "int64";
+        case 8:  return "uint64";
+        case 9:  return "float16";      // Not supported
+        case 10: return "longdouble";   // Not supported
+        case 11: return "float32";
+        case 12: return "float64";
+        case 13: return "clongdouble";  // Not supported
+        case 14: return "complex64";
+        case 15: return "complex128";
+        default: return "unknown";
+    }
+}
+
+/* Convert NumPy type number to miniexpr dtype
+ *
+ * Returns the miniexpr dtype corresponding to a NumPy dtype.num value.
+ * Returns -1 and prints an error message for unsupported NumPy types.
+ *
+ * Example:
+ *   me_dtype dtype = me_dtype_from_numpy(7);  // Returns ME_INT64
+ *   if (dtype < 0) {
+ *       // Unsupported type, error already printed
+ *       return NULL;
+ *   }
+ *
+ * Note: This function only supports the subset of NumPy types that
+ * miniexpr implements. Other types (float16, longdouble, etc.) will
+ * return -1 and print an error message to stderr.
+ */
+static inline me_dtype me_dtype_from_numpy(int numpy_type_num) {
+    switch (numpy_type_num) {
+        case 0:  return ME_BOOL;
+        case 1:  return ME_INT8;
+        case 2:  return ME_UINT8;
+        case 3:  return ME_INT16;
+        case 4:  return ME_UINT16;
+        case 5:  return ME_INT32;
+        case 6:  return ME_UINT32;
+        case 7:  return ME_INT64;
+        case 8:  return ME_UINT64;
+        case 11: return ME_FLOAT32;
+        case 12: return ME_FLOAT64;
+        case 14: return ME_COMPLEX64;
+        case 15: return ME_COMPLEX128;
+        default:
+            fprintf(stderr, "Error: Unsupported NumPy dtype.num = %d (%s)\n",
+                    numpy_type_num, me_numpy_type_name(numpy_type_num));
+            return -1;  // Return -1 to indicate error
+    }
+}
+
+/* Check if a NumPy type is supported by miniexpr
+ *
+ * Returns 1 if the NumPy type number is supported, 0 otherwise.
+ * This function does not print error messages.
+ *
+ * Example:
+ *   if (me_numpy_type_supported(numpy_dtype_num)) {
+ *       // Can use this type with miniexpr
+ *   }
+ */
+static inline int me_numpy_type_supported(int numpy_type_num) {
+    // Check directly without calling me_dtype_from_numpy to avoid error messages
+    switch (numpy_type_num) {
+        case 0:   // bool
+        case 1:   // int8
+        case 2:   // uint8
+        case 3:   // int16
+        case 4:   // uint16
+        case 5:   // int32
+        case 6:   // uint32
+        case 7:   // int64
+        case 8:   // uint64
+        case 11:  // float32
+        case 12:  // float64
+        case 14:  // complex64
+        case 15:  // complex128
+            return 1;
+        default:
+            return 0;
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* MINIEXPR_NUMPY_H */

From 0483cd11c1798428c9b151dc0160c7ff6f515d03 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 12 Dec 2025 08:51:22 +0100
Subject: [PATCH 003/123] Using me_compile_chunk in register prefilter

---
 src/blosc2/blosc2_ext.pyx | 55 ++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index c43c6681..f2015164 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -531,6 +531,7 @@ cdef extern from "b2nd.h":
 # miniexpr C API declarations
 cdef extern from "miniexpr.h":
     ctypedef enum me_dtype:
+        ME_AUTO,
         ME_BOOL
         ME_INT8
         ME_INT16
@@ -548,10 +549,10 @@ cdef extern from "miniexpr.h":
     # typedef struct me_variable
     ctypedef struct me_variable:
         const char *name
+        me_dtype dtype
         const void *address
         int type
         void *context
-        me_dtype dtype
 
     ctypedef struct me_expr:
         int type
@@ -566,10 +567,12 @@ cdef extern from "miniexpr.h":
         int ncode
         void *parameters[1]
 
+    # me_expr *me_compile(const char *expression, const me_variable *variables,
+    #                     int var_count, void *output, int nitems, me_dtype dtype,
+    #                     int *error) nogil
 
-    me_expr *me_compile(const char *expression, const me_variable *variables,
-                        int var_count, void *output, int nitems, me_dtype dtype,
-                        int *error) nogil
+    me_expr *me_compile_chunk(const char *expression, const me_variable *variables,
+                              int var_count, me_dtype dtype, int *error)
 
     void me_eval(const me_expr *n) nogil
     void me_eval_fused(const me_expr *n) nogil
@@ -2807,11 +2810,11 @@ cdef class NDArray:
     def as_ffi_ptr(self):
         return PyCapsule_New(self.array, <char *> "b2nd_array_t*", NULL)
 
-    cdef udf_udata *_fill_udf_udata(self, func_id, inputs_id):
+    cdef udf_udata *_fill_udf_udata(self, func_id, inputs):
         cdef udf_udata *udata = <udf_udata *> malloc(sizeof(udf_udata))
         udata.py_func = <char *> malloc(strlen(func_id) + 1)
         strcpy(udata.py_func, func_id)
-        udata.inputs_id = inputs_id
+        udata.inputs_id = id(inputs)
         udata.output_cdtype = np.dtype(self.dtype).num
         udata.array = self.array
         # Save these in udf_udata to avoid computing them for each block
@@ -2821,29 +2824,39 @@ cdef class NDArray:
 
         return udata
 
-    def _set_pref_expr(self, func, expression, inputs_id):
-        # Support both function objects and string identifiers
-        if isinstance(func, str):
-            func_id = func
-            # No need to register in prefilter_funcs - C API will be used directly
-        else:
-            func_id = func.__name__
-            blosc2.prefilter_funcs[func_id] = func
-        func_id = func_id.encode("utf-8") if isinstance(func_id, str) else func_id
-
-        # Set prefilter
+    def _set_pref_expr(self, func, expression, inputs):
+        # Set prefilter for miniexpr
         cdef blosc2_cparams* cparams = self.array.sc.storage.cparams
         cparams.prefilter = <blosc2_prefilter_fn> miniexpr_prefilter
 
-        cdef blosc2_prefilter_params* preparams = <blosc2_prefilter_params *> malloc(sizeof(blosc2_prefilter_params))
-        cdef udf_udata* udata = self._fill_udf_udata(func_id, inputs_id)
+        func_id = func.encode("utf-8") if isinstance(func, str) else func
+        cdef udf_udata* udata = self._fill_udf_udata(func_id, inputs)
+
+        # Get the compiled expression handle for multi-threading
+        cdef Py_ssize_t n = len(inputs)
+        cdef me_variable **variables = <me_variable **> malloc(sizeof(me_variable *) * n)
+        if variables == NULL:
+            raise MemoryError()
+        cdef me_variable *var
+        for i, (k, v) in enumerate(inputs.items()):
+            var = <me_variable *> malloc(sizeof(me_variable))  # XXX devise a way to free this
+            if var == NULL:
+                raise MemoryError()
+            var_name = k.encode("utf-8") if isinstance(k, str) else k
+            var.name = <char *> malloc(strlen(var_name) + 1)
+            strcpy(var.name, var_name)
+            var.dtype = v.dtype.num
+            variables[i] = var
+        cdef int error = 0
+        udata.miniexpr_handle = me_compile_chunk(expression, variables, n, ME_AUTO, &error)
+        if udata.miniexpr_handle == NULL:
+            raise ValueError(f"Cannot compile expression: {expression}")
 
-        # XXX Get the compiled expression handle for multi-threading
-        # udata.miniexpr_handle = me_compile(expression, inputs_id)
         # # Increment reference count to keep the expression alive across threads
         # if udata.miniexpr_handle != NULL:
         #     Py_INCREF(<object>udata.miniexpr_handle)
 
+        cdef blosc2_prefilter_params* preparams = <blosc2_prefilter_params *> malloc(sizeof(blosc2_prefilter_params))
         preparams.user_data = udata
         cparams.preparams = preparams
         _check_cparams(cparams)

From f1986cc061708f768b3a622495366e3bc8a26616 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 12 Dec 2025 10:19:08 +0100
Subject: [PATCH 004/123] Use me_dtype_from_numpy and others

---
 src/blosc2/blosc2_ext.pyx | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index f2015164..442b6fec 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -580,6 +580,10 @@ cdef extern from "miniexpr.h":
     void me_free(me_expr *n) nogil
 
 
+cdef extern from "miniexpr-numpy.h":
+    me_dtype me_dtype_from_numpy(int numpy_type_num)
+
+
 ctypedef struct user_filters_udata:
     char* py_func
     int input_cdtype
@@ -1869,8 +1873,8 @@ cdef int aux_miniexpr(udf_udata *udata, int64_t nchunk, int32_t nblock,
             # inputs_slice[key] = obj[slices]
             arr = np.empty(blockshape, dtype=obj.dtype)
             # inputs_slice[key] = obj.get_slice_numpy(arr, (start_ndim, stop_ndim))
-            # This is *slightly* faster than using get_slice_numpy; my hope is that,
-            # with multithreading enabled, this should go faster.
+            # This is *slightly* faster than using get_slice_numpy;
+            # hopefully, with multithreading enabled, this should go faster.
             ndarr = <b2nd_array_t*><uintptr_t>obj.c_array
             PyObject_GetBuffer(arr, &view, PyBUF_SIMPLE)
             with nogil:
@@ -1923,8 +1927,10 @@ cdef int aux_miniexpr(udf_udata *udata, int64_t nchunk, int32_t nblock,
             # output = numexpr_run_compiled_simple(miniexpr_handle, input_arrays, n_inputs)
             # Call miniexpr C API
             # me_eval_expr(miniexpr_handle, input_arrays, n_inputs, <void*>output, typesize)
-        finally:
-            free(input_arrays)
+            me_eval_chunk_threadsafe("", input_arrays, n_inputs, <void*>output, chunk_nitems)  # XXX remove expression
+
+            finally:
+                free(input_arrays)
     else:
         # Fallback to Python callback if C API not available
         if is_postfilter:
@@ -2845,7 +2851,7 @@ cdef class NDArray:
             var_name = k.encode("utf-8") if isinstance(k, str) else k
             var.name = <char *> malloc(strlen(var_name) + 1)
             strcpy(var.name, var_name)
-            var.dtype = v.dtype.num
+            var.dtype = me_dtype_from_numpy(v.dtype.num)
             variables[i] = var
         cdef int error = 0
         udata.miniexpr_handle = me_compile_chunk(expression, variables, n, ME_AUTO, &error)

From 8c6249d0bdc7c64223e7a6570b1f743a9fbc84ba Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 12 Dec 2025 14:29:43 +0100
Subject: [PATCH 005/123] First preliminary version that works with miniexpr
 (for simple cases)

---
 CMakeLists.txt                     |  2 +-
 bench/ndarray/expr-blocked-eval.py | 21 +++++++
 src/blosc2/blosc2_ext.pyx          | 95 ++++++++++++++++--------------
 src/blosc2/lazyexpr.py             |  2 +-
 4 files changed, 75 insertions(+), 45 deletions(-)
 create mode 100644 bench/ndarray/expr-blocked-eval.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f9e65667..65f61065 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,7 +29,7 @@ Python_add_library(blosc2_ext MODULE blosc2_ext.c
 # We need to link against NumPy
 target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 
-# Add include directory for miniexpr.h
+# Add include directory for miniexpr.h and others
 target_include_directories(blosc2_ext PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2")
 
 if(DEFINED ENV{USE_SYSTEM_BLOSC2})
diff --git a/bench/ndarray/expr-blocked-eval.py b/bench/ndarray/expr-blocked-eval.py
new file mode 100644
index 00000000..d4d23e0e
--- /dev/null
+++ b/bench/ndarray/expr-blocked-eval.py
@@ -0,0 +1,21 @@
+from time import time
+import blosc2
+import numpy as np
+
+N = 10_000
+dtype= np.float32
+
+t0 = time()
+a = blosc2.ones((N, N), dtype=dtype)
+print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
+t0 = time()
+b = a.copy()
+c = a.copy()
+print(f"Time to copy data: {(time() - t0) * 1000 :.4f} ms")
+
+t0 = time()
+res = ((a + b) * c).compute()
+print(f"Time to evaluate: {(time() - t0) * 1000 :.4f} ms")
+# print(res.info)
+
+np.testing.assert_allclose(res, a[:] * 2)
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 442b6fec..5dd30446 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -576,11 +576,14 @@ cdef extern from "miniexpr.h":
 
     void me_eval(const me_expr *n) nogil
     void me_eval_fused(const me_expr *n) nogil
+    void me_eval_chunk_threadsafe(const me_expr *expr, const void ** vars_chunk,
+                                  int n_vars, void *output_chunk,
+                                  int chunk_nitems) nogil
     void me_print(const me_expr *n) nogil
     void me_free(me_expr *n) nogil
 
 
-cdef extern from "miniexpr-numpy.h":
+cdef extern from "miniexpr_numpy.h":
     me_dtype me_dtype_from_numpy(int numpy_type_num)
 
 
@@ -603,7 +606,7 @@ ctypedef struct udf_udata:
     b2nd_array_t *array
     int64_t chunks_in_array[B2ND_MAX_DIM]
     int64_t blocks_in_chunk[B2ND_MAX_DIM]
-    void* miniexpr_handle  # Cached miniexpr compiled expression handle
+    me_expr* miniexpr_handle
 
 MAX_TYPESIZE = BLOSC2_MAXTYPESIZE
 MAX_BUFFERSIZE = BLOSC2_MAX_BUFFERSIZE
@@ -1735,9 +1738,9 @@ cdef class SChunk:
         # Clean up the miniexpr handle if this is a miniexpr_prefilter
         if self.schunk.storage.cparams.prefilter == <blosc2_prefilter_fn>miniexpr_prefilter:
             udf_data = <udf_udata*>self.schunk.storage.cparams.preparams.user_data
-            if udf_data.miniexpr_handle != NULL:
-                Py_DECREF(<object>udf_data.miniexpr_handle)
             free(udf_data.py_func)
+            if udf_data.miniexpr_handle != NULL:
+                free(udf_data.miniexpr_handle)
             free(udf_data)
         else:
             # From Python the preparams->udata with always have the field py_func
@@ -1825,9 +1828,7 @@ cdef int aux_miniexpr(udf_udata *udata, int64_t nchunk, int32_t nblock,
     cdef np.npy_intp dims[B2ND_MAX_DIM]
     cdef b2nd_array_t* ndarr
     cdef int rc
-    cdef void* miniexpr_handle
     cdef int n_inputs
-    cdef void** input_arrays
     cdef int64_t start[B2ND_MAX_DIM]
     cdef int64_t slice_shape[B2ND_MAX_DIM]
     cdef int64_t blockshape_int64[B2ND_MAX_DIM]
@@ -1867,6 +1868,7 @@ cdef int aux_miniexpr(udf_udata *udata, int64_t nchunk, int32_t nblock,
         stop_ndim[i] = start_ndim[i] + dims[i]
         l.append(slice(start_ndim[i], stop_ndim[i]))
     slices = tuple(l)
+    cdef int nelems_block
     #print("slices ->", slices)
     for key, obj in inputs_dict.items():
         if isinstance(obj, NDArray):
@@ -1878,14 +1880,17 @@ cdef int aux_miniexpr(udf_udata *udata, int64_t nchunk, int32_t nblock,
             ndarr = <b2nd_array_t*><uintptr_t>obj.c_array
             PyObject_GetBuffer(arr, &view, PyBUF_SIMPLE)
             with nogil:
+                nelems_block = 1
                 for i in range(udata.array.ndim):
                     buffershape_[i] = stop_ndim[i] - start_ndim[i]
+                    nelems_block *= buffershape_[i]
 
                 rc = b2nd_get_slice_cbuffer(ndarr, start_ndim, stop_ndim,
                                                  <void *> view.buf,
                                                  buffershape_, view.len)
             _check_rc(rc, "Error while getting the buffer")
             PyBuffer_Release(&view)
+            print(f"nelems_block -> {nelems_block}")
             inputs_slice[key] = arr
 
         elif isinstance(obj, np.ndarray | blosc2.C2Array):
@@ -1894,7 +1899,7 @@ cdef int aux_miniexpr(udf_udata *udata, int64_t nchunk, int32_t nblock,
             inputs_slice[key] = obj
         else:
             raise ValueError("Unsupported operand")
-    #print("inputs_slice ->", inputs_slice)
+    print("inputs_slice ->", inputs_slice)
 
     # Call miniexpr C API directly
     func_id = udata.py_func.decode("utf-8")
@@ -1902,41 +1907,37 @@ cdef int aux_miniexpr(udf_udata *udata, int64_t nchunk, int32_t nblock,
     cdef int linear_offset = sum(start_ndim) * typesize + nblock * udata.array.sc.blocksize
 
     # Use miniexpr C API for faster evaluation
-    # Use the cached handle from udata (set during _set_pref_expr)
+    # Use the expression handle from udata (set during _set_pref_expr)
     # This allows multi-threading since all threads share the same handle
-    miniexpr_handle = udata.miniexpr_handle
-    if miniexpr_handle != NULL:
-        # Get the variable names order from the compiled expression
-        compiled_ex = <object>miniexpr_handle
-        input_names = compiled_ex.input_names  # tuple of variable names in order
-
-        # Build list of input arrays in the correct order
-        n_inputs = len(input_names)
-        input_list = []
+    cdef me_expr* miniexpr_handle = udata.miniexpr_handle
+    if miniexpr_handle == NULL:
+        raise ValueError("miniexpr handle not assigned")
+    # Get the variable names order from the compiled expression
+    input_names = list(inputs_dict)  # list of variable names. XXX Check order.
+    print(f"input_names -> {input_names}")
+
+    # Build list of input arrays in the correct order
+    n_inputs = len(input_names)
+    input_list = []
+    for i in range(n_inputs):
+        var_name = input_names[i]
+        input_list.append(inputs_slice[var_name])
+
+    # Convert to array of void pointers
+    cdef void** input_arrays = <void**>malloc(n_inputs * sizeof(void*))
+    cdef np.ndarray nparr
+    try:
         for i in range(n_inputs):
-            var_name = input_names[i]
-            input_list.append(inputs_slice[var_name])
-
-        # Convert to array of void pointers (PyObject*)
-        input_arrays = <void**>malloc(n_inputs * sizeof(void*))
-        try:
-            for i in range(n_inputs):
-                input_arrays[i] = <void*>input_list[i]
+            nparr = <np.ndarray> input_list[i]
+            if not nparr.flags['C_CONTIGUOUS']:
+                raise ValueError("All input arrays must be C-contiguous")
+            input_arrays[i] = <const void*>nparr.data
 
-            # XXX Call numexpr C API  XXXX
-            # output = numexpr_run_compiled_simple(miniexpr_handle, input_arrays, n_inputs)
-            # Call miniexpr C API
-            # me_eval_expr(miniexpr_handle, input_arrays, n_inputs, <void*>output, typesize)
-            me_eval_chunk_threadsafe("", input_arrays, n_inputs, <void*>output, chunk_nitems)  # XXX remove expression
+        # Call miniexpr C API
+        me_eval_chunk_threadsafe(miniexpr_handle, input_arrays, n_inputs, <void*>params_output, nelems_block)
 
-            finally:
-                free(input_arrays)
-    else:
-        # Fallback to Python callback if C API not available
-        if is_postfilter:
-            output = blosc2.postfilter_funcs[func_id](inputs_slice)
-        else:
-            output = blosc2.prefilter_funcs[func_id](inputs_slice)
+    finally:
+        free(input_arrays)
 
     return 0
 
@@ -2840,24 +2841,32 @@ cdef class NDArray:
 
         # Get the compiled expression handle for multi-threading
         cdef Py_ssize_t n = len(inputs)
-        cdef me_variable **variables = <me_variable **> malloc(sizeof(me_variable *) * n)
+        cdef me_variable* variables = <me_variable *> malloc(sizeof(me_variable) * n)
         if variables == NULL:
             raise MemoryError()
         cdef me_variable *var
+        print(f"variables: {inputs.keys()}")
         for i, (k, v) in enumerate(inputs.items()):
-            var = <me_variable *> malloc(sizeof(me_variable))  # XXX devise a way to free this
-            if var == NULL:
-                raise MemoryError()
+            var = &variables[i]
             var_name = k.encode("utf-8") if isinstance(k, str) else k
             var.name = <char *> malloc(strlen(var_name) + 1)
             strcpy(var.name, var_name)
             var.dtype = me_dtype_from_numpy(v.dtype.num)
-            variables[i] = var
+            var.address = NULL  # chunked compile: addresses provided later
+            var.type = 0  # auto-set to ME_VARIABLE inside compiler
+            var.context = NULL
+
         cdef int error = 0
+        expression = expression.encode("utf-8") if isinstance(expression, str) else expression
         udata.miniexpr_handle = me_compile_chunk(expression, variables, n, ME_AUTO, &error)
         if udata.miniexpr_handle == NULL:
             raise ValueError(f"Cannot compile expression: {expression}")
 
+        # Free resources
+        for i in range(len(inputs)):
+            free(variables[i].name)
+        free(variables)
+
         # # Increment reference count to keep the expression alive across threads
         # if udata.miniexpr_handle != NULL:
         #     Py_INCREF(<object>udata.miniexpr_handle)
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index a0974c96..7ad2ded5 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1278,7 +1278,7 @@ def fast_eval(  # noqa: C901
         # func_name = "numexpr_last_compiled"
         # res_eval._set_pref_expr(func_name, id(operands))
         func_name = "miniexpr"
-        res_eval._set_pref_expr(func_name, expression, id(operands))
+        res_eval._set_pref_expr(func_name, expression, operands)
 
         # This line would NOT allocate physical RAM on any modern OS:
         aux = np.empty(res_eval.shape, res_eval.dtype)

From cd0c4da6fd8b14e26c8723f8c8dd8acfdf19413e Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 13 Dec 2025 06:31:35 +0100
Subject: [PATCH 006/123] miniexpr prefilter doesn't hold the gil anymore

---
 src/blosc2/blosc2_ext.pyx | 185 +++++++++++++-------------------------
 src/blosc2/lazyexpr.py    |  12 +--
 2 files changed, 67 insertions(+), 130 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 5dd30446..abe96af3 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -519,13 +519,13 @@ cdef extern from "b2nd.h":
                                       int64_t *buffershape, int64_t buffersize)
     int b2nd_from_schunk(blosc2_schunk *schunk, b2nd_array_t **array)
 
-    void blosc2_unidim_to_multidim(uint8_t ndim, int64_t *shape, int64_t i, int64_t *index)
+    void blosc2_unidim_to_multidim(uint8_t ndim, int64_t *shape, int64_t i, int64_t *index) nogil
     int b2nd_copy_buffer2(int8_t ndim,
                           int32_t itemsize,
                           const void *src, const int64_t *src_pad_shape,
                           const int64_t *src_start, const int64_t *src_stop,
                           void *dst, const int64_t *dst_pad_shape,
-                          const int64_t *dst_start) nogil;
+                          const int64_t *dst_start) nogil
 
 
 # miniexpr C API declarations
@@ -606,6 +606,13 @@ ctypedef struct udf_udata:
     b2nd_array_t *array
     int64_t chunks_in_array[B2ND_MAX_DIM]
     int64_t blocks_in_chunk[B2ND_MAX_DIM]
+
+ctypedef struct me_udata:
+    b2nd_array_t** inputs
+    int ninputs
+    b2nd_array_t *array
+    int64_t chunks_in_array[B2ND_MAX_DIM]
+    int64_t blocks_in_chunk[B2ND_MAX_DIM]
     me_expr* miniexpr_handle
 
 MAX_TYPESIZE = BLOSC2_MAXTYPESIZE
@@ -1737,11 +1744,11 @@ cdef class SChunk:
 
         # Clean up the miniexpr handle if this is a miniexpr_prefilter
         if self.schunk.storage.cparams.prefilter == <blosc2_prefilter_fn>miniexpr_prefilter:
-            udf_data = <udf_udata*>self.schunk.storage.cparams.preparams.user_data
-            free(udf_data.py_func)
-            if udf_data.miniexpr_handle != NULL:
-                free(udf_data.miniexpr_handle)
-            free(udf_data)
+            me_data = <me_udata*>self.schunk.storage.cparams.preparams.user_data
+            free(me_data.inputs)
+            if me_data.miniexpr_handle != NULL:  # XXX do we really need the conditional?
+                free(me_data.miniexpr_handle)
+            free(me_data)
         else:
             # From Python the preparams->udata with always have the field py_func
             udata = <user_filters_udata*>self.schunk.storage.cparams.preparams.user_data
@@ -1815,96 +1822,39 @@ cdef int general_filler(blosc2_prefilter_params *params):
     return 0
 
 
-# Aux function for prefilter and postfilter for last expression
-cdef int aux_miniexpr(udf_udata *udata, int64_t nchunk, int32_t nblock,
-                      c_bool is_postfilter, uint8_t *params_output, int32_t typesize):
+# Auxiliary function for just miniexpr as a prefilter
+# Only meant for (input and output) arrays that:
+# 1) Are blosc2.NDArray objects
+# 2) Do not have padding
+cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
+                      c_bool is_postfilter, uint8_t *params_output, int32_t typesize) nogil:
     # Declare all C variables at the beginning
     cdef int64_t chunk_ndim[B2ND_MAX_DIM]
     cdef int64_t block_ndim[B2ND_MAX_DIM]
-    cdef Py_buffer view
     cdef int64_t start_ndim[B2ND_MAX_DIM]
     cdef int64_t stop_ndim[B2ND_MAX_DIM]
-    cdef int64_t[B2ND_MAX_DIM] buffershape_
-    cdef np.npy_intp dims[B2ND_MAX_DIM]
-    cdef b2nd_array_t* ndarr
-    cdef int rc
-    cdef int n_inputs
-    cdef int64_t start[B2ND_MAX_DIM]
-    cdef int64_t slice_shape[B2ND_MAX_DIM]
-    cdef int64_t blockshape_int64[B2ND_MAX_DIM]
-    cdef Py_buffer buf
+    cdef int64_t buffershape[B2ND_MAX_DIM]
 
+    # Get the right slice for each operand
     blosc2_unidim_to_multidim(udata.array.ndim, udata.chunks_in_array, nchunk, chunk_ndim)
     blosc2_unidim_to_multidim(udata.array.ndim, udata.blocks_in_chunk, nblock, block_ndim)
     for i in range(udata.array.ndim):
         start_ndim[i] = chunk_ndim[i] * udata.array.chunkshape[i] + block_ndim[i] * udata.array.blockshape[i]
+        stop_ndim[i] = start_ndim[i] + udata.array.blockshape[i]
+        buffershape[i] = udata.array.blockshape[i]
 
-    padding = False
-    blockshape = []
-    for i in range(udata.array.ndim):
-        if start_ndim[i] + udata.array.blockshape[i] > udata.array.shape[i]:
-            padding = True
-            blockshape.append(udata.array.shape[i] - start_ndim[i])
-            if blockshape[i] <= 0:
-                # This block contains only padding, skip it
-                return 0
-        else:
-            blockshape.append(udata.array.blockshape[i])
-    for i in range(udata.array.ndim):
-        dims[i] = blockshape[i]
-    #print("blockshape ->", blockshape)
-
-    # if padding:
-    #     output = np.empty(blockshape, udata.array.dtype)
-    # else:
-    #     output = np.PyArray_SimpleNewFromData(udata.array.ndim, dims, udata.output_cdtype, <void*>params_output)
-
-    inputs_dict = _ctypes.PyObj_FromPtr(udata.inputs_id)
-    #print("inputs_dict ->", inputs_dict)
-    inputs_slice = {}
-    # Get slice of each operand
-    l = []
-    for i in range(udata.array.ndim):
-        stop_ndim[i] = start_ndim[i] + dims[i]
-        l.append(slice(start_ndim[i], stop_ndim[i]))
-    slices = tuple(l)
-    cdef int nelems_block
-    #print("slices ->", slices)
-    for key, obj in inputs_dict.items():
-        if isinstance(obj, NDArray):
-            # inputs_slice[key] = obj[slices]
-            arr = np.empty(blockshape, dtype=obj.dtype)
-            # inputs_slice[key] = obj.get_slice_numpy(arr, (start_ndim, stop_ndim))
-            # This is *slightly* faster than using get_slice_numpy;
-            # hopefully, with multithreading enabled, this should go faster.
-            ndarr = <b2nd_array_t*><uintptr_t>obj.c_array
-            PyObject_GetBuffer(arr, &view, PyBUF_SIMPLE)
-            with nogil:
-                nelems_block = 1
-                for i in range(udata.array.ndim):
-                    buffershape_[i] = stop_ndim[i] - start_ndim[i]
-                    nelems_block *= buffershape_[i]
-
-                rc = b2nd_get_slice_cbuffer(ndarr, start_ndim, stop_ndim,
-                                                 <void *> view.buf,
-                                                 buffershape_, view.len)
-            _check_rc(rc, "Error while getting the buffer")
-            PyBuffer_Release(&view)
-            print(f"nelems_block -> {nelems_block}")
-            inputs_slice[key] = arr
-
-        elif isinstance(obj, np.ndarray | blosc2.C2Array):
-            inputs_slice[key] = obj[slices]
-        elif np.isscalar(obj):
-            inputs_slice[key] = obj
-        else:
-            raise ValueError("Unsupported operand")
-    print("inputs_slice ->", inputs_slice)
-
-    # Call miniexpr C API directly
-    func_id = udata.py_func.decode("utf-8")
-    offset = tuple(start_ndim[i] for i in range(udata.array.ndim))
-    cdef int linear_offset = sum(start_ndim) * typesize + nblock * udata.array.sc.blocksize
+    cdef b2nd_array_t* ndarr
+    cdef int rc
+    cdef void** input_buffers = <void**> malloc(udata.ninputs * sizeof(uint8_t*))
+    for i in range(udata.ninputs):
+        ndarr = udata.inputs[i]
+        input_buffers[i] = malloc(ndarr.sc.blocksize)
+        rc = b2nd_get_slice_cbuffer(
+                ndarr, start_ndim, stop_ndim, input_buffers[i],
+                buffershape, ndarr.sc.blocksize)
+        if rc < 0:
+            return rc
+    #print("nelems in block:", ndarr.blocknitems)
 
     # Use miniexpr C API for faster evaluation
     # Use the expression handle from udata (set during _set_pref_expr)
@@ -1912,32 +1862,12 @@ cdef int aux_miniexpr(udf_udata *udata, int64_t nchunk, int32_t nblock,
     cdef me_expr* miniexpr_handle = udata.miniexpr_handle
     if miniexpr_handle == NULL:
         raise ValueError("miniexpr handle not assigned")
-    # Get the variable names order from the compiled expression
-    input_names = list(inputs_dict)  # list of variable names. XXX Check order.
-    print(f"input_names -> {input_names}")
-
-    # Build list of input arrays in the correct order
-    n_inputs = len(input_names)
-    input_list = []
-    for i in range(n_inputs):
-        var_name = input_names[i]
-        input_list.append(inputs_slice[var_name])
-
-    # Convert to array of void pointers
-    cdef void** input_arrays = <void**>malloc(n_inputs * sizeof(void*))
-    cdef np.ndarray nparr
-    try:
-        for i in range(n_inputs):
-            nparr = <np.ndarray> input_list[i]
-            if not nparr.flags['C_CONTIGUOUS']:
-                raise ValueError("All input arrays must be C-contiguous")
-            input_arrays[i] = <const void*>nparr.data
-
-        # Call miniexpr C API
-        me_eval_chunk_threadsafe(miniexpr_handle, input_arrays, n_inputs, <void*>params_output, nelems_block)
-
-    finally:
-        free(input_arrays)
+    # Call miniexpr C API
+    me_eval_chunk_threadsafe(miniexpr_handle, input_buffers, udata.ninputs, <void*>params_output, ndarr.blocknitems)
+
+    for i in range(udata.ninputs):
+        free(input_buffers[i])
+    free(input_buffers)
 
     return 0
 
@@ -2016,8 +1946,8 @@ cdef int aux_udf(udf_udata *udata, int64_t nchunk, int32_t nblock,
 
 
 cdef int miniexpr_prefilter(blosc2_prefilter_params *params):
-    cdef udf_udata *udata = <udf_udata *> params.user_data
-    return aux_miniexpr(udata, params.nchunk, params.nblock, False, params.output, params.output_typesize)
+    return aux_miniexpr(<me_udata *> params.user_data, params.nchunk, params.nblock, False,
+                        params.output, params.output_typesize)
 
 
 cdef int general_udf_prefilter(blosc2_prefilter_params *params):
@@ -2831,13 +2761,30 @@ cdef class NDArray:
 
         return udata
 
-    def _set_pref_expr(self, func, expression, inputs):
+    cdef me_udata *_fill_me_udata(self, inputs):
+        cdef me_udata *udata = <me_udata *> malloc(sizeof(me_udata))
+        operands = list(inputs.values())
+        ninputs = len(operands)
+        cdef b2nd_array_t** inputs_ = <b2nd_array_t**> malloc(ninputs * sizeof(b2nd_array_t*))
+        for i, operand in enumerate(operands):
+            inputs_[i] = <b2nd_array_t*><uintptr_t>operand.c_array
+        udata.inputs = inputs_
+        udata.ninputs = ninputs
+        udata.array = self.array
+        # Save these in udf_udata to avoid computing them for each block
+        for i in range(self.array.ndim):
+            udata.chunks_in_array[i] = udata.array.extshape[i] // udata.array.chunkshape[i]
+            udata.blocks_in_chunk[i] = udata.array.extchunkshape[i] // udata.array.blockshape[i]
+
+        return udata
+
+    def _set_pref_expr(self, expression, inputs):
         # Set prefilter for miniexpr
         cdef blosc2_cparams* cparams = self.array.sc.storage.cparams
         cparams.prefilter = <blosc2_prefilter_fn> miniexpr_prefilter
 
-        func_id = func.encode("utf-8") if isinstance(func, str) else func
-        cdef udf_udata* udata = self._fill_udf_udata(func_id, inputs)
+        # cdef udf_udata* udata = self._fill_udf_udata(func_id, inputs)
+        cdef me_udata* udata = self._fill_me_udata(inputs)
 
         # Get the compiled expression handle for multi-threading
         cdef Py_ssize_t n = len(inputs)
@@ -2867,10 +2814,6 @@ cdef class NDArray:
             free(variables[i].name)
         free(variables)
 
-        # # Increment reference count to keep the expression alive across threads
-        # if udata.miniexpr_handle != NULL:
-        #     Py_INCREF(<object>udata.miniexpr_handle)
-
         cdef blosc2_prefilter_params* preparams = <blosc2_prefilter_params *> malloc(sizeof(blosc2_prefilter_params))
         preparams.user_data = udata
         cparams.preparams = preparams
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 7ad2ded5..4d6696b4 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1270,21 +1270,15 @@ def fast_eval(  # noqa: C901
             prev_nthreads = cparams.nthreads
             cparams.nthreads = 1
         res_eval = blosc2.empty(shape, dtype, cparams=cparams, **kwargs)
-        # Validate expression so that it will be cached in numexpr
+        # XXX Validate expression before using it
         # numexpr.validate(expression, local_dict=operands)
-        # Register a prefilter for last expression using C API
-        # We use a placeholder function name since the actual evaluation
-        # is done directly via numexpr C API in blosc2_ext.pyx
-        # func_name = "numexpr_last_compiled"
-        # res_eval._set_pref_expr(func_name, id(operands))
-        func_name = "miniexpr"
-        res_eval._set_pref_expr(func_name, expression, operands)
+        res_eval._set_pref_expr(expression, operands)
 
         # This line would NOT allocate physical RAM on any modern OS:
         aux = np.empty(res_eval.shape, res_eval.dtype)
         # Physical allocation happens here (when writing):
         res_eval[...] = aux
-        res_eval.schunk.remove_prefilter(func_name)
+        res_eval.schunk.remove_prefilter("miniexpr")
         if cparams.nthreads > 1:
             res_eval.schunk.cparams.nthreads = prev_nthreads
 

From 42189d9ffe1588fa54a644cb5d7a025ed751d59e Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 13 Dec 2025 09:41:57 +0100
Subject: [PATCH 007/123] miniexpr prefilter finally works in multithread mode!

---
 CMakeLists.txt                     |  2 ++
 bench/ndarray/expr-blocked-eval.py |  5 +--
 src/blosc2/blosc2_ext.pyx          | 57 ++++++++++++++++++++++--------
 src/blosc2/lazyexpr.py             | 10 +++---
 4 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65f61065..6ac7da37 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,8 @@ else()
     FetchContent_Declare(blosc2
         GIT_REPOSITORY https://github.com/Blosc/c-blosc2
         GIT_TAG 5a2b0ed9c4d801230c118fbc5811817055b5a3f5  # v2.22.0
+        # in case you want to use a local copy of c-blosc2 for development, uncomment the line below
+        # SOURCE_DIR "/Users/faltet/blosc/c-blosc2"
     )
     FetchContent_MakeAvailable(blosc2)
     include_directories("${blosc2_SOURCE_DIR}/include")
diff --git a/bench/ndarray/expr-blocked-eval.py b/bench/ndarray/expr-blocked-eval.py
index d4d23e0e..10ab3701 100644
--- a/bench/ndarray/expr-blocked-eval.py
+++ b/bench/ndarray/expr-blocked-eval.py
@@ -6,7 +6,8 @@
 dtype= np.float32
 
 t0 = time()
-a = blosc2.ones((N, N), dtype=dtype)
+#a = blosc2.ones((N, N), dtype=dtype)
+a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype)
 print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
 t0 = time()
 b = a.copy()
@@ -18,4 +19,4 @@
 print(f"Time to evaluate: {(time() - t0) * 1000 :.4f} ms")
 # print(res.info)
 
-np.testing.assert_allclose(res, a[:] * 2)
+np.testing.assert_allclose(res, (a[:] + b[:]) * c[:])
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index abe96af3..8bc92798 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -27,7 +27,7 @@ from cpython.ref cimport Py_INCREF, Py_DECREF
 from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New
 from cython.operator cimport dereference
 from libc.stdint cimport uintptr_t
-from libc.stdlib cimport free, malloc, realloc
+from libc.stdlib cimport free, malloc, realloc, calloc
 from libc.stdlib cimport abs as c_abs
 from libc.string cimport memcpy, strcpy, strdup, strlen
 from libcpp cimport bool as c_bool
@@ -180,7 +180,7 @@ cdef extern from "blosc2.h":
     int blosc2_free_resources()
 
     int blosc2_cbuffer_sizes(const void* cbuffer, int32_t* nbytes,
-                             int32_t* cbytes, int32_t* blocksize)
+                             int32_t* cbytes, int32_t* blocksize) nogil
 
     int blosc1_cbuffer_validate(const void* cbuffer, size_t cbytes, size_t* nbytes)
 
@@ -258,7 +258,7 @@ cdef extern from "blosc2.h":
 
     blosc2_context* blosc2_create_cctx(blosc2_cparams cparams) nogil
 
-    blosc2_context* blosc2_create_dctx(blosc2_dparams dparams)
+    blosc2_context* blosc2_create_dctx(blosc2_dparams dparams) nogil
 
     void blosc2_free_ctx(blosc2_context * context) nogil
 
@@ -281,7 +281,7 @@ cdef extern from "blosc2.h":
 
     int blosc2_getitem_ctx(blosc2_context* context, const void* src,
                            int32_t srcsize, int start, int nitems, void* dest,
-                           int32_t destsize)
+                           int32_t destsize) nogil
 
 
 
@@ -1846,28 +1846,55 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
     cdef b2nd_array_t* ndarr
     cdef int rc
     cdef void** input_buffers = <void**> malloc(udata.ninputs * sizeof(uint8_t*))
+    cdef float *buf
+    cdef void* src
+    cdef int32_t chunk_nbytes, chunk_cbytes, block_nbytes
+    cdef int start
+    cdef blosc2_context** input_dctxs = <blosc2_context**> calloc(udata.ninputs, sizeof(blosc2_context*))
+    cdef blosc2_context* dctx
     for i in range(udata.ninputs):
         ndarr = udata.inputs[i]
         input_buffers[i] = malloc(ndarr.sc.blocksize)
-        rc = b2nd_get_slice_cbuffer(
-                ndarr, start_ndim, stop_ndim, input_buffers[i],
-                buffershape, ndarr.sc.blocksize)
-        if rc < 0:
-            return rc
-    #print("nelems in block:", ndarr.blocknitems)
+        # A way to check for top speed
+        if False:
+            buf = <float *>input_buffers[i]
+            for j in range(ndarr.blocknitems):
+                buf[j] = 1.
+        else:
+            src = ndarr.sc.data[nchunk]
+            rc = blosc2_cbuffer_sizes(src, &chunk_nbytes, &chunk_cbytes, &block_nbytes)
+            if rc < 0:
+                raise ValueError("miniexpr: error getting cbuffer sizes")
+            start = nblock * ndarr.blocknitems
+            # A way to check for top speed
+            if False:
+                # Unsafe, but it works for special arrays (e.g. blosc2.ones), and can be fast
+                dctx = ndarr.sc.dctx
+            else:
+                # This can add a significant overhead, but it is needed for thread safety.
+                # Perhaps one can create a specific (serial) context just for blosc2_getitem_ctx?
+                input_dctxs[i] = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
+                dctx = input_dctxs[i]
+            rc = blosc2_getitem_ctx(dctx, src, chunk_cbytes, start, ndarr.blocknitems,
+                                    input_buffers[i], block_nbytes)
+            if rc < 0:
+                raise ValueError("miniexpr: error decompressing the chunk")
 
-    # Use miniexpr C API for faster evaluation
-    # Use the expression handle from udata (set during _set_pref_expr)
-    # This allows multi-threading since all threads share the same handle
     cdef me_expr* miniexpr_handle = udata.miniexpr_handle
     if miniexpr_handle == NULL:
-        raise ValueError("miniexpr handle not assigned")
-    # Call miniexpr C API
+        raise ValueError("miniexpr: handle not assigned")
+    # Call thread-safe miniexpr C API
+    # XXX Add error checking inside the function?
     me_eval_chunk_threadsafe(miniexpr_handle, input_buffers, udata.ninputs, <void*>params_output, ndarr.blocknitems)
 
+    # Free resources
     for i in range(udata.ninputs):
         free(input_buffers[i])
+        if input_dctxs[i] != NULL:
+            # When doing profiling (see above code), this can be NULL
+            blosc2_free_ctx(input_dctxs[i])
     free(input_buffers)
+    free(input_dctxs)
 
     return 0
 
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 4d6696b4..e5d701b3 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1266,9 +1266,9 @@ def fast_eval(  # noqa: C901
         # Force single-threaded execution for prefilter evaluation
         # The prefilter callback accesses Python objects which aren't thread-safe
         # across blosc2's C threads. numexpr does its own multi-threading internally.
-        if cparams.nthreads > 1:
-            prev_nthreads = cparams.nthreads
-            cparams.nthreads = 1
+        # if cparams.nthreads > 1:
+        #     prev_nthreads = cparams.nthreads
+        #     cparams.nthreads = 1
         res_eval = blosc2.empty(shape, dtype, cparams=cparams, **kwargs)
         # XXX Validate expression before using it
         # numexpr.validate(expression, local_dict=operands)
@@ -1279,8 +1279,8 @@ def fast_eval(  # noqa: C901
         # Physical allocation happens here (when writing):
         res_eval[...] = aux
         res_eval.schunk.remove_prefilter("miniexpr")
-        if cparams.nthreads > 1:
-            res_eval.schunk.cparams.nthreads = prev_nthreads
+        # if cparams.nthreads > 1:
+        #     res_eval.schunk.cparams.nthreads = prev_nthreads
 
         return res_eval
 

From a2a725942cd5060023bf78da7de05c9f0eac87c6 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 13 Dec 2025 10:00:00 +0100
Subject: [PATCH 008/123] Fix a compilation error on linux

---
 src/blosc2/blosc2_ext.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 8bc92798..2bcb933c 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1885,7 +1885,8 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
         raise ValueError("miniexpr: handle not assigned")
     # Call thread-safe miniexpr C API
     # XXX Add error checking inside the function?
-    me_eval_chunk_threadsafe(miniexpr_handle, input_buffers, udata.ninputs, <void*>params_output, ndarr.blocknitems)
+    me_eval_chunk_threadsafe(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
+                             <void*>params_output, ndarr.blocknitems)
 
     # Free resources
     for i in range(udata.ninputs):

From f599cdb5c470c0b29991d5e4fff3f446d5364f50 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 13 Dec 2025 11:10:25 +0100
Subject: [PATCH 009/123] Be more conservative for blocksize in apple silicon

---
 src/blosc2/core.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/blosc2/core.py b/src/blosc2/core.py
index 4b348c79..15f58ed3 100644
--- a/src/blosc2/core.py
+++ b/src/blosc2/core.py
@@ -1547,12 +1547,14 @@ def compute_chunks_blocks(  # noqa: C901
             # min_blocksize = blosc2.cpu_info["l1_data_cache_size"] * 4
         elif platform.system() == "Darwin" and "arm" in platform.machine():
             # For Apple Silicon, experiments say we can use 4x the L1 size
-            min_blocksize = blosc2.cpu_info["l1_data_cache_size"] * 4
+            # min_blocksize = blosc2.cpu_info["l1_data_cache_size"] * 4
+            # However, let's adjust for several operands in cache, so let's use just L1
+            min_blocksize = blosc2.cpu_info["l1_data_cache_size"] * 1
         elif "l1_data_cache_size" in blosc2.cpu_info and isinstance(
             blosc2.cpu_info["l1_data_cache_size"], int
         ):
-            # For other archs, we don't have hints; be conservative and use 2x the L1 size
-            min_blocksize = blosc2.cpu_info["l1_data_cache_size"] * 2
+            # For other archs, we don't have hints; be conservative and use 1x the L1 size
+            min_blocksize = blosc2.cpu_info["l1_data_cache_size"] * 1
 
         if blocksize < min_blocksize:
             blocksize = min_blocksize

From 3bf54c956077a3638c9b5c7e270883aef8a75c41 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 13 Dec 2025 11:11:32 +0100
Subject: [PATCH 010/123] Add a comparsion against numexpr

---
 bench/ndarray/expr-blocked-eval.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/bench/ndarray/expr-blocked-eval.py b/bench/ndarray/expr-blocked-eval.py
index 10ab3701..7caa2f67 100644
--- a/bench/ndarray/expr-blocked-eval.py
+++ b/bench/ndarray/expr-blocked-eval.py
@@ -1,13 +1,15 @@
 from time import time
 import blosc2
 import numpy as np
+import numexpr as ne
 
 N = 10_000
 dtype= np.float32
+cparams = blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=5)
 
 t0 = time()
 #a = blosc2.ones((N, N), dtype=dtype)
-a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype)
+a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
 print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
 t0 = time()
 b = a.copy()
@@ -15,8 +17,15 @@
 print(f"Time to copy data: {(time() - t0) * 1000 :.4f} ms")
 
 t0 = time()
-res = ((a + b) * c).compute()
+res = ((a + b) * c).compute(cparams=cparams)
 print(f"Time to evaluate: {(time() - t0) * 1000 :.4f} ms")
 # print(res.info)
 
-np.testing.assert_allclose(res, (a[:] + b[:]) * c[:])
+na = a[:]
+nb = b[:]
+nc = c[:]
+np.testing.assert_allclose(res, (na + nb) * nc)
+
+t0 = time()
+res = ne.evaluate("(na + nb) * nc")
+print(f"Time to evaluate with NumExpr: {(time() - t0) * 1000 :.4f} ms")

From ef9dc80dde9af005faa635fff22d7b99a41e9ce2 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 17 Dec 2025 13:11:32 +0100
Subject: [PATCH 011/123] Update to latest miniexpr sources

---
 src/blosc2/blosc2_ext.pyx |  23 +-
 src/blosc2/miniexpr.c     | 819 ++++++++++----------------------------
 src/blosc2/miniexpr.h     | 100 ++---
 3 files changed, 244 insertions(+), 698 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 2bcb933c..5ff21fca 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -567,18 +567,11 @@ cdef extern from "miniexpr.h":
         int ncode
         void *parameters[1]
 
-    # me_expr *me_compile(const char *expression, const me_variable *variables,
-    #                     int var_count, void *output, int nitems, me_dtype dtype,
-    #                     int *error) nogil
-
-    me_expr *me_compile_chunk(const char *expression, const me_variable *variables,
-                              int var_count, me_dtype dtype, int *error)
-
-    void me_eval(const me_expr *n) nogil
-    void me_eval_fused(const me_expr *n) nogil
-    void me_eval_chunk_threadsafe(const me_expr *expr, const void ** vars_chunk,
-                                  int n_vars, void *output_chunk,
-                                  int chunk_nitems) nogil
+    me_expr *me_compile(const char *expression, const me_variable *variables,
+                        int var_count, me_dtype dtype, int *error)
+
+    void me_eval(const me_expr *expr, const void ** vars_chunk,
+                 int n_vars, void *output_chunk, int chunk_nitems) nogil
     void me_print(const me_expr *n) nogil
     void me_free(me_expr *n) nogil
 
@@ -1885,8 +1878,8 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
         raise ValueError("miniexpr: handle not assigned")
     # Call thread-safe miniexpr C API
     # XXX Add error checking inside the function?
-    me_eval_chunk_threadsafe(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
-                             <void*>params_output, ndarr.blocknitems)
+    me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
+            <void*>params_output, ndarr.blocknitems)
 
     # Free resources
     for i in range(udata.ninputs):
@@ -2833,7 +2826,7 @@ cdef class NDArray:
 
         cdef int error = 0
         expression = expression.encode("utf-8") if isinstance(expression, str) else expression
-        udata.miniexpr_handle = me_compile_chunk(expression, variables, n, ME_AUTO, &error)
+        udata.miniexpr_handle = me_compile(expression, variables, n, ME_AUTO, &error)
         if udata.miniexpr_handle == NULL:
             raise ValueError(f"Cannot compile expression: {expression}")
 
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index 63ce1e7b..25760364 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -75,6 +75,27 @@ enum {
     TOK_BITWISE, TOK_SHIFT, TOK_COMPARE, TOK_POW
 };
 
+/* Internal definition of me_expr (opaque to users) */
+struct me_expr {
+    int type;
+
+    union {
+        double value;
+        const void *bound;
+        const void *function;
+    };
+
+    /* Vector operation info */
+    void *output; // Generic pointer (can be float* or double*)
+    int nitems;
+    me_dtype dtype; // Data type for this expression (result type after promotion)
+    me_dtype input_dtype; // Original input type (for variables/constants)
+    /* Bytecode info (for fused evaluation) */
+    void *bytecode; // Pointer to compiled bytecode
+    int ncode; // Number of instructions
+    void *parameters[1]; // Must be last (flexible array member)
+};
+
 
 /* Type promotion table following NumPy rules */
 /* Note: ME_AUTO (0) should never appear in type promotion, so we index from 1 */
@@ -136,7 +157,7 @@ static const me_dtype type_promotion_table[13][13] = {
 };
 
 /* Promote two types according to NumPy rules */
-static me_dtype promome_types(me_dtype a, me_dtype b) {
+static me_dtype promote_types(me_dtype a, me_dtype b) {
     // ME_AUTO should have been resolved during compilation
     if (a == ME_AUTO || b == ME_AUTO) {
         fprintf(stderr, "FATAL: ME_AUTO in type promotion (a=%d, b=%d). This is a bug.\n", a, b);
@@ -245,7 +266,7 @@ static me_dtype infer_result_type(const me_expr *n) {
 
             for (int i = 0; i < arity; i++) {
                 me_dtype param_type = infer_result_type((const me_expr *) n->parameters[i]);
-                result = promome_types(result, param_type);
+                result = promote_types(result, param_type);
             }
 
             return result;
@@ -256,7 +277,7 @@ static me_dtype infer_result_type(const me_expr *n) {
 }
 
 /* Apply type promotion to a binary operation node */
-static me_expr *creame_conversion_node(me_expr *source, me_dtype target_dtype) {
+static me_expr *create_conversion_node(me_expr *source, me_dtype target_dtype) {
     /* Create a unary conversion node that converts source to target_dtype */
     me_expr *conv = NEW_EXPR(ME_FUNCTION1 | ME_FLAG_PURE, source);
     if (conv) {
@@ -276,7 +297,7 @@ static void apply_type_promotion(me_expr *node) {
     if (left && right) {
         me_dtype left_type = left->dtype;
         me_dtype right_type = right->dtype;
-        me_dtype promoted = promome_types(left_type, right_type);
+        me_dtype promoted = promote_types(left_type, right_type);
 
         // Store the promoted output type
         node->dtype = promoted;
@@ -533,150 +554,179 @@ static double logical_or(double a, double b) { return ((int) a) || ((int) b) ? 1
 static double logical_not(double a) { return !(int) a ? 1.0 : 0.0; }
 static double logical_xor(double a, double b) { return ((int) a) != ((int) b) ? 1.0 : 0.0; }
 
+static bool is_identifier_start(char c) {
+    return isalpha((unsigned char) c) || c == '_';
+}
+
+static bool is_identifier_char(char c) {
+    return isalnum((unsigned char) c) || c == '_';
+}
+
+static void skip_whitespace(state *s) {
+    while (*s->next && isspace((unsigned char) *s->next)) {
+        s->next++;
+    }
+}
+
+static void read_number_token(state *s) {
+    s->value = strtod(s->next, (char **) &s->next);
+    s->type = TOK_NUMBER;
+}
+
+static void read_identifier_token(state *s) {
+    const char *start = s->next;
+    while (is_identifier_char(*s->next)) {
+        s->next++;
+    }
+
+    const me_variable *var = find_lookup(s, start, s->next - start);
+    if (!var) {
+        var = find_builtin(start, s->next - start);
+    }
+
+    if (!var) {
+        s->type = TOK_ERROR;
+        return;
+    }
+
+    switch (TYPE_MASK(var->type)) {
+        case ME_VARIABLE:
+            s->type = TOK_VARIABLE;
+            s->bound = var->address;
+            s->dtype = var->dtype;
+            break;
+
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7:
+            s->context = var->context;
+        /* Falls through. */
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+            s->type = var->type;
+            s->function = var->address;
+            break;
+    }
+}
+
+typedef struct {
+    const char *literal;
+    int token_type;
+    me_fun2 function;
+} operator_spec;
+
+static bool handle_multi_char_operator(state *s) {
+    static const operator_spec multi_ops[] = {
+        {"**", TOK_POW, pow},
+        {"<<", TOK_SHIFT, bit_shl},
+        {">>", TOK_SHIFT, bit_shr},
+        {"==", TOK_COMPARE, cmp_eq},
+        {"!=", TOK_COMPARE, cmp_ne},
+        {"<=", TOK_COMPARE, cmp_le},
+        {">=", TOK_COMPARE, cmp_ge},
+    };
+
+    for (size_t i = 0; i < sizeof(multi_ops) / sizeof(multi_ops[0]); i++) {
+        const operator_spec *op = &multi_ops[i];
+        size_t len = strlen(op->literal);
+        if (strncmp(s->next, op->literal, len) == 0) {
+            s->type = op->token_type;
+            s->function = op->function;
+            s->next += len;
+            return true;
+        }
+    }
+    return false;
+}
+
+static void handle_single_char_operator(state *s, char c) {
+    s->next++;
+    switch (c) {
+        case '+': s->type = TOK_INFIX;
+            s->function = add;
+            break;
+        case '-': s->type = TOK_INFIX;
+            s->function = sub;
+            break;
+        case '*': s->type = TOK_INFIX;
+            s->function = mul;
+            break;
+        case '/': s->type = TOK_INFIX;
+            s->function = divide;
+            break;
+        case '%': s->type = TOK_INFIX;
+            s->function = fmod;
+            break;
+        case '&': s->type = TOK_BITWISE;
+            s->function = bit_and;
+            break;
+        case '|': s->type = TOK_BITWISE;
+            s->function = bit_or;
+            break;
+        case '^': s->type = TOK_BITWISE;
+            s->function = bit_xor;
+            break;
+        case '~': s->type = TOK_BITWISE;
+            s->function = bit_not;
+            break;
+        case '<': s->type = TOK_COMPARE;
+            s->function = cmp_lt;
+            break;
+        case '>': s->type = TOK_COMPARE;
+            s->function = cmp_gt;
+            break;
+        case '(': s->type = TOK_OPEN;
+            break;
+        case ')': s->type = TOK_CLOSE;
+            break;
+        case ',': s->type = TOK_SEP;
+            break;
+        default: s->type = TOK_ERROR;
+            break;
+    }
+}
+
+static void read_operator_token(state *s) {
+    if (handle_multi_char_operator(s)) {
+        return;
+    }
+
+    if (!*s->next) {
+        s->type = TOK_END;
+        return;
+    }
+
+    handle_single_char_operator(s, *s->next);
+}
 
 void next_token(state *s) {
     s->type = TOK_NULL;
 
     do {
+        skip_whitespace(s);
+
         if (!*s->next) {
             s->type = TOK_END;
             return;
         }
 
-        /* Try reading a number. */
         if ((s->next[0] >= '0' && s->next[0] <= '9') || s->next[0] == '.') {
-            s->value = strtod(s->next, (char **) &s->next);
-            s->type = TOK_NUMBER;
+            read_number_token(s);
+        } else if (is_identifier_start(s->next[0])) {
+            read_identifier_token(s);
         } else {
-            /* Look for a variable or builtin function call. */
-            if (isalpha(s->next[0])) {
-                const char *start;
-                start = s->next;
-                while (isalpha(s->next[0]) || isdigit(s->next[0]) || (s->next[0] == '_')) s->next++;
-
-                const me_variable *var = find_lookup(s, start, s->next - start);
-                if (!var) var = find_builtin(start, s->next - start);
-
-                if (!var) {
-                    s->type = TOK_ERROR;
-                } else {
-                    switch (TYPE_MASK(var->type)) {
-                        case ME_VARIABLE:
-                            s->type = TOK_VARIABLE;
-                            s->bound = var->address;
-                            s->dtype = var->dtype; // Store the variable's type
-                            break;
-
-                        case ME_CLOSURE0:
-                        case ME_CLOSURE1:
-                        case ME_CLOSURE2:
-                        case ME_CLOSURE3: /* Falls through. */
-                        case ME_CLOSURE4:
-                        case ME_CLOSURE5:
-                        case ME_CLOSURE6:
-                        case ME_CLOSURE7: /* Falls through. */
-                            s->context = var->context; /* Falls through. */
-
-                        case ME_FUNCTION0:
-                        case ME_FUNCTION1:
-                        case ME_FUNCTION2:
-                        case ME_FUNCTION3: /* Falls through. */
-                        case ME_FUNCTION4:
-                        case ME_FUNCTION5:
-                        case ME_FUNCTION6:
-                        case ME_FUNCTION7: /* Falls through. */
-                            s->type = var->type;
-                            s->function = var->address;
-                            break;
-                    }
-                }
-            } else {
-                /* Look for an operator or special character. */
-                char c = s->next[0];
-                char next_c = s->next[1];
-
-                /* Multi-character operators */
-                if (c == '*' && next_c == '*') {
-                    s->type = TOK_POW;
-                    s->function = (const void *) pow;
-                    s->next += 2;
-                } else if (c == '<' && next_c == '<') {
-                    s->type = TOK_SHIFT;
-                    s->function = bit_shl;
-                    s->next += 2;
-                } else if (c == '>' && next_c == '>') {
-                    s->type = TOK_SHIFT;
-                    s->function = bit_shr;
-                    s->next += 2;
-                } else if (c == '=' && next_c == '=') {
-                    s->type = TOK_COMPARE;
-                    s->function = cmp_eq;
-                    s->next += 2;
-                } else if (c == '!' && next_c == '=') {
-                    s->type = TOK_COMPARE;
-                    s->function = cmp_ne;
-                    s->next += 2;
-                } else if (c == '<' && next_c == '=') {
-                    s->type = TOK_COMPARE;
-                    s->function = cmp_le;
-                    s->next += 2;
-                } else if (c == '>' && next_c == '=') {
-                    s->type = TOK_COMPARE;
-                    s->function = cmp_ge;
-                    s->next += 2;
-                } else {
-                    /* Single-character operators */
-                    s->next++;
-                    switch (c) {
-                        case '+': s->type = TOK_INFIX;
-                            s->function = add;
-                            break;
-                        case '-': s->type = TOK_INFIX;
-                            s->function = sub;
-                            break;
-                        case '*': s->type = TOK_INFIX;
-                            s->function = mul;
-                            break;
-                        case '/': s->type = TOK_INFIX;
-                            s->function = divide;
-                            break;
-                        case '%': s->type = TOK_INFIX;
-                            s->function = fmod;
-                            break;
-                        case '&': s->type = TOK_BITWISE;
-                            s->function = bit_and;
-                            break;
-                        case '|': s->type = TOK_BITWISE;
-                            s->function = bit_or;
-                            break;
-                        case '^': s->type = TOK_BITWISE;
-                            s->function = bit_xor;
-                            break; /* XOR for ints/bools */
-                        case '~': s->type = TOK_BITWISE;
-                            s->function = bit_not;
-                            break;
-                        case '<': s->type = TOK_COMPARE;
-                            s->function = cmp_lt;
-                            break;
-                        case '>': s->type = TOK_COMPARE;
-                            s->function = cmp_gt;
-                            break;
-                        case '(': s->type = TOK_OPEN;
-                            break;
-                        case ')': s->type = TOK_CLOSE;
-                            break;
-                        case ',': s->type = TOK_SEP;
-                            break;
-                        case ' ':
-                        case '\t':
-                        case '\n':
-                        case '\r': s->type = TOK_NULL;
-                            break;
-                        default: s->type = TOK_ERROR;
-                            break;
-                    }
-                }
-            }
+            read_operator_token(s);
         }
     } while (s->type == TOK_NULL);
 }
@@ -1086,7 +1136,7 @@ static double me_eval_scalar(const me_expr *n) {
 
     switch (TYPE_MASK(n->type)) {
         case ME_CONSTANT: return n->value;
-        case ME_VARIABLE: return *n->bound;
+        case ME_VARIABLE: return *(const double *) n->bound;
 
         case ME_FUNCTION0:
         case ME_FUNCTION1:
@@ -2223,7 +2273,7 @@ static void save_variable_bindings(const me_expr *node,
 }
 
 /* Recursively promote variables in expression tree */
-static void promome_variables_in_tree(me_expr *n, me_dtype target_type,
+static void promote_variables_in_tree(me_expr *n, me_dtype target_type,
                                       promoted_var_t *promotions, int *promo_count,
                                       int nitems) {
     if (!n) return;
@@ -2276,7 +2326,7 @@ static void promome_variables_in_tree(me_expr *n, me_dtype target_type,
         case ME_CLOSURE7: {
             const int arity = ARITY(n->type);
             for (int i = 0; i < arity; i++) {
-                promome_variables_in_tree((me_expr *) n->parameters[i], target_type,
+                promote_variables_in_tree((me_expr *) n->parameters[i], target_type,
                                           promotions, promo_count, nitems);
             }
             break;
@@ -2316,8 +2366,7 @@ static void restore_variables_in_tree(me_expr *n, const void **original_bounds,
         case ME_CLOSURE7: {
             const int arity = ARITY(n->type);
             for (int i = 0; i < arity; i++) {
-                restore_variables_in_tree((me_expr *) n->parameters[i],
-                                          original_bounds, original_types, restore_idx);
+                restore_variables_in_tree((me_expr *) n->parameters[i], original_bounds, original_types, restore_idx);
             }
             break;
         }
@@ -2364,7 +2413,7 @@ static bool all_variables_match_type(const me_expr *n, me_dtype target_type) {
     return true;
 }
 
-void me_eval(const me_expr *n) {
+static void private_eval(const me_expr *n) {
     if (!n) return;
 
     // Infer the result type from the expression tree
@@ -2433,7 +2482,7 @@ void me_eval(const me_expr *n) {
     save_variable_bindings(n, original_bounds, original_types, &save_idx);
 
     // Promote variables
-    promome_variables_in_tree((me_expr *) n, result_type, promotions, &promo_count, n->nitems);
+    promote_variables_in_tree((me_expr *) n, result_type, promotions, &promo_count, n->nitems);
 
     // Update expression type
     me_dtype saved_dtype = n->dtype;
@@ -2723,57 +2772,6 @@ static void update_variable_bindings(me_expr *node, const void **new_bounds, int
 }
 
 /* Evaluate compiled expression with new variable and output pointers */
-void me_eval_chunk(const me_expr *expr, const void **vars_chunk, int n_vars,
-                   void *output_chunk, int chunk_nitems) {
-    if (!expr) return;
-
-    // Save original variable pointers (unique list)
-    const void *original_var_pointers[100];
-    int actual_var_count = 0;
-    save_variable_pointers(expr, original_var_pointers, &actual_var_count);
-
-    // Verify variable count matches
-    if (actual_var_count != n_vars) {
-        // Mismatch in variable count
-        return;
-    }
-
-    // Save original state
-    int original_nitems_array[100];
-    void *original_output = expr->output;
-
-    // Save original nitems for all nodes
-    int nitems_idx = 0;
-    save_nitems_in_tree(expr, original_nitems_array, &nitems_idx);
-
-    // Free intermediate buffers so they can be reallocated with correct size
-    free_intermediate_buffers((me_expr *) expr);
-
-    // Update variable bindings to new chunk pointers (by matching old pointers)
-    update_vars_by_pointer((me_expr *) expr, original_var_pointers, vars_chunk, n_vars);
-
-    // Update nitems throughout the tree
-    int update_idx = 0; // dummy variable
-    update_variable_bindings((me_expr *) expr, NULL, &update_idx, chunk_nitems);
-
-    // Update output pointer
-    ((me_expr *) expr)->output = output_chunk;
-
-    // Evaluate with new pointers
-    me_eval(expr);
-
-    // Restore original variable bindings
-    update_vars_by_pointer((me_expr *) expr, vars_chunk, original_var_pointers, n_vars);
-
-    // Restore output
-    ((me_expr *) expr)->output = original_output;
-
-    // Restore nitems for all nodes
-    nitems_idx = 0;
-    restore_nitems_in_tree((me_expr *) expr, original_nitems_array, &nitems_idx);
-}
-
-/* Clone an expression tree (deep copy of structure, shallow copy of data) */
 static me_expr *clone_expr(const me_expr *src) {
     if (!src) return NULL;
 
@@ -2814,8 +2812,8 @@ static me_expr *clone_expr(const me_expr *src) {
  * This function is safe to call from multiple threads simultaneously,
  * even on the same expression object. Each call creates a temporary
  * clone of the expression tree to avoid race conditions. */
-void me_eval_chunk_threadsafe(const me_expr *expr, const void **vars_chunk,
-                              int n_vars, void *output_chunk, int chunk_nitems) {
+void me_eval(const me_expr *expr, const void **vars_chunk,
+             int n_vars, void *output_chunk, int chunk_nitems) {
     if (!expr) return;
 
     // Verify variable count matches
@@ -2842,7 +2840,7 @@ void me_eval_chunk_threadsafe(const me_expr *expr, const void **vars_chunk,
     clone->output = output_chunk;
 
     // Evaluate the clone
-    me_eval(clone);
+    private_eval(clone);
 
     // Free the clone (including any intermediate buffers it allocated)
     me_free(clone);
@@ -2876,8 +2874,8 @@ static void optimize(me_expr *n) {
 }
 
 
-me_expr *me_compile(const char *expression, const me_variable *variables, int var_count,
-                    void *output, int nitems, me_dtype dtype, int *error) {
+static me_expr *private_compile(const char *expression, const me_variable *variables, int var_count,
+                                void *output, int nitems, me_dtype dtype, int *error) {
     // Validate dtype usage: either all vars are ME_AUTO (use dtype), or dtype is ME_AUTO (use var dtypes)
     if (variables && var_count > 0) {
         int auto_count = 0;
@@ -2895,7 +2893,9 @@ me_expr *me_compile(const char *expression, const me_variable *variables, int va
         if (dtype == ME_AUTO) {
             // Mode 1: Output dtype is ME_AUTO, all variables must have explicit dtypes
             if (auto_count > 0) {
-                fprintf(stderr, "Error: When output dtype is ME_AUTO, all variable dtypes must be specified (not ME_AUTO)\n");
+                fprintf(
+                    stderr,
+                    "Error: When output dtype is ME_AUTO, all variable dtypes must be specified (not ME_AUTO)\n");
                 if (error) *error = -1;
                 return NULL;
             }
@@ -2970,8 +2970,8 @@ me_expr *me_compile(const char *expression, const me_variable *variables, int va
 // Synthetic addresses for ordinal matching (when user provides NULL addresses)
 static char synthetic_var_addresses[100];
 
-me_expr *me_compile_chunk(const char *expression, const me_variable *variables,
-                          int var_count, me_dtype dtype, int *error) {
+me_expr *me_compile(const char *expression, const me_variable *variables,
+                    int var_count, me_dtype dtype, int *error) {
     // For chunked evaluation, we compile without specific output/nitems
     // If variables have NULL addresses, assign synthetic unique addresses for ordinal matching
     me_variable *vars_copy = NULL;
@@ -3002,14 +3002,14 @@ me_expr *me_compile_chunk(const char *expression, const me_variable *variables,
                 }
             }
 
-            me_expr *result = me_compile(expression, vars_copy, var_count, NULL, 0, dtype, error);
+            me_expr *result = private_compile(expression, vars_copy, var_count, NULL, 0, dtype, error);
             free(vars_copy);
             return result;
         }
     }
 
     // No NULL addresses, use variables as-is
-    return me_compile(expression, variables, var_count, NULL, 0, dtype, error);
+    return private_compile(expression, variables, var_count, NULL, 0, dtype, error);
 }
 
 static void pn(const me_expr *n, int depth) {
@@ -3056,417 +3056,10 @@ static void pn(const me_expr *n, int depth) {
     }
 }
 
-
 void me_print(const me_expr *n) {
     pn(n, 0);
 }
 
-
-/* ============================================================================
- * BYTECODE COMPILER AND FUSED EXECUTOR
- * ============================================================================
- * This implements expression flattening for optimal performance.
- * The bytecode is type-agnostic and enables loop fusion.
- */
-
-typedef enum {
-    BC_LOAD_VAR, // Load from variable array: reg[dst] = vars[src1][i]
-    BC_LOAD_CONST, // Load constant: reg[dst] = constant
-    BC_ADD, // reg[dst] = reg[src1] + reg[src2]
-    BC_SUB, // reg[dst] = reg[src1] - reg[src2]
-    BC_MUL, // reg[dst] = reg[src1] * reg[src2]
-    BC_DIV, // reg[dst] = reg[src1] / reg[src2]
-    BC_POW, // reg[dst] = pow(reg[src1], reg[src2])
-    BC_NEG, // reg[dst] = -reg[src1]
-    BC_SQRT, // reg[dst] = sqrt(reg[src1])
-    BC_SIN, // reg[dst] = sin(reg[src1])
-    BC_COS, // reg[dst] = cos(reg[src1])
-    BC_EXP, // reg[dst] = exp(reg[src1])
-    BC_LOG, // reg[dst] = log(reg[src1])
-    BC_ABS, // reg[dst] = fabs(reg[src1])
-    BC_CALL1, // reg[dst] = function(reg[src1])
-    BC_CALL2, // reg[dst] = function(reg[src1], reg[src2])
-    BC_STORE, // output[i] = reg[src1]
-    BC_CONVERT // Type conversion: reg[dst] = convert(reg[src1])
-} bc_opcode;
-
-typedef struct {
-    bc_opcode op;
-    int src1; // First source register/variable index
-    int src2; // Second source register (for binary ops)
-    int dst; // Destination register
-    union {
-        double constant; // For BC_LOAD_CONST
-        const void *function; // For BC_CALL1/BC_CALL2
-        struct {
-            me_dtype from_type;
-            me_dtype to_type;
-        } convert; // For BC_CONVERT
-    } data;
-} bc_instruction;
-
-typedef struct {
-    bc_instruction *code;
-    int capacity;
-    int count;
-    int next_reg; // Next available register
-    const double **var_ptrs; // Array of variable pointers
-    int var_count; // Number of variables
-    int var_capacity;
-} bc_compiler;
-
-static bc_compiler *bc_new() {
-    bc_compiler *bc = malloc(sizeof(bc_compiler));
-    bc->capacity = 16;
-    bc->code = malloc(bc->capacity * sizeof(bc_instruction));
-    bc->count = 0;
-    bc->next_reg = 0;
-    bc->var_capacity = 16;
-    bc->var_ptrs = malloc(bc->var_capacity * sizeof(double *));
-    bc->var_count = 0;
-    return bc;
-}
-
-static void bc_free(bc_compiler *bc) {
-    if (bc) {
-        free(bc->code);
-        free(bc->var_ptrs);
-        free(bc);
-    }
-}
-
-static void bc_emit(bc_compiler *bc, bc_instruction inst) {
-    if (bc->count >= bc->capacity) {
-        bc->capacity *= 2;
-        bc->code = realloc(bc->code, bc->capacity * sizeof(bc_instruction));
-    }
-    bc->code[bc->count++] = inst;
-}
-
-static int bc_alloc_reg(bc_compiler *bc) {
-    return bc->next_reg++;
-}
-
-/* Find or add variable to mapping */
-static int bc_get_var_index(bc_compiler *bc, const double *var_ptr) {
-    for (int i = 0; i < bc->var_count; i++) {
-        if (bc->var_ptrs[i] == var_ptr) {
-            return i;
-        }
-    }
-    // Add new variable
-    if (bc->var_count >= bc->var_capacity) {
-        bc->var_capacity *= 2;
-        bc->var_ptrs = realloc(bc->var_ptrs, bc->var_capacity * sizeof(double *));
-    }
-    bc->var_ptrs[bc->var_count] = var_ptr;
-    return bc->var_count++;
-}
-
-/* Compile expression tree to bytecode */
-static int bc_compile_expr(bc_compiler *bc, const me_expr *n) {
-    if (!n) return -1;
-
-    int dst_reg;
-
-    switch (TYPE_MASK(n->type)) {
-        case ME_CONSTANT:
-            dst_reg = bc_alloc_reg(bc);
-            bc_emit(bc, (bc_instruction){BC_LOAD_CONST, -1, -1, dst_reg, {.constant = n->value}});
-            return dst_reg;
-
-        case ME_VARIABLE:
-            dst_reg = bc_alloc_reg(bc); {
-                int var_idx = bc_get_var_index(bc, n->bound);
-                bc_emit(bc, (bc_instruction){BC_LOAD_VAR, var_idx, -1, dst_reg, {.constant = 0}});
-            }
-            return dst_reg;
-
-        case ME_FUNCTION0:
-        case ME_FUNCTION0 | ME_FLAG_PURE:
-            // Constants like pi(), e()
-            dst_reg = bc_alloc_reg(bc); {
-                double (*func)(void) = (double(*)(void)) n->function;
-                double val = func();
-                bc_emit(bc, (bc_instruction){BC_LOAD_CONST, -1, -1, dst_reg, {.constant = val}});
-            }
-            return dst_reg;
-
-        case ME_FUNCTION1:
-        case ME_FUNCTION1 | ME_FLAG_PURE: {
-            int src = bc_compile_expr(bc, n->parameters[0]);
-            dst_reg = bc_alloc_reg(bc);
-
-            const void *func_ptr = n->function;
-
-            // Recognize common functions
-            if (func_ptr == (void *) sqrt) {
-                bc_emit(bc, (bc_instruction){BC_SQRT, src, -1, dst_reg, {.constant = 0}});
-            } else if (func_ptr == (void *) sin) {
-                bc_emit(bc, (bc_instruction){BC_SIN, src, -1, dst_reg, {.constant = 0}});
-            } else if (func_ptr == (void *) cos) {
-                bc_emit(bc, (bc_instruction){BC_COS, src, -1, dst_reg, {.constant = 0}});
-            } else if (func_ptr == (void *) exp) {
-                bc_emit(bc, (bc_instruction){BC_EXP, src, -1, dst_reg, {.constant = 0}});
-            } else if (func_ptr == (void *) log) {
-                bc_emit(bc, (bc_instruction){BC_LOG, src, -1, dst_reg, {.constant = 0}});
-            } else if (func_ptr == (void *) fabs) {
-                bc_emit(bc, (bc_instruction){BC_ABS, src, -1, dst_reg, {.constant = 0}});
-            } else if (func_ptr == (void *) negate) {
-                bc_emit(bc, (bc_instruction){BC_NEG, src, -1, dst_reg, {.constant = 0}});
-            } else {
-                // Generic call
-                bc_emit(bc, (bc_instruction){BC_CALL1, src, -1, dst_reg, {.function = func_ptr}});
-            }
-            return dst_reg;
-        }
-
-        case ME_FUNCTION2:
-        case ME_FUNCTION2 | ME_FLAG_PURE: {
-            int src1 = bc_compile_expr(bc, n->parameters[0]);
-            int src2 = bc_compile_expr(bc, n->parameters[1]);
-            dst_reg = bc_alloc_reg(bc);
-
-            me_fun2 func = (me_fun2) n->function;
-
-            // Recognize common functions
-            if (func == add) {
-                bc_emit(bc, (bc_instruction){BC_ADD, src1, src2, dst_reg, {.constant = 0}});
-            } else if (func == sub) {
-                bc_emit(bc, (bc_instruction){BC_SUB, src1, src2, dst_reg, {.constant = 0}});
-            } else if (func == mul) {
-                bc_emit(bc, (bc_instruction){BC_MUL, src1, src2, dst_reg, {.constant = 0}});
-            } else if (func == divide) {
-                bc_emit(bc, (bc_instruction){BC_DIV, src1, src2, dst_reg, {.constant = 0}});
-            } else if (func == (me_fun2) pow) {
-                bc_emit(bc, (bc_instruction){BC_POW, src1, src2, dst_reg, {.constant = 0}});
-            } else {
-                // Generic call
-                bc_emit(bc, (bc_instruction){BC_CALL2, src1, src2, dst_reg, {.function = (void *) func}});
-            }
-            return dst_reg;
-        }
-
-        default:
-            // For more complex cases, fall back to tree evaluation
-            return -1;
-    }
-}
-
-/* Compile expression to bytecode and attach to me_expr */
-static void me_compile_bytecode(me_expr *n) {
-    if (!n) return;
-
-    bc_compiler *bc = bc_new();
-
-    // Compile expression
-    int result_reg = bc_compile_expr(bc, n);
-
-    if (result_reg >= 0) {
-        // Emit store instruction
-        bc_emit(bc, (bc_instruction){BC_STORE, result_reg, -1, 0, {.constant = 0}});
-
-        // Attach to expression
-        n->bytecode = bc->code;
-        n->ncode = bc->count;
-
-        // Free compiler but keep code and var mapping
-        free((void *) bc->var_ptrs);
-        free(bc);
-    } else {
-        // Compilation failed, clean up
-        bc_free(bc);
-        n->bytecode = NULL;
-        n->ncode = 0;
-    }
-}
-
-/* Recursive helper for building variable array */
-static void me_traverse_vars(const me_expr *node, const double **vars, int *var_count, int max_vars) {
-    if (!node || *var_count >= max_vars) return;
-
-    if (node->type == ME_VARIABLE) {
-        // Check if already in array
-        for (int i = 0; i < *var_count; i++) {
-            if (vars[i] == node->bound) return;
-        }
-        vars[*var_count] = node->bound;
-        (*var_count)++;
-        return;
-    }
-
-    if (IS_FUNCTION(node->type) || IS_CLOSURE(node->type)) {
-        int arity = ARITY(node->type);
-        for (int i = 0; i < arity; i++) {
-            me_traverse_vars(node->parameters[i], vars, var_count, max_vars);
-        }
-    }
-}
-
-/* Build variable array from expression tree */
-static int me_build_var_array(const me_expr *n, const double **vars, int max_vars) {
-    int var_count = 0;
-    me_traverse_vars(n, vars, &var_count, max_vars);
-    return var_count;
-}
-
-/* Execute bytecode with fused loop - OPTIMIZED VERSION */
-void me_eval_fused(const me_expr *n) {
-    if (!n || !n->output || n->nitems <= 0) return;
-
-    // Compile bytecode if not already done
-    if (!n->bytecode) {
-        me_compile_bytecode((me_expr *) n);
-    }
-
-    // Fall back to regular eval if compilation failed
-    if (!n->bytecode) {
-        me_eval(n);
-        return;
-    }
-
-    const bc_instruction *code = n->bytecode;
-    const int ncode = n->ncode;
-    const int nitems = n->nitems;
-
-    // Build variable array - same order as during compilation
-    const double *vars[16];
-    me_build_var_array(n, vars, 16);
-
-    // Determine max register used
-    int max_reg = 0;
-    for (int pc = 0; pc < ncode; pc++) {
-        if (code[pc].dst > max_reg) max_reg = code[pc].dst;
-        if (code[pc].src1 > max_reg && code[pc].src1 >= 0) max_reg = code[pc].src1;
-        if (code[pc].src2 > max_reg && code[pc].src2 >= 0) max_reg = code[pc].src2;
-    }
-    max_reg++; // Convert to count
-
-    // Allocate temporary arrays for registers
-    double **temps = malloc(max_reg * sizeof(double *));
-    for (int r = 0; r < max_reg; r++) {
-        temps[r] = malloc(nitems * sizeof(double));
-    }
-
-    // Execute each instruction across ALL elements (loop fusion!)
-    for (int pc = 0; pc < ncode; pc++) {
-        bc_instruction inst = code[pc];
-        int i;
-
-        switch (inst.op) {
-            case BC_LOAD_VAR:
-                // Copy variable data to temp register
-                memcpy(temps[inst.dst], vars[inst.src1], nitems * sizeof(double));
-                break;
-
-            case BC_LOAD_CONST:
-                // Broadcast constant to all elements
-#pragma GCC ivdep
-                for (i = 0; i < nitems; i++) {
-                    temps[inst.dst][i] = inst.data.constant;
-                }
-                break;
-
-            case BC_ADD:
-                vec_add(temps[inst.src1], temps[inst.src2], temps[inst.dst], nitems);
-                break;
-
-            case BC_SUB:
-                vec_sub(temps[inst.src1], temps[inst.src2], temps[inst.dst], nitems);
-                break;
-
-            case BC_MUL:
-                vec_mul(temps[inst.src1], temps[inst.src2], temps[inst.dst], nitems);
-                break;
-
-            case BC_DIV:
-                vec_div(temps[inst.src1], temps[inst.src2], temps[inst.dst], nitems);
-                break;
-
-            case BC_POW:
-                vec_pow(temps[inst.src1], temps[inst.src2], temps[inst.dst], nitems);
-                break;
-
-            case BC_NEG:
-                vec_negate(temps[inst.src1], temps[inst.dst], nitems);
-                break;
-
-            case BC_SQRT:
-                vec_sqrt(temps[inst.src1], temps[inst.dst], nitems);
-                break;
-
-            case BC_SIN:
-                vec_sin(temps[inst.src1], temps[inst.dst], nitems);
-                break;
-
-            case BC_COS:
-                vec_cos(temps[inst.src1], temps[inst.dst], nitems);
-                break;
-
-            case BC_EXP:
-#pragma GCC ivdep
-                for (i = 0; i < nitems; i++) {
-                    temps[inst.dst][i] = exp(temps[inst.src1][i]);
-                }
-                break;
-
-            case BC_LOG:
-#pragma GCC ivdep
-                for (i = 0; i < nitems; i++) {
-                    temps[inst.dst][i] = log(temps[inst.src1][i]);
-                }
-                break;
-
-            case BC_ABS:
-#pragma GCC ivdep
-                for (i = 0; i < nitems; i++) {
-                    temps[inst.dst][i] = fabs(temps[inst.src1][i]);
-                }
-                break;
-
-            case BC_CALL1: {
-                double (*func)(double) = inst.data.function;
-#pragma GCC ivdep
-                for (i = 0; i < nitems; i++) {
-                    temps[inst.dst][i] = func(temps[inst.src1][i]);
-                }
-                break;
-            }
-
-            case BC_CALL2: {
-                double (*func)(double, double) = inst.data.function;
-#pragma GCC ivdep
-                for (i = 0; i < nitems; i++) {
-                    temps[inst.dst][i] = func(temps[inst.src1][i], temps[inst.src2][i]);
-                }
-                break;
-            }
-
-            case BC_CONVERT: {
-                // Type conversion - for now, this is a placeholder
-                // Full implementation requires type-aware bytecode execution
-                convert_func_t conv_func = get_convert_func(inst.data.convert.from_type,
-                                                            inst.data.convert.to_type);
-                if (conv_func) {
-                    conv_func(temps[inst.src1], temps[inst.dst], nitems);
-                } else {
-                    // No conversion needed or unsupported
-                    memcpy(temps[inst.dst], temps[inst.src1],
-                           nitems * dtype_size(inst.data.convert.from_type));
-                }
-                break;
-            }
-
-            case BC_STORE:
-                // Copy result to output
-                memcpy(n->output, temps[inst.src1], nitems * sizeof(double));
-                break;
-        }
-    }
-
-    // Free temporary arrays
-    for (int r = 0; r < max_reg; r++) {
-        free(temps[r]);
-    }
-    free(temps);
+me_dtype me_get_dtype(const me_expr *expr) {
+    return expr ? expr->dtype : ME_AUTO;
 }
diff --git a/src/blosc2/miniexpr.h b/src/blosc2/miniexpr.h
index a499b01d..fd79d6ac 100644
--- a/src/blosc2/miniexpr.h
+++ b/src/blosc2/miniexpr.h
@@ -41,6 +41,7 @@
 #ifdef __cplusplus
 extern "C" {
 
+
 #endif
 
 
@@ -73,25 +74,8 @@ typedef enum {
     ME_COMPLEX128 /* double complex */
 } me_dtype;
 
-typedef struct me_expr {
-    int type;
-
-    union {
-        double value;
-        const double *bound;
-        const void *function;
-    };
-
-    /* Vector operation info */
-    void *output; // Generic pointer (can be float* or double*)
-    int nitems;
-    me_dtype dtype; // Data type for this expression (result type after promotion)
-    me_dtype input_dtype; // Original input type (for variables/constants)
-    /* Bytecode info (for fused evaluation) */
-    void *bytecode; // Pointer to compiled bytecode
-    int ncode; // Number of instructions
-    void *parameters[1]; // Must be last (flexible array member)
-} me_expr;
+/* Opaque type for compiled expressions */
+typedef struct me_expr me_expr;
 
 
 enum {
@@ -108,44 +92,31 @@ enum {
 
 typedef struct me_variable {
     const char *name;
-    me_dtype dtype;      // Data type of this variable (ME_AUTO = use output dtype)
-    const void *address; // Pointer to data (NULL for me_compile_chunk)
-    int type;            // ME_VARIABLE for user variables (0 = auto-set to ME_VARIABLE)
-    void *context;       // For closures/functions (NULL for normal variables)
+    me_dtype dtype; // Data type of this variable (ME_AUTO = use output dtype)
+    const void *address; // Pointer to data (NULL for me_compile)
+    int type; // ME_VARIABLE for user variables (0 = auto-set to ME_VARIABLE)
+    void *context; // For closures/functions (NULL for normal variables)
 } me_variable;
 
 /* Note: When initializing variables, only name/dtype/address are typically needed.
  * Unspecified fields default to 0/NULL, which is correct for normal use:
  *   {"varname"}                          → defaults all fields
- *   {"varname", ME_FLOAT64}              → for me_compile_chunk with mixed types
+ *   {"varname", ME_FLOAT64}              → for me_compile with mixed types
  *   {"varname", ME_FLOAT64, var_array}   → for me_compile with address
  * Advanced users can specify type for closures/functions if needed.
  */
 
 
-/* Parses the input expression and binds variables. */
-/* Returns NULL on error. */
-/*
- * The dtype parameter controls variable type handling:
- *   - If dtype is ME_AUTO: All variables must have explicit dtypes (not ME_AUTO).
- *                          Output dtype is inferred from the expression.
- *   - If dtype is specified: All variables must be ME_AUTO.
- *                            Both variables and output use this dtype.
- * The actual result type is available in expr->dtype after compilation.
- */
-me_expr *me_compile(const char *expression, const me_variable *variables, int var_count,
-                    void *output, int nitems, me_dtype dtype, int *error);
-
 /* Compile expression for chunked evaluation.
- * This variant is optimized for use with me_eval_chunk() and me_eval_chunk_threadsafe(),
+ * This function is optimized for use with me_eval(),
  * where variable and output pointers are provided later during evaluation.
  *
  * Parameters:
  *   expression: The expression string to compile
  *   variables: Array of variable definitions. Only the 'name' field is required.
- *              Variables will be matched by position (ordinal order) during me_eval_chunk().
+ *              Variables will be matched by position (ordinal order) during me_eval().
  *   var_count: Number of variables
- *   dtype: Data type handling (same rules as me_compile):
+ *   dtype: Data type handling:
  *          - ME_AUTO: All variables must specify their dtypes, output is inferred
  *          - Specific type: All variables must be ME_AUTO, this type is used for all
  *   error: Optional pointer to receive error position (0 on success, >0 on error)
@@ -154,27 +125,23 @@ me_expr *me_compile(const char *expression, const me_variable *variables, int va
  *
  * Example 1 (simple - all same type):
  *   me_variable vars[] = {{"x"}, {"y"}};  // Both ME_AUTO
- *   me_expr *expr = me_compile_chunk("x + y", vars, 2, ME_FLOAT64, &err);
+ *   me_expr *expr = me_compile("x + y", vars, 2, ME_FLOAT64, &err);
  *
  * Example 2 (mixed types):
  *   me_variable vars[] = {{"x", ME_INT32}, {"y", ME_FLOAT64}};
- *   me_expr *expr = me_compile_chunk("x + y", vars, 2, ME_AUTO, &err);
+ *   me_expr *expr = me_compile("x + y", vars, 2, ME_AUTO, &err);
  *
  *   // Later, provide data in same order as variable definitions
  *   const void *data[] = {x_array, y_array};  // x first, y second
- *   me_eval_chunk(expr, data, 2, output, nitems);
+ *   me_eval(expr, data, 2, output, nitems);
  */
-me_expr *me_compile_chunk(const char *expression, const me_variable *variables,
-                          int var_count, me_dtype dtype, int *error);
-
-/* Evaluates the expression on vectors. */
-void me_eval(const me_expr *n);
-
-/* Evaluates using fused bytecode (faster for complex expressions). */
-void me_eval_fused(const me_expr *n);
+me_expr *me_compile(const char *expression, const me_variable *variables,
+                    int var_count, me_dtype dtype, int *error);
 
-/* Evaluates compiled expression with new variable and output pointers.
- * This allows processing large arrays in chunks without recompiling.
+/* Evaluates compiled expression with variable and output pointers.
+ * This function can be safely called from multiple threads simultaneously on the
+ * same compiled expression. It creates a temporary clone of the expression tree
+ * for each call, eliminating race conditions at the cost of some memory allocation.
  *
  * Parameters:
  *   expr: Compiled expression (from me_compile)
@@ -183,31 +150,24 @@ void me_eval_fused(const me_expr *n);
  *   output_chunk: Pointer to output buffer for this chunk
  *   chunk_nitems: Number of elements in this chunk
  *
- * Note: The chunks must have the same data types as the original variables.
- * WARNING: This function is NOT thread-safe. Use me_eval_chunk_threadsafe() for
- *          concurrent evaluation from multiple threads.
+ * Use this function for both serial and parallel evaluation. It is thread-safe
+ * and can be used from multiple threads to process different chunks simultaneously.
  */
-void me_eval_chunk(const me_expr *expr, const void **vars_chunk, int n_vars,
-                   void *output_chunk, int chunk_nitems);
+void me_eval(const me_expr *expr, const void **vars_chunk,
+             int n_vars, void *output_chunk, int chunk_nitems);
 
-/* Thread-safe version of me_eval_chunk.
- * This function can be safely called from multiple threads simultaneously on the
- * same compiled expression. It creates a temporary clone of the expression tree
- * for each call, eliminating race conditions at the cost of some memory allocation.
- *
- * Use this when you need to evaluate the same expression in parallel across
- * different chunks from multiple threads.
- */
-void me_eval_chunk_threadsafe(const me_expr *expr, const void **vars_chunk,
-                              int n_vars, void *output_chunk, int chunk_nitems);
-
-/* Prints debugging information on the syntax tree. */
+/* Prints the expression tree for debugging purposes. */
 void me_print(const me_expr *n);
 
 /* Frees the expression. */
 /* This is safe to call on NULL pointers. */
 void me_free(me_expr *n);
 
+/* Get the result data type of a compiled expression.
+ * Returns the dtype that will be used for the output of me_eval().
+ */
+me_dtype me_get_dtype(const me_expr *expr);
+
 
 #ifdef __cplusplus
 }

From e345d9c04eee30c6f60bddd63116a9579deac6a9 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 17 Dec 2025 13:19:10 +0100
Subject: [PATCH 012/123] Fix a leak

---
 src/blosc2/blosc2_ext.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 5ff21fca..0bfa591a 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1740,7 +1740,7 @@ cdef class SChunk:
             me_data = <me_udata*>self.schunk.storage.cparams.preparams.user_data
             free(me_data.inputs)
             if me_data.miniexpr_handle != NULL:  # XXX do we really need the conditional?
-                free(me_data.miniexpr_handle)
+                me_free(me_data.miniexpr_handle)
             free(me_data)
         else:
             # From Python the preparams->udata with always have the field py_func

From 655efeccb6d88bf79de5d518a092d5b3dce0a0dc Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 17 Dec 2025 17:04:10 +0100
Subject: [PATCH 013/123] Add a comparison against NumPy

---
 bench/ndarray/expr-blocked-eval.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/bench/ndarray/expr-blocked-eval.py b/bench/ndarray/expr-blocked-eval.py
index 7caa2f67..61219b69 100644
--- a/bench/ndarray/expr-blocked-eval.py
+++ b/bench/ndarray/expr-blocked-eval.py
@@ -5,11 +5,12 @@
 
 N = 10_000
 dtype= np.float32
-cparams = blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=5)
+cparams = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=1)
 
 t0 = time()
 #a = blosc2.ones((N, N), dtype=dtype)
 a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
+# a = blosc2.linspace(0., 1., np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
 print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
 t0 = time()
 b = a.copy()
@@ -29,3 +30,7 @@
 t0 = time()
 res = ne.evaluate("(na + nb) * nc")
 print(f"Time to evaluate with NumExpr: {(time() - t0) * 1000 :.4f} ms")
+
+t0 = time()
+res = na + nb * nc
+print(f"Time to evaluate with NumPy: {(time() - t0) * 1000 :.4f} ms")

From 1d56614ddc29765dfefcaad72ed7f9a304abd60c Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 17 Dec 2025 17:54:07 +0100
Subject: [PATCH 014/123] Selectively try miniexpr, if possible

---
 src/blosc2/blosc2_ext.pyx |  2 +-
 src/blosc2/lazyexpr.py    | 38 ++++++++++++++++++++++++++------------
 src/blosc2/miniexpr.c     |  4 ++++
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 0bfa591a..15387d3e 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -2828,7 +2828,7 @@ cdef class NDArray:
         expression = expression.encode("utf-8") if isinstance(expression, str) else expression
         udata.miniexpr_handle = me_compile(expression, variables, n, ME_AUTO, &error)
         if udata.miniexpr_handle == NULL:
-            raise ValueError(f"Cannot compile expression: {expression}")
+            raise NotImplementedError(f"Cannot compile expression: {expression}")
 
         # Free resources
         for i in range(len(inputs)):
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index e5d701b3..a19a4689 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -91,6 +91,9 @@
     safe_numpy_globals["matrix_transpose"] = np.transpose
     safe_numpy_globals["vecdot"] = npvecdot
 
+# Set this to False if miniexpr should not be tried out
+try_miniexpr = True
+
 
 def ne_evaluate(expression, local_dict=None, **kwargs):
     """Safely evaluate expressions using numexpr when possible, falling back to numpy."""
@@ -1216,6 +1219,8 @@ def fast_eval(  # noqa: C901
     :ref:`NDArray` or np.ndarray
         The output array.
     """
+    global try_miniexpr
+
     out = kwargs.pop("_output", None)
     ne_args: dict = kwargs.pop("_ne_args", {})
     if ne_args is None:
@@ -1261,7 +1266,12 @@ def fast_eval(  # noqa: C901
         # WebAssembly does not support threading, so we cannot use the iter_disk option
         iter_disk = False
 
-    if True:
+    # Check whether we can use miniexpr
+    for op in operands.values():
+        if not isinstance(op, blosc2.NDArray):
+            try_miniexpr = False
+
+    if try_miniexpr:
         cparams = kwargs.pop("cparams", blosc2.CParams())
         # Force single-threaded execution for prefilter evaluation
         # The prefilter callback accesses Python objects which aren't thread-safe
@@ -1272,17 +1282,21 @@ def fast_eval(  # noqa: C901
         res_eval = blosc2.empty(shape, dtype, cparams=cparams, **kwargs)
         # XXX Validate expression before using it
         # numexpr.validate(expression, local_dict=operands)
-        res_eval._set_pref_expr(expression, operands)
-
-        # This line would NOT allocate physical RAM on any modern OS:
-        aux = np.empty(res_eval.shape, res_eval.dtype)
-        # Physical allocation happens here (when writing):
-        res_eval[...] = aux
-        res_eval.schunk.remove_prefilter("miniexpr")
-        # if cparams.nthreads > 1:
-        #     res_eval.schunk.cparams.nthreads = prev_nthreads
-
-        return res_eval
+        try:
+            res_eval._set_pref_expr(expression, operands)
+            # This line would NOT allocate physical RAM on any modern OS:
+            aux = np.empty(res_eval.shape, res_eval.dtype)
+            # Physical allocation happens here (when writing):
+            res_eval[...] = aux
+            res_eval.schunk.remove_prefilter("miniexpr")
+            # if cparams.nthreads > 1:
+            #     res_eval.schunk.cparams.nthreads = prev_nthreads
+            return res_eval
+        except Exception:
+            # print(f"Error setting prefilter expression: {e}")
+            # This expression is not supported; clean up the prefilter and continue
+            # res_eval.schunk.remove_prefilter("miniexpr")  # XXX
+            pass
 
     chunk_operands = {}
     # Check which chunks intersect with _slice
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index 25760364..a116ff5e 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -457,6 +457,10 @@ static const me_variable functions[] = {
     /* Format: {name, dtype, address, type, context} */
     {"abs", 0, fabs, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"acos", 0, acos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"arccos", 0, acos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"arcsin", 0, asin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"arctan", 0, atan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"arctan2", 0, atan2, ME_FUNCTION2 | ME_FLAG_PURE, 0},
     {"asin", 0, asin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"atan", 0, atan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"atan2", 0, atan2, ME_FUNCTION2 | ME_FLAG_PURE, 0},

From cd2613399dce97da4473e466c4d6a9cc77841fad Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 18 Dec 2025 08:15:25 +0100
Subject: [PATCH 015/123] Create a new context for each input

---
 bench/ndarray/expr-blocked-eval.py | 2 +-
 src/blosc2/blosc2_ext.pyx          | 9 ++-------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/bench/ndarray/expr-blocked-eval.py b/bench/ndarray/expr-blocked-eval.py
index 61219b69..e8f56a2b 100644
--- a/bench/ndarray/expr-blocked-eval.py
+++ b/bench/ndarray/expr-blocked-eval.py
@@ -8,7 +8,7 @@
 cparams = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=1)
 
 t0 = time()
-#a = blosc2.ones((N, N), dtype=dtype)
+#a = blosc2.ones((N, N), dtype=dtype, cparams=cparams)
 a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
 # a = blosc2.linspace(0., 1., np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
 print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 15387d3e..57ad059d 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1843,7 +1843,6 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
     cdef void* src
     cdef int32_t chunk_nbytes, chunk_cbytes, block_nbytes
     cdef int start
-    cdef blosc2_context** input_dctxs = <blosc2_context**> calloc(udata.ninputs, sizeof(blosc2_context*))
     cdef blosc2_context* dctx
     for i in range(udata.ninputs):
         ndarr = udata.inputs[i]
@@ -1866,10 +1865,10 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
             else:
                 # This can add a significant overhead, but it is needed for thread safety.
                 # Perhaps one can create a specific (serial) context just for blosc2_getitem_ctx?
-                input_dctxs[i] = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
-                dctx = input_dctxs[i]
+                dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
             rc = blosc2_getitem_ctx(dctx, src, chunk_cbytes, start, ndarr.blocknitems,
                                     input_buffers[i], block_nbytes)
+            blosc2_free_ctx(dctx)
             if rc < 0:
                 raise ValueError("miniexpr: error decompressing the chunk")
 
@@ -1884,11 +1883,7 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
     # Free resources
     for i in range(udata.ninputs):
         free(input_buffers[i])
-        if input_dctxs[i] != NULL:
-            # When doing profiling (see above code), this can be NULL
-            blosc2_free_ctx(input_dctxs[i])
     free(input_buffers)
-    free(input_dctxs)
 
     return 0
 

From b3742f87623e98953bd32197d6e41477245207d4 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sun, 21 Dec 2025 12:55:53 +0100
Subject: [PATCH 016/123] Updated to latest miniexpr (bools as result of
 comparsions)

---
 src/blosc2/miniexpr.c | 116 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 110 insertions(+), 6 deletions(-)

diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index a116ff5e..1b41f6cc 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -234,7 +234,7 @@ typedef struct state {
 /* Forward declaration */
 static me_expr *new_expr(const int type, const me_expr *parameters[]);
 
-/* Infer result type from expression tree */
+/* Infer computation type from expression tree (for evaluation) */
 static me_dtype infer_result_type(const me_expr *n) {
     if (!n) return ME_FLOAT64;
 
@@ -261,6 +261,11 @@ static me_dtype infer_result_type(const me_expr *n) {
         case ME_CLOSURE5:
         case ME_CLOSURE6:
         case ME_CLOSURE7: {
+            // For comparisons with ME_BOOL output, we still need to infer the
+            // computation type from operands (e.g., float64 for float inputs).
+            // Don't return ME_BOOL early - let the operand types determine
+            // the computation type.
+
             const int arity = ARITY(n->type);
             me_dtype result = ME_BOOL;
 
@@ -276,6 +281,55 @@ static me_dtype infer_result_type(const me_expr *n) {
     return ME_FLOAT64;
 }
 
+/* Infer logical output type from expression tree (for compilation with ME_AUTO) */
+static me_dtype infer_output_type(const me_expr *n) {
+    if (!n) return ME_FLOAT64;
+
+    switch (TYPE_MASK(n->type)) {
+        case ME_CONSTANT:
+            return n->dtype;
+
+        case ME_VARIABLE:
+            return n->dtype;
+
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            // If this node is a comparison (dtype == ME_BOOL set during parsing),
+            // the output type is ME_BOOL
+            if (n->dtype == ME_BOOL) {
+                return ME_BOOL;
+            }
+
+            // Otherwise, infer from operands
+            const int arity = ARITY(n->type);
+            me_dtype result = ME_BOOL;
+
+            for (int i = 0; i < arity; i++) {
+                me_dtype param_type = infer_output_type((const me_expr *) n->parameters[i]);
+                result = promote_types(result, param_type);
+            }
+
+            return result;
+        }
+    }
+
+    return ME_FLOAT64;
+}
+
 /* Apply type promotion to a binary operation node */
 static me_expr *create_conversion_node(me_expr *source, me_dtype target_dtype) {
     /* Create a unary conversion node that converts source to target_dtype */
@@ -1694,6 +1748,7 @@ static void vec_convert_##FROM_SUFFIX##_to_##TO_SUFFIX(const FROM_TYPE *in, TO_T
 }
 
 /* Generate all conversion functions */
+/* Conversions FROM bool TO other types */
 DEFINE_VEC_CONVERT(bool, i8, bool, int8_t)
 DEFINE_VEC_CONVERT(bool, i16, bool, int16_t)
 DEFINE_VEC_CONVERT(bool, i32, bool, int32_t)
@@ -1705,6 +1760,18 @@ DEFINE_VEC_CONVERT(bool, u64, bool, uint64_t)
 DEFINE_VEC_CONVERT(bool, f32, bool, float)
 DEFINE_VEC_CONVERT(bool, f64, bool, double)
 
+/* Conversions FROM other types TO bool */
+DEFINE_VEC_CONVERT(i8, bool, int8_t, bool)
+DEFINE_VEC_CONVERT(i16, bool, int16_t, bool)
+DEFINE_VEC_CONVERT(i32, bool, int32_t, bool)
+DEFINE_VEC_CONVERT(i64, bool, int64_t, bool)
+DEFINE_VEC_CONVERT(u8, bool, uint8_t, bool)
+DEFINE_VEC_CONVERT(u16, bool, uint16_t, bool)
+DEFINE_VEC_CONVERT(u32, bool, uint32_t, bool)
+DEFINE_VEC_CONVERT(u64, bool, uint64_t, bool)
+DEFINE_VEC_CONVERT(f32, bool, float, bool)
+DEFINE_VEC_CONVERT(f64, bool, double, bool)
+
 DEFINE_VEC_CONVERT(i8, i16, int8_t, int16_t)
 DEFINE_VEC_CONVERT(i8, i32, int8_t, int32_t)
 DEFINE_VEC_CONVERT(i8, i64, int8_t, int64_t)
@@ -1773,6 +1840,17 @@ static convert_func_t get_convert_func(me_dtype from, me_dtype to) {
     CONV_CASE(ME_BOOL, ME_FLOAT32, bool, f32)
     CONV_CASE(ME_BOOL, ME_FLOAT64, bool, f64)
 
+    CONV_CASE(ME_INT8, ME_BOOL, i8, bool)
+    CONV_CASE(ME_INT16, ME_BOOL, i16, bool)
+    CONV_CASE(ME_INT32, ME_BOOL, i32, bool)
+    CONV_CASE(ME_INT64, ME_BOOL, i64, bool)
+    CONV_CASE(ME_UINT8, ME_BOOL, u8, bool)
+    CONV_CASE(ME_UINT16, ME_BOOL, u16, bool)
+    CONV_CASE(ME_UINT32, ME_BOOL, u32, bool)
+    CONV_CASE(ME_UINT64, ME_BOOL, u64, bool)
+    CONV_CASE(ME_FLOAT32, ME_BOOL, f32, bool)
+    CONV_CASE(ME_FLOAT64, ME_BOOL, f64, bool)
+
     CONV_CASE(ME_INT8, ME_INT16, i8, i16)
     CONV_CASE(ME_INT8, ME_INT32, i8, i32)
     CONV_CASE(ME_INT8, ME_INT64, i8, i64)
@@ -2488,8 +2566,20 @@ static void private_eval(const me_expr *n) {
     // Promote variables
     promote_variables_in_tree((me_expr *) n, result_type, promotions, &promo_count, n->nitems);
 
-    // Update expression type
+    // Check if we need output type conversion (e.g., computation in float64, output in bool)
     me_dtype saved_dtype = n->dtype;
+    void *original_output = n->output;
+    void *temp_output = NULL;
+
+    if (saved_dtype != result_type) {
+        // Allocate temp buffer for computation
+        temp_output = malloc(n->nitems * dtype_size(result_type));
+        if (temp_output) {
+            ((me_expr *) n)->output = temp_output;
+        }
+    }
+
+    // Update expression type for evaluation
     ((me_expr *) n)->dtype = result_type;
 
     // Evaluate with promoted types
@@ -2537,6 +2627,17 @@ static void private_eval(const me_expr *n) {
 #endif
     }
 
+    // If we used a temp buffer, convert to final output type
+    if (temp_output) {
+        convert_func_t conv = get_convert_func(result_type, saved_dtype);
+        if (conv) {
+            conv(temp_output, original_output, n->nitems);
+        }
+        // Restore original output pointer
+        ((me_expr *) n)->output = original_output;
+        free(temp_output);
+    }
+
     // Restore original variable bindings
     int restore_idx = 0;
     restore_variables_in_tree((me_expr *) n, original_bounds, original_types, &restore_idx);
@@ -2904,9 +3005,11 @@ static me_expr *private_compile(const char *expression, const me_variable *varia
                 return NULL;
             }
         } else {
-            // Mode 2: Output dtype is specified, all variables must be ME_AUTO
-            if (specified_count > 0) {
-                fprintf(stderr, "Error: When output dtype is specified, all variable dtypes must be ME_AUTO\n");
+            // Mode 2: Output dtype is specified
+            // Two sub-modes: all ME_AUTO (homogeneous), or all explicit (heterogeneous with conversion)
+            if (auto_count > 0 && specified_count > 0) {
+                // Mixed mode not allowed
+                fprintf(stderr, "Error: Variable dtypes must be all ME_AUTO or all explicitly specified\n");
                 if (error) *error = -1;
                 return NULL;
             }
@@ -2961,8 +3064,9 @@ static me_expr *private_compile(const char *expression, const me_variable *varia
 
         // If dtype is ME_AUTO, infer from expression; otherwise use provided dtype
         if (dtype == ME_AUTO) {
-            root->dtype = infer_result_type(root);
+            root->dtype = infer_output_type(root);
         } else {
+            // User explicitly requested a dtype - use it (will cast if needed)
             root->dtype = dtype;
         }
 

From 36065c344a0b3e7f720f623d7e014075744a66e2 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sun, 21 Dec 2025 14:37:12 +0100
Subject: [PATCH 017/123] Fixed many things with miniexpr backend

---
 src/blosc2/blosc2_ext.pyx      |  6 ++--
 src/blosc2/lazyexpr.py         | 66 +++++++++++++++++++++++++++++-----
 src/blosc2/miniexpr.c          | 17 ++++++---
 tests/ndarray/test_lazyexpr.py | 12 +++----
 4 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 57ad059d..386e85e6 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -493,7 +493,7 @@ cdef extern from "b2nd.h":
     int b2nd_free(b2nd_array_t *array)
     int b2nd_get_slice_cbuffer(b2nd_array_t *array,
                                int64_t *start, int64_t *stop,
-                               void *buffer, int64_t *buffershape, int64_t buffersize) nogil
+                               void *buffer, int64_t *buffershape, int64_t buffersize)
     int b2nd_set_slice_cbuffer(void *buffer, int64_t *buffershape, int64_t buffersize,
                                int64_t *start, int64_t *stop, b2nd_array_t *array)
     int b2nd_get_slice(b2nd_context_t *ctx, b2nd_array_t **array, b2nd_array_t *src, const int64_t *start,
@@ -525,7 +525,7 @@ cdef extern from "b2nd.h":
                           const void *src, const int64_t *src_pad_shape,
                           const int64_t *src_start, const int64_t *src_stop,
                           void *dst, const int64_t *dst_pad_shape,
-                          const int64_t *dst_start) nogil
+                          const int64_t *dst_start)
 
 
 # miniexpr C API declarations
@@ -1876,7 +1876,6 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
     if miniexpr_handle == NULL:
         raise ValueError("miniexpr: handle not assigned")
     # Call thread-safe miniexpr C API
-    # XXX Add error checking inside the function?
     me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
             <void*>params_output, ndarr.blocknitems)
 
@@ -2808,7 +2807,6 @@ cdef class NDArray:
         if variables == NULL:
             raise MemoryError()
         cdef me_variable *var
-        print(f"variables: {inputs.keys()}")
         for i, (k, v) in enumerate(inputs.items()):
             var = &variables[i]
             var_name = k.encode("utf-8") if isinstance(k, str) else k
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index a19a4689..43cdc438 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -12,6 +12,7 @@
 import asyncio
 import builtins
 import concurrent.futures
+import contextlib
 import copy
 import inspect
 import linecache
@@ -92,6 +93,7 @@
     safe_numpy_globals["vecdot"] = npvecdot
 
 # Set this to False if miniexpr should not be tried out
+# Disabled: miniexpr has critical bugs with scalar constants in expressions
 try_miniexpr = True
 
 
@@ -1221,6 +1223,13 @@ def fast_eval(  # noqa: C901
     """
     global try_miniexpr
 
+    # Use a local copy so we don't modify the global
+    use_miniexpr = try_miniexpr
+
+    # Disable miniexpr for UDFs (callable expressions)
+    if callable(expression):
+        use_miniexpr = False
+
     out = kwargs.pop("_output", None)
     ne_args: dict = kwargs.pop("_ne_args", {})
     if ne_args is None:
@@ -1267,11 +1276,49 @@ def fast_eval(  # noqa: C901
         iter_disk = False
 
     # Check whether we can use miniexpr
-    for op in operands.values():
-        if not isinstance(op, blosc2.NDArray):
-            try_miniexpr = False
+    # Miniexpr only supports a subset of functions - disable for unsupported ones
+    unsupported_funcs = [
+        "acosh",
+        "arctan2",  # miniexpr C library works, but Python bindings have issues
+        "arccosh",
+        "arcsinh",
+        "arctanh",
+        "asinh",
+        "atanh",
+        "clip",
+        "conj",
+        "expm1",
+        "imag",
+        "log",  # miniexpr uses log10 by default, but blosc2 expects ln
+        "log1p",
+        "log2",
+        "logaddexp",
+        "maximum",
+        "minimum",
+        "real",
+        "round",
+        "sign",
+        "square",
+        "trunc",
+        "where",
+        "contains",
+    ]
+
+    if isinstance(expression, str) and any(func in expression for func in unsupported_funcs):
+        use_miniexpr = False
 
-    if try_miniexpr:
+    if use_miniexpr:
+        for op in operands.values():
+            # Only NDArray in-memory operands
+            if not (isinstance(op, blosc2.NDArray) and op.urlpath is None and out is None):
+                use_miniexpr = False
+                break
+            # Check that partitions are well-behaved (no padding)
+            if not blosc2.are_partitions_behaved(op.shape, op.chunks, op.blocks):
+                use_miniexpr = False
+                break
+
+    if use_miniexpr:
         cparams = kwargs.pop("cparams", blosc2.CParams())
         # Force single-threaded execution for prefilter evaluation
         # The prefilter callback accesses Python objects which aren't thread-safe
@@ -1279,7 +1326,8 @@ def fast_eval(  # noqa: C901
         # if cparams.nthreads > 1:
         #     prev_nthreads = cparams.nthreads
         #     cparams.nthreads = 1
-        res_eval = blosc2.empty(shape, dtype, cparams=cparams, **kwargs)
+        # Use the same chunks/blocks as the input operands for consistency
+        res_eval = blosc2.empty(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
         # XXX Validate expression before using it
         # numexpr.validate(expression, local_dict=operands)
         try:
@@ -1291,12 +1339,14 @@ def fast_eval(  # noqa: C901
             res_eval.schunk.remove_prefilter("miniexpr")
             # if cparams.nthreads > 1:
             #     res_eval.schunk.cparams.nthreads = prev_nthreads
+            if getitem:
+                return res_eval[:]
             return res_eval
         except Exception:
-            # print(f"Error setting prefilter expression: {e}")
             # This expression is not supported; clean up the prefilter and continue
-            # res_eval.schunk.remove_prefilter("miniexpr")  # XXX
-            pass
+            with contextlib.suppress(Exception):
+                # Prefilter might not have been set yet
+                res_eval.schunk.remove_prefilter("miniexpr")
 
     chunk_operands = {}
     # Check which chunks intersect with _slice
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index 1b41f6cc..d5ebcacc 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -42,9 +42,9 @@ For a**b**c = a**(b**c) and -a**b = -(a**b) uncomment the next line.*/
 /* #define ME_POW_FROM_RIGHT */
 
 /* Logarithms
-For log = base 10 log do nothing
-For log = natural log uncomment the next line. */
-/* #define ME_NAT_LOG */
+For log = natural log do nothing (NumPy compatible)
+For log = base 10 log comment the next line. */
+#define ME_NAT_LOG
 
 #include "miniexpr.h"
 #include <stdlib.h>
@@ -3038,7 +3038,16 @@ static me_expr *private_compile(const char *expression, const me_variable *varia
     s.start = s.next = expression;
     s.lookup = vars_copy ? vars_copy : variables;
     s.lookup_len = var_count;
-    s.target_dtype = (dtype != ME_AUTO) ? dtype : ME_FLOAT64; // Set target dtype for constants
+    // When dtype is ME_AUTO, infer target dtype from variables to avoid type mismatch
+    if (dtype != ME_AUTO) {
+        s.target_dtype = dtype;
+    } else if (variables && var_count > 0) {
+        // Use the first variable's dtype as the target for constants
+        // This prevents type promotion issues when mixing float32 vars with float64 constants
+        s.target_dtype = variables[0].dtype;
+    } else {
+        s.target_dtype = ME_FLOAT64; // Fallback to double
+    }
 
     next_token(&s);
     me_expr *root = list(&s);
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index 88136602..1425f2c4 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -312,9 +312,9 @@ def test_functions(function, dtype_fixture, shape_fixture):
     expr_string = f"{function}(na1)"
     res_numexpr = ne_evaluate(expr_string)
     # Compare the results
-    np.testing.assert_allclose(res_lazyexpr[:], res_numexpr)
-    np.testing.assert_allclose(expr.slice(slice(0, 10, 1)), res_numexpr[:10])  # slice test
-    np.testing.assert_allclose(expr[:10], res_numexpr[:10])  # getitem test
+    np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=1e-5)
+    np.testing.assert_allclose(expr.slice(slice(0, 10, 1)), res_numexpr[:10], rtol=1e-5)  # slice test
+    np.testing.assert_allclose(expr[:10], res_numexpr[:10], rtol=1e-5)  # getitem test
 
     # For some reason real and imag are not supported by numpy's assert_allclose
     # (TypeError: bad operand type for abs(): 'LazyExpr' and segfaults are observed)
@@ -324,7 +324,7 @@ def test_functions(function, dtype_fixture, shape_fixture):
     # Using numpy functions
     expr = eval(f"np.{function}(a1)", {"a1": a1, "np": np})
     # Compare the results
-    np.testing.assert_allclose(expr[()], res_numexpr)
+    np.testing.assert_allclose(expr[()], res_numexpr, rtol=1e-5)
 
     # In combination with other operands
     na2 = np.linspace(0, 10, nelems, dtype=dtype_fixture).reshape(shape_fixture)
@@ -338,7 +338,7 @@ def test_functions(function, dtype_fixture, shape_fixture):
     expr_string = f"na1 + {function}(na2)"
     res_numexpr = ne_evaluate(expr_string)
     # Compare the results
-    np.testing.assert_allclose(res_lazyexpr[:], res_numexpr)
+    np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=1e-5)
 
     # Functions of the form np.function(a1 + a2)
     expr = eval(f"np.{function}(a1 + a2)", {"a1": a1, "a2": a2, "np": np})
@@ -346,7 +346,7 @@ def test_functions(function, dtype_fixture, shape_fixture):
     expr_string = f"{function}(na1 + na2)"
     res_numexpr = ne_evaluate(expr_string)
     # Compare the results
-    np.testing.assert_allclose(expr[()], res_numexpr)
+    np.testing.assert_allclose(expr[()], res_numexpr, rtol=1e-5)
 
 
 @pytest.mark.parametrize(

From 232e485e218ee774cd4c10544934f1bfcd01d44e Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sun, 21 Dec 2025 18:49:40 +0100
Subject: [PATCH 018/123] More defensive protection when calling C funcs

---
 src/blosc2/blosc2_ext.pyx | 45 ++++++++++++++++++++++++++-------------
 src/blosc2/lazyexpr.py    | 43 +++++++++++++++++++++++++------------
 2 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 386e85e6..a00ffecd 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1723,7 +1723,9 @@ cdef class SChunk:
         cparams.preparams = preparams
         _check_cparams(cparams)
 
-        blosc2_free_ctx(self.schunk.cctx)
+        if self.schunk.cctx != NULL:
+            # Freeing NULL context can lead to segmentation fault
+            blosc2_free_ctx(self.schunk.cctx)
         self.schunk.cctx = blosc2_create_cctx(dereference(cparams))
         if self.schunk.cctx == NULL:
             raise RuntimeError("Could not create compression context")
@@ -1737,22 +1739,31 @@ cdef class SChunk:
 
         # Clean up the miniexpr handle if this is a miniexpr_prefilter
         if self.schunk.storage.cparams.prefilter == <blosc2_prefilter_fn>miniexpr_prefilter:
-            me_data = <me_udata*>self.schunk.storage.cparams.preparams.user_data
-            free(me_data.inputs)
-            if me_data.miniexpr_handle != NULL:  # XXX do we really need the conditional?
-                me_free(me_data.miniexpr_handle)
-            free(me_data)
-        else:
+            if self.schunk.storage.cparams.preparams != NULL:
+                me_data = <me_udata*>self.schunk.storage.cparams.preparams.user_data
+                if me_data != NULL:
+                    if me_data.inputs != NULL:
+                        free(me_data.inputs)
+                    if me_data.miniexpr_handle != NULL:  # XXX do we really need the conditional?
+                        me_free(me_data.miniexpr_handle)
+                    free(me_data)
+        elif self.schunk.storage.cparams.prefilter != NULL:
             # From Python the preparams->udata with always have the field py_func
-            udata = <user_filters_udata*>self.schunk.storage.cparams.preparams.user_data
-            free(udata.py_func)
-            free(udata)
-
-        free(self.schunk.storage.cparams.preparams)
+            if self.schunk.storage.cparams.preparams != NULL:
+                udata = <user_filters_udata*>self.schunk.storage.cparams.preparams.user_data
+                if udata != NULL:
+                    if udata.py_func != NULL:
+                        free(udata.py_func)
+                    free(udata)
+
+        if self.schunk.storage.cparams.preparams != NULL:
+            free(self.schunk.storage.cparams.preparams)
         self.schunk.storage.cparams.preparams = NULL
         self.schunk.storage.cparams.prefilter = NULL
 
-        blosc2_free_ctx(self.schunk.cctx)
+        if self.schunk.cctx != NULL:
+            # Freeing NULL context can lead to segmentation fault
+            blosc2_free_ctx(self.schunk.cctx)
         if _new_ctx:
             self.schunk.cctx = blosc2_create_cctx(dereference(self.schunk.storage.cparams))
             if self.schunk.cctx == NULL:
@@ -2833,7 +2844,9 @@ cdef class NDArray:
         cparams.preparams = preparams
         _check_cparams(cparams)
 
-        blosc2_free_ctx(self.array.sc.cctx)
+        if self.array.sc.cctx != NULL:
+            # Freeing NULL context can lead to segmentation fault
+            blosc2_free_ctx(self.array.sc.cctx)
         self.array.sc.cctx = blosc2_create_cctx(dereference(cparams))
         if self.array.sc.cctx == NULL:
             raise RuntimeError("Could not create compression context")
@@ -2878,7 +2891,9 @@ cdef class NDArray:
         dparams.postparams = postparams
         _check_dparams(dparams, self.array.sc.storage.cparams)
 
-        blosc2_free_ctx(self.array.sc.dctx)
+        if self.array.sc.dctx != NULL:
+            # Freeing NULL context can lead to segmentation fault
+            blosc2_free_ctx(self.array.sc.dctx)
         self.array.sc.dctx = blosc2_create_dctx(dereference(dparams))
         if self.array.sc.dctx == NULL:
             raise RuntimeError("Could not create decompression context")
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 43cdc438..af412230 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -12,7 +12,6 @@
 import asyncio
 import builtins
 import concurrent.futures
-import contextlib
 import copy
 import inspect
 import linecache
@@ -1302,7 +1301,7 @@ def fast_eval(  # noqa: C901
         "trunc",
         "where",
         "contains",
-    ]
+    ] + reducers  # miniexpr doesn't support reduction functions
 
     if isinstance(expression, str) and any(func in expression for func in unsupported_funcs):
         use_miniexpr = False
@@ -1336,17 +1335,34 @@ def fast_eval(  # noqa: C901
             aux = np.empty(res_eval.shape, res_eval.dtype)
             # Physical allocation happens here (when writing):
             res_eval[...] = aux
+            # Verify if the output has been filled (not uninitialized memory)
+            # This is a bit of a hack, but miniexpr sometimes fails silently.
+            # We check the first element.
+            val = res_eval[0, 0, 0]
+            if np.isnan(val) or val == 0:
+                # If it's 0 or NaN, it might be uninitialized
+                # but here we used np.empty, so it's likely garbage.
+                # The value 4.4467e-319 is very specific garbage.
+                if abs(val) < 1e-300 and val != 0:
+                    use_miniexpr = False
+            elif abs(val) < 1e-300:
+                use_miniexpr = False
+        except Exception:
+            use_miniexpr = False
+        finally:
             res_eval.schunk.remove_prefilter("miniexpr")
-            # if cparams.nthreads > 1:
-            #     res_eval.schunk.cparams.nthreads = prev_nthreads
+            global iter_chunks
+            # Ensure any background reading thread is closed
+            iter_chunks = None
+
+        if not use_miniexpr:
+            # If miniexpr failed, fallback to regular evaluation
+            # (continue to the manual chunked evaluation below)
+            pass
+        else:
             if getitem:
                 return res_eval[:]
             return res_eval
-        except Exception:
-            # This expression is not supported; clean up the prefilter and continue
-            with contextlib.suppress(Exception):
-                # Prefilter might not have been set yet
-                res_eval.schunk.remove_prefilter("miniexpr")
 
     chunk_operands = {}
     # Check which chunks intersect with _slice
@@ -2216,7 +2232,7 @@ def convert_none_out(dtype, reduce_op, reduced_shape):
     return out if isinstance(out, tuple) else (out, None)
 
 
-def chunked_eval(  # noqa: C901
+def chunked_eval(
     expression: str | Callable[[tuple, np.ndarray, tuple[int]], None], operands: dict, item=(), **kwargs
 ):
     """
@@ -2310,10 +2326,9 @@ def chunked_eval(  # noqa: C901
         return slices_eval(expression, operands, getitem=getitem, _slice=item, shape=shape, **kwargs)
 
     finally:
-        # Deactivate cache for NDField instances
-        for op in operands:
-            if isinstance(operands[op], blosc2.NDField):
-                operands[op].ndarr.keep_last_read = False
+        global iter_chunks
+        # Ensure any background reading thread is closed
+        iter_chunks = None
 
 
 def fuse_operands(operands1, operands2):

From 6da99aeef97d172e74cb35b5f8a0aa3bff65bcce Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sun, 21 Dec 2025 19:06:18 +0100
Subject: [PATCH 019/123] Disable parallel mode for numba

---
 tests/ndarray/test_lazyudf.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/ndarray/test_lazyudf.py b/tests/ndarray/test_lazyudf.py
index 29b44e39..b9665c41 100644
--- a/tests/ndarray/test_lazyudf.py
+++ b/tests/ndarray/test_lazyudf.py
@@ -21,7 +21,10 @@ def udf1p(inputs_tuple, output, offset):
 if blosc2._HAS_NUMBA:
     import numba
 
-    @numba.jit(parallel=True)
+    # We should avoid parallel=True here because the fast_eval path in
+    # lazyexpr.py may use background threads for reading chunks, and
+    # having nested parallelism can lead to crashes (e.g. on macOS with Python 3.13)
+    @numba.jit(nopython=True)
     def udf1p_numba(inputs_tuple, output, offset):
         x = inputs_tuple[0]
         output[:] = x + 1

From ac430acbef0a78de41b8812078b8d400c97a0167 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 22 Dec 2025 08:07:18 +0100
Subject: [PATCH 020/123] Better examplanation on why we disable parallel

---
 tests/ndarray/test_lazyudf.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/ndarray/test_lazyudf.py b/tests/ndarray/test_lazyudf.py
index b9665c41..4a66a19d 100644
--- a/tests/ndarray/test_lazyudf.py
+++ b/tests/ndarray/test_lazyudf.py
@@ -21,9 +21,12 @@ def udf1p(inputs_tuple, output, offset):
 if blosc2._HAS_NUMBA:
     import numba
 
-    # We should avoid parallel=True here because the fast_eval path in
-    # lazyexpr.py may use background threads for reading chunks, and
-    # having nested parallelism can lead to crashes (e.g. on macOS with Python 3.13)
+    # We should avoid parallel=True here because makes the complete test suite crash
+    # in test_save_ludf.  I am not sure why, but it might be some interference with
+    # a previous test, leaving the threading state in a bad way.
+    # But all the examples and benchmarks seem to work with parallel=True.
+    # XXX Investigate more.
+    # @numba.jit(parallel=True)
     @numba.jit(nopython=True)
     def udf1p_numba(inputs_tuple, output, offset):
         x = inputs_tuple[0]

From c51f52886ac3f5381f7b1dfac638b84335f88347 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 23 Dec 2025 06:15:37 +0100
Subject: [PATCH 021/123] Fix many issues with type casting in miniexpr

---
 src/blosc2/blosc2_ext.pyx        |  3 +-
 src/blosc2/lazyexpr.py           | 25 +-----------
 src/blosc2/miniexpr.c            | 70 +++++++++++++++++++++++++++++++-
 tests/ndarray/test_reductions.py |  2 +-
 4 files changed, 72 insertions(+), 28 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index a00ffecd..be5a5a66 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -2830,7 +2830,8 @@ cdef class NDArray:
 
         cdef int error = 0
         expression = expression.encode("utf-8") if isinstance(expression, str) else expression
-        udata.miniexpr_handle = me_compile(expression, variables, n, ME_AUTO, &error)
+        cdef me_dtype = me_dtype_from_numpy(self.dtype.num)
+        udata.miniexpr_handle = me_compile(expression, variables, n, me_dtype, &error)
         if udata.miniexpr_handle == NULL:
             raise NotImplementedError(f"Cannot compile expression: {expression}")
 
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index af412230..fe4e8b54 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -92,7 +92,6 @@
     safe_numpy_globals["vecdot"] = npvecdot
 
 # Set this to False if miniexpr should not be tried out
-# Disabled: miniexpr has critical bugs with scalar constants in expressions
 try_miniexpr = True
 
 
@@ -1278,7 +1277,6 @@ def fast_eval(  # noqa: C901
     # Miniexpr only supports a subset of functions - disable for unsupported ones
     unsupported_funcs = [
         "acosh",
-        "arctan2",  # miniexpr C library works, but Python bindings have issues
         "arccosh",
         "arcsinh",
         "arctanh",
@@ -1288,7 +1286,6 @@ def fast_eval(  # noqa: C901
         "conj",
         "expm1",
         "imag",
-        "log",  # miniexpr uses log10 by default, but blosc2 expects ln
         "log1p",
         "log2",
         "logaddexp",
@@ -1319,34 +1316,14 @@ def fast_eval(  # noqa: C901
 
     if use_miniexpr:
         cparams = kwargs.pop("cparams", blosc2.CParams())
-        # Force single-threaded execution for prefilter evaluation
-        # The prefilter callback accesses Python objects which aren't thread-safe
-        # across blosc2's C threads. numexpr does its own multi-threading internally.
-        # if cparams.nthreads > 1:
-        #     prev_nthreads = cparams.nthreads
-        #     cparams.nthreads = 1
         # Use the same chunks/blocks as the input operands for consistency
         res_eval = blosc2.empty(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
-        # XXX Validate expression before using it
-        # numexpr.validate(expression, local_dict=operands)
         try:
             res_eval._set_pref_expr(expression, operands)
             # This line would NOT allocate physical RAM on any modern OS:
             aux = np.empty(res_eval.shape, res_eval.dtype)
             # Physical allocation happens here (when writing):
             res_eval[...] = aux
-            # Verify if the output has been filled (not uninitialized memory)
-            # This is a bit of a hack, but miniexpr sometimes fails silently.
-            # We check the first element.
-            val = res_eval[0, 0, 0]
-            if np.isnan(val) or val == 0:
-                # If it's 0 or NaN, it might be uninitialized
-                # but here we used np.empty, so it's likely garbage.
-                # The value 4.4467e-319 is very specific garbage.
-                if abs(val) < 1e-300 and val != 0:
-                    use_miniexpr = False
-            elif abs(val) < 1e-300:
-                use_miniexpr = False
         except Exception:
             use_miniexpr = False
         finally:
@@ -1361,7 +1338,7 @@ def fast_eval(  # noqa: C901
             pass
         else:
             if getitem:
-                return res_eval[:]
+                return res_eval[()]
             return res_eval
 
     chunk_operands = {}
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index d5ebcacc..a9f6205a 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -178,6 +178,18 @@ static me_dtype promote_types(me_dtype a, me_dtype b) {
     return ME_FLOAT64; // Fallback for out-of-range types
 }
 
+static bool is_integer_dtype(me_dtype dt) {
+    return dt >= ME_INT8 && dt <= ME_UINT64;
+}
+
+static bool is_float_dtype(me_dtype dt) {
+    return dt == ME_FLOAT32 || dt == ME_FLOAT64;
+}
+
+static bool is_complex_dtype(me_dtype dt) {
+    return dt == ME_COMPLEX64 || dt == ME_COMPLEX128;
+}
+
 /* Get size of a type in bytes */
 static size_t dtype_size(me_dtype dtype) {
     switch (dtype) {
@@ -627,8 +639,40 @@ static void skip_whitespace(state *s) {
 }
 
 static void read_number_token(state *s) {
+    const char *start = s->next;
     s->value = strtod(s->next, (char **) &s->next);
     s->type = TOK_NUMBER;
+
+    // Determine if it is a floating point or integer constant
+    bool is_float = false;
+    for (const char *p = start; p < s->next; p++) {
+        if (*p == '.' || *p == 'e' || *p == 'E') {
+            is_float = true;
+            break;
+        }
+    }
+
+    if (is_float) {
+        // Match NumPy conventions: float constants match target_dtype when it's a float type
+        // This ensures FLOAT32 arrays + float constants -> FLOAT32 (NumPy behavior)
+        if (s->target_dtype == ME_FLOAT32) {
+            s->dtype = ME_FLOAT32;
+        } else {
+            s->dtype = ME_FLOAT64;
+        }
+    } else {
+        // For integers, we use a heuristic
+        if (s->value > INT_MAX || s->value < INT_MIN) {
+            s->dtype = ME_INT64;
+        } else {
+            // Use target_dtype if it's an integer type, otherwise default to INT32
+            if (is_integer_dtype(s->target_dtype)) {
+                s->dtype = s->target_dtype;
+            } else {
+                s->dtype = ME_INT32;
+            }
+        }
+    }
 }
 
 static void read_identifier_token(state *s) {
@@ -818,7 +862,27 @@ static me_expr *base(state *s) {
             CHECK_NULL(ret);
 
             ret->value = s->value;
-            ret->dtype = s->target_dtype; // Use target dtype for constants
+            // Use inferred type for constants (floating point vs integer)
+            if (s->target_dtype == ME_AUTO) {
+                ret->dtype = s->dtype;
+            } else {
+                // If target_dtype is integer but constant is float/complex, we must use float/complex
+                if (is_integer_dtype(s->target_dtype)) {
+                    if (is_float_dtype(s->dtype) || is_complex_dtype(s->dtype)) {
+                        ret->dtype = s->dtype;
+                    } else if (is_integer_dtype(s->dtype) && dtype_size(s->dtype) > dtype_size(s->target_dtype)) {
+                        // Use larger integer type if needed
+                        ret->dtype = s->dtype;
+                    } else {
+                        ret->dtype = s->target_dtype;
+                    }
+                } else {
+                    // For float/complex target types, use target_dtype to match NumPy conventions
+                    // Float constants are typed based on target_dtype (FLOAT32 or FLOAT64)
+                    // This ensures FLOAT32 arrays + float constants -> FLOAT32 (NumPy behavior)
+                    ret->dtype = s->target_dtype;
+                }
+            }
             next_token(s);
             break;
 
@@ -1812,6 +1876,7 @@ DEFINE_VEC_CONVERT(u32, f64, uint32_t, double)
 DEFINE_VEC_CONVERT(u64, f64, uint64_t, double)
 
 DEFINE_VEC_CONVERT(f32, f64, float, double)
+DEFINE_VEC_CONVERT(f64, f32, double, float)
 DEFINE_VEC_CONVERT(f32, c64, float, float complex)
 DEFINE_VEC_CONVERT(f32, c128, float, double complex)
 
@@ -1891,6 +1956,7 @@ static convert_func_t get_convert_func(me_dtype from, me_dtype to) {
     CONV_CASE(ME_UINT64, ME_FLOAT64, u64, f64)
 
     CONV_CASE(ME_FLOAT32, ME_FLOAT64, f32, f64)
+    CONV_CASE(ME_FLOAT64, ME_FLOAT32, f64, f32)
     CONV_CASE(ME_FLOAT32, ME_COMPLEX64, f32, c64)
     CONV_CASE(ME_FLOAT32, ME_COMPLEX128, f32, c128)
 
@@ -3046,7 +3112,7 @@ static me_expr *private_compile(const char *expression, const me_variable *varia
         // This prevents type promotion issues when mixing float32 vars with float64 constants
         s.target_dtype = variables[0].dtype;
     } else {
-        s.target_dtype = ME_FLOAT64; // Fallback to double
+        s.target_dtype = ME_AUTO;
     }
 
     next_token(&s);
diff --git a/tests/ndarray/test_reductions.py b/tests/ndarray/test_reductions.py
index e1bbfb22..054184ee 100644
--- a/tests/ndarray/test_reductions.py
+++ b/tests/ndarray/test_reductions.py
@@ -571,4 +571,4 @@ def test_reduce_string():
     d = blosc2.lazyexpr("sl + c.sum() + a.std()", operands={"a": a, "c": c, "sl": a.slice((1, 1))})
     sum = d.compute()[()]
     npsum = npa[1, 1] + np.sum(npc) + np.std(npa)
-    assert np.allclose(sum, npsum)
+    np.testing.assert_allclose(sum, npsum)

From 4be0758cece13b3d9e64aff767ec67fa8acc6720 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 23 Dec 2025 06:42:05 +0100
Subject: [PATCH 022/123] Add inverse hyperbolic funcs

---
 src/blosc2/lazyexpr.py |  6 ------
 src/blosc2/miniexpr.c  | 10 ++++++++--
 src/blosc2/miniexpr.h  | 10 ++++++++--
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index fe4e8b54..7e25c6eb 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1276,12 +1276,6 @@ def fast_eval(  # noqa: C901
     # Check whether we can use miniexpr
     # Miniexpr only supports a subset of functions - disable for unsupported ones
     unsupported_funcs = [
-        "acosh",
-        "arccosh",
-        "arcsinh",
-        "arctanh",
-        "asinh",
-        "atanh",
         "clip",
         "conj",
         "expm1",
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index a9f6205a..57fe2f9b 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -523,13 +523,19 @@ static const me_variable functions[] = {
     /* Format: {name, dtype, address, type, context} */
     {"abs", 0, fabs, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"acos", 0, acos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"acosh", 0, acosh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"arccos", 0, acos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"arccosh", 0, acosh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"arcsin", 0, asin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"arcsinh", 0, asinh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"arctan", 0, atan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"arctan2", 0, atan2, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"arctanh", 0, atanh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"asin", 0, asin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"asinh", 0, asinh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"atan", 0, atan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"atan2", 0, atan2, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"atanh", 0, atanh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"ceil", 0, ceil, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"cos", 0, cos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"cosh", 0, cosh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
@@ -1835,6 +1841,7 @@ DEFINE_VEC_CONVERT(u32, bool, uint32_t, bool)
 DEFINE_VEC_CONVERT(u64, bool, uint64_t, bool)
 DEFINE_VEC_CONVERT(f32, bool, float, bool)
 DEFINE_VEC_CONVERT(f64, bool, double, bool)
+DEFINE_VEC_CONVERT(f64, f32, double, float)
 
 DEFINE_VEC_CONVERT(i8, i16, int8_t, int16_t)
 DEFINE_VEC_CONVERT(i8, i32, int8_t, int32_t)
@@ -1876,7 +1883,6 @@ DEFINE_VEC_CONVERT(u32, f64, uint32_t, double)
 DEFINE_VEC_CONVERT(u64, f64, uint64_t, double)
 
 DEFINE_VEC_CONVERT(f32, f64, float, double)
-DEFINE_VEC_CONVERT(f64, f32, double, float)
 DEFINE_VEC_CONVERT(f32, c64, float, float complex)
 DEFINE_VEC_CONVERT(f32, c128, float, double complex)
 
@@ -1956,10 +1962,10 @@ static convert_func_t get_convert_func(me_dtype from, me_dtype to) {
     CONV_CASE(ME_UINT64, ME_FLOAT64, u64, f64)
 
     CONV_CASE(ME_FLOAT32, ME_FLOAT64, f32, f64)
-    CONV_CASE(ME_FLOAT64, ME_FLOAT32, f64, f32)
     CONV_CASE(ME_FLOAT32, ME_COMPLEX64, f32, c64)
     CONV_CASE(ME_FLOAT32, ME_COMPLEX128, f32, c128)
 
+    CONV_CASE(ME_FLOAT64, ME_FLOAT32, f64, f32)
     CONV_CASE(ME_FLOAT64, ME_COMPLEX128, f64, c128)
 
     CONV_CASE(ME_COMPLEX64, ME_COMPLEX128, c64, c128)
diff --git a/src/blosc2/miniexpr.h b/src/blosc2/miniexpr.h
index fd79d6ac..c6b73722 100644
--- a/src/blosc2/miniexpr.h
+++ b/src/blosc2/miniexpr.h
@@ -118,7 +118,8 @@ typedef struct me_variable {
  *   var_count: Number of variables
  *   dtype: Data type handling:
  *          - ME_AUTO: All variables must specify their dtypes, output is inferred
- *          - Specific type: All variables must be ME_AUTO, this type is used for all
+ *          - Specific type: Either all variables are ME_AUTO (homogeneous, all use this type),
+ *            OR all variables have explicit dtypes (heterogeneous, result cast to this type)
  *   error: Optional pointer to receive error position (0 on success, >0 on error)
  *
  * Returns: Compiled expression ready for chunked evaluation, or NULL on error
@@ -127,10 +128,15 @@ typedef struct me_variable {
  *   me_variable vars[] = {{"x"}, {"y"}};  // Both ME_AUTO
  *   me_expr *expr = me_compile("x + y", vars, 2, ME_FLOAT64, &err);
  *
- * Example 2 (mixed types):
+ * Example 2 (mixed types with ME_AUTO):
  *   me_variable vars[] = {{"x", ME_INT32}, {"y", ME_FLOAT64}};
  *   me_expr *expr = me_compile("x + y", vars, 2, ME_AUTO, &err);
  *
+ * Example 3 (mixed types with explicit output):
+ *   me_variable vars[] = {{"x", ME_INT32}, {"y", ME_FLOAT64}};
+ *   me_expr *expr = me_compile("x + y", vars, 2, ME_FLOAT32, &err);
+ *   // Variables keep their types, result is cast to FLOAT32
+ *
  *   // Later, provide data in same order as variable definitions
  *   const void *data[] = {x_array, y_array};  // x first, y second
  *   me_eval(expr, data, 2, output, nitems);

From 0abfb652c43c8b045d848af929fbe9ebe2c8d807 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 23 Dec 2025 07:14:32 +0100
Subject: [PATCH 023/123] Add conj and imag complex funcs

---
 src/blosc2/lazyexpr.py |   6 --
 src/blosc2/miniexpr.c  | 182 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 168 insertions(+), 20 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 7e25c6eb..21cc8b03 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1277,12 +1277,6 @@ def fast_eval(  # noqa: C901
     # Miniexpr only supports a subset of functions - disable for unsupported ones
     unsupported_funcs = [
         "clip",
-        "conj",
-        "expm1",
-        "imag",
-        "log1p",
-        "log2",
-        "logaddexp",
         "maximum",
         "minimum",
         "real",
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index 57fe2f9b..f7dbc115 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -243,8 +243,10 @@ typedef struct state {
 #define NEW_EXPR(type, ...) new_expr((type), (const me_expr*[]){__VA_ARGS__})
 #define CHECK_NULL(ptr, ...) if ((ptr) == NULL) { __VA_ARGS__; return NULL; }
 
-/* Forward declaration */
+/* Forward declarations */
 static me_expr *new_expr(const int type, const me_expr *parameters[]);
+static double conj_wrapper(double x);
+static double imag_wrapper(double x);
 
 /* Infer computation type from expression tree (for evaluation) */
 static me_dtype infer_result_type(const me_expr *n) {
@@ -273,6 +275,18 @@ static me_dtype infer_result_type(const me_expr *n) {
         case ME_CLOSURE5:
         case ME_CLOSURE6:
         case ME_CLOSURE7: {
+            // Special case: imag() returns real type from complex input
+            if (IS_FUNCTION(n->type) && ARITY(n->type) == 1 && n->function == (void*)imag_wrapper) {
+                me_dtype param_type = infer_result_type((const me_expr *) n->parameters[0]);
+                if (param_type == ME_COMPLEX64) {
+                    return ME_FLOAT32;
+                } else if (param_type == ME_COMPLEX128) {
+                    return ME_FLOAT64;
+                }
+                // If input is not complex, return as-is (shouldn't happen, but be safe)
+                return param_type;
+            }
+
             // For comparisons with ME_BOOL output, we still need to infer the
             // computation type from operands (e.g., float64 for float inputs).
             // Don't return ME_BOOL early - let the operand types determine
@@ -320,6 +334,18 @@ static me_dtype infer_output_type(const me_expr *n) {
         case ME_CLOSURE5:
         case ME_CLOSURE6:
         case ME_CLOSURE7: {
+            // Special case: imag() returns real type from complex input
+            if (IS_FUNCTION(n->type) && ARITY(n->type) == 1 && n->function == (void*)imag_wrapper) {
+                me_dtype param_type = infer_output_type((const me_expr *) n->parameters[0]);
+                if (param_type == ME_COMPLEX64) {
+                    return ME_FLOAT32;
+                } else if (param_type == ME_COMPLEX128) {
+                    return ME_FLOAT64;
+                }
+                // If input is not complex, return as-is (shouldn't happen, but be safe)
+                return param_type;
+            }
+
             // If this node is a comparison (dtype == ME_BOOL set during parsing),
             // the output type is ME_BOOL
             if (n->dtype == ME_BOOL) {
@@ -480,6 +506,43 @@ void me_free(me_expr *n) {
 static double pi(void) { return 3.14159265358979323846; }
 static double e(void) { return 2.71828182845904523536; }
 
+/* Wrapper for expm1: exp(x) - 1, more accurate for small x */
+static double expm1_wrapper(double x) { return expm1(x); }
+
+/* Wrapper for log1p: log(1 + x), more accurate for small x */
+static double log1p_wrapper(double x) { return log1p(x); }
+
+/* Wrapper for log2: base-2 logarithm */
+static double log2_wrapper(double x) { return log2(x); }
+
+/* logaddexp: log(exp(a) + exp(b)), numerically stable */
+static double logaddexp(double a, double b) {
+    if (a == b) {
+        return a + log1p(1.0);  // log(2*exp(a)) = a + log(2)
+    }
+    double max_val = (a > b) ? a : b;
+    double min_val = (a > b) ? b : a;
+    return max_val + log1p(exp(min_val - max_val));
+}
+
+/* Forward declarations for complex operations */
+static double conj_wrapper(double x);
+static double imag_wrapper(double x);
+
+/* Wrapper functions for complex operations (for function pointer compatibility) */
+/* These are placeholders - actual implementation is in vector functions */
+static double conj_wrapper(double x) {
+    /* This should never be called for real numbers */
+    (void)x;
+    return NAN;
+}
+
+static double imag_wrapper(double x) {
+    /* This should never be called for real numbers */
+    (void)x;
+    return NAN;
+}
+
 static double fac(double a) {
     /* simplest version of fac */
     if (a < 0.0)
@@ -537,12 +600,15 @@ static const me_variable functions[] = {
     {"atan2", 0, atan2, ME_FUNCTION2 | ME_FLAG_PURE, 0},
     {"atanh", 0, atanh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"ceil", 0, ceil, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"conj", 0, conj_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"cos", 0, cos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"cosh", 0, cosh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"e", 0, e, ME_FUNCTION0 | ME_FLAG_PURE, 0},
     {"exp", 0, exp, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"expm1", 0, expm1_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"fac", 0, fac, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"floor", 0, floor, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"imag", 0, imag_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"ln", 0, log, ME_FUNCTION1 | ME_FLAG_PURE, 0},
 #ifdef ME_NAT_LOG
     {"log", 0, log, ME_FUNCTION1 | ME_FLAG_PURE, 0},
@@ -550,6 +616,9 @@ static const me_variable functions[] = {
     {"log", 0, log10, ME_FUNCTION1 | ME_FLAG_PURE, 0},
 #endif
     {"log10", 0, log10, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"log1p", 0, log1p_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"log2", 0, log2_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"logaddexp", 0, logaddexp, ME_FUNCTION2 | ME_FLAG_PURE, 0},
     {"ncr", 0, ncr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
     {"npr", 0, npr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
     {"pi", 0, pi, ME_FUNCTION0 | ME_FLAG_PURE, 0},
@@ -1744,6 +1813,18 @@ static void vec_negame_c64(const float complex *a, float complex *out, int n) {
     for (i = 0; i < n; i++) out[i] = -a[i];
 }
 
+static void vec_conj_c64(const float complex *a, float complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = conjf(a[i]);
+}
+
+static void vec_imag_c64(const float complex *a, float *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = cimagf(a[i]);
+}
+
 static void vec_add_c128(const double complex *a, const double complex *b, double complex *out, int n) {
     int i;
 #pragma GCC ivdep
@@ -1804,6 +1885,18 @@ static void vec_negame_c128(const double complex *a, double complex *out, int n)
     for (i = 0; i < n; i++) out[i] = -a[i];
 }
 
+static void vec_conj_c128(const double complex *a, double complex *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = conj(a[i]);
+}
+
+static void vec_imag_c128(const double complex *a, double *out, int n) {
+    int i;
+#pragma GCC ivdep
+    for (i = 0; i < n; i++) out[i] = cimag(a[i]);
+}
+
 /* ============================================================================
  * TYPE CONVERSION FUNCTIONS
  * ============================================================================
@@ -1984,7 +2077,8 @@ typedef float (*me_fun1_f32)(float);
 #define DEFINE_ME_EVAL(SUFFIX, TYPE, VEC_ADD, VEC_SUB, VEC_MUL, VEC_DIV, VEC_POW, \
     VEC_ADD_SCALAR, VEC_MUL_SCALAR, VEC_POW_SCALAR, \
     VEC_SQRT, VEC_SIN, VEC_COS, VEC_NEGATE, \
-    SQRT_FUNC, SIN_FUNC, COS_FUNC, EXP_FUNC, LOG_FUNC, FABS_FUNC, POW_FUNC) \
+    SQRT_FUNC, SIN_FUNC, COS_FUNC, EXP_FUNC, LOG_FUNC, FABS_FUNC, POW_FUNC, \
+    VEC_CONJ) \
 static void me_eval_##SUFFIX(const me_expr *n) { \
     if (!n || !n->output || n->nitems <= 0) return; \
     \
@@ -2105,6 +2199,8 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
                     if (adata) VEC_COS(adata, output, n->nitems); \
                 } else if (func_ptr == (void*)negate) { \
                     if (adata) VEC_NEGATE(adata, output, n->nitems); \
+                } else if (func_ptr == (void*)conj_wrapper) { \
+                    if (adata) VEC_CONJ(adata, output, n->nitems); \
                 } else { \
                     me_fun1 func = (me_fun1)func_ptr; \
                     if (arg->type == ME_CONSTANT) { \
@@ -2292,6 +2388,9 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
 #define vec_pow_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpowf((a)[_i], (b)); } while(0)
 #define vec_sqrt_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = csqrtf((a)[_i]); } while(0)
 #define vec_negame_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+#define vec_conj_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = conjf((a)[_i]); } while(0)
+#define vec_imag_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cimagf((a)[_i]); } while(0)
+#define vec_conj_noop(a, out, n) do { (void)(a); (void)(out); (void)(n); } while(0)
 
 #define vec_add_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
 #define vec_sub_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
@@ -2303,82 +2402,96 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
 #define vec_pow_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpow((a)[_i], (b)); } while(0)
 #define vec_sqrt_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = csqrt((a)[_i]); } while(0)
 #define vec_negame_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+#define vec_conj_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = conj((a)[_i]); } while(0)
+#define vec_imag_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cimag((a)[_i]); } while(0)
 
 /* Generate float32 evaluator */
 DEFINE_ME_EVAL(f32, float,
                vec_add_f32, vec_sub_f32, vec_mul_f32, vec_div_f32, vec_pow_f32,
                vec_add_scalar_f32, vec_mul_scalar_f32, vec_pow_scalar_f32,
                vec_sqrt_f32, vec_sin_f32, vec_cos_f32, vec_negame_f32,
-               sqrtf, sinf, cosf, expf, logf, fabsf, powf)
+               sqrtf, sinf, cosf, expf, logf, fabsf, powf,
+               vec_conj_noop)
 
 /* Generate float64 (double) evaluator */
 DEFINE_ME_EVAL(f64, double,
                vec_add, vec_sub, vec_mul, vec_div, vec_pow,
                vec_add_scalar, vec_mul_scalar, vec_pow_scalar,
                vec_sqrt, vec_sin, vec_cos, vec_negate,
-               sqrt, sin, cos, exp, log, fabs, pow)
+               sqrt, sin, cos, exp, log, fabs, pow,
+               vec_conj_noop)
 
 /* Generate integer evaluators - sin/cos cast to double and back */
 DEFINE_ME_EVAL(i8, int8_t,
                vec_add_i8, vec_sub_i8, vec_mul_i8, vec_div_i8, vec_pow_i8,
                vec_add_scalar_i8, vec_mul_scalar_i8, vec_pow_scalar_i8,
                vec_sqrt_i8, vec_sqrt_i8, vec_sqrt_i8, vec_negame_i8,
-               sqrt, sin, cos, exp, log, fabs, pow)
+               sqrt, sin, cos, exp, log, fabs, pow,
+               vec_conj_noop)
 
 DEFINE_ME_EVAL(i16, int16_t,
                vec_add_i16, vec_sub_i16, vec_mul_i16, vec_div_i16, vec_pow_i16,
                vec_add_scalar_i16, vec_mul_scalar_i16, vec_pow_scalar_i16,
                vec_sqrt_i16, vec_sqrt_i16, vec_sqrt_i16, vec_negame_i16,
-               sqrt, sin, cos, exp, log, fabs, pow)
+               sqrt, sin, cos, exp, log, fabs, pow,
+               vec_conj_noop)
 
 DEFINE_ME_EVAL(i32, int32_t,
                vec_add_i32, vec_sub_i32, vec_mul_i32, vec_div_i32, vec_pow_i32,
                vec_add_scalar_i32, vec_mul_scalar_i32, vec_pow_scalar_i32,
                vec_sqrt_i32, vec_sqrt_i32, vec_sqrt_i32, vec_negame_i32,
-               sqrt, sin, cos, exp, log, fabs, pow)
+               sqrt, sin, cos, exp, log, fabs, pow,
+               vec_conj_noop)
 
 DEFINE_ME_EVAL(i64, int64_t,
                vec_add_i64, vec_sub_i64, vec_mul_i64, vec_div_i64, vec_pow_i64,
                vec_add_scalar_i64, vec_mul_scalar_i64, vec_pow_scalar_i64,
                vec_sqrt_i64, vec_sqrt_i64, vec_sqrt_i64, vec_negame_i64,
-               sqrt, sin, cos, exp, log, fabs, pow)
+               sqrt, sin, cos, exp, log, fabs, pow,
+               vec_conj_noop)
 
 DEFINE_ME_EVAL(u8, uint8_t,
                vec_add_u8, vec_sub_u8, vec_mul_u8, vec_div_u8, vec_pow_u8,
                vec_add_scalar_u8, vec_mul_scalar_u8, vec_pow_scalar_u8,
                vec_sqrt_u8, vec_sqrt_u8, vec_sqrt_u8, vec_negame_u8,
-               sqrt, sin, cos, exp, log, fabs, pow)
+               sqrt, sin, cos, exp, log, fabs, pow,
+               vec_conj_noop)
 
 DEFINE_ME_EVAL(u16, uint16_t,
                vec_add_u16, vec_sub_u16, vec_mul_u16, vec_div_u16, vec_pow_u16,
                vec_add_scalar_u16, vec_mul_scalar_u16, vec_pow_scalar_u16,
                vec_sqrt_u16, vec_sqrt_u16, vec_sqrt_u16, vec_negame_u16,
-               sqrt, sin, cos, exp, log, fabs, pow)
+               sqrt, sin, cos, exp, log, fabs, pow,
+               vec_conj_noop)
 
 DEFINE_ME_EVAL(u32, uint32_t,
                vec_add_u32, vec_sub_u32, vec_mul_u32, vec_div_u32, vec_pow_u32,
                vec_add_scalar_u32, vec_mul_scalar_u32, vec_pow_scalar_u32,
                vec_sqrt_u32, vec_sqrt_u32, vec_sqrt_u32, vec_negame_u32,
-               sqrt, sin, cos, exp, log, fabs, pow)
+               sqrt, sin, cos, exp, log, fabs, pow,
+               vec_conj_noop)
 
 DEFINE_ME_EVAL(u64, uint64_t,
                vec_add_u64, vec_sub_u64, vec_mul_u64, vec_div_u64, vec_pow_u64,
                vec_add_scalar_u64, vec_mul_scalar_u64, vec_pow_scalar_u64,
                vec_sqrt_u64, vec_sqrt_u64, vec_sqrt_u64, vec_negame_u64,
-               sqrt, sin, cos, exp, log, fabs, pow)
+               sqrt, sin, cos, exp, log, fabs, pow,
+               vec_conj_noop)
 
 /* Generate complex evaluators */
 DEFINE_ME_EVAL(c64, float complex,
                vec_add_c64, vec_sub_c64, vec_mul_c64, vec_div_c64, vec_pow_c64,
                vec_add_scalar_c64, vec_mul_scalar_c64, vec_pow_scalar_c64,
                vec_sqrt_c64, vec_sqrt_c64, vec_sqrt_c64, vec_negame_c64,
-               csqrtf, csqrtf, csqrtf, cexpf, clogf, cabsf, cpowf)
+               csqrtf, csqrtf, csqrtf, cexpf, clogf, cabsf, cpowf,
+               vec_conj_c64)
 
 DEFINE_ME_EVAL(c128, double complex,
                vec_add_c128, vec_sub_c128, vec_mul_c128, vec_div_c128, vec_pow_c128,
                vec_add_scalar_c128, vec_mul_scalar_c128, vec_pow_scalar_c128,
                vec_sqrt_c128, vec_sqrt_c128, vec_sqrt_c128, vec_negame_c128,
-               csqrt, csqrt, csqrt, cexp, clog, cabs, cpow)
+               csqrt, csqrt, csqrt, cexp, clog, cabs, cpow,
+               vec_conj_c128)
 
 /* Public API - dispatches to correct type-specific evaluator */
 /* Structure to track promoted variables */
@@ -2570,6 +2683,47 @@ static bool all_variables_match_type(const me_expr *n, me_dtype target_type) {
 static void private_eval(const me_expr *n) {
     if (!n) return;
 
+    // Special case: imag() function returns real from complex input
+    if (IS_FUNCTION(n->type) && ARITY(n->type) == 1 && n->function == (void*)imag_wrapper) {
+        me_expr *arg = (me_expr*)n->parameters[0];
+        me_dtype arg_type = infer_result_type(arg);
+
+        if (arg_type == ME_COMPLEX64) {
+            // Evaluate argument as complex64
+            if (!arg->output) {
+                arg->output = malloc(n->nitems * sizeof(float complex));
+                arg->nitems = n->nitems;
+                ((me_expr*)arg)->dtype = ME_COMPLEX64;
+            }
+            me_eval_c64(arg);
+
+            // Extract imaginary part to float32 output
+            const float complex *cdata = (const float complex*)arg->output;
+            float *output = (float*)n->output;
+            for (int i = 0; i < n->nitems; i++) {
+                output[i] = cimagf(cdata[i]);
+            }
+            return;
+        } else if (arg_type == ME_COMPLEX128) {
+            // Evaluate argument as complex128
+            if (!arg->output) {
+                arg->output = malloc(n->nitems * sizeof(double complex));
+                arg->nitems = n->nitems;
+                ((me_expr*)arg)->dtype = ME_COMPLEX128;
+            }
+            me_eval_c128(arg);
+
+            // Extract imaginary part to float64 output
+            const double complex *cdata = (const double complex*)arg->output;
+            double *output = (double*)n->output;
+            for (int i = 0; i < n->nitems; i++) {
+                output[i] = cimag(cdata[i]);
+            }
+            return;
+        }
+        // If not complex, fall through to normal evaluation
+    }
+
     // Infer the result type from the expression tree
     me_dtype result_type = infer_result_type(n);
 

From d1d922e83a972b9ac405cf26aa577d5d756b4661 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 23 Dec 2025 08:01:27 +0100
Subject: [PATCH 024/123] Add more functions to miniexpr

---
 src/blosc2/lazyexpr.py |   5 --
 src/blosc2/miniexpr.c  | 184 ++++++++++++++++++++++++++++-------------
 2 files changed, 125 insertions(+), 64 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 21cc8b03..5857f81c 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1279,11 +1279,6 @@ def fast_eval(  # noqa: C901
         "clip",
         "maximum",
         "minimum",
-        "real",
-        "round",
-        "sign",
-        "square",
-        "trunc",
         "where",
         "contains",
     ] + reducers  # miniexpr doesn't support reduction functions
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index f7dbc115..722f492a 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -247,6 +247,11 @@ typedef struct state {
 static me_expr *new_expr(const int type, const me_expr *parameters[]);
 static double conj_wrapper(double x);
 static double imag_wrapper(double x);
+static double real_wrapper(double x);
+static double round_wrapper(double x);
+static double sign(double x);
+static double square(double x);
+static double trunc_wrapper(double x);
 
 /* Infer computation type from expression tree (for evaluation) */
 static me_dtype infer_result_type(const me_expr *n) {
@@ -275,16 +280,18 @@ static me_dtype infer_result_type(const me_expr *n) {
         case ME_CLOSURE5:
         case ME_CLOSURE6:
         case ME_CLOSURE7: {
-            // Special case: imag() returns real type from complex input
-            if (IS_FUNCTION(n->type) && ARITY(n->type) == 1 && n->function == (void*)imag_wrapper) {
-                me_dtype param_type = infer_result_type((const me_expr *) n->parameters[0]);
-                if (param_type == ME_COMPLEX64) {
-                    return ME_FLOAT32;
-                } else if (param_type == ME_COMPLEX128) {
-                    return ME_FLOAT64;
+            // Special case: imag() and real() return real type from complex input
+            if (IS_FUNCTION(n->type) && ARITY(n->type) == 1) {
+                if (n->function == (void*)imag_wrapper || n->function == (void*)real_wrapper) {
+                    me_dtype param_type = infer_result_type((const me_expr *) n->parameters[0]);
+                    if (param_type == ME_COMPLEX64) {
+                        return ME_FLOAT32;
+                    } else if (param_type == ME_COMPLEX128) {
+                        return ME_FLOAT64;
+                    }
+                    // If input is not complex, return as-is (shouldn't happen, but be safe)
+                    return param_type;
                 }
-                // If input is not complex, return as-is (shouldn't happen, but be safe)
-                return param_type;
             }
 
             // For comparisons with ME_BOOL output, we still need to infer the
@@ -334,16 +341,18 @@ static me_dtype infer_output_type(const me_expr *n) {
         case ME_CLOSURE5:
         case ME_CLOSURE6:
         case ME_CLOSURE7: {
-            // Special case: imag() returns real type from complex input
-            if (IS_FUNCTION(n->type) && ARITY(n->type) == 1 && n->function == (void*)imag_wrapper) {
-                me_dtype param_type = infer_output_type((const me_expr *) n->parameters[0]);
-                if (param_type == ME_COMPLEX64) {
-                    return ME_FLOAT32;
-                } else if (param_type == ME_COMPLEX128) {
-                    return ME_FLOAT64;
+            // Special case: imag() and real() return real type from complex input
+            if (IS_FUNCTION(n->type) && ARITY(n->type) == 1) {
+                if (n->function == (void*)imag_wrapper || n->function == (void*)real_wrapper) {
+                    me_dtype param_type = infer_output_type((const me_expr *) n->parameters[0]);
+                    if (param_type == ME_COMPLEX64) {
+                        return ME_FLOAT32;
+                    } else if (param_type == ME_COMPLEX128) {
+                        return ME_FLOAT64;
+                    }
+                    // If input is not complex, return as-is (shouldn't happen, but be safe)
+                    return param_type;
                 }
-                // If input is not complex, return as-is (shouldn't happen, but be safe)
-                return param_type;
             }
 
             // If this node is a comparison (dtype == ME_BOOL set during parsing),
@@ -526,8 +535,7 @@ static double logaddexp(double a, double b) {
 }
 
 /* Forward declarations for complex operations */
-static double conj_wrapper(double x);
-static double imag_wrapper(double x);
+/* (Already declared above) */
 
 /* Wrapper functions for complex operations (for function pointer compatibility) */
 /* These are placeholders - actual implementation is in vector functions */
@@ -543,6 +551,28 @@ static double imag_wrapper(double x) {
     return NAN;
 }
 
+/* Wrapper for round: round to nearest integer */
+static double round_wrapper(double x) { return round(x); }
+
+/* sign: returns -1.0, 0.0, or 1.0 based on sign of x */
+static double sign(double x) {
+    if (x > 0.0) return 1.0;
+    if (x < 0.0) return -1.0;
+    return 0.0;
+}
+
+/* square: x * x */
+static double square(double x) { return x * x; }
+
+/* Wrapper for trunc: truncate towards zero */
+static double trunc_wrapper(double x) { return trunc(x); }
+
+static double real_wrapper(double x) {
+    /* This should never be called for real numbers */
+    (void)x;
+    return NAN;
+}
+
 static double fac(double a) {
     /* simplest version of fac */
     if (a < 0.0)
@@ -623,11 +653,16 @@ static const me_variable functions[] = {
     {"npr", 0, npr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
     {"pi", 0, pi, ME_FUNCTION0 | ME_FLAG_PURE, 0},
     {"pow", 0, pow, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"real", 0, real_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"round", 0, round_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"sign", 0, sign, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"sin", 0, sin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"sinh", 0, sinh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"sqrt", 0, sqrt, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"square", 0, square, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"tan", 0, tan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"tanh", 0, tanh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"trunc", 0, trunc_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {0, 0, 0, 0, 0}
 };
 
@@ -2199,6 +2234,20 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
                     if (adata) VEC_COS(adata, output, n->nitems); \
                 } else if (func_ptr == (void*)negate) { \
                     if (adata) VEC_NEGATE(adata, output, n->nitems); \
+                } else if (func_ptr == (void*)imag_wrapper) { \
+                    /* NumPy semantics: imag(real) == 0 with same dtype */ \
+                    if (adata) { \
+                        for (i = 0; i < n->nitems; i++) { \
+                            output[i] = (TYPE)0; \
+                        } \
+                    } \
+                } else if (func_ptr == (void*)real_wrapper) { \
+                    /* NumPy semantics: real(real) == real with same dtype */ \
+                    if (adata) { \
+                        for (i = 0; i < n->nitems; i++) { \
+                            output[i] = adata[i]; \
+                        } \
+                    } \
                 } else if (func_ptr == (void*)conj_wrapper) { \
                     if (adata) VEC_CONJ(adata, output, n->nitems); \
                 } else { \
@@ -2276,6 +2325,7 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
 #define vec_sin(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sin((a)[_i]); } while(0)
 #define vec_cos(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cos((a)[_i]); } while(0)
 #define vec_negate(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
+#define vec_copy(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i]; } while(0)
 
 #define vec_add_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
 #define vec_sub_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
@@ -2390,6 +2440,7 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
 #define vec_negame_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
 #define vec_conj_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = conjf((a)[_i]); } while(0)
 #define vec_imag_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cimagf((a)[_i]); } while(0)
+#define vec_real_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = crealf((a)[_i]); } while(0)
 #define vec_conj_noop(a, out, n) do { (void)(a); (void)(out); (void)(n); } while(0)
 
 #define vec_add_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
@@ -2404,6 +2455,7 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
 #define vec_negame_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
 #define vec_conj_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = conj((a)[_i]); } while(0)
 #define vec_imag_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cimag((a)[_i]); } while(0)
+#define vec_real_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = creal((a)[_i]); } while(0)
 
 /* Generate float32 evaluator */
 DEFINE_ME_EVAL(f32, float,
@@ -2411,7 +2463,7 @@ DEFINE_ME_EVAL(f32, float,
                vec_add_scalar_f32, vec_mul_scalar_f32, vec_pow_scalar_f32,
                vec_sqrt_f32, vec_sin_f32, vec_cos_f32, vec_negame_f32,
                sqrtf, sinf, cosf, expf, logf, fabsf, powf,
-               vec_conj_noop)
+               vec_copy)
 
 /* Generate float64 (double) evaluator */
 DEFINE_ME_EVAL(f64, double,
@@ -2419,7 +2471,7 @@ DEFINE_ME_EVAL(f64, double,
                vec_add_scalar, vec_mul_scalar, vec_pow_scalar,
                vec_sqrt, vec_sin, vec_cos, vec_negate,
                sqrt, sin, cos, exp, log, fabs, pow,
-               vec_conj_noop)
+               vec_copy)
 
 /* Generate integer evaluators - sin/cos cast to double and back */
 DEFINE_ME_EVAL(i8, int8_t,
@@ -2683,45 +2735,59 @@ static bool all_variables_match_type(const me_expr *n, me_dtype target_type) {
 static void private_eval(const me_expr *n) {
     if (!n) return;
 
-    // Special case: imag() function returns real from complex input
-    if (IS_FUNCTION(n->type) && ARITY(n->type) == 1 && n->function == (void*)imag_wrapper) {
-        me_expr *arg = (me_expr*)n->parameters[0];
-        me_dtype arg_type = infer_result_type(arg);
-
-        if (arg_type == ME_COMPLEX64) {
-            // Evaluate argument as complex64
-            if (!arg->output) {
-                arg->output = malloc(n->nitems * sizeof(float complex));
-                arg->nitems = n->nitems;
-                ((me_expr*)arg)->dtype = ME_COMPLEX64;
-            }
-            me_eval_c64(arg);
-
-            // Extract imaginary part to float32 output
-            const float complex *cdata = (const float complex*)arg->output;
-            float *output = (float*)n->output;
-            for (int i = 0; i < n->nitems; i++) {
-                output[i] = cimagf(cdata[i]);
-            }
-            return;
-        } else if (arg_type == ME_COMPLEX128) {
-            // Evaluate argument as complex128
-            if (!arg->output) {
-                arg->output = malloc(n->nitems * sizeof(double complex));
-                arg->nitems = n->nitems;
-                ((me_expr*)arg)->dtype = ME_COMPLEX128;
-            }
-            me_eval_c128(arg);
-
-            // Extract imaginary part to float64 output
-            const double complex *cdata = (const double complex*)arg->output;
-            double *output = (double*)n->output;
-            for (int i = 0; i < n->nitems; i++) {
-                output[i] = cimag(cdata[i]);
+    // Special case: imag() and real() functions return real from complex input
+    if (IS_FUNCTION(n->type) && ARITY(n->type) == 1) {
+        if (n->function == (void*)imag_wrapper || n->function == (void*)real_wrapper) {
+            me_expr *arg = (me_expr*)n->parameters[0];
+            me_dtype arg_type = infer_result_type(arg);
+
+            if (arg_type == ME_COMPLEX64) {
+                // Evaluate argument as complex64
+                if (!arg->output) {
+                    arg->output = malloc(n->nitems * sizeof(float complex));
+                    arg->nitems = n->nitems;
+                    ((me_expr*)arg)->dtype = ME_COMPLEX64;
+                }
+                me_eval_c64(arg);
+
+                // Extract real/imaginary part to float32 output
+                const float complex *cdata = (const float complex*)arg->output;
+                float *output = (float*)n->output;
+                if (n->function == (void*)imag_wrapper) {
+                    for (int i = 0; i < n->nitems; i++) {
+                        output[i] = cimagf(cdata[i]);
+                    }
+                } else { // real_wrapper
+                    for (int i = 0; i < n->nitems; i++) {
+                        output[i] = crealf(cdata[i]);
+                    }
+                }
+                return;
+            } else if (arg_type == ME_COMPLEX128) {
+                // Evaluate argument as complex128
+                if (!arg->output) {
+                    arg->output = malloc(n->nitems * sizeof(double complex));
+                    arg->nitems = n->nitems;
+                    ((me_expr*)arg)->dtype = ME_COMPLEX128;
+                }
+                me_eval_c128(arg);
+
+                // Extract real/imaginary part to float64 output
+                const double complex *cdata = (const double complex*)arg->output;
+                double *output = (double*)n->output;
+                if (n->function == (void*)imag_wrapper) {
+                    for (int i = 0; i < n->nitems; i++) {
+                        output[i] = cimag(cdata[i]);
+                    }
+                } else { // real_wrapper
+                    for (int i = 0; i < n->nitems; i++) {
+                        output[i] = creal(cdata[i]);
+                    }
+                }
+                return;
             }
-            return;
+            // If not complex, fall through to normal evaluation
         }
-        // If not complex, fall through to normal evaluation
     }
 
     // Infer the result type from the expression tree

From 620c98093c93f79273126be6bf3724d300836aa7 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 23 Dec 2025 08:16:19 +0100
Subject: [PATCH 025/123] New where() func in miniexpr

---
 src/blosc2/lazyexpr.py |  2 +-
 src/blosc2/miniexpr.c  | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 5857f81c..cd008ab5 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1279,7 +1279,6 @@ def fast_eval(  # noqa: C901
         "clip",
         "maximum",
         "minimum",
-        "where",
         "contains",
     ] + reducers  # miniexpr doesn't support reduction functions
 
@@ -1302,6 +1301,7 @@ def fast_eval(  # noqa: C901
         # Use the same chunks/blocks as the input operands for consistency
         res_eval = blosc2.empty(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
         try:
+            # print("expr->miniexpr:", expression)
             res_eval._set_pref_expr(expression, operands)
             # This line would NOT allocate physical RAM on any modern OS:
             aux = np.empty(res_eval.shape, res_eval.dtype)
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index 722f492a..000f6a8b 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -252,6 +252,7 @@ static double round_wrapper(double x);
 static double sign(double x);
 static double square(double x);
 static double trunc_wrapper(double x);
+static double where_scalar(double c, double x, double y);
 
 /* Infer computation type from expression tree (for evaluation) */
 static me_dtype infer_result_type(const me_expr *n) {
@@ -355,6 +356,14 @@ static me_dtype infer_output_type(const me_expr *n) {
                 }
             }
 
+            // Special case: where(cond, x, y) -> promote(x, y), regardless of cond type.
+            if (IS_FUNCTION(n->type) && ARITY(n->type) == 3 &&
+                n->function == (void*)where_scalar) {
+                me_dtype x_type = infer_output_type((const me_expr *) n->parameters[1]);
+                me_dtype y_type = infer_output_type((const me_expr *) n->parameters[2]);
+                return promote_types(x_type, y_type);
+            }
+
             // If this node is a comparison (dtype == ME_BOOL set during parsing),
             // the output type is ME_BOOL
             if (n->dtype == ME_BOOL) {
@@ -567,6 +576,11 @@ static double square(double x) { return x * x; }
 /* Wrapper for trunc: truncate towards zero */
 static double trunc_wrapper(double x) { return trunc(x); }
 
+/* Scalar helper for where(), used only in generic slow path */
+static double where_scalar(double c, double x, double y) {
+    return (c != 0.0) ? x : y;
+}
+
 static double real_wrapper(double x) {
     /* This should never be called for real numbers */
     (void)x;
@@ -663,6 +677,7 @@ static const me_variable functions[] = {
     {"tan", 0, tan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"tanh", 0, tanh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"trunc", 0, trunc_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"where", 0, where_scalar, ME_FUNCTION3 | ME_FLAG_PURE, 0},
     {0, 0, 0, 0, 0}
 };
 
@@ -2217,6 +2232,19 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
                         output[i] = (TYPE)func(a, b); \
                     } \
                 } \
+            } else if (arity == 3 && IS_FUNCTION(n->type) && n->function == (void*)where_scalar) { \
+                /* where(cond, x, y) – NumPy-like semantics: cond != 0 selects x else y */ \
+                me_expr *cond = (me_expr*)n->parameters[0]; \
+                me_expr *xexpr = (me_expr*)n->parameters[1]; \
+                me_expr *yexpr = (me_expr*)n->parameters[2]; \
+                \
+                const TYPE *cdata = (const TYPE*)((cond->type == ME_VARIABLE) ? cond->bound : cond->output); \
+                const TYPE *xdata = (const TYPE*)((xexpr->type == ME_VARIABLE) ? xexpr->bound : xexpr->output); \
+                const TYPE *ydata = (const TYPE*)((yexpr->type == ME_VARIABLE) ? yexpr->bound : yexpr->output); \
+                \
+                for (i = 0; i < n->nitems; i++) { \
+                    output[i] = (cdata[i] != (TYPE)0) ? xdata[i] : ydata[i]; \
+                } \
             } \
             else if (arity == 1 && IS_FUNCTION(n->type)) { \
                 me_expr *arg = (me_expr*)n->parameters[0]; \

From 5545a81fe76a88c62cee801fbde66d6cb0b015c1 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 23 Dec 2025 08:43:39 +0100
Subject: [PATCH 026/123] Support for windows

---
 src/blosc2/miniexpr.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index 000f6a8b..ca6a59bd 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -46,6 +46,13 @@ For log = natural log do nothing (NumPy compatible)
 For log = base 10 log comment the next line. */
 #define ME_NAT_LOG
 
+#if defined(_MSC_VER)
+// Enable C99 complex support on MSVC
+#ifndef _CRT_USE_C99_COMPLEX
+#define _CRT_USE_C99_COMPLEX
+#endif
+#endif
+
 #include "miniexpr.h"
 #include <stdlib.h>
 #include <math.h>

From d88dcbba8b83bb41dead08c7b0018e4206321b8e Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 23 Dec 2025 14:48:53 +0100
Subject: [PATCH 027/123] Changing to clang-cl on win

---
 CMakeLists.txt        |   9 +
 src/blosc2/miniexpr.c | 610 ++++++++++++++++++++++++++++++------------
 src/blosc2/miniexpr.h |   2 +-
 3 files changed, 448 insertions(+), 173 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6ac7da37..407f0258 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,15 @@ target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 # Add include directory for miniexpr.h and others
 target_include_directories(blosc2_ext PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2")
 
+# Compiler-specific settings for clang-cl on Windows
+if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    # clang-cl mimics MSVC, so set flags to ensure compatibility
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument -Wno-microsoft-enum-value")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-command-line-argument -Wno-microsoft-enum-value")
+    # Link clang runtime library for miniexpr sources
+    target_link_libraries(blosc2_ext PRIVATE clang_rt.builtins-x86_64.lib)
+endif()
+
 if(DEFINED ENV{USE_SYSTEM_BLOSC2})
     set(USE_SYSTEM_BLOSC2 ON)
 endif()
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index ca6a59bd..81dd5f76 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -1,7 +1,7 @@
 /*********************************************************************
   Blosc - Blocked Shuffling and Compression Library
 
-  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
+  Copyright (c) 2025  Blosc Development Team <blosc@blosc.org>
   https://blosc.org
   License: BSD 3-Clause (see LICENSE.txt)
 
@@ -46,13 +46,6 @@ For log = natural log do nothing (NumPy compatible)
 For log = base 10 log comment the next line. */
 #define ME_NAT_LOG
 
-#if defined(_MSC_VER)
-// Enable C99 complex support on MSVC
-#ifndef _CRT_USE_C99_COMPLEX
-#define _CRT_USE_C99_COMPLEX
-#endif
-#endif
-
 #include "miniexpr.h"
 #include <stdlib.h>
 #include <math.h>
@@ -62,7 +55,241 @@ For log = base 10 log comment the next line. */
 #include <limits.h>
 #include <stdint.h>
 #include <stdbool.h>
+#if defined(_MSC_VER) && !defined(__clang__)
+#define IVDEP
+#else
+#define IVDEP _Pragma("GCC ivdep")
+#endif
+
 #include <complex.h>
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define float_complex _Fcomplex
+#define double_complex _Dcomplex
+// And it doesn't support standard operators for them in C
+static inline _Fcomplex add_c64(_Fcomplex a, _Fcomplex b) { return _FCbuild(crealf(a) + crealf(b), cimagf(a) + cimagf(b)); }
+static inline _Fcomplex sub_c64(_Fcomplex a, _Fcomplex b) { return _FCbuild(crealf(a) - crealf(b), cimagf(a) - cimagf(b)); }
+static inline _Fcomplex neg_c64(_Fcomplex a) { return _FCbuild(-crealf(a), -cimagf(a)); }
+static inline _Fcomplex mul_c64(_Fcomplex a, _Fcomplex b) {
+    return _FCbuild(crealf(a) * crealf(b) - cimagf(a) * cimagf(b), crealf(a) * cimagf(b) + cimagf(a) * crealf(b));
+}
+static inline _Fcomplex div_c64(_Fcomplex a, _Fcomplex b) {
+    float denom = crealf(b) * crealf(b) + cimagf(b) * cimagf(b);
+    return _FCbuild((crealf(a) * crealf(b) + cimagf(a) * cimagf(b)) / denom, (cimagf(a) * crealf(b) - crealf(a) * cimagf(b)) / denom);
+}
+static inline _Dcomplex add_c128(_Dcomplex a, _Dcomplex b) { return _Cbuild(creal(a) + creal(b), cimag(a) + cimag(b)); }
+static inline _Dcomplex sub_c128(_Dcomplex a, _Dcomplex b) { return _Cbuild(creal(a) - creal(b), cimag(a) - cimag(b)); }
+static inline _Dcomplex neg_c128(_Dcomplex a) { return _Cbuild(-creal(a), -cimag(a)); }
+static inline _Dcomplex mul_c128(_Dcomplex a, _Dcomplex b) {
+    return _Cbuild(creal(a) * creal(b) - cimag(a) * cimag(b), creal(a) * cimag(b) + cimag(a) * creal(b));
+}
+static inline _Dcomplex div_c128(_Dcomplex a, _Dcomplex b) {
+    double denom = creal(b) * creal(b) + cimag(b) * cimag(b);
+    return _Cbuild((creal(a) * creal(b) + cimag(a) * cimag(b)) / denom, (cimag(a) * creal(b) - creal(a) * cimag(b)) / denom);
+}
+#else
+#define float_complex float _Complex
+#define double_complex double _Complex
+#define add_c64(a, b) ((a) + (b))
+#define sub_c64(a, b) ((a) - (b))
+#define neg_c64(a) (-(a))
+#define mul_c64(a, b) ((a) * (b))
+#define div_c64(a, b) ((a) / (b))
+#define add_c128(a, b) ((a) + (b))
+#define sub_c128(a, b) ((a) - (b))
+#define neg_c128(a) (-(a))
+#define mul_c128(a, b) ((a) * (b))
+#define div_c128(a, b) ((a) / (b))
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
+/* Wrappers for complex functions to handle MSVC's _Fcomplex/_Dcomplex */
+static inline float _Complex me_cpowf(float _Complex a, float _Complex b) {
+    union { float _Complex c; _Fcomplex m; } ua, ub, ur;
+    ua.c = a; ub.c = b;
+    ur.m = cpowf(ua.m, ub.m);
+    return ur.c;
+}
+static inline double _Complex me_cpow(double _Complex a, double _Complex b) {
+    union { double _Complex c; _Dcomplex m; } ua, ub, ur;
+    ua.c = a; ub.c = b;
+    ur.m = cpow(ua.m, ub.m);
+    return ur.c;
+}
+static inline float _Complex me_csqrtf(float _Complex a) {
+    union { float _Complex c; _Fcomplex m; } ua, ur;
+    ua.c = a;
+    ur.m = csqrtf(ua.m);
+    return ur.c;
+}
+static inline double _Complex me_csqrt(double _Complex a) {
+    union { double _Complex c; _Dcomplex m; } ua, ur;
+    ua.c = a;
+    ur.m = csqrt(ua.m);
+    return ur.c;
+}
+static inline float _Complex me_cexpf(float _Complex a) {
+    union { float _Complex c; _Fcomplex m; } ua, ur;
+    ua.c = a;
+    ur.m = cexpf(ua.m);
+    return ur.c;
+}
+static inline double _Complex me_cexp(double _Complex a) {
+    union { double _Complex c; _Dcomplex m; } ua, ur;
+    ua.c = a;
+    ur.m = cexp(ua.m);
+    return ur.c;
+}
+static inline float _Complex me_clogf(float _Complex a) {
+    union { float _Complex c; _Fcomplex m; } ua, ur;
+    ua.c = a;
+    ur.m = clogf(ua.m);
+    return ur.c;
+}
+static inline double _Complex me_clog(double _Complex a) {
+    union { double _Complex c; _Dcomplex m; } ua, ur;
+    ua.c = a;
+    ur.m = clog(ua.m);
+    return ur.c;
+}
+static inline float me_cabsf(float _Complex a) {
+    union { float _Complex c; _Fcomplex m; } ua;
+    ua.c = a;
+    return cabsf(ua.m);
+}
+static inline double me_cabs(double _Complex a) {
+    union { double _Complex c; _Dcomplex m; } ua;
+    ua.c = a;
+    return cabs(ua.m);
+}
+static inline float me_cimagf(float _Complex a) {
+    union { float _Complex c; _Fcomplex m; } ua;
+    ua.c = a;
+    return cimagf(ua.m);
+}
+static inline double me_cimag(double _Complex a) {
+    union { double _Complex c; _Dcomplex m; } ua;
+    ua.c = a;
+    return cimag(ua.m);
+}
+static inline float me_crealf(float _Complex a) {
+    union { float _Complex c; _Fcomplex m; } ua;
+    ua.c = a;
+    return crealf(ua.m);
+}
+static inline double me_creal(double _Complex a) {
+    union { double _Complex c; _Dcomplex m; } ua;
+    ua.c = a;
+    return creal(ua.m);
+}
+static inline float _Complex me_conjf(float _Complex a) {
+    union { float _Complex c; _Fcomplex m; } ua, ur;
+    ua.c = a;
+    ur.m = conjf(ua.m);
+    return ur.c;
+}
+static inline double _Complex me_conj(double _Complex a) {
+    union { double _Complex c; _Dcomplex m; } ua, ur;
+    ua.c = a;
+    ur.m = conj(ua.m);
+    return ur.c;
+}
+#else
+#if defined(_MSC_VER) && defined(__clang__)
+#define me_cimagf __builtin_cimagf
+#define me_cimag __builtin_cimag
+#define me_crealf __builtin_crealf
+#define me_creal __builtin_creal
+#define me_conjf __builtin_conjf
+#define me_conj __builtin_conj
+#define me_cpowf __builtin_cpowf
+#define me_cpow __builtin_cpow
+#define me_csqrtf __builtin_csqrtf
+#define me_csqrt __builtin_csqrt
+#define me_cexpf __builtin_cexpf
+#define me_cexp __builtin_cexp
+#define me_clogf __builtin_clogf
+#define me_clog __builtin_clog
+#define me_cabsf __builtin_cabsf
+#define me_cabs __builtin_cabs
+#else
+#define me_cpowf cpowf
+#define me_cpow cpow
+#define me_csqrtf csqrtf
+#define me_csqrt csqrt
+#define me_cexpf cexpf
+#define me_cexp cexp
+#define me_clogf clogf
+#define me_clog clog
+#define me_cabsf cabsf
+#define me_cabs cabs
+#define me_cimagf cimagf
+#define me_cimag cimag
+#define me_crealf crealf
+#define me_creal creal
+#define me_conjf conjf
+#define me_conj conj
+#endif
+#endif
+
+/* Type-specific cast and comparison macros to handle MSVC complex structs */
+#define TO_TYPE_bool(x) (bool)(x)
+#define TO_TYPE_i8(x) (int8_t)(x)
+#define TO_TYPE_i16(x) (int16_t)(x)
+#define TO_TYPE_i32(x) (int32_t)(x)
+#define TO_TYPE_i64(x) (int64_t)(x)
+#define TO_TYPE_u8(x) (uint8_t)(x)
+#define TO_TYPE_u16(x) (uint16_t)(x)
+#define TO_TYPE_u32(x) (uint32_t)(x)
+#define TO_TYPE_u64(x) (uint64_t)(x)
+#define TO_TYPE_f32(x) (float)(x)
+#define TO_TYPE_f64(x) (double)(x)
+
+#define FROM_TYPE_bool(x) (double)(x)
+#define FROM_TYPE_i8(x) (double)(x)
+#define FROM_TYPE_i16(x) (double)(x)
+#define FROM_TYPE_i32(x) (double)(x)
+#define FROM_TYPE_i64(x) (double)(x)
+#define FROM_TYPE_u8(x) (double)(x)
+#define FROM_TYPE_u16(x) (double)(x)
+#define FROM_TYPE_u32(x) (double)(x)
+#define FROM_TYPE_u64(x) (double)(x)
+#define FROM_TYPE_f32(x) (double)(x)
+#define FROM_TYPE_f64(x) (double)(x)
+
+#define IS_NONZERO_bool(x) (x)
+#define IS_NONZERO_i8(x) ((x) != 0)
+#define IS_NONZERO_i16(x) ((x) != 0)
+#define IS_NONZERO_i32(x) ((x) != 0)
+#define IS_NONZERO_i64(x) ((x) != 0)
+#define IS_NONZERO_u8(x) ((x) != 0)
+#define IS_NONZERO_u16(x) ((x) != 0)
+#define IS_NONZERO_u32(x) ((x) != 0)
+#define IS_NONZERO_u64(x) ((x) != 0)
+#define IS_NONZERO_f32(x) ((x) != 0.0f)
+#define IS_NONZERO_f64(x) ((x) != 0.0)
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define TO_TYPE_c64(x) _FCbuild((float)(x), 0.0f)
+#define TO_TYPE_c128(x) _Cbuild((double)(x), 0.0)
+#define FROM_TYPE_c64(x) (double)crealf(x)
+#define FROM_TYPE_c128(x) (double)creal(x)
+#define IS_NONZERO_c64(x) (crealf(x) != 0.0f || cimagf(x) != 0.0f)
+#define IS_NONZERO_c128(x) (creal(x) != 0.0 || cimag(x) != 0.0)
+
+/* Helper macros for complex-to-complex conversions */
+#define CONV_c64_to_c128(x) _Cbuild((double)crealf(x), (double)cimagf(x))
+#define TO_TYPE_c128_from_c64(x) CONV_c64_to_c128(x)
+#else
+#define TO_TYPE_c64(x) (float_complex)(x)
+#define TO_TYPE_c128(x) (double_complex)(x)
+#define FROM_TYPE_c64(x) (double)me_crealf(x)
+#define FROM_TYPE_c128(x) (double)me_creal(x)
+#define IS_NONZERO_c64(x) (me_crealf(x) != 0.0f || me_cimagf(x) != 0.0f)
+#define IS_NONZERO_c128(x) (me_creal(x) != 0.0 || me_cimag(x) != 0.0)
+#define TO_TYPE_c128_from_c64(x) (double_complex)(x)
+#endif
+
 #include <assert.h>
 
 #ifndef NAN
@@ -211,8 +438,8 @@ static size_t dtype_size(me_dtype dtype) {
         case ME_UINT64: return sizeof(uint64_t);
         case ME_FLOAT32: return sizeof(float);
         case ME_FLOAT64: return sizeof(double);
-        case ME_COMPLEX64: return sizeof(float complex);
-        case ME_COMPLEX128: return sizeof(double complex);
+        case ME_COMPLEX64: return sizeof(float _Complex);
+        case ME_COMPLEX128: return sizeof(double _Complex);
         default: return 0;
     }
 }
@@ -555,16 +782,11 @@ static double logaddexp(double a, double b) {
 
 /* Wrapper functions for complex operations (for function pointer compatibility) */
 /* These are placeholders - actual implementation is in vector functions */
-static double conj_wrapper(double x) {
-    /* This should never be called for real numbers */
-    (void)x;
-    return NAN;
-}
+static double conj_wrapper(double x) { return x; }
 
 static double imag_wrapper(double x) {
-    /* This should never be called for real numbers */
     (void)x;
-    return NAN;
+    return 0.0;
 }
 
 /* Wrapper for round: round to nearest integer */
@@ -588,11 +810,7 @@ static double where_scalar(double c, double x, double y) {
     return (c != 0.0) ? x : y;
 }
 
-static double real_wrapper(double x) {
-    /* This should never be called for real numbers */
-    (void)x;
-    return NAN;
-}
+static double real_wrapper(double x) { return x; }
 
 static double fac(double a) {
     /* simplest version of fac */
@@ -1180,7 +1398,7 @@ static me_expr *factor(state *s) {
     CHECK_NULL(ret);
 
     while (s->type == TOK_POW) {
-        me_fun2 t = s->function;
+        me_fun2 t = (me_fun2)s->function;
         next_token(s);
         me_expr *f = power(s);
         CHECK_NULL(f, me_free(ret));
@@ -1189,7 +1407,7 @@ static me_expr *factor(state *s) {
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, f);
         CHECK_NULL(ret, me_free(f), me_free(prev));
 
-        ret->function = t;
+        ret->function = (void *)t;
         apply_type_promotion(ret);
     }
 
@@ -1204,7 +1422,7 @@ static me_expr *term(state *s) {
     CHECK_NULL(ret);
 
     while (s->type == TOK_INFIX && (s->function == mul || s->function == divide || s->function == fmod)) {
-        me_fun2 t = s->function;
+        me_fun2 t = (me_fun2)s->function;
         next_token(s);
         me_expr *f = factor(s);
         CHECK_NULL(f, me_free(ret));
@@ -1213,7 +1431,7 @@ static me_expr *term(state *s) {
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, f);
         CHECK_NULL(ret, me_free(f), me_free(prev));
 
-        ret->function = t;
+        ret->function = (void *)t;
         apply_type_promotion(ret);
     }
 
@@ -1227,7 +1445,7 @@ static me_expr *expr(state *s) {
     CHECK_NULL(ret);
 
     while (s->type == TOK_INFIX && (s->function == add || s->function == sub)) {
-        me_fun2 t = s->function;
+        me_fun2 t = (me_fun2)s->function;
         next_token(s);
         me_expr *te = term(s);
         CHECK_NULL(te, me_free(ret));
@@ -1236,7 +1454,7 @@ static me_expr *expr(state *s) {
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, te);
         CHECK_NULL(ret, me_free(te), me_free(prev));
 
-        ret->function = t;
+        ret->function = (void *)t;
         apply_type_promotion(ret); // Apply type promotion
     }
 
@@ -1250,7 +1468,7 @@ static me_expr *shift_expr(state *s) {
     CHECK_NULL(ret);
 
     while (s->type == TOK_SHIFT) {
-        me_fun2 t = s->function;
+        me_fun2 t = (me_fun2)s->function;
         next_token(s);
         me_expr *e = expr(s);
         CHECK_NULL(e, me_free(ret));
@@ -1259,7 +1477,7 @@ static me_expr *shift_expr(state *s) {
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
         CHECK_NULL(ret, me_free(e), me_free(prev));
 
-        ret->function = t;
+        ret->function = (void *)t;
         apply_type_promotion(ret);
     }
 
@@ -1318,7 +1536,7 @@ static me_expr *bitwise_or(state *s) {
     CHECK_NULL(ret);
 
     while (s->type == TOK_BITWISE && (s->function == bit_or)) {
-        me_fun2 t = s->function;
+        me_fun2 t = (me_fun2)s->function;
         next_token(s);
         me_expr *e = bitwise_xor(s);
         CHECK_NULL(e, me_free(ret));
@@ -1327,7 +1545,7 @@ static me_expr *bitwise_or(state *s) {
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
         CHECK_NULL(ret, me_free(e), me_free(prev));
 
-        ret->function = t;
+        ret->function = (void *)t;
         apply_type_promotion(ret);
     }
 
@@ -1341,7 +1559,7 @@ static me_expr *comparison(state *s) {
     CHECK_NULL(ret);
 
     while (s->type == TOK_COMPARE) {
-        me_fun2 t = s->function;
+        me_fun2 t = (me_fun2)s->function;
         next_token(s);
         me_expr *e = bitwise_or(s);
         CHECK_NULL(e, me_free(ret));
@@ -1350,7 +1568,7 @@ static me_expr *comparison(state *s) {
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
         CHECK_NULL(ret, me_free(e), me_free(prev));
 
-        ret->function = t;
+        ret->function = (void *)t;
         apply_type_promotion(ret);
         /* Comparisons always return bool */
         ret->dtype = ME_BOOL;
@@ -1741,25 +1959,25 @@ DEFINE_INT_VEC_OPS(u64, uint64_t)
 /* Boolean logical operations */
 static void vec_and_bool(const bool *a, const bool *b, bool *out, int n) {
     int i;
-#pragma GCC ivdep
+    IVDEP
     for (i = 0; i < n; i++) out[i] = a[i] && b[i];
 }
 
 static void vec_or_bool(const bool *a, const bool *b, bool *out, int n) {
     int i;
-#pragma GCC ivdep
+    IVDEP
     for (i = 0; i < n; i++) out[i] = a[i] || b[i];
 }
 
 static void vec_xor_bool(const bool *a, const bool *b, bool *out, int n) {
     int i;
-#pragma GCC ivdep
+    IVDEP
     for (i = 0; i < n; i++) out[i] = a[i] != b[i];
 }
 
 static void vec_not_bool(const bool *a, bool *out, int n) {
     int i;
-#pragma GCC ivdep
+    IVDEP
     for (i = 0; i < n; i++) out[i] = !a[i];
 }
 
@@ -1768,32 +1986,32 @@ static void vec_not_bool(const bool *a, bool *out, int n) {
 #define DEFINE_COMPARE_OPS(SUFFIX, TYPE) \
 static void vec_cmp_eq_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
     int i; \
-    _Pragma("GCC ivdep") \
+    IVDEP \
     for (i = 0; i < n; i++) out[i] = (a[i] == b[i]) ? 1 : 0; \
 } \
 static void vec_cmp_ne_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
     int i; \
-    _Pragma("GCC ivdep") \
+    IVDEP \
     for (i = 0; i < n; i++) out[i] = (a[i] != b[i]) ? 1 : 0; \
 } \
 static void vec_cmp_lt_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
     int i; \
-    _Pragma("GCC ivdep") \
+    IVDEP \
     for (i = 0; i < n; i++) out[i] = (a[i] < b[i]) ? 1 : 0; \
 } \
 static void vec_cmp_le_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
     int i; \
-    _Pragma("GCC ivdep") \
+    IVDEP \
     for (i = 0; i < n; i++) out[i] = (a[i] <= b[i]) ? 1 : 0; \
 } \
 static void vec_cmp_gt_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
     int i; \
-    _Pragma("GCC ivdep") \
+    IVDEP \
     for (i = 0; i < n; i++) out[i] = (a[i] > b[i]) ? 1 : 0; \
 } \
 static void vec_cmp_ge_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
     int i; \
-    _Pragma("GCC ivdep") \
+    IVDEP \
     for (i = 0; i < n; i++) out[i] = (a[i] >= b[i]) ? 1 : 0; \
 }
 
@@ -1810,148 +2028,148 @@ DEFINE_COMPARE_OPS(f32, float)
 DEFINE_COMPARE_OPS(f64, double)
 
 /* Complex operations */
-static void vec_add_c64(const float complex *a, const float complex *b, float complex *out, int n) {
+static void vec_add_c64(const float _Complex *a, const float _Complex *b, float _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = a[i] + b[i];
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = add_c64(a[i], b[i]);
 }
 
-static void vec_sub_c64(const float complex *a, const float complex *b, float complex *out, int n) {
+static void vec_sub_c64(const float _Complex *a, const float _Complex *b, float _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = a[i] - b[i];
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = sub_c64(a[i], b[i]);
 }
 
-static void vec_mul_c64(const float complex *a, const float complex *b, float complex *out, int n) {
+static void vec_mul_c64(const float _Complex *a, const float _Complex *b, float _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = a[i] * b[i];
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = mul_c64(a[i], b[i]);
 }
 
-static void vec_div_c64(const float complex *a, const float complex *b, float complex *out, int n) {
+static void vec_div_c64(const float _Complex *a, const float _Complex *b, float _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = a[i] / b[i];
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = div_c64(a[i], b[i]);
 }
 
-static void vec_add_scalar_c64(const float complex *a, float complex b, float complex *out, int n) {
+static void vec_add_scalar_c64(const float _Complex *a, float _Complex b, float _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = a[i] + b;
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = add_c64(a[i], b);
 }
 
-static void vec_mul_scalar_c64(const float complex *a, float complex b, float complex *out, int n) {
+static void vec_mul_scalar_c64(const float _Complex *a, float _Complex b, float _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = a[i] * b;
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = mul_c64(a[i], b);
 }
 
-static void vec_pow_c64(const float complex *a, const float complex *b, float complex *out, int n) {
+static void vec_pow_c64(const float _Complex *a, const float _Complex *b, float _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = cpowf(a[i], b[i]);
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = me_cpowf(a[i], b[i]);
 }
 
-static void vec_pow_scalar_c64(const float complex *a, float complex b, float complex *out, int n) {
+static void vec_pow_scalar_c64(const float _Complex *a, float _Complex b, float _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = cpowf(a[i], b);
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = me_cpowf(a[i], b);
 }
 
-static void vec_sqrt_c64(const float complex *a, float complex *out, int n) {
+static void vec_sqrt_c64(const float _Complex *a, float _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = csqrtf(a[i]);
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = me_csqrtf(a[i]);
 }
 
-static void vec_negame_c64(const float complex *a, float complex *out, int n) {
+static void vec_negame_c64(const float _Complex *a, float _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = -a[i];
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = neg_c64(a[i]);
 }
 
-static void vec_conj_c64(const float complex *a, float complex *out, int n) {
+static void vec_conj_c64(const float _Complex *a, float _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = conjf(a[i]);
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = me_conjf(a[i]);
 }
 
-static void vec_imag_c64(const float complex *a, float *out, int n) {
+static void vec_imag_c64(const float _Complex *a, float *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = cimagf(a[i]);
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = me_cimagf(a[i]);
 }
 
-static void vec_add_c128(const double complex *a, const double complex *b, double complex *out, int n) {
+static void vec_add_c128(const double _Complex *a, const double _Complex *b, double _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = a[i] + b[i];
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = add_c128(a[i], b[i]);
 }
 
-static void vec_sub_c128(const double complex *a, const double complex *b, double complex *out, int n) {
+static void vec_sub_c128(const double _Complex *a, const double _Complex *b, double _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = a[i] - b[i];
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = sub_c128(a[i], b[i]);
 }
 
-static void vec_mul_c128(const double complex *a, const double complex *b, double complex *out, int n) {
+static void vec_mul_c128(const double _Complex *a, const double _Complex *b, double _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = a[i] * b[i];
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = mul_c128(a[i], b[i]);
 }
 
-static void vec_div_c128(const double complex *a, const double complex *b, double complex *out, int n) {
+static void vec_div_c128(const double _Complex *a, const double _Complex *b, double _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = a[i] / b[i];
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = div_c128(a[i], b[i]);
 }
 
-static void vec_add_scalar_c128(const double complex *a, double complex b, double complex *out, int n) {
+static void vec_add_scalar_c128(const double _Complex *a, double _Complex b, double _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = a[i] + b;
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = add_c128(a[i], b);
 }
 
-static void vec_mul_scalar_c128(const double complex *a, double complex b, double complex *out, int n) {
+static void vec_mul_scalar_c128(const double _Complex *a, double _Complex b, double _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = a[i] * b;
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = mul_c128(a[i], b);
 }
 
-static void vec_pow_c128(const double complex *a, const double complex *b, double complex *out, int n) {
+static void vec_pow_c128(const double _Complex *a, const double _Complex *b, double _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = cpow(a[i], b[i]);
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = me_cpow(a[i], b[i]);
 }
 
-static void vec_pow_scalar_c128(const double complex *a, double complex b, double complex *out, int n) {
+static void vec_pow_scalar_c128(const double _Complex *a, double _Complex b, double _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = cpow(a[i], b);
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = me_cpow(a[i], b);
 }
 
-static void vec_sqrt_c128(const double complex *a, double complex *out, int n) {
+static void vec_sqrt_c128(const double _Complex *a, double _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = csqrt(a[i]);
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = me_csqrt(a[i]);
 }
 
-static void vec_negame_c128(const double complex *a, double complex *out, int n) {
+static void vec_negame_c128(const double _Complex *a, double _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = -a[i];
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = neg_c128(a[i]);
 }
 
-static void vec_conj_c128(const double complex *a, double complex *out, int n) {
+static void vec_conj_c128(const double _Complex *a, double _Complex *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = conj(a[i]);
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = me_conj(a[i]);
 }
 
-static void vec_imag_c128(const double complex *a, double *out, int n) {
+static void vec_imag_c128(const double _Complex *a, double *out, int n) {
     int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) out[i] = cimag(a[i]);
+    IVDEP
+    for (i = 0; i < n; i++) out[i] = me_cimag(a[i]);
 }
 
 /* ============================================================================
@@ -1963,10 +2181,11 @@ static void vec_imag_c128(const double complex *a, double *out, int n) {
 #define DEFINE_VEC_CONVERT(FROM_SUFFIX, TO_SUFFIX, FROM_TYPE, TO_TYPE) \
 static void vec_convert_##FROM_SUFFIX##_to_##TO_SUFFIX(const FROM_TYPE *in, TO_TYPE *out, int n) { \
     int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = (TO_TYPE)in[i]; \
+    IVDEP \
+    for (i = 0; i < n; i++) out[i] = TO_TYPE_##TO_SUFFIX(in[i]); \
 }
 
+
 /* Generate all conversion functions */
 /* Conversions FROM bool TO other types */
 DEFINE_VEC_CONVERT(bool, i8, bool, int8_t)
@@ -2033,12 +2252,12 @@ DEFINE_VEC_CONVERT(u32, f64, uint32_t, double)
 DEFINE_VEC_CONVERT(u64, f64, uint64_t, double)
 
 DEFINE_VEC_CONVERT(f32, f64, float, double)
-DEFINE_VEC_CONVERT(f32, c64, float, float complex)
-DEFINE_VEC_CONVERT(f32, c128, float, double complex)
+DEFINE_VEC_CONVERT(f32, c64, float, float _Complex)
+DEFINE_VEC_CONVERT(f32, c128, float, double _Complex)
 
-DEFINE_VEC_CONVERT(f64, c128, double, double complex)
+DEFINE_VEC_CONVERT(f64, c128, double, double _Complex)
 
-DEFINE_VEC_CONVERT(c64, c128, float complex, double complex)
+DEFINE_VEC_CONVERT(c64, c128, float _Complex, double _Complex)
 
 /* Function to get conversion function pointer */
 typedef void (*convert_func_t)(const void *, void *, int);
@@ -2146,7 +2365,7 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
     switch(TYPE_MASK(n->type)) { \
         case ME_CONSTANT: \
             { \
-                TYPE val = (TYPE)n->value; \
+                TYPE val = TO_TYPE_##SUFFIX(n->value); \
                 for (i = 0; i < n->nitems; i++) { \
                     output[i] = val; \
                 } \
@@ -2193,9 +2412,9 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
                     if (ldata && rdata) { \
                         VEC_ADD(ldata, rdata, output, n->nitems); \
                     } else if (ldata && right->type == ME_CONSTANT) { \
-                        VEC_ADD_SCALAR(ldata, (TYPE)right->value, output, n->nitems); \
+                        VEC_ADD_SCALAR(ldata, TO_TYPE_##SUFFIX(right->value), output, n->nitems); \
                     } else if (left->type == ME_CONSTANT && rdata) { \
-                        VEC_ADD_SCALAR(rdata, (TYPE)left->value, output, n->nitems); \
+                        VEC_ADD_SCALAR(rdata, TO_TYPE_##SUFFIX(left->value), output, n->nitems); \
                     } else { \
                         goto general_case_binary_##SUFFIX; \
                     } \
@@ -2209,9 +2428,9 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
                     if (ldata && rdata) { \
                         VEC_MUL(ldata, rdata, output, n->nitems); \
                     } else if (ldata && right->type == ME_CONSTANT) { \
-                        VEC_MUL_SCALAR(ldata, (TYPE)right->value, output, n->nitems); \
+                        VEC_MUL_SCALAR(ldata, TO_TYPE_##SUFFIX(right->value), output, n->nitems); \
                     } else if (left->type == ME_CONSTANT && rdata) { \
-                        VEC_MUL_SCALAR(rdata, (TYPE)left->value, output, n->nitems); \
+                        VEC_MUL_SCALAR(rdata, TO_TYPE_##SUFFIX(left->value), output, n->nitems); \
                     } else { \
                         goto general_case_binary_##SUFFIX; \
                     } \
@@ -2225,7 +2444,7 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
                     if (ldata && rdata) { \
                         VEC_POW(ldata, rdata, output, n->nitems); \
                     } else if (ldata && right->type == ME_CONSTANT) { \
-                        VEC_POW_SCALAR(ldata, (TYPE)right->value, output, n->nitems); \
+                        VEC_POW_SCALAR(ldata, TO_TYPE_##SUFFIX(right->value), output, n->nitems); \
                     } else { \
                         goto general_case_binary_##SUFFIX; \
                     } \
@@ -2233,10 +2452,10 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
                     general_case_binary_##SUFFIX: \
                     for (i = 0; i < n->nitems; i++) { \
                         double a = (left->type == ME_CONSTANT) ? left->value : \
-                                  (left->type == ME_VARIABLE) ? (double)ldata[i] : (double)ldata[i]; \
+                                  FROM_TYPE_##SUFFIX(ldata[i]); \
                         double b = (right->type == ME_CONSTANT) ? right->value : \
-                                  (right->type == ME_VARIABLE) ? (double)rdata[i] : (double)rdata[i]; \
-                        output[i] = (TYPE)func(a, b); \
+                                  FROM_TYPE_##SUFFIX(rdata[i]); \
+                        output[i] = TO_TYPE_##SUFFIX(func(a, b)); \
                     } \
                 } \
             } else if (arity == 3 && IS_FUNCTION(n->type) && n->function == (void*)where_scalar) { \
@@ -2250,7 +2469,7 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
                 const TYPE *ydata = (const TYPE*)((yexpr->type == ME_VARIABLE) ? yexpr->bound : yexpr->output); \
                 \
                 for (i = 0; i < n->nitems; i++) { \
-                    output[i] = (cdata[i] != (TYPE)0) ? xdata[i] : ydata[i]; \
+                    output[i] = (IS_NONZERO_##SUFFIX(cdata[i])) ? xdata[i] : ydata[i]; \
                 } \
             } \
             else if (arity == 1 && IS_FUNCTION(n->type)) { \
@@ -2273,7 +2492,7 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
                     /* NumPy semantics: imag(real) == 0 with same dtype */ \
                     if (adata) { \
                         for (i = 0; i < n->nitems; i++) { \
-                            output[i] = (TYPE)0; \
+                            output[i] = TO_TYPE_##SUFFIX(0); \
                         } \
                     } \
                 } else if (func_ptr == (void*)real_wrapper) { \
@@ -2288,13 +2507,13 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
                 } else { \
                     me_fun1 func = (me_fun1)func_ptr; \
                     if (arg->type == ME_CONSTANT) { \
-                        TYPE val = (TYPE)func(arg->value); \
+                        TYPE val = TO_TYPE_##SUFFIX(func(arg->value)); \
                         for (i = 0; i < n->nitems; i++) { \
                             output[i] = val; \
                         } \
                     } else { \
                         for (i = 0; i < n->nitems; i++) { \
-                            output[i] = (TYPE)func((double)adata[i]); \
+                            output[i] = TO_TYPE_##SUFFIX(func(FROM_TYPE_##SUFFIX(adata[i]))); \
                         } \
                     } \
                 } \
@@ -2309,30 +2528,30 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
                         if (param->type == ME_CONSTANT) { \
                             args[j] = param->value; \
                         } else { \
-                            args[j] = (double)pdata[i]; \
+                            args[j] = FROM_TYPE_##SUFFIX(pdata[i]); \
                         } \
                     } \
                     \
                     if (IS_FUNCTION(n->type)) { \
                         switch(arity) { \
-                            case 0: output[i] = (TYPE)((double(*)(void))n->function)(); break; \
-                            case 3: output[i] = (TYPE)((double(*)(double,double,double))n->function)(args[0], args[1], args[2]); break; \
-                            case 4: output[i] = (TYPE)((double(*)(double,double,double,double))n->function)(args[0], args[1], args[2], args[3]); break; \
-                            case 5: output[i] = (TYPE)((double(*)(double,double,double,double,double))n->function)(args[0], args[1], args[2], args[3], args[4]); break; \
-                            case 6: output[i] = (TYPE)((double(*)(double,double,double,double,double,double))n->function)(args[0], args[1], args[2], args[3], args[4], args[5]); break; \
-                            case 7: output[i] = (TYPE)((double(*)(double,double,double,double,double,double,double))n->function)(args[0], args[1], args[2], args[3], args[4], args[5], args[6]); break; \
+                            case 0: output[i] = TO_TYPE_##SUFFIX(((double(*)(void))n->function)()); break; \
+                            case 3: output[i] = TO_TYPE_##SUFFIX(((double(*)(double,double,double))n->function)(args[0], args[1], args[2])); break; \
+                            case 4: output[i] = TO_TYPE_##SUFFIX(((double(*)(double,double,double,double))n->function)(args[0], args[1], args[2], args[3])); break; \
+                            case 5: output[i] = TO_TYPE_##SUFFIX(((double(*)(double,double,double,double,double))n->function)(args[0], args[1], args[2], args[3], args[4])); break; \
+                            case 6: output[i] = TO_TYPE_##SUFFIX(((double(*)(double,double,double,double,double,double))n->function)(args[0], args[1], args[2], args[3], args[4], args[5])); break; \
+                            case 7: output[i] = TO_TYPE_##SUFFIX(((double(*)(double,double,double,double,double,double,double))n->function)(args[0], args[1], args[2], args[3], args[4], args[5], args[6])); break; \
                         } \
                     } else if (IS_CLOSURE(n->type)) { \
                         void *context = n->parameters[arity]; \
                         switch(arity) { \
-                            case 0: output[i] = (TYPE)((double(*)(void*))n->function)(context); break; \
-                            case 1: output[i] = (TYPE)((double(*)(void*,double))n->function)(context, args[0]); break; \
-                            case 2: output[i] = (TYPE)((double(*)(void*,double,double))n->function)(context, args[0], args[1]); break; \
-                            case 3: output[i] = (TYPE)((double(*)(void*,double,double,double))n->function)(context, args[0], args[1], args[2]); break; \
-                            case 4: output[i] = (TYPE)((double(*)(void*,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3]); break; \
-                            case 5: output[i] = (TYPE)((double(*)(void*,double,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3], args[4]); break; \
-                            case 6: output[i] = (TYPE)((double(*)(void*,double,double,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3], args[4], args[5]); break; \
-                            case 7: output[i] = (TYPE)((double(*)(void*,double,double,double,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3], args[4], args[5], args[6]); break; \
+                            case 0: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*))n->function)(context)); break; \
+                            case 1: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double))n->function)(context, args[0])); break; \
+                            case 2: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double,double))n->function)(context, args[0], args[1])); break; \
+                            case 3: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double,double,double))n->function)(context, args[0], args[1], args[2])); break; \
+                            case 4: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3])); break; \
+                            case 5: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3], args[4])); break; \
+                            case 6: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double,double,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3], args[4], args[5])); break; \
+                            case 7: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double,double,double,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3], args[4], args[5], args[6])); break; \
                         } \
                     } \
                 } \
@@ -2341,7 +2560,7 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
         \
         default: \
             for (i = 0; i < n->nitems; i++) { \
-                output[i] = (TYPE)NAN; \
+                output[i] = TO_TYPE_##SUFFIX(NAN); \
             } \
             break; \
     } \
@@ -2463,34 +2682,65 @@ static void me_eval_##SUFFIX(const me_expr *n) { \
 #define vec_sqrt_u64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint64_t)sqrt((a)[_i]); } while(0)
 #define vec_negame_u64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
 
+#if defined(_MSC_VER) && !defined(__clang__)
+#define vec_add_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = add_c64((a)[_i], (b)[_i]); } while(0)
+#define vec_sub_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sub_c64((a)[_i], (b)[_i]); } while(0)
+#define vec_mul_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = mul_c64((a)[_i], (b)[_i]); } while(0)
+#define vec_div_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = div_c64((a)[_i], (b)[_i]); } while(0)
+#define vec_pow_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpowf((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = add_c64((a)[_i], (b)); } while(0)
+#define vec_mul_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = mul_c64((a)[_i], (b)); } while(0)
+#define vec_pow_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpowf((a)[_i], (b)); } while(0)
+#define vec_sqrt_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = csqrtf((a)[_i]); } while(0)
+#define vec_negame_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = neg_c64((a)[_i]); } while(0)
+#define vec_conj_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = conjf((a)[_i]); } while(0)
+#define vec_imag_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cimagf((a)[_i]); } while(0)
+#define vec_real_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_crealf((a)[_i]); } while(0)
+#define vec_conj_noop(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i]; } while(0)
+
+#define vec_add_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = add_c128((a)[_i], (b)[_i]); } while(0)
+#define vec_sub_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sub_c128((a)[_i], (b)[_i]); } while(0)
+#define vec_mul_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = mul_c128((a)[_i], (b)[_i]); } while(0)
+#define vec_div_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = div_c128((a)[_i], (b)[_i]); } while(0)
+#define vec_pow_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpow((a)[_i], (b)[_i]); } while(0)
+#define vec_add_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = add_c128((a)[_i], (b)); } while(0)
+#define vec_mul_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = mul_c128((a)[_i], (b)); } while(0)
+#define vec_pow_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpow((a)[_i], (b)); } while(0)
+#define vec_sqrt_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = csqrt((a)[_i]); } while(0)
+#define vec_negame_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = neg_c128((a)[_i]); } while(0)
+#define vec_conj_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = conj((a)[_i]); } while(0)
+#define vec_imag_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cimag((a)[_i]); } while(0)
+#define vec_real_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_creal((a)[_i]); } while(0)
+#else
 #define vec_add_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
 #define vec_sub_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
 #define vec_mul_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
 #define vec_div_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpowf((a)[_i], (b)[_i]); } while(0)
+#define vec_pow_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cpowf((a)[_i], (b)[_i]); } while(0)
 #define vec_add_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
 #define vec_mul_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpowf((a)[_i], (b)); } while(0)
-#define vec_sqrt_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = csqrtf((a)[_i]); } while(0)
+#define vec_pow_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cpowf((a)[_i], (b)); } while(0)
+#define vec_sqrt_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_csqrtf((a)[_i]); } while(0)
 #define vec_negame_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-#define vec_conj_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = conjf((a)[_i]); } while(0)
-#define vec_imag_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cimagf((a)[_i]); } while(0)
-#define vec_real_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = crealf((a)[_i]); } while(0)
-#define vec_conj_noop(a, out, n) do { (void)(a); (void)(out); (void)(n); } while(0)
+#define vec_conj_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_conjf((a)[_i]); } while(0)
+#define vec_imag_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cimagf((a)[_i]); } while(0)
+#define vec_real_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_crealf((a)[_i]); } while(0)
+#define vec_conj_noop(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i]; } while(0)
 
 #define vec_add_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
 #define vec_sub_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
 #define vec_mul_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
 #define vec_div_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpow((a)[_i], (b)[_i]); } while(0)
+#define vec_pow_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cpow((a)[_i], (b)[_i]); } while(0)
 #define vec_add_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
 #define vec_mul_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpow((a)[_i], (b)); } while(0)
-#define vec_sqrt_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = csqrt((a)[_i]); } while(0)
+#define vec_pow_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cpow((a)[_i], (b)); } while(0)
+#define vec_sqrt_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_csqrt((a)[_i]); } while(0)
 #define vec_negame_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-#define vec_conj_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = conj((a)[_i]); } while(0)
-#define vec_imag_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cimag((a)[_i]); } while(0)
-#define vec_real_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = creal((a)[_i]); } while(0)
+#define vec_conj_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_conj((a)[_i]); } while(0)
+#define vec_imag_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cimag((a)[_i]); } while(0)
+#define vec_real_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_creal((a)[_i]); } while(0)
+#endif
 
 /* Generate float32 evaluator */
 DEFINE_ME_EVAL(f32, float,
@@ -2566,18 +2816,18 @@ DEFINE_ME_EVAL(u64, uint64_t,
                vec_conj_noop)
 
 /* Generate complex evaluators */
-DEFINE_ME_EVAL(c64, float complex,
+DEFINE_ME_EVAL(c64, float _Complex,
                vec_add_c64, vec_sub_c64, vec_mul_c64, vec_div_c64, vec_pow_c64,
                vec_add_scalar_c64, vec_mul_scalar_c64, vec_pow_scalar_c64,
                vec_sqrt_c64, vec_sqrt_c64, vec_sqrt_c64, vec_negame_c64,
-               csqrtf, csqrtf, csqrtf, cexpf, clogf, cabsf, cpowf,
+               me_csqrtf, me_csqrtf, me_csqrtf, me_cexpf, me_clogf, me_cabsf, me_cpowf,
                vec_conj_c64)
 
-DEFINE_ME_EVAL(c128, double complex,
+DEFINE_ME_EVAL(c128, double _Complex,
                vec_add_c128, vec_sub_c128, vec_mul_c128, vec_div_c128, vec_pow_c128,
                vec_add_scalar_c128, vec_mul_scalar_c128, vec_pow_scalar_c128,
                vec_sqrt_c128, vec_sqrt_c128, vec_sqrt_c128, vec_negame_c128,
-               csqrt, csqrt, csqrt, cexp, clog, cabs, cpow,
+               me_csqrt, me_csqrt, me_csqrt, me_cexp, me_clog, me_cabs, me_cpow,
                vec_conj_c128)
 
 /* Public API - dispatches to correct type-specific evaluator */
@@ -2779,44 +3029,60 @@ static void private_eval(const me_expr *n) {
             if (arg_type == ME_COMPLEX64) {
                 // Evaluate argument as complex64
                 if (!arg->output) {
-                    arg->output = malloc(n->nitems * sizeof(float complex));
+                    arg->output = malloc(n->nitems * sizeof(float _Complex));
                     arg->nitems = n->nitems;
                     ((me_expr*)arg)->dtype = ME_COMPLEX64;
                 }
                 me_eval_c64(arg);
 
                 // Extract real/imaginary part to float32 output
-                const float complex *cdata = (const float complex*)arg->output;
+                const float _Complex *cdata = (const float _Complex*)arg->output;
                 float *output = (float*)n->output;
                 if (n->function == (void*)imag_wrapper) {
                     for (int i = 0; i < n->nitems; i++) {
+#if defined(_MSC_VER) && defined(__clang__)
+                        output[i] = __builtin_cimagf(cdata[i]);
+#else
                         output[i] = cimagf(cdata[i]);
+#endif
                     }
                 } else { // real_wrapper
                     for (int i = 0; i < n->nitems; i++) {
+#if defined(_MSC_VER) && defined(__clang__)
+                        output[i] = __builtin_crealf(cdata[i]);
+#else
                         output[i] = crealf(cdata[i]);
+#endif
                     }
                 }
                 return;
             } else if (arg_type == ME_COMPLEX128) {
                 // Evaluate argument as complex128
                 if (!arg->output) {
-                    arg->output = malloc(n->nitems * sizeof(double complex));
+                    arg->output = malloc(n->nitems * sizeof(double _Complex));
                     arg->nitems = n->nitems;
                     ((me_expr*)arg)->dtype = ME_COMPLEX128;
                 }
                 me_eval_c128(arg);
 
                 // Extract real/imaginary part to float64 output
-                const double complex *cdata = (const double complex*)arg->output;
+                const double _Complex *cdata = (const double _Complex*)arg->output;
                 double *output = (double*)n->output;
                 if (n->function == (void*)imag_wrapper) {
                     for (int i = 0; i < n->nitems; i++) {
+#if defined(_MSC_VER) && defined(__clang__)
+                        output[i] = __builtin_cimag(cdata[i]);
+#else
                         output[i] = cimag(cdata[i]);
+#endif
                     }
                 } else { // real_wrapper
                     for (int i = 0; i < n->nitems; i++) {
+#if defined(_MSC_VER) && defined(__clang__)
+                        output[i] = __builtin_creal(cdata[i]);
+#else
                         output[i] = creal(cdata[i]);
+#endif
                     }
                 }
                 return;
diff --git a/src/blosc2/miniexpr.h b/src/blosc2/miniexpr.h
index c6b73722..b7499996 100644
--- a/src/blosc2/miniexpr.h
+++ b/src/blosc2/miniexpr.h
@@ -1,7 +1,7 @@
 /*********************************************************************
   Blosc - Blocked Shuffling and Compression Library
 
-  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
+  Copyright (c) 2025  Blosc Development Team <blosc@blosc.org>
   https://blosc.org
   License: BSD 3-Clause (see LICENSE.txt)
 

From d65f2dce839054cc7c299a604a9949a2c8015246 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 09:20:31 +0100
Subject: [PATCH 028/123] Upgrade version of miniexpr

---
 src/blosc2/miniexpr.c | 174 +++++++++++++++++++++++++++++++++++++-----
 src/blosc2/miniexpr.h |  15 ++++
 2 files changed, 171 insertions(+), 18 deletions(-)

diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index 81dd5f76..03c720dc 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -3145,13 +3145,13 @@ static void private_eval(const me_expr *n) {
     }
 
     // Slow path: need to promote variables
-    // Allocate tracking structures (max 100 variables)
-    promoted_var_t promotions[100];
+    // Allocate tracking structures (max ME_MAX_VARS variables)
+    promoted_var_t promotions[ME_MAX_VARS];
     int promo_count = 0;
 
     // Save original variable bindings
-    const void *original_bounds[100];
-    me_dtype original_types[100];
+    const void *original_bounds[ME_MAX_VARS];
+    me_dtype original_types[ME_MAX_VARS];
     int save_idx = 0;
 
     save_variable_bindings(n, original_bounds, original_types, &save_idx);
@@ -3354,7 +3354,7 @@ static void free_intermediate_buffers(me_expr *node) {
 }
 
 /* Helper to save original variable bindings with their pointers */
-static void save_variable_pointers(const me_expr *node, const void **var_pointers, int *var_count) {
+static void save_variable_metadata(const me_expr *node, const void **var_pointers, size_t *var_sizes, int *var_count) {
     if (!node) return;
     switch (TYPE_MASK(node->type)) {
         case ME_VARIABLE:
@@ -3363,6 +3363,7 @@ static void save_variable_pointers(const me_expr *node, const void **var_pointer
                 if (var_pointers[i] == node->bound) return; // Already saved
             }
             var_pointers[*var_count] = node->bound;
+            var_sizes[*var_count] = dtype_size(node->input_dtype);
             (*var_count)++;
             break;
         case ME_FUNCTION0:
@@ -3383,7 +3384,84 @@ static void save_variable_pointers(const me_expr *node, const void **var_pointer
         case ME_CLOSURE7: {
             const int arity = ARITY(node->type);
             for (int i = 0; i < arity; i++) {
-                save_variable_pointers((const me_expr *) node->parameters[i], var_pointers, var_count);
+                save_variable_metadata((const me_expr *) node->parameters[i], var_pointers, var_sizes, var_count);
+            }
+            break;
+        }
+    }
+}
+
+static int count_variable_nodes(const me_expr *node) {
+    if (!node) return 0;
+    switch (TYPE_MASK(node->type)) {
+        case ME_VARIABLE:
+            return 1;
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            int count = 0;
+            const int arity = ARITY(node->type);
+            for (int i = 0; i < arity; i++) {
+                count += count_variable_nodes((const me_expr *) node->parameters[i]);
+            }
+            return count;
+        }
+    }
+    return 0;
+}
+
+static void collect_variable_nodes(me_expr *node, const void **var_pointers, int n_vars,
+                                   me_expr **var_nodes, int *var_indices, int *node_count) {
+    if (!node) return;
+    switch (TYPE_MASK(node->type)) {
+        case ME_VARIABLE: {
+            int idx = -1;
+            for (int i = 0; i < n_vars; i++) {
+                if (node->bound == var_pointers[i]) {
+                    idx = i;
+                    break;
+                }
+            }
+            if (idx >= 0) {
+                var_nodes[*node_count] = node;
+                var_indices[*node_count] = idx;
+                (*node_count)++;
+            }
+            break;
+        }
+        case ME_FUNCTION0:
+        case ME_FUNCTION1:
+        case ME_FUNCTION2:
+        case ME_FUNCTION3:
+        case ME_FUNCTION4:
+        case ME_FUNCTION5:
+        case ME_FUNCTION6:
+        case ME_FUNCTION7:
+        case ME_CLOSURE0:
+        case ME_CLOSURE1:
+        case ME_CLOSURE2:
+        case ME_CLOSURE3:
+        case ME_CLOSURE4:
+        case ME_CLOSURE5:
+        case ME_CLOSURE6:
+        case ME_CLOSURE7: {
+            const int arity = ARITY(node->type);
+            for (int i = 0; i < arity; i++) {
+                collect_variable_nodes((me_expr *) node->parameters[i], var_pointers, n_vars,
+                                       var_nodes, var_indices, node_count);
             }
             break;
         }
@@ -3515,9 +3593,15 @@ void me_eval(const me_expr *expr, const void **vars_chunk,
     if (!expr) return;
 
     // Verify variable count matches
-    const void *original_var_pointers[100];
+    const void *original_var_pointers[ME_MAX_VARS];
+    size_t var_sizes[ME_MAX_VARS];
     int actual_var_count = 0;
-    save_variable_pointers(expr, original_var_pointers, &actual_var_count);
+    save_variable_metadata(expr, original_var_pointers, var_sizes, &actual_var_count);
+    if (actual_var_count > ME_MAX_VARS) {
+        fprintf(stderr, "Error: Expression uses %d variables, exceeds ME_MAX_VARS=%d\n",
+                actual_var_count, ME_MAX_VARS);
+        return;
+    }
 
     if (actual_var_count != n_vars) {
         return;
@@ -3527,18 +3611,72 @@ void me_eval(const me_expr *expr, const void **vars_chunk,
     me_expr *clone = clone_expr(expr);
     if (!clone) return;
 
-    // Update clone's variable bindings
-    update_vars_by_pointer(clone, original_var_pointers, vars_chunk, n_vars);
+    const int block_nitems = ME_EVAL_BLOCK_NITEMS;
+
+    if (!ME_EVAL_ENABLE_BLOCKING || chunk_nitems <= block_nitems) {
+        // Update clone's variable bindings
+        update_vars_by_pointer(clone, original_var_pointers, vars_chunk, n_vars);
+
+        // Update clone's nitems throughout the tree
+        int update_idx = 0;
+        update_variable_bindings(clone, NULL, &update_idx, chunk_nitems);
 
-    // Update clone's nitems throughout the tree
-    int update_idx = 0;
-    update_variable_bindings(clone, NULL, &update_idx, chunk_nitems);
+        // Set output pointer
+        clone->output = output_chunk;
+
+        // Evaluate the clone
+        private_eval(clone);
+    } else {
+        const size_t output_item_size = dtype_size(clone->dtype);
+        const int max_var_nodes = count_variable_nodes(clone);
+        me_expr **var_nodes = NULL;
+        int *var_indices = NULL;
+        int var_node_count = 0;
+
+        if (max_var_nodes > 0) {
+            var_nodes = malloc((size_t)max_var_nodes * sizeof(*var_nodes));
+            var_indices = malloc((size_t)max_var_nodes * sizeof(*var_indices));
+            if (!var_nodes || !var_indices) {
+                free(var_nodes);
+                free(var_indices);
+                me_free(clone);
+                return;
+            }
+            collect_variable_nodes(clone, original_var_pointers, n_vars,
+                                   var_nodes, var_indices, &var_node_count);
+        }
 
-    // Set output pointer
-    clone->output = output_chunk;
+#if defined(__clang__)
+#pragma clang loop unroll_count(4)
+#elif defined(__GNUC__) && !defined(__clang__)
+#pragma GCC unroll 4
+#endif
+        for (int offset = 0; offset < chunk_nitems; offset += block_nitems) {
+            int current = block_nitems;
+            if (offset + current > chunk_nitems) {
+                current = chunk_nitems - offset;
+            }
 
-    // Evaluate the clone
-    private_eval(clone);
+            const void *block_vars[ME_MAX_VARS];
+            for (int i = 0; i < n_vars; i++) {
+                const unsigned char *base = (const unsigned char *)vars_chunk[i];
+                block_vars[i] = base + (size_t)offset * var_sizes[i];
+            }
+
+            for (int i = 0; i < var_node_count; i++) {
+                var_nodes[i]->bound = block_vars[var_indices[i]];
+            }
+
+            int update_idx = 0;
+            update_variable_bindings(clone, NULL, &update_idx, current);
+
+            clone->output = (unsigned char *)output_chunk + (size_t)offset * output_item_size;
+            private_eval(clone);
+        }
+
+        free(var_nodes);
+        free(var_indices);
+    }
 
     // Free the clone (including any intermediate buffers it allocated)
     me_free(clone);
@@ -3678,7 +3816,7 @@ static me_expr *private_compile(const char *expression, const me_variable *varia
 }
 
 // Synthetic addresses for ordinal matching (when user provides NULL addresses)
-static char synthetic_var_addresses[100];
+static char synthetic_var_addresses[ME_MAX_VARS];
 
 me_expr *me_compile(const char *expression, const me_variable *variables,
                     int var_count, me_dtype dtype, int *error) {
diff --git a/src/blosc2/miniexpr.h b/src/blosc2/miniexpr.h
index b7499996..57fc7b65 100644
--- a/src/blosc2/miniexpr.h
+++ b/src/blosc2/miniexpr.h
@@ -44,6 +44,21 @@ extern "C" {
 
 #endif
 
+/* Internal eval block size (elements). Compile-time fixed. */
+#ifndef ME_EVAL_BLOCK_NITEMS
+#define ME_EVAL_BLOCK_NITEMS 1024
+#endif
+
+/* Maximum number of variables supported in a single expression. */
+#ifndef ME_MAX_VARS
+#define ME_MAX_VARS 128
+#endif
+
+/* Enable internal eval blocking for large chunks (1 = on, 0 = off). */
+#ifndef ME_EVAL_ENABLE_BLOCKING
+#define ME_EVAL_ENABLE_BLOCKING 1
+#endif
+
 
 /* Data type enumeration - Full C99 support */
 typedef enum {

From 147cb808842663dfc672cf5f4c3fbcaabf18d458 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 09:31:30 +0100
Subject: [PATCH 029/123] Changing MSVC to clang-cl on win (II)

---
 .github/workflows/cibuildwheels.yml |  5 +++++
 CMakeLists.txt                      | 23 ++++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index f4bb64eb..814a351c 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -17,6 +17,7 @@ env:
   # Skip PyPy wheels for now (numexpr needs some adjustments first)
   # musllinux takes too long to build, and it's not worth it for now
   CIBW_SKIP: "pp* *musllinux* *-win32"
+  CIBW_ENVIRONMENT_WINDOWS: "CC=clang-cl CXX=clang-cl CMAKE_GENERATOR=Ninja CMAKE_C_COMPILER=clang-cl CMAKE_CXX_COMPILER=clang-cl"
 
 jobs:
 
@@ -77,6 +78,10 @@ jobs:
         id: ninja
         uses: turtlesec-no/get-ninja@main
 
+      - name: Install LLVM (clang-cl)
+        if: ${{ matrix.os == 'windows-latest' }}
+        run: choco install llvm --yes
+
       - name: Install MSVC amd64
         uses: ilammy/msvc-dev-cmd@v1
         with:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 407f0258..372d709f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,19 @@
 cmake_minimum_required(VERSION 3.15.0)
+
+if(WIN32)
+    if(NOT DEFINED CMAKE_C_COMPILER AND NOT DEFINED ENV{CC})
+        set(CMAKE_C_COMPILER clang-cl CACHE STRING "Use clang-cl for C99 support on Windows." FORCE)
+    endif()
+    if(NOT DEFINED CMAKE_CXX_COMPILER AND NOT DEFINED ENV{CXX})
+        set(CMAKE_CXX_COMPILER clang-cl CACHE STRING "Use clang-cl for C99 support on Windows." FORCE)
+    endif()
+endif()
+
 project(python-blosc2)
+
+if(WIN32 AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
+    message(FATAL_ERROR "Windows builds require clang-cl for C99 support. Set CC/CXX to clang-cl or use the ClangCL toolset.")
+endif()
 # Specifying Python version below is tricky, but if you don't specify the minimum version here,
 # it would not consider python3 when looking for the executable. This is problematic since Fedora
 # does not include a python symbolic link to python3.
@@ -32,15 +46,6 @@ target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 # Add include directory for miniexpr.h and others
 target_include_directories(blosc2_ext PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2")
 
-# Compiler-specific settings for clang-cl on Windows
-if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-    # clang-cl mimics MSVC, so set flags to ensure compatibility
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument -Wno-microsoft-enum-value")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-command-line-argument -Wno-microsoft-enum-value")
-    # Link clang runtime library for miniexpr sources
-    target_link_libraries(blosc2_ext PRIVATE clang_rt.builtins-x86_64.lib)
-endif()
-
 if(DEFINED ENV{USE_SYSTEM_BLOSC2})
     set(USE_SYSTEM_BLOSC2 ON)
 endif()

From 9e8f677d3448b1342286cb3f42f2becab86e6b83 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 09:36:52 +0100
Subject: [PATCH 030/123] Changing MSVC to clang-cl on win (III)

---
 .github/workflows/cibuildwheels.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index 814a351c..2707d82d 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -17,7 +17,8 @@ env:
   # Skip PyPy wheels for now (numexpr needs some adjustments first)
   # musllinux takes too long to build, and it's not worth it for now
   CIBW_SKIP: "pp* *musllinux* *-win32"
-  CIBW_ENVIRONMENT_WINDOWS: "CC=clang-cl CXX=clang-cl CMAKE_GENERATOR=Ninja CMAKE_C_COMPILER=clang-cl CMAKE_CXX_COMPILER=clang-cl"
+  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl CMAKE_GENERATOR=Ninja CMAKE_C_COMPILER=clang-cl CMAKE_CXX_COMPILER=clang-cl"
+  CIBW_CONFIG_SETTINGS_WINDOWS: "cmake.generator=Ninja;cmake.define.CMAKE_C_COMPILER=clang-cl;cmake.define.CMAKE_CXX_COMPILER=clang-cl"
 
 jobs:
 

From 8e5ed3bc399ad88ab689996cd2dfdf620dded616 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 09:48:05 +0100
Subject: [PATCH 031/123] Changing MSVC to clang-cl on win (IV)

---
 .github/workflows/cibuildwheels.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index 2707d82d..a56f5e4a 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -17,7 +17,8 @@ env:
   # Skip PyPy wheels for now (numexpr needs some adjustments first)
   # musllinux takes too long to build, and it's not worth it for now
   CIBW_SKIP: "pp* *musllinux* *-win32"
-  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl CMAKE_GENERATOR=Ninja CMAKE_C_COMPILER=clang-cl CMAKE_CXX_COMPILER=clang-cl"
+  # Keep generator/compiler settings here to avoid quoting pitfalls in CIBW_ENVIRONMENT_WINDOWS.
+  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl"
   CIBW_CONFIG_SETTINGS_WINDOWS: "cmake.generator=Ninja;cmake.define.CMAKE_C_COMPILER=clang-cl;cmake.define.CMAKE_CXX_COMPILER=clang-cl"
 
 jobs:

From 9ac57e9b18b34f52ee9b4a05fa286732bb4af516 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 09:50:39 +0100
Subject: [PATCH 032/123] Changing MSVC to clang-cl on win (V)

---
 .github/workflows/cibuildwheels.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index a56f5e4a..d9a6668c 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -17,9 +17,9 @@ env:
   # Skip PyPy wheels for now (numexpr needs some adjustments first)
   # musllinux takes too long to build, and it's not worth it for now
   CIBW_SKIP: "pp* *musllinux* *-win32"
-  # Keep generator/compiler settings here to avoid quoting pitfalls in CIBW_ENVIRONMENT_WINDOWS.
+  # Keep generator/toolset settings here to avoid quoting pitfalls in CIBW_ENVIRONMENT_WINDOWS.
   CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl"
-  CIBW_CONFIG_SETTINGS_WINDOWS: "cmake.generator=Ninja;cmake.define.CMAKE_C_COMPILER=clang-cl;cmake.define.CMAKE_CXX_COMPILER=clang-cl"
+  CIBW_CONFIG_SETTINGS_WINDOWS: "cmake.generator=Visual Studio 17 2022;cmake.define.CMAKE_GENERATOR_TOOLSET=ClangCL"
 
 jobs:
 

From 1a473f5841c22661b3e9488a077a8135b34e6691 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 09:55:08 +0100
Subject: [PATCH 033/123] Changing MSVC to clang-cl on win (VI)

---
 CMakeLists.txt | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 372d709f..46d6bd6c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,17 @@
 cmake_minimum_required(VERSION 3.15.0)
 
 if(WIN32)
-    if(NOT DEFINED CMAKE_C_COMPILER AND NOT DEFINED ENV{CC})
-        set(CMAKE_C_COMPILER clang-cl CACHE STRING "Use clang-cl for C99 support on Windows." FORCE)
-    endif()
-    if(NOT DEFINED CMAKE_CXX_COMPILER AND NOT DEFINED ENV{CXX})
-        set(CMAKE_CXX_COMPILER clang-cl CACHE STRING "Use clang-cl for C99 support on Windows." FORCE)
+    if(CMAKE_GENERATOR MATCHES "Visual Studio")
+        if(NOT DEFINED CMAKE_GENERATOR_TOOLSET)
+            set(CMAKE_GENERATOR_TOOLSET "ClangCL" CACHE STRING "Use ClangCL toolset for C99 support on Windows." FORCE)
+        endif()
+    else()
+        if(NOT DEFINED CMAKE_C_COMPILER AND NOT DEFINED ENV{CC})
+            set(CMAKE_C_COMPILER clang-cl CACHE STRING "Use clang-cl for C99 support on Windows." FORCE)
+        endif()
+        if(NOT DEFINED CMAKE_CXX_COMPILER AND NOT DEFINED ENV{CXX})
+            set(CMAKE_CXX_COMPILER clang-cl CACHE STRING "Use clang-cl for C99 support on Windows." FORCE)
+        endif()
     endif()
 endif()
 

From 07cd2d801473219a866de931867122521fbe16ae Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 09:58:27 +0100
Subject: [PATCH 034/123] Changing MSVC to clang-cl on win (VII)

---
 .github/workflows/cibuildwheels.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index d9a6668c..635b7f0c 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -19,7 +19,7 @@ env:
   CIBW_SKIP: "pp* *musllinux* *-win32"
   # Keep generator/toolset settings here to avoid quoting pitfalls in CIBW_ENVIRONMENT_WINDOWS.
   CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl"
-  CIBW_CONFIG_SETTINGS_WINDOWS: "cmake.generator=Visual Studio 17 2022;cmake.define.CMAKE_GENERATOR_TOOLSET=ClangCL"
+  CIBW_CONFIG_SETTINGS_WINDOWS: "cmake.generator=Visual Studio 17 2022;cmake.args=-T ClangCL"
 
 jobs:
 

From 3170053224223c19d9858a511d20e4ce73b34a8c Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:00:52 +0100
Subject: [PATCH 035/123] Changing MSVC to clang-cl on win (VIII)

---
 .github/workflows/cibuildwheels.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index 635b7f0c..eba6164f 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -17,9 +17,8 @@ env:
   # Skip PyPy wheels for now (numexpr needs some adjustments first)
   # musllinux takes too long to build, and it's not worth it for now
   CIBW_SKIP: "pp* *musllinux* *-win32"
-  # Keep generator/toolset settings here to avoid quoting pitfalls in CIBW_ENVIRONMENT_WINDOWS.
-  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl"
-  CIBW_CONFIG_SETTINGS_WINDOWS: "cmake.generator=Visual Studio 17 2022;cmake.args=-T ClangCL"
+  # Use CMAKE_ARGS so scikit-build-core forwards generator/compiler reliably on Windows.
+  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl CMAKE_ARGS=\"-G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl\""
 
 jobs:
 

From d194bd6a91905e0f3159afc4f3304a2a893a2c9b Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:06:38 +0100
Subject: [PATCH 036/123] Changing MSVC to clang-cl on win (IX)

---
 .github/workflows/cibuildwheels.yml | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index eba6164f..03c86124 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -17,8 +17,8 @@ env:
   # Skip PyPy wheels for now (numexpr needs some adjustments first)
   # musllinux takes too long to build, and it's not worth it for now
   CIBW_SKIP: "pp* *musllinux* *-win32"
-  # Use CMAKE_ARGS so scikit-build-core forwards generator/compiler reliably on Windows.
-  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl CMAKE_ARGS=\"-G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl\""
+  # Use CMAKE_ARGS/SKBUILD_CMAKE_ARGS so scikit-build-core forwards generator/compiler reliably on Windows.
+  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl CMAKE_ARGS=\"-G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl\" SKBUILD_CMAKE_ARGS=\"-G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl\""
 
 jobs:
 
@@ -90,6 +90,20 @@ jobs:
 
       - name: Build wheels
         uses: pypa/cibuildwheel@v3.3
+        continue-on-error: true
+
+      - name: Dump CMakeInit.txt (Windows)
+        if: ${{ runner.os == 'Windows' }}
+        run: |
+          $files = Get-ChildItem -Path $env:TEMP -Recurse -Filter CMakeInit.txt -ErrorAction SilentlyContinue
+          if ($files.Count -eq 0) {
+            Write-Host "No CMakeInit.txt files found under $env:TEMP"
+            exit 0
+          }
+          foreach ($f in $files) {
+            Write-Host "---- $($f.FullName) ----"
+            Get-Content $f.FullName
+          }
 
       - name: Make sdist
         if: ${{ matrix.os == 'ubuntu-latest' }}

From 426876a80134b4622ae9ca088b3ad06ae80c1bbc Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:12:51 +0100
Subject: [PATCH 037/123] Changing MSVC to clang-cl on win (X)

---
 .github/workflows/build.yml | 41 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 4287b6c9..9af80e80 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -33,12 +33,51 @@ jobs:
     - name: Install Ninja
       uses: seanmiddleditch/gha-setup-ninja@master
 
+    - name: Install LLVM (Windows)
+      if: runner.os == 'Windows'
+      run: choco install llvm --yes
+
+    - name: Add LLVM to PATH (Windows)
+      if: runner.os == 'Windows'
+      run: echo "C:\\Program Files\\LLVM\\bin" >> $env:GITHUB_PATH
+
     - name: Install specific numpy version
       if: matrix.numpy-version
       run: pip install "numpy==${{ matrix.numpy-version }}.*"
 
-    - name: Build
+    - name: Build (Windows)
+      if: runner.os == 'Windows'
+      id: build_windows
+      run: pip install -e .[test]
+      env:
+        CMAKE_ARGS: "-G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl"
+        SKBUILD_CMAKE_ARGS: "-G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl"
+        CC: clang-cl
+        CXX: clang-cl
+      continue-on-error: true
+
+    - name: Build (non-Windows)
+      if: runner.os != 'Windows'
+      id: build_non_windows
       run: pip install -e .[test]
 
+    - name: Dump CMakeInit.txt (Windows)
+      if: runner.os == 'Windows'
+      run: |
+        $files = Get-ChildItem -Path $env:TEMP -Recurse -Filter CMakeInit.txt -ErrorAction SilentlyContinue
+        if ($files.Count -eq 0) {
+          Write-Host "No CMakeInit.txt files found under $env:TEMP"
+          exit 0
+        }
+        foreach ($f in $files) {
+          Write-Host "---- $($f.FullName) ----"
+          Get-Content $f.FullName
+        }
+
+    - name: Fail if Windows build failed
+      if: runner.os == 'Windows' && steps.build_windows.outcome == 'failure'
+      run: exit 1
+
     - name: Test
+      if: runner.os != 'Windows' || steps.build_windows.outcome == 'success'
       run: python -m pytest -m "not heavy and (network or not network)"

From 639d4833915c89495461e48028795c9bfdcf300b Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:16:38 +0100
Subject: [PATCH 038/123] Changing MSVC to clang-cl on win (XI)

---
 .github/workflows/build.yml         | 5 +++--
 .github/workflows/cibuildwheels.yml | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 9af80e80..7814a63b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -50,8 +50,9 @@ jobs:
       id: build_windows
       run: pip install -e .[test]
       env:
-        CMAKE_ARGS: "-G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl"
-        SKBUILD_CMAKE_ARGS: "-G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl"
+        CMAKE_GENERATOR: Ninja
+        CMAKE_C_COMPILER: clang-cl
+        CMAKE_CXX_COMPILER: clang-cl
         CC: clang-cl
         CXX: clang-cl
       continue-on-error: true
diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index 03c86124..2ca9e55d 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -17,8 +17,8 @@ env:
   # Skip PyPy wheels for now (numexpr needs some adjustments first)
   # musllinux takes too long to build, and it's not worth it for now
   CIBW_SKIP: "pp* *musllinux* *-win32"
-  # Use CMAKE_ARGS/SKBUILD_CMAKE_ARGS so scikit-build-core forwards generator/compiler reliably on Windows.
-  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl CMAKE_ARGS=\"-G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl\" SKBUILD_CMAKE_ARGS=\"-G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl\""
+  # Use explicit generator/compiler env vars; CMAKE_ARGS with spaces is not split on Windows.
+  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl CMAKE_GENERATOR=Ninja CMAKE_C_COMPILER=clang-cl CMAKE_CXX_COMPILER=clang-cl"
 
 jobs:
 

From c35159fea0afb9a7386a3aaffbf494cb737d7a0c Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:22:36 +0100
Subject: [PATCH 039/123] Clang is detected now! Cleanup logs

---
 .github/workflows/build.yml         | 19 -------------------
 .github/workflows/cibuildwheels.yml | 14 --------------
 2 files changed, 33 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7814a63b..a87781bb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -55,30 +55,11 @@ jobs:
         CMAKE_CXX_COMPILER: clang-cl
         CC: clang-cl
         CXX: clang-cl
-      continue-on-error: true
 
     - name: Build (non-Windows)
       if: runner.os != 'Windows'
       id: build_non_windows
       run: pip install -e .[test]
 
-    - name: Dump CMakeInit.txt (Windows)
-      if: runner.os == 'Windows'
-      run: |
-        $files = Get-ChildItem -Path $env:TEMP -Recurse -Filter CMakeInit.txt -ErrorAction SilentlyContinue
-        if ($files.Count -eq 0) {
-          Write-Host "No CMakeInit.txt files found under $env:TEMP"
-          exit 0
-        }
-        foreach ($f in $files) {
-          Write-Host "---- $($f.FullName) ----"
-          Get-Content $f.FullName
-        }
-
-    - name: Fail if Windows build failed
-      if: runner.os == 'Windows' && steps.build_windows.outcome == 'failure'
-      run: exit 1
-
     - name: Test
-      if: runner.os != 'Windows' || steps.build_windows.outcome == 'success'
       run: python -m pytest -m "not heavy and (network or not network)"
diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index 2ca9e55d..ae83cfa5 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -90,20 +90,6 @@ jobs:
 
       - name: Build wheels
         uses: pypa/cibuildwheel@v3.3
-        continue-on-error: true
-
-      - name: Dump CMakeInit.txt (Windows)
-        if: ${{ runner.os == 'Windows' }}
-        run: |
-          $files = Get-ChildItem -Path $env:TEMP -Recurse -Filter CMakeInit.txt -ErrorAction SilentlyContinue
-          if ($files.Count -eq 0) {
-            Write-Host "No CMakeInit.txt files found under $env:TEMP"
-            exit 0
-          }
-          foreach ($f in $files) {
-            Write-Host "---- $($f.FullName) ----"
-            Get-Content $f.FullName
-          }
 
       - name: Make sdist
         if: ${{ matrix.os == 'ubuntu-latest' }}

From d189614ac002b5341c40e9062cc9588d2971210d Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:26:23 +0100
Subject: [PATCH 040/123] Changing MSVC to clang-cl on win (XII)

---
 CMakeLists.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46d6bd6c..3f135d1e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,8 @@ target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 
 # Add include directory for miniexpr.h and others
 target_include_directories(blosc2_ext PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2")
+# Enforce a C standard that miniexpr relies on.
+target_compile_features(blosc2_ext PRIVATE c_std_99)
 
 if(DEFINED ENV{USE_SYSTEM_BLOSC2})
     set(USE_SYSTEM_BLOSC2 ON)
@@ -81,6 +83,16 @@ else()
     FetchContent_MakeAvailable(blosc2)
     include_directories("${blosc2_SOURCE_DIR}/include")
     target_link_libraries(blosc2_ext PRIVATE blosc2_static)
+    if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang")
+        set(_blosc2_ssse3_flag "-mssse3")
+        if(CMAKE_C_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC")
+            set(_blosc2_ssse3_flag "/clang:-mssse3")
+        endif()
+        if(TARGET blosc2_static)
+            target_compile_options(blosc2_static PRIVATE "${_blosc2_ssse3_flag}")
+        endif()
+        unset(_blosc2_ssse3_flag)
+    endif()
 endif()
 
 add_custom_command(

From c64af49d2f74aa97939e5363b1491416b9874dc1 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:33:44 +0100
Subject: [PATCH 041/123] Changing MSVC to clang-cl on win (XIII)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f135d1e..d3dbbd3f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,7 +76,7 @@ else()
     include(FetchContent)
     FetchContent_Declare(blosc2
         GIT_REPOSITORY https://github.com/Blosc/c-blosc2
-        GIT_TAG 5a2b0ed9c4d801230c118fbc5811817055b5a3f5  # v2.22.0
+        GIT_TAG openzl
         # in case you want to use a local copy of c-blosc2 for development, uncomment the line below
         # SOURCE_DIR "/Users/faltet/blosc/c-blosc2"
     )

From b7bf1144a48e2d16d8cdb6bdee9800585527100b Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:35:28 +0100
Subject: [PATCH 042/123] Changing MSVC to clang-cl on win (XIV)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d3dbbd3f..9d5a0121 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,7 +76,7 @@ else()
     include(FetchContent)
     FetchContent_Declare(blosc2
         GIT_REPOSITORY https://github.com/Blosc/c-blosc2
-        GIT_TAG openzl
+        GIT_TAG 290853dc15dc0e0e887cc72fbaac692cefd75014  # openzl
         # in case you want to use a local copy of c-blosc2 for development, uncomment the line below
         # SOURCE_DIR "/Users/faltet/blosc/c-blosc2"
     )

From 6d0d2b864019f307eb48476a718bddc3828509fc Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:37:42 +0100
Subject: [PATCH 043/123] Remove llvm forced installation

---
 .github/workflows/build.yml         | 4 ----
 .github/workflows/cibuildwheels.yml | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a87781bb..825f23c1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -33,10 +33,6 @@ jobs:
     - name: Install Ninja
       uses: seanmiddleditch/gha-setup-ninja@master
 
-    - name: Install LLVM (Windows)
-      if: runner.os == 'Windows'
-      run: choco install llvm --yes
-
     - name: Add LLVM to PATH (Windows)
       if: runner.os == 'Windows'
       run: echo "C:\\Program Files\\LLVM\\bin" >> $env:GITHUB_PATH
diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index ae83cfa5..76721c44 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -79,10 +79,6 @@ jobs:
         id: ninja
         uses: turtlesec-no/get-ninja@main
 
-      - name: Install LLVM (clang-cl)
-        if: ${{ matrix.os == 'windows-latest' }}
-        run: choco install llvm --yes
-
       - name: Install MSVC amd64
         uses: ilammy/msvc-dev-cmd@v1
         with:

From 6d81b0968434f068e031b3d4af6d4d3256a84cc9 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:43:23 +0100
Subject: [PATCH 044/123] Link with clang_rt

---
 CMakeLists.txt | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d5a0121..30b7f490 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,6 +53,26 @@ target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 target_include_directories(blosc2_ext PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2")
 # Enforce a C standard that miniexpr relies on.
 target_compile_features(blosc2_ext PRIVATE c_std_99)
+if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang")
+    execute_process(
+        COMMAND "${CMAKE_C_COMPILER}" -print-resource-dir
+        OUTPUT_VARIABLE _clang_resource_dir
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+    if(_clang_resource_dir)
+        if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+            set(_clang_builtins "${_clang_resource_dir}/lib/windows/clang_rt.builtins-x86_64.lib")
+        else()
+            set(_clang_builtins "${_clang_resource_dir}/lib/windows/clang_rt.builtins-i386.lib")
+        endif()
+        if(EXISTS "${_clang_builtins}")
+            target_link_libraries(blosc2_ext PRIVATE "${_clang_builtins}")
+        endif()
+        unset(_clang_builtins)
+    endif()
+    unset(_clang_resource_dir)
+endif()
 
 if(DEFINED ENV{USE_SYSTEM_BLOSC2})
     set(USE_SYSTEM_BLOSC2 ON)

From a2c955e7ba1955b48325e7e256607a80e222b0fd Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:49:47 +0100
Subject: [PATCH 045/123] We are requring C11

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30b7f490..f7f6d4c6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,8 +51,8 @@ target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 
 # Add include directory for miniexpr.h and others
 target_include_directories(blosc2_ext PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2")
-# Enforce a C standard that miniexpr relies on.
-target_compile_features(blosc2_ext PRIVATE c_std_99)
+# Enforce a C standard that bundled deps rely on (e.g. OpenZL requires C11).
+target_compile_features(blosc2_ext PRIVATE c_std_11)
 if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang")
     execute_process(
         COMMAND "${CMAKE_C_COMPILER}" -print-resource-dir

From 167be9ef5ad7392e8adfb2a0e25cce464346f3ad Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:53:17 +0100
Subject: [PATCH 046/123] We are requring C11; this time globally.

---
 CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7f6d4c6..7d4833c1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,9 @@
 cmake_minimum_required(VERSION 3.15.0)
 
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_STANDARD_REQUIRED ON)
+set(CMAKE_C_EXTENSIONS OFF)
+
 if(WIN32)
     if(CMAKE_GENERATOR MATCHES "Visual Studio")
         if(NOT DEFINED CMAKE_GENERATOR_TOOLSET)
@@ -51,8 +55,6 @@ target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 
 # Add include directory for miniexpr.h and others
 target_include_directories(blosc2_ext PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2")
-# Enforce a C standard that bundled deps rely on (e.g. OpenZL requires C11).
-target_compile_features(blosc2_ext PRIVATE c_std_11)
 if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang")
     execute_process(
         COMMAND "${CMAKE_C_COMPILER}" -print-resource-dir

From f1ff22ee6bcd906975faf72765d3fa30eb2a563a Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 10:57:11 +0100
Subject: [PATCH 047/123] Debugging a test crash on win

---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7d4833c1..817adc30 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,10 @@
 cmake_minimum_required(VERSION 3.15.0)
 
+if(WIN32)
+    cmake_policy(SET CMP0091 NEW)
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
+endif()
+
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED ON)
 set(CMAKE_C_EXTENSIONS OFF)

From 37dae78df3cddc295e59938f001402ec0cae07f1 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 11:01:32 +0100
Subject: [PATCH 048/123] Go back to require C11 in th extension only

---
 CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 817adc30..36b05b77 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,9 +5,6 @@ if(WIN32)
     set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
 endif()
 
-set(CMAKE_C_STANDARD 11)
-set(CMAKE_C_STANDARD_REQUIRED ON)
-set(CMAKE_C_EXTENSIONS OFF)
 
 if(WIN32)
     if(CMAKE_GENERATOR MATCHES "Visual Studio")
@@ -60,6 +57,7 @@ target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 
 # Add include directory for miniexpr.h and others
 target_include_directories(blosc2_ext PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2")
+target_compile_features(blosc2_ext PRIVATE c_std_11)
 if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang")
     execute_process(
         COMMAND "${CMAKE_C_COMPILER}" -print-resource-dir

From 7296e488b6fe2b06a197349ffbb4f5ae38115c78 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 11:10:47 +0100
Subject: [PATCH 049/123] Use sscache to accelerate builds in CI

---
 .github/workflows/build.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 825f23c1..e6499906 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -30,6 +30,28 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
 
+    - name: Install sccache (Windows)
+      if: runner.os == 'Windows'
+      run: choco install sccache --yes
+
+    - name: Cache sccache (Windows)
+      if: runner.os == 'Windows'
+      uses: actions/cache@v4
+      with:
+        path: C:\Users\runneradmin\AppData\Local\sccache
+        key: sccache-${{ runner.os }}-${{ github.sha }}
+        restore-keys: |
+          sccache-${{ runner.os }}-
+
+    - name: Cache pip (Windows)
+      if: runner.os == 'Windows'
+      uses: actions/cache@v4
+      with:
+        path: C:\Users\runneradmin\AppData\Local\pip\Cache
+        key: pip-${{ runner.os }}-${{ hashFiles('pyproject.toml') }}
+        restore-keys: |
+          pip-${{ runner.os }}-
+
     - name: Install Ninja
       uses: seanmiddleditch/gha-setup-ninja@master
 
@@ -49,8 +71,13 @@ jobs:
         CMAKE_GENERATOR: Ninja
         CMAKE_C_COMPILER: clang-cl
         CMAKE_CXX_COMPILER: clang-cl
+        CMAKE_C_COMPILER_LAUNCHER: sccache
+        CMAKE_CXX_COMPILER_LAUNCHER: sccache
+        SCCACHE_DIR: C:\Users\runneradmin\AppData\Local\sccache
         CC: clang-cl
         CXX: clang-cl
+        CMAKE_BUILD_PARALLEL_LEVEL: 8
+        SKBUILD_PARALLEL_LEVEL: 8
 
     - name: Build (non-Windows)
       if: runner.os != 'Windows'

From 4e1475c261af29bbfadfe291c76e87b5d2e8bc76 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 11:14:26 +0100
Subject: [PATCH 050/123] Add instructions on how to accelerate local builds

---
 README_DEVELOPERS.md | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/README_DEVELOPERS.md b/README_DEVELOPERS.md
index 5c2ee0cc..fef948a8 100644
--- a/README_DEVELOPERS.md
+++ b/README_DEVELOPERS.md
@@ -38,6 +38,33 @@ LD_LIBRARY_PATH=/usr/local/lib pytest
 
 That's it! You can now proceed to the testing section.
 
+### Speeding up local builds (sccache + Ninja)
+
+If you do frequent local rebuilds, sccache can significantly speed up C/C++ rebuilds.
+
+```bash
+brew install sccache ninja
+```
+
+Then run:
+
+```bash
+CMAKE_GENERATOR=Ninja \
+CMAKE_C_COMPILER=clang \
+CMAKE_CXX_COMPILER=clang++ \
+CMAKE_C_COMPILER_LAUNCHER=sccache \
+CMAKE_CXX_COMPILER_LAUNCHER=sccache \
+CMAKE_BUILD_PARALLEL_LEVEL=8 \
+SKBUILD_PARALLEL_LEVEL=8 \
+pip install -e .
+```
+
+Check cache stats with:
+
+```bash
+sccache --show-stats
+```
+
 ## Testing
 
 We are using pytest for testing.  You can run the tests by executing

From 7b3b663265f3aa359e36f56626406178dc722de7 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 11:19:46 +0100
Subject: [PATCH 051/123] Run tests in single thread mode for windows

---
 .github/workflows/build.yml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e6499906..ade8a633 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -84,5 +84,14 @@ jobs:
       id: build_non_windows
       run: pip install -e .[test]
 
-    - name: Test
+    - name: Test (Windows)
+      if: runner.os == 'Windows'
+      run: python -m pytest -m "not heavy and (network or not network)"
+      env:
+        BLOSC_NTHREADS: "1"
+        NUMEXPR_NUM_THREADS: "1"
+        OMP_NUM_THREADS: "1"
+
+    - name: Test (non-Windows)
+      if: runner.os != 'Windows'
       run: python -m pytest -m "not heavy and (network or not network)"

From 92f7df81f804330fc4c720dc77a00a57ec5697ac Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 11:37:45 +0100
Subject: [PATCH 052/123] Add more info about accelerated builds

---
 README_DEVELOPERS.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README_DEVELOPERS.md b/README_DEVELOPERS.md
index fef948a8..a37ae5ed 100644
--- a/README_DEVELOPERS.md
+++ b/README_DEVELOPERS.md
@@ -56,9 +56,13 @@ CMAKE_C_COMPILER_LAUNCHER=sccache \
 CMAKE_CXX_COMPILER_LAUNCHER=sccache \
 CMAKE_BUILD_PARALLEL_LEVEL=8 \
 SKBUILD_PARALLEL_LEVEL=8 \
+SKBUILD_BUILD_DIR=build \
 pip install -e .
 ```
 
+Using `SKBUILD_BUILD_DIR` keeps a stable build directory between runs, which
+improves incremental rebuilds and sccache hit rates.
+
 Check cache stats with:
 
 ```bash

From 9c2a902762044f8214053a2dc2e9cfb06cb246e6 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 11:44:02 +0100
Subject: [PATCH 053/123] Debugging on CI windows...

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 36b05b77..a53ad33b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.15.0)
 
 if(WIN32)
     cmake_policy(SET CMP0091 NEW)
-    set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL" CACHE STRING "" FORCE)
 endif()
 
 

From eb1007a827d0ebc0d4514c85ac8697ba84fb11e8 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 13:08:24 +0100
Subject: [PATCH 054/123] Deactivate miniexpr on windows

---
 .github/workflows/build.yml | 8 ++++----
 src/blosc2/lazyexpr.py      | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ade8a633..ae03ba4a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -87,10 +87,10 @@ jobs:
     - name: Test (Windows)
       if: runner.os == 'Windows'
       run: python -m pytest -m "not heavy and (network or not network)"
-      env:
-        BLOSC_NTHREADS: "1"
-        NUMEXPR_NUM_THREADS: "1"
-        OMP_NUM_THREADS: "1"
+#      env:
+#        BLOSC_NTHREADS: "1"
+#        NUMEXPR_NUM_THREADS: "1"
+#        OMP_NUM_THREADS: "1"
 
     - name: Test (non-Windows)
       if: runner.os != 'Windows'
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index cd008ab5..3ee73351 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1296,6 +1296,10 @@ def fast_eval(  # noqa: C901
                 use_miniexpr = False
                 break
 
+    if sys.platform == "win32":
+        # Miniexpr has issues on Windows; still investigating
+        use_miniexpr = False
+
     if use_miniexpr:
         cparams = kwargs.pop("cparams", blosc2.CParams())
         # Use the same chunks/blocks as the input operands for consistency

From c04a2990e3bb9543bf0e6cb3348d38c85baf593a Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 13:12:38 +0100
Subject: [PATCH 055/123] Use clang-cl and others bells and wisthels

---
 .github/workflows/cibuildwheels.yml | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index 76721c44..0eb79113 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -18,7 +18,7 @@ env:
   # musllinux takes too long to build, and it's not worth it for now
   CIBW_SKIP: "pp* *musllinux* *-win32"
   # Use explicit generator/compiler env vars; CMAKE_ARGS with spaces is not split on Windows.
-  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl CMAKE_GENERATOR=Ninja CMAKE_C_COMPILER=clang-cl CMAKE_CXX_COMPILER=clang-cl"
+  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl CMAKE_GENERATOR=Ninja CMAKE_C_COMPILER=clang-cl CMAKE_CXX_COMPILER=clang-cl CMAKE_C_COMPILER_LAUNCHER=sccache CMAKE_CXX_COMPILER_LAUNCHER=sccache CMAKE_BUILD_PARALLEL_LEVEL=8 SKBUILD_PARALLEL_LEVEL=8 SCCACHE_DIR=C:\\Users\\runneradmin\\AppData\\Local\\sccache"
 
 jobs:
 
@@ -79,6 +79,32 @@ jobs:
         id: ninja
         uses: turtlesec-no/get-ninja@main
 
+      - name: Add LLVM to PATH (Windows)
+        if: ${{ matrix.os == 'windows-latest' }}
+        run: echo "C:\\Program Files\\LLVM\\bin" >> $env:GITHUB_PATH
+
+      - name: Install sccache (Windows)
+        if: ${{ matrix.os == 'windows-latest' }}
+        run: choco install sccache --yes
+
+      - name: Cache sccache (Windows)
+        if: ${{ matrix.os == 'windows-latest' }}
+        uses: actions/cache@v4
+        with:
+          path: C:\Users\runneradmin\AppData\Local\sccache
+          key: sccache-${{ runner.os }}-${{ github.sha }}
+          restore-keys: |
+            sccache-${{ runner.os }}-
+
+      - name: Cache pip (Windows)
+        if: ${{ matrix.os == 'windows-latest' }}
+        uses: actions/cache@v4
+        with:
+          path: C:\Users\runneradmin\AppData\Local\pip\Cache
+          key: pip-${{ runner.os }}-${{ hashFiles('pyproject.toml') }}
+          restore-keys: |
+            pip-${{ runner.os }}-
+
       - name: Install MSVC amd64
         uses: ilammy/msvc-dev-cmd@v1
         with:

From e2ab03f2842525b9d625bb6cc300d0dff2eb9202 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 14:52:55 +0100
Subject: [PATCH 056/123] Further .yml simplications

---
 .github/workflows/build.yml         |  5 ++---
 .github/workflows/cibuildwheels.yml | 24 +-----------------------
 2 files changed, 3 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ae03ba4a..6cd96367 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -12,6 +12,8 @@ jobs:
   build_wheels:
     name: Build and test on ${{ matrix.os }}${{ matrix.numpy-version && format(' (numpy {0})', matrix.numpy-version) || '' }}
     runs-on: ${{ matrix.os }}
+    env:
+      CMAKE_GENERATOR: Ninja
     strategy:
       matrix:
         os: [ubuntu-latest, windows-latest, macos-latest]
@@ -68,9 +70,6 @@ jobs:
       id: build_windows
       run: pip install -e .[test]
       env:
-        CMAKE_GENERATOR: Ninja
-        CMAKE_C_COMPILER: clang-cl
-        CMAKE_CXX_COMPILER: clang-cl
         CMAKE_C_COMPILER_LAUNCHER: sccache
         CMAKE_CXX_COMPILER_LAUNCHER: sccache
         SCCACHE_DIR: C:\Users\runneradmin\AppData\Local\sccache
diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index 0eb79113..168ec33f 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -18,7 +18,7 @@ env:
   # musllinux takes too long to build, and it's not worth it for now
   CIBW_SKIP: "pp* *musllinux* *-win32"
   # Use explicit generator/compiler env vars; CMAKE_ARGS with spaces is not split on Windows.
-  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CC=clang-cl CXX=clang-cl CMAKE_GENERATOR=Ninja CMAKE_C_COMPILER=clang-cl CMAKE_CXX_COMPILER=clang-cl CMAKE_C_COMPILER_LAUNCHER=sccache CMAKE_CXX_COMPILER_LAUNCHER=sccache CMAKE_BUILD_PARALLEL_LEVEL=8 SKBUILD_PARALLEL_LEVEL=8 SCCACHE_DIR=C:\\Users\\runneradmin\\AppData\\Local\\sccache"
+  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CMAKE_GENERATOR=Ninja CC=clang-cl CXX=clang-cl"
 
 jobs:
 
@@ -83,28 +83,6 @@ jobs:
         if: ${{ matrix.os == 'windows-latest' }}
         run: echo "C:\\Program Files\\LLVM\\bin" >> $env:GITHUB_PATH
 
-      - name: Install sccache (Windows)
-        if: ${{ matrix.os == 'windows-latest' }}
-        run: choco install sccache --yes
-
-      - name: Cache sccache (Windows)
-        if: ${{ matrix.os == 'windows-latest' }}
-        uses: actions/cache@v4
-        with:
-          path: C:\Users\runneradmin\AppData\Local\sccache
-          key: sccache-${{ runner.os }}-${{ github.sha }}
-          restore-keys: |
-            sccache-${{ runner.os }}-
-
-      - name: Cache pip (Windows)
-        if: ${{ matrix.os == 'windows-latest' }}
-        uses: actions/cache@v4
-        with:
-          path: C:\Users\runneradmin\AppData\Local\pip\Cache
-          key: pip-${{ runner.os }}-${{ hashFiles('pyproject.toml') }}
-          restore-keys: |
-            pip-${{ runner.os }}-
-
       - name: Install MSVC amd64
         uses: ilammy/msvc-dev-cmd@v1
         with:

From 340fccd2f855d1bcced0ca3f75e7cff015e85638 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 14:56:19 +0100
Subject: [PATCH 057/123] Simplified cmake config

---
 CMakeLists.txt | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a53ad33b..648d0b72 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,25 +6,16 @@ if(WIN32)
 endif()
 
 
-if(WIN32)
-    if(CMAKE_GENERATOR MATCHES "Visual Studio")
-        if(NOT DEFINED CMAKE_GENERATOR_TOOLSET)
-            set(CMAKE_GENERATOR_TOOLSET "ClangCL" CACHE STRING "Use ClangCL toolset for C99 support on Windows." FORCE)
-        endif()
-    else()
-        if(NOT DEFINED CMAKE_C_COMPILER AND NOT DEFINED ENV{CC})
-            set(CMAKE_C_COMPILER clang-cl CACHE STRING "Use clang-cl for C99 support on Windows." FORCE)
-        endif()
-        if(NOT DEFINED CMAKE_CXX_COMPILER AND NOT DEFINED ENV{CXX})
-            set(CMAKE_CXX_COMPILER clang-cl CACHE STRING "Use clang-cl for C99 support on Windows." FORCE)
-        endif()
+if(WIN32 AND CMAKE_GENERATOR MATCHES "Visual Studio")
+    if(NOT DEFINED CMAKE_GENERATOR_TOOLSET)
+        set(CMAKE_GENERATOR_TOOLSET "ClangCL" CACHE STRING "Use ClangCL toolset for C99/C11 support on Windows." FORCE)
     endif()
 endif()
 
 project(python-blosc2)
 
 if(WIN32 AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
-    message(FATAL_ERROR "Windows builds require clang-cl for C99 support. Set CC/CXX to clang-cl or use the ClangCL toolset.")
+    message(FATAL_ERROR "Windows builds require clang-cl. Set CC/CXX to clang-cl or configure CMake with -T ClangCL.")
 endif()
 # Specifying Python version below is tricky, but if you don't specify the minimum version here,
 # it would not consider python3 when looking for the executable. This is problematic since Fedora
@@ -108,16 +99,6 @@ else()
     FetchContent_MakeAvailable(blosc2)
     include_directories("${blosc2_SOURCE_DIR}/include")
     target_link_libraries(blosc2_ext PRIVATE blosc2_static)
-    if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang")
-        set(_blosc2_ssse3_flag "-mssse3")
-        if(CMAKE_C_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC")
-            set(_blosc2_ssse3_flag "/clang:-mssse3")
-        endif()
-        if(TARGET blosc2_static)
-            target_compile_options(blosc2_static PRIVATE "${_blosc2_ssse3_flag}")
-        endif()
-        unset(_blosc2_ssse3_flag)
-    endif()
 endif()
 
 add_custom_command(

From 8155136270e1e41f32dd109280ca77d7b5fe2ce5 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 24 Dec 2025 15:01:12 +0100
Subject: [PATCH 058/123] Add notes about new clang-cl dependency on windows

---
 README.rst           | 11 +++++++++++
 README_DEVELOPERS.md | 10 ++++++++++
 2 files changed, 21 insertions(+)

diff --git a/README.rst b/README.rst
index 3d442419..7480d64f 100644
--- a/README.rst
+++ b/README.rst
@@ -53,6 +53,17 @@ Conda users can install from conda-forge:
 
     conda install -c conda-forge python-blosc2
 
+Windows note
+============
+
+When building from source on Windows, clang-cl is required (OpenZL depends on C11 support).
+Make sure LLVM is on PATH and use the Ninja generator, for example::
+
+    CMAKE_GENERATOR=Ninja
+    CC=clang-cl
+    CXX=clang-cl
+    pip install -e .
+
 Documentation
 =============
 
diff --git a/README_DEVELOPERS.md b/README_DEVELOPERS.md
index a37ae5ed..acf8493a 100644
--- a/README_DEVELOPERS.md
+++ b/README_DEVELOPERS.md
@@ -22,6 +22,16 @@ You are done!
     pip install .   # add -e for editable mode
 ```
 
+On Windows, clang-cl is required (OpenZL depends on C11 support). Make sure LLVM
+is on PATH and build with Ninja, for example:
+
+```bash
+CMAKE_GENERATOR=Ninja \
+CC=clang-cl \
+CXX=clang-cl \
+pip install -e .
+```
+
 There are situations where you may want to build the C-Blosc2 library separately, for example, when debugging issues in the C library. In that case, let's assume you have the C-Blosc2 library installed in `/usr/local`:
 
 ```bash

From f07a7f59b63f4c101827ab35e4e057e31e7d8fdc Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 31 Dec 2025 12:05:22 +0100
Subject: [PATCH 059/123] Updated to latest miniexpr sources

---
 src/blosc2/blosc2_ext.pyx |   21 +-
 src/blosc2/miniexpr.c     | 4663 ++++++++++++++++++++++++++++---------
 src/blosc2/miniexpr.h     |   50 +-
 3 files changed, 3571 insertions(+), 1163 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index be5a5a66..0eb684e8 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -567,11 +567,12 @@ cdef extern from "miniexpr.h":
         int ncode
         void *parameters[1]
 
-    me_expr *me_compile(const char *expression, const me_variable *variables,
-                        int var_count, me_dtype dtype, int *error)
+    int me_compile(const char *expression, const me_variable *variables,
+                   int var_count, me_dtype dtype, int *error, me_expr **out)
+
+    int me_eval(const me_expr *expr, const void ** vars_chunk,
+                int n_vars, void *output_chunk, int chunk_nitems) nogil
 
-    void me_eval(const me_expr *expr, const void ** vars_chunk,
-                 int n_vars, void *output_chunk, int chunk_nitems) nogil
     void me_print(const me_expr *n) nogil
     void me_free(me_expr *n) nogil
 
@@ -1887,8 +1888,10 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
     if miniexpr_handle == NULL:
         raise ValueError("miniexpr: handle not assigned")
     # Call thread-safe miniexpr C API
-    me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
-            <void*>params_output, ndarr.blocknitems)
+    rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
+                 <void*>params_output, ndarr.blocknitems)
+    if rc != 0:
+        raise RuntimeError(f"miniexpr: issues during evaluation; error code: {rc}")
 
     # Free resources
     for i in range(udata.ninputs):
@@ -2831,9 +2834,11 @@ cdef class NDArray:
         cdef int error = 0
         expression = expression.encode("utf-8") if isinstance(expression, str) else expression
         cdef me_dtype = me_dtype_from_numpy(self.dtype.num)
-        udata.miniexpr_handle = me_compile(expression, variables, n, me_dtype, &error)
-        if udata.miniexpr_handle == NULL:
+        cdef me_expr *out_expr
+        error = me_compile(expression, variables, n, me_dtype, &error, &out_expr)
+        if error != 0:
             raise NotImplementedError(f"Cannot compile expression: {expression}")
+        udata.miniexpr_handle = out_expr
 
         # Free resources
         for i in range(len(inputs)):
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index 03c720dc..8890daba 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -55,6 +55,12 @@ For log = base 10 log comment the next line. */
 #include <limits.h>
 #include <stdint.h>
 #include <stdbool.h>
+#if defined(__SSE2__) || defined(__SSE__) || defined(__AVX__) || defined(__AVX2__)
+#include <immintrin.h>
+#endif
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif
 #if defined(_MSC_VER) && !defined(__clang__)
 #define IVDEP
 #else
@@ -67,15 +73,20 @@ For log = base 10 log comment the next line. */
 #define float_complex _Fcomplex
 #define double_complex _Dcomplex
 // And it doesn't support standard operators for them in C
-static inline _Fcomplex add_c64(_Fcomplex a, _Fcomplex b) { return _FCbuild(crealf(a) + crealf(b), cimagf(a) + cimagf(b)); }
-static inline _Fcomplex sub_c64(_Fcomplex a, _Fcomplex b) { return _FCbuild(crealf(a) - crealf(b), cimagf(a) - cimagf(b)); }
+static inline _Fcomplex add_c64(_Fcomplex a, _Fcomplex b) {
+    return _FCbuild(crealf(a) + crealf(b), cimagf(a) + cimagf(b));
+}
+static inline _Fcomplex sub_c64(_Fcomplex a, _Fcomplex b) {
+    return _FCbuild(crealf(a) - crealf(b), cimagf(a) - cimagf(b));
+}
 static inline _Fcomplex neg_c64(_Fcomplex a) { return _FCbuild(-crealf(a), -cimagf(a)); }
 static inline _Fcomplex mul_c64(_Fcomplex a, _Fcomplex b) {
     return _FCbuild(crealf(a) * crealf(b) - cimagf(a) * cimagf(b), crealf(a) * cimagf(b) + cimagf(a) * crealf(b));
 }
 static inline _Fcomplex div_c64(_Fcomplex a, _Fcomplex b) {
     float denom = crealf(b) * crealf(b) + cimagf(b) * cimagf(b);
-    return _FCbuild((crealf(a) * crealf(b) + cimagf(a) * cimagf(b)) / denom, (cimagf(a) * crealf(b) - crealf(a) * cimagf(b)) / denom);
+    return _FCbuild((crealf(a) * crealf(b) + cimagf(a) * cimagf(b)) / denom,
+                    (cimagf(a) * crealf(b) - crealf(a) * cimagf(b)) / denom);
 }
 static inline _Dcomplex add_c128(_Dcomplex a, _Dcomplex b) { return _Cbuild(creal(a) + creal(b), cimag(a) + cimag(b)); }
 static inline _Dcomplex sub_c128(_Dcomplex a, _Dcomplex b) { return _Cbuild(creal(a) - creal(b), cimag(a) - cimag(b)); }
@@ -85,7 +96,8 @@ static inline _Dcomplex mul_c128(_Dcomplex a, _Dcomplex b) {
 }
 static inline _Dcomplex div_c128(_Dcomplex a, _Dcomplex b) {
     double denom = creal(b) * creal(b) + cimag(b) * cimag(b);
-    return _Cbuild((creal(a) * creal(b) + cimag(a) * cimag(b)) / denom, (cimag(a) * creal(b) - creal(a) * cimag(b)) / denom);
+    return _Cbuild((creal(a) * creal(b) + cimag(a) * cimag(b)) / denom,
+                   (cimag(a) * creal(b) - creal(a) * cimag(b)) / denom);
 }
 #else
 #define float_complex float _Complex
@@ -105,91 +117,141 @@ static inline _Dcomplex div_c128(_Dcomplex a, _Dcomplex b) {
 #if defined(_MSC_VER) && !defined(__clang__)
 /* Wrappers for complex functions to handle MSVC's _Fcomplex/_Dcomplex */
 static inline float _Complex me_cpowf(float _Complex a, float _Complex b) {
-    union { float _Complex c; _Fcomplex m; } ua, ub, ur;
-    ua.c = a; ub.c = b;
+    union {
+        float _Complex c;
+        _Fcomplex m;
+    } ua, ub, ur;
+    ua.c = a;
+    ub.c = b;
     ur.m = cpowf(ua.m, ub.m);
     return ur.c;
 }
 static inline double _Complex me_cpow(double _Complex a, double _Complex b) {
-    union { double _Complex c; _Dcomplex m; } ua, ub, ur;
-    ua.c = a; ub.c = b;
+    union {
+        double _Complex c;
+        _Dcomplex m;
+    } ua, ub, ur;
+    ua.c = a;
+    ub.c = b;
     ur.m = cpow(ua.m, ub.m);
     return ur.c;
 }
 static inline float _Complex me_csqrtf(float _Complex a) {
-    union { float _Complex c; _Fcomplex m; } ua, ur;
+    union {
+        float _Complex c;
+        _Fcomplex m;
+    } ua, ur;
     ua.c = a;
     ur.m = csqrtf(ua.m);
     return ur.c;
 }
 static inline double _Complex me_csqrt(double _Complex a) {
-    union { double _Complex c; _Dcomplex m; } ua, ur;
+    union {
+        double _Complex c;
+        _Dcomplex m;
+    } ua, ur;
     ua.c = a;
     ur.m = csqrt(ua.m);
     return ur.c;
 }
 static inline float _Complex me_cexpf(float _Complex a) {
-    union { float _Complex c; _Fcomplex m; } ua, ur;
+    union {
+        float _Complex c;
+        _Fcomplex m;
+    } ua, ur;
     ua.c = a;
     ur.m = cexpf(ua.m);
     return ur.c;
 }
 static inline double _Complex me_cexp(double _Complex a) {
-    union { double _Complex c; _Dcomplex m; } ua, ur;
+    union {
+        double _Complex c;
+        _Dcomplex m;
+    } ua, ur;
     ua.c = a;
     ur.m = cexp(ua.m);
     return ur.c;
 }
 static inline float _Complex me_clogf(float _Complex a) {
-    union { float _Complex c; _Fcomplex m; } ua, ur;
+    union {
+        float _Complex c;
+        _Fcomplex m;
+    } ua, ur;
     ua.c = a;
     ur.m = clogf(ua.m);
     return ur.c;
 }
 static inline double _Complex me_clog(double _Complex a) {
-    union { double _Complex c; _Dcomplex m; } ua, ur;
+    union {
+        double _Complex c;
+        _Dcomplex m;
+    } ua, ur;
     ua.c = a;
     ur.m = clog(ua.m);
     return ur.c;
 }
 static inline float me_cabsf(float _Complex a) {
-    union { float _Complex c; _Fcomplex m; } ua;
+    union {
+        float _Complex c;
+        _Fcomplex m;
+    } ua;
     ua.c = a;
     return cabsf(ua.m);
 }
 static inline double me_cabs(double _Complex a) {
-    union { double _Complex c; _Dcomplex m; } ua;
+    union {
+        double _Complex c;
+        _Dcomplex m;
+    } ua;
     ua.c = a;
     return cabs(ua.m);
 }
 static inline float me_cimagf(float _Complex a) {
-    union { float _Complex c; _Fcomplex m; } ua;
+    union {
+        float _Complex c;
+        _Fcomplex m;
+    } ua;
     ua.c = a;
     return cimagf(ua.m);
 }
 static inline double me_cimag(double _Complex a) {
-    union { double _Complex c; _Dcomplex m; } ua;
+    union {
+        double _Complex c;
+        _Dcomplex m;
+    } ua;
     ua.c = a;
     return cimag(ua.m);
 }
 static inline float me_crealf(float _Complex a) {
-    union { float _Complex c; _Fcomplex m; } ua;
+    union {
+        float _Complex c;
+        _Fcomplex m;
+    } ua;
     ua.c = a;
     return crealf(ua.m);
 }
 static inline double me_creal(double _Complex a) {
-    union { double _Complex c; _Dcomplex m; } ua;
+    union {
+        double _Complex c;
+        _Dcomplex m;
+    } ua;
     ua.c = a;
     return creal(ua.m);
 }
 static inline float _Complex me_conjf(float _Complex a) {
-    union { float _Complex c; _Fcomplex m; } ua, ur;
+    union {
+        float _Complex c;
+        _Fcomplex m;
+    } ua, ur;
     ua.c = a;
     ur.m = conjf(ua.m);
     return ur.c;
 }
 static inline double _Complex me_conj(double _Complex a) {
-    union { double _Complex c; _Dcomplex m; } ua, ur;
+    union {
+        double _Complex c;
+        _Dcomplex m;
+    } ua, ur;
     ua.c = a;
     ur.m = conj(ua.m);
     return ur.c;
@@ -303,6 +365,11 @@ static inline double _Complex me_conj(double _Complex a) {
 
 typedef double (*me_fun2)(double, double);
 
+#if defined(_WIN32) || defined(_WIN64)
+static bool has_complex_node(const me_expr* n);
+static bool has_complex_input(const me_expr* n);
+#endif
+
 enum {
     TOK_NULL = ME_CLOSURE7 + 1, TOK_ERROR, TOK_END, TOK_SEP,
     TOK_OPEN, TOK_CLOSE, TOK_NUMBER, TOK_VARIABLE, TOK_INFIX,
@@ -315,19 +382,19 @@ struct me_expr {
 
     union {
         double value;
-        const void *bound;
-        const void *function;
+        const void* bound;
+        const void* function;
     };
 
     /* Vector operation info */
-    void *output; // Generic pointer (can be float* or double*)
+    void* output; // Generic pointer (can be float* or double*)
     int nitems;
     me_dtype dtype; // Data type for this expression (result type after promotion)
     me_dtype input_dtype; // Original input type (for variables/constants)
     /* Bytecode info (for fused evaluation) */
-    void *bytecode; // Pointer to compiled bytecode
+    void* bytecode; // Pointer to compiled bytecode
     int ncode; // Number of instructions
-    void *parameters[1]; // Must be last (flexible array member)
+    void* parameters[1]; // Must be last (flexible array member)
 };
 
 
@@ -424,23 +491,46 @@ static bool is_complex_dtype(me_dtype dt) {
     return dt == ME_COMPLEX64 || dt == ME_COMPLEX128;
 }
 
+static double sum_reduce(double x);
+static double prod_reduce(double x);
+static double any_reduce(double x);
+static double all_reduce(double x);
+
+static me_dtype reduction_output_dtype(me_dtype dt, const void* func) {
+    if (func == (void*)any_reduce || func == (void*)all_reduce) {
+        return ME_BOOL;
+    }
+    if (func == (void*)sum_reduce || func == (void*)prod_reduce) {
+        if (dt == ME_BOOL) {
+            return ME_INT64;
+        }
+        if (dt >= ME_UINT8 && dt <= ME_UINT64) {
+            return ME_UINT64;
+        }
+        if (dt >= ME_INT8 && dt <= ME_INT64) {
+            return ME_INT64;
+        }
+    }
+    return dt;
+}
+
 /* Get size of a type in bytes */
 static size_t dtype_size(me_dtype dtype) {
     switch (dtype) {
-        case ME_BOOL: return sizeof(bool);
-        case ME_INT8: return sizeof(int8_t);
-        case ME_INT16: return sizeof(int16_t);
-        case ME_INT32: return sizeof(int32_t);
-        case ME_INT64: return sizeof(int64_t);
-        case ME_UINT8: return sizeof(uint8_t);
-        case ME_UINT16: return sizeof(uint16_t);
-        case ME_UINT32: return sizeof(uint32_t);
-        case ME_UINT64: return sizeof(uint64_t);
-        case ME_FLOAT32: return sizeof(float);
-        case ME_FLOAT64: return sizeof(double);
-        case ME_COMPLEX64: return sizeof(float _Complex);
-        case ME_COMPLEX128: return sizeof(double _Complex);
-        default: return 0;
+    case ME_BOOL: return sizeof(bool);
+    case ME_INT8: return sizeof(int8_t);
+    case ME_INT16: return sizeof(int16_t);
+    case ME_INT32: return sizeof(int32_t);
+    case ME_INT64: return sizeof(int64_t);
+    case ME_UINT8: return sizeof(uint8_t);
+    case ME_UINT16: return sizeof(uint16_t);
+    case ME_UINT32: return sizeof(uint32_t);
+    case ME_UINT64: return sizeof(uint64_t);
+    case ME_FLOAT32: return sizeof(float);
+    case ME_FLOAT64: return sizeof(double);
+    case ME_COMPLEX64: return sizeof(float _Complex);
+    case ME_COMPLEX128: return sizeof(double _Complex);
+    default: return 0;
     }
 }
 
@@ -449,21 +539,21 @@ enum { ME_CONSTANT = 1 };
 
 
 typedef struct state {
-    const char *start;
-    const char *next;
+    const char* start;
+    const char* next;
     int type;
 
     union {
         double value;
-        const double *bound;
-        const void *function;
+        const double* bound;
+        const void* function;
     };
 
-    void *context;
+    void* context;
     me_dtype dtype; // Type of current token
     me_dtype target_dtype; // Target dtype for the overall expression
 
-    const me_variable *lookup;
+    const me_variable* lookup;
     int lookup_len;
 } state;
 
@@ -478,50 +568,120 @@ typedef struct state {
 #define CHECK_NULL(ptr, ...) if ((ptr) == NULL) { __VA_ARGS__; return NULL; }
 
 /* Forward declarations */
-static me_expr *new_expr(const int type, const me_expr *parameters[]);
+static me_expr* new_expr(const int type, const me_expr* parameters[]);
 static double conj_wrapper(double x);
 static double imag_wrapper(double x);
 static double real_wrapper(double x);
 static double round_wrapper(double x);
+static double sum_reduce(double x);
+static double prod_reduce(double x);
+static double any_reduce(double x);
+static double all_reduce(double x);
+static double min_reduce(double x);
+static double max_reduce(double x);
 static double sign(double x);
 static double square(double x);
 static double trunc_wrapper(double x);
 static double where_scalar(double c, double x, double y);
 
+static bool is_reduction_function(const void* func) {
+    return func == (void*)sum_reduce || func == (void*)prod_reduce ||
+        func == (void*)min_reduce || func == (void*)max_reduce ||
+        func == (void*)any_reduce || func == (void*)all_reduce;
+}
+
+static bool is_reduction_node(const me_expr* n) {
+    return n && IS_FUNCTION(n->type) && ARITY(n->type) == 1 &&
+        is_reduction_function(n->function);
+}
+
+static bool contains_reduction(const me_expr* n) {
+    if (!n) return false;
+    if (is_reduction_node(n)) return true;
+
+    switch (TYPE_MASK(n->type)) {
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
+            const int arity = ARITY(n->type);
+            for (int i = 0; i < arity; i++) {
+                if (contains_reduction((const me_expr*)n->parameters[i])) {
+                    return true;
+                }
+            }
+            return false;
+        }
+    default:
+        return false;
+    }
+}
+
+static bool reduction_usage_is_valid(const me_expr* n) {
+    if (!is_reduction_node(n)) return false;
+    me_expr* arg = (me_expr*)n->parameters[0];
+    if (!arg) return false;
+    if (n->function == (void*)min_reduce || n->function == (void*)max_reduce) {
+        if (arg->dtype == ME_COMPLEX64 || arg->dtype == ME_COMPLEX128) {
+            return false;
+        }
+    }
+    return TYPE_MASK(arg->type) == ME_VARIABLE || TYPE_MASK(arg->type) == ME_CONSTANT;
+}
+
 /* Infer computation type from expression tree (for evaluation) */
-static me_dtype infer_result_type(const me_expr *n) {
+static me_dtype infer_result_type(const me_expr* n) {
     if (!n) return ME_FLOAT64;
 
     switch (TYPE_MASK(n->type)) {
-        case ME_CONSTANT:
-            return n->dtype;
-
-        case ME_VARIABLE:
-            return n->dtype;
-
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
+    case ME_CONSTANT:
+        return n->dtype;
+
+    case ME_VARIABLE:
+        return n->dtype;
+
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
+            if (is_reduction_node(n)) {
+                me_dtype param_type = infer_result_type((const me_expr*)n->parameters[0]);
+                return reduction_output_dtype(param_type, n->function);
+            }
             // Special case: imag() and real() return real type from complex input
             if (IS_FUNCTION(n->type) && ARITY(n->type) == 1) {
                 if (n->function == (void*)imag_wrapper || n->function == (void*)real_wrapper) {
-                    me_dtype param_type = infer_result_type((const me_expr *) n->parameters[0]);
+                    me_dtype param_type = infer_result_type((const me_expr*)n->parameters[0]);
                     if (param_type == ME_COMPLEX64) {
                         return ME_FLOAT32;
-                    } else if (param_type == ME_COMPLEX128) {
+                    }
+                    else if (param_type == ME_COMPLEX128) {
                         return ME_FLOAT64;
                     }
                     // If input is not complex, return as-is (shouldn't happen, but be safe)
@@ -538,7 +698,7 @@ static me_dtype infer_result_type(const me_expr *n) {
             me_dtype result = ME_BOOL;
 
             for (int i = 0; i < arity; i++) {
-                me_dtype param_type = infer_result_type((const me_expr *) n->parameters[i]);
+                me_dtype param_type = infer_result_type((const me_expr*)n->parameters[i]);
                 result = promote_types(result, param_type);
             }
 
@@ -550,39 +710,45 @@ static me_dtype infer_result_type(const me_expr *n) {
 }
 
 /* Infer logical output type from expression tree (for compilation with ME_AUTO) */
-static me_dtype infer_output_type(const me_expr *n) {
+static me_dtype infer_output_type(const me_expr* n) {
     if (!n) return ME_FLOAT64;
 
     switch (TYPE_MASK(n->type)) {
-        case ME_CONSTANT:
-            return n->dtype;
-
-        case ME_VARIABLE:
-            return n->dtype;
-
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
+    case ME_CONSTANT:
+        return n->dtype;
+
+    case ME_VARIABLE:
+        return n->dtype;
+
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
+            if (is_reduction_node(n)) {
+                me_dtype param_type = infer_output_type((const me_expr*)n->parameters[0]);
+                return reduction_output_dtype(param_type, n->function);
+            }
             // Special case: imag() and real() return real type from complex input
             if (IS_FUNCTION(n->type) && ARITY(n->type) == 1) {
                 if (n->function == (void*)imag_wrapper || n->function == (void*)real_wrapper) {
-                    me_dtype param_type = infer_output_type((const me_expr *) n->parameters[0]);
+                    me_dtype param_type = infer_output_type((const me_expr*)n->parameters[0]);
                     if (param_type == ME_COMPLEX64) {
                         return ME_FLOAT32;
-                    } else if (param_type == ME_COMPLEX128) {
+                    }
+                    else if (param_type == ME_COMPLEX128) {
                         return ME_FLOAT64;
                     }
                     // If input is not complex, return as-is (shouldn't happen, but be safe)
@@ -593,8 +759,8 @@ static me_dtype infer_output_type(const me_expr *n) {
             // Special case: where(cond, x, y) -> promote(x, y), regardless of cond type.
             if (IS_FUNCTION(n->type) && ARITY(n->type) == 3 &&
                 n->function == (void*)where_scalar) {
-                me_dtype x_type = infer_output_type((const me_expr *) n->parameters[1]);
-                me_dtype y_type = infer_output_type((const me_expr *) n->parameters[2]);
+                me_dtype x_type = infer_output_type((const me_expr*)n->parameters[1]);
+                me_dtype y_type = infer_output_type((const me_expr*)n->parameters[2]);
                 return promote_types(x_type, y_type);
             }
 
@@ -609,7 +775,7 @@ static me_dtype infer_output_type(const me_expr *n) {
             me_dtype result = ME_BOOL;
 
             for (int i = 0; i < arity; i++) {
-                me_dtype param_type = infer_output_type((const me_expr *) n->parameters[i]);
+                me_dtype param_type = infer_output_type((const me_expr*)n->parameters[i]);
                 result = promote_types(result, param_type);
             }
 
@@ -621,9 +787,9 @@ static me_dtype infer_output_type(const me_expr *n) {
 }
 
 /* Apply type promotion to a binary operation node */
-static me_expr *create_conversion_node(me_expr *source, me_dtype target_dtype) {
+static me_expr* create_conversion_node(me_expr* source, me_dtype target_dtype) {
     /* Create a unary conversion node that converts source to target_dtype */
-    me_expr *conv = NEW_EXPR(ME_FUNCTION1 | ME_FLAG_PURE, source);
+    me_expr* conv = NEW_EXPR(ME_FUNCTION1 | ME_FLAG_PURE, source);
     if (conv) {
         conv->function = NULL; // Mark as conversion
         conv->dtype = target_dtype;
@@ -632,11 +798,11 @@ static me_expr *create_conversion_node(me_expr *source, me_dtype target_dtype) {
     return conv;
 }
 
-static void apply_type_promotion(me_expr *node) {
+static void apply_type_promotion(me_expr* node) {
     if (!node || ARITY(node->type) < 2) return;
 
-    me_expr *left = (me_expr *) node->parameters[0];
-    me_expr *right = (me_expr *) node->parameters[1];
+    me_expr* left = (me_expr*)node->parameters[0];
+    me_expr* right = (me_expr*)node->parameters[1];
 
     if (left && right) {
         me_dtype left_type = left->dtype;
@@ -667,11 +833,11 @@ static void apply_type_promotion(me_expr *node) {
     }
 }
 
-static me_expr *new_expr(const int type, const me_expr *parameters[]) {
+static me_expr* new_expr(const int type, const me_expr* parameters[]) {
     const int arity = ARITY(type);
-    const int psize = sizeof(void *) * arity;
-    const int size = (sizeof(me_expr) - sizeof(void *)) + psize + (IS_CLOSURE(type) ? sizeof(void *) : 0);
-    me_expr *ret = malloc(size);
+    const int psize = sizeof(void*) * arity;
+    const int size = (sizeof(me_expr) - sizeof(void*)) + psize + (IS_CLOSURE(type) ? sizeof(void*) : 0);
+    me_expr* ret = malloc(size);
     CHECK_NULL(ret);
 
     memset(ret, 0, size);
@@ -689,63 +855,63 @@ static me_expr *new_expr(const int type, const me_expr *parameters[]) {
 }
 
 
-void me_free_parameters(me_expr *n) {
+void me_free_parameters(me_expr* n) {
     if (!n) return;
     switch (TYPE_MASK(n->type)) {
-        case ME_FUNCTION7:
-        case ME_CLOSURE7:
-            if (n->parameters[6] && ((me_expr *) n->parameters[6])->output &&
-                ((me_expr *) n->parameters[6])->output != n->output) {
-                free(((me_expr *) n->parameters[6])->output);
-            }
-            me_free(n->parameters[6]);
-        case ME_FUNCTION6:
-        case ME_CLOSURE6:
-            if (n->parameters[5] && ((me_expr *) n->parameters[5])->output &&
-                ((me_expr *) n->parameters[5])->output != n->output) {
-                free(((me_expr *) n->parameters[5])->output);
-            }
-            me_free(n->parameters[5]);
-        case ME_FUNCTION5:
-        case ME_CLOSURE5:
-            if (n->parameters[4] && ((me_expr *) n->parameters[4])->output &&
-                ((me_expr *) n->parameters[4])->output != n->output) {
-                free(((me_expr *) n->parameters[4])->output);
-            }
-            me_free(n->parameters[4]);
-        case ME_FUNCTION4:
-        case ME_CLOSURE4:
-            if (n->parameters[3] && ((me_expr *) n->parameters[3])->output &&
-                ((me_expr *) n->parameters[3])->output != n->output) {
-                free(((me_expr *) n->parameters[3])->output);
-            }
-            me_free(n->parameters[3]);
-        case ME_FUNCTION3:
-        case ME_CLOSURE3:
-            if (n->parameters[2] && ((me_expr *) n->parameters[2])->output &&
-                ((me_expr *) n->parameters[2])->output != n->output) {
-                free(((me_expr *) n->parameters[2])->output);
-            }
-            me_free(n->parameters[2]);
-        case ME_FUNCTION2:
-        case ME_CLOSURE2:
-            if (n->parameters[1] && ((me_expr *) n->parameters[1])->output &&
-                ((me_expr *) n->parameters[1])->output != n->output) {
-                free(((me_expr *) n->parameters[1])->output);
-            }
-            me_free(n->parameters[1]);
-        case ME_FUNCTION1:
-        case ME_CLOSURE1:
-            if (n->parameters[0] && ((me_expr *) n->parameters[0])->output &&
-                ((me_expr *) n->parameters[0])->output != n->output) {
-                free(((me_expr *) n->parameters[0])->output);
-            }
-            me_free(n->parameters[0]);
-    }
-}
-
-
-void me_free(me_expr *n) {
+    case ME_FUNCTION7:
+    case ME_CLOSURE7:
+        if (n->parameters[6] && ((me_expr*)n->parameters[6])->output &&
+            ((me_expr*)n->parameters[6])->output != n->output) {
+            free(((me_expr*)n->parameters[6])->output);
+        }
+        me_free(n->parameters[6]);
+    case ME_FUNCTION6:
+    case ME_CLOSURE6:
+        if (n->parameters[5] && ((me_expr*)n->parameters[5])->output &&
+            ((me_expr*)n->parameters[5])->output != n->output) {
+            free(((me_expr*)n->parameters[5])->output);
+        }
+        me_free(n->parameters[5]);
+    case ME_FUNCTION5:
+    case ME_CLOSURE5:
+        if (n->parameters[4] && ((me_expr*)n->parameters[4])->output &&
+            ((me_expr*)n->parameters[4])->output != n->output) {
+            free(((me_expr*)n->parameters[4])->output);
+        }
+        me_free(n->parameters[4]);
+    case ME_FUNCTION4:
+    case ME_CLOSURE4:
+        if (n->parameters[3] && ((me_expr*)n->parameters[3])->output &&
+            ((me_expr*)n->parameters[3])->output != n->output) {
+            free(((me_expr*)n->parameters[3])->output);
+        }
+        me_free(n->parameters[3]);
+    case ME_FUNCTION3:
+    case ME_CLOSURE3:
+        if (n->parameters[2] && ((me_expr*)n->parameters[2])->output &&
+            ((me_expr*)n->parameters[2])->output != n->output) {
+            free(((me_expr*)n->parameters[2])->output);
+        }
+        me_free(n->parameters[2]);
+    case ME_FUNCTION2:
+    case ME_CLOSURE2:
+        if (n->parameters[1] && ((me_expr*)n->parameters[1])->output &&
+            ((me_expr*)n->parameters[1])->output != n->output) {
+            free(((me_expr*)n->parameters[1])->output);
+        }
+        me_free(n->parameters[1]);
+    case ME_FUNCTION1:
+    case ME_CLOSURE1:
+        if (n->parameters[0] && ((me_expr*)n->parameters[0])->output &&
+            ((me_expr*)n->parameters[0])->output != n->output) {
+            free(((me_expr*)n->parameters[0])->output);
+        }
+        me_free(n->parameters[0]);
+    }
+}
+
+
+void me_free(me_expr* n) {
     if (!n) return;
     me_free_parameters(n);
     if (n->bytecode) {
@@ -770,7 +936,7 @@ static double log2_wrapper(double x) { return log2(x); }
 /* logaddexp: log(exp(a) + exp(b)), numerically stable */
 static double logaddexp(double a, double b) {
     if (a == b) {
-        return a + log1p(1.0);  // log(2*exp(a)) = a + log(2)
+        return a + log1p(1.0); // log(2*exp(a)) = a + log(2)
     }
     double max_val = (a > b) ? a : b;
     double min_val = (a > b) ? b : a;
@@ -818,20 +984,20 @@ static double fac(double a) {
         return NAN;
     if (a > UINT_MAX)
         return INFINITY;
-    unsigned int ua = (unsigned int) (a);
+    unsigned int ua = (unsigned int)(a);
     unsigned long int result = 1, i;
     for (i = 1; i <= ua; i++) {
         if (i > ULONG_MAX / result)
             return INFINITY;
         result *= i;
     }
-    return (double) result;
+    return (double)result;
 }
 
 static double ncr(double n, double r) {
     if (n < 0.0 || r < 0.0 || n < r) return NAN;
     if (n > UINT_MAX || r > UINT_MAX) return INFINITY;
-    unsigned long int un = (unsigned int) (n), ur = (unsigned int) (r), i;
+    unsigned long int un = (unsigned int)(n), ur = (unsigned int)(r), i;
     unsigned long int result = 1;
     if (ur > un / 2) ur = un - ur;
     for (i = 1; i <= ur; i++) {
@@ -856,6 +1022,8 @@ static const me_variable functions[] = {
     {"abs", 0, fabs, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"acos", 0, acos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"acosh", 0, acosh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"all", 0, all_reduce, ME_FUNCTION1, 0},
+    {"any", 0, any_reduce, ME_FUNCTION1, 0},
     {"arccos", 0, acos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"arccosh", 0, acosh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"arcsin", 0, asin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
@@ -888,10 +1056,13 @@ static const me_variable functions[] = {
     {"log1p", 0, log1p_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"log2", 0, log2_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"logaddexp", 0, logaddexp, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"max", 0, max_reduce, ME_FUNCTION1, 0},
+    {"min", 0, min_reduce, ME_FUNCTION1, 0},
     {"ncr", 0, ncr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
     {"npr", 0, npr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
     {"pi", 0, pi, ME_FUNCTION0 | ME_FLAG_PURE, 0},
     {"pow", 0, pow, ME_FUNCTION2 | ME_FLAG_PURE, 0},
+    {"prod", 0, prod_reduce, ME_FUNCTION1, 0},
     {"real", 0, real_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"round", 0, round_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"sign", 0, sign, ME_FUNCTION1 | ME_FLAG_PURE, 0},
@@ -899,6 +1070,7 @@ static const me_variable functions[] = {
     {"sinh", 0, sinh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"sqrt", 0, sqrt, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"square", 0, square, ME_FUNCTION1 | ME_FLAG_PURE, 0},
+    {"sum", 0, sum_reduce, ME_FUNCTION1, 0},
     {"tan", 0, tan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"tanh", 0, tanh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
     {"trunc", 0, trunc_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
@@ -906,7 +1078,7 @@ static const me_variable functions[] = {
     {0, 0, 0, 0, 0}
 };
 
-static const me_variable *find_builtin(const char *name, int len) {
+static const me_variable* find_builtin(const char* name, int len) {
     int imin = 0;
     int imax = sizeof(functions) / sizeof(me_variable) - 2;
 
@@ -917,9 +1089,11 @@ static const me_variable *find_builtin(const char *name, int len) {
         if (!c) c = '\0' - functions[i].name[len];
         if (c == 0) {
             return functions + i;
-        } else if (c > 0) {
+        }
+        else if (c > 0) {
             imin = i + 1;
-        } else {
+        }
+        else {
             imax = i - 1;
         }
     }
@@ -927,9 +1101,9 @@ static const me_variable *find_builtin(const char *name, int len) {
     return 0;
 }
 
-static const me_variable *find_lookup(const state *s, const char *name, int len) {
+static const me_variable* find_lookup(const state* s, const char* name, int len) {
     int iters;
-    const me_variable *var;
+    const me_variable* var;
     if (!s->lookup) return 0;
 
     for (var = s->lookup, iters = s->lookup_len; iters; ++var, --iters) {
@@ -946,405 +1120,1714 @@ static double sub(double a, double b) { return a - b; }
 static double mul(double a, double b) { return a * b; }
 static double divide(double a, double b) { return a / b; }
 static double negate(double a) { return -a; }
-
-static double comma(double a, double b) {
-    (void) a;
-    return b;
-}
-
-/* Bitwise operators (for integer types) */
-static double bit_and(double a, double b) { return (double) ((int64_t) a & (int64_t) b); }
-static double bit_or(double a, double b) { return (double) ((int64_t) a | (int64_t) b); }
-static double bit_xor(double a, double b) { return (double) ((int64_t) a ^ (int64_t) b); }
-static double bit_not(double a) { return (double) (~(int64_t) a); }
-static double bit_shl(double a, double b) { return (double) ((int64_t) a << (int64_t) b); }
-static double bit_shr(double a, double b) { return (double) ((int64_t) a >> (int64_t) b); }
-
-/* Comparison operators (return 1.0 for true, 0.0 for false) */
-static double cmp_eq(double a, double b) { return a == b ? 1.0 : 0.0; }
-static double cmp_ne(double a, double b) { return a != b ? 1.0 : 0.0; }
-static double cmp_lt(double a, double b) { return a < b ? 1.0 : 0.0; }
-static double cmp_le(double a, double b) { return a <= b ? 1.0 : 0.0; }
-static double cmp_gt(double a, double b) { return a > b ? 1.0 : 0.0; }
-static double cmp_ge(double a, double b) { return a >= b ? 1.0 : 0.0; }
-
-/* Logical operators (for bool type) - short-circuit via OR/AND */
-static double logical_and(double a, double b) { return ((int) a) && ((int) b) ? 1.0 : 0.0; }
-static double logical_or(double a, double b) { return ((int) a) || ((int) b) ? 1.0 : 0.0; }
-static double logical_not(double a) { return !(int) a ? 1.0 : 0.0; }
-static double logical_xor(double a, double b) { return ((int) a) != ((int) b) ? 1.0 : 0.0; }
-
-static bool is_identifier_start(char c) {
-    return isalpha((unsigned char) c) || c == '_';
+static volatile double sum_salt = 0.0;
+static volatile double prod_salt = 1.0;
+static volatile double min_salt = 0.0;
+static volatile double max_salt = 0.0;
+static volatile double any_salt = 0.0;
+static volatile double all_salt = 0.0;
+static double sum_reduce(double x) { return x + sum_salt; }
+static double prod_reduce(double x) { return x * prod_salt; }
+static double any_reduce(double x) { return x + any_salt; }
+static double all_reduce(double x) { return x * (1.0 + all_salt); }
+static double min_reduce(double x) { return x + min_salt; }
+static double max_reduce(double x) { return x - max_salt; }
+
+static float reduce_min_float32_nan_safe(const float* data, int nitems) {
+    if (nitems <= 0) return INFINITY;
+#if defined(__AVX__) || defined(__AVX2__)
+    int i = 0;
+    __m256 vmin = _mm256_set1_ps(INFINITY);
+    __m256 vnan = _mm256_setzero_ps();
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        __m256 v = _mm256_loadu_ps(data + i);
+        vnan = _mm256_or_ps(vnan, _mm256_cmp_ps(v, v, _CMP_UNORD_Q));
+        vmin = _mm256_min_ps(vmin, v);
+    }
+    __m128 low = _mm256_castps256_ps128(vmin);
+    __m128 high = _mm256_extractf128_ps(vmin, 1);
+    __m128 min128 = _mm_min_ps(low, high);
+    __m128 tmp = _mm_min_ps(min128, _mm_movehl_ps(min128, min128));
+    tmp = _mm_min_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
+    float acc = _mm_cvtss_f32(tmp);
+    if (_mm256_movemask_ps(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        float v = data[i];
+        if (v != v) return v;
+        if (v < acc) acc = v;
+    }
+    return acc;
+#elif defined(__SSE__)
+    int i = 0;
+    __m128 vmin = _mm_set1_ps(INFINITY);
+    __m128 vnan = _mm_setzero_ps();
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        __m128 v = _mm_loadu_ps(data + i);
+        vnan = _mm_or_ps(vnan, _mm_cmpunord_ps(v, v));
+        vmin = _mm_min_ps(vmin, v);
+    }
+    __m128 tmp = _mm_min_ps(vmin, _mm_movehl_ps(vmin, vmin));
+    tmp = _mm_min_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
+    float acc = _mm_cvtss_f32(tmp);
+    if (_mm_movemask_ps(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        float v = data[i];
+        if (v != v) return v;
+        if (v < acc) acc = v;
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    float32x4_t vmin = vdupq_n_f32(INFINITY);
+    uint32x4_t vnan = vdupq_n_u32(0);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        float32x4_t v = vld1q_f32(data + i);
+        uint32x4_t eq = vceqq_f32(v, v);
+        vnan = vorrq_u32(vnan, vmvnq_u32(eq));
+        vmin = vminq_f32(vmin, v);
+    }
+#if defined(__aarch64__)
+    float acc = vminvq_f32(vmin);
+#else
+    float32x2_t min2 = vmin_f32(vget_low_f32(vmin), vget_high_f32(vmin));
+    min2 = vpmin_f32(min2, min2);
+    float acc = vget_lane_f32(min2, 0);
+#endif
+    uint32x2_t nan2 = vorr_u32(vget_low_u32(vnan), vget_high_u32(vnan));
+    nan2 = vpadd_u32(nan2, nan2);
+    if (vget_lane_u32(nan2, 0)) return NAN;
+    for (; i < nitems; i++) {
+        float v = data[i];
+        if (v != v) return v;
+        if (v < acc) acc = v;
+    }
+    return acc;
+#else
+    float acc = data[0];
+    for (int i = 0; i < nitems; i++) {
+        float v = data[i];
+        if (v != v) return v;
+        if (v < acc) acc = v;
+    }
+    return acc;
+#endif
 }
 
-static bool is_identifier_char(char c) {
-    return isalnum((unsigned char) c) || c == '_';
+static float reduce_max_float32_nan_safe(const float* data, int nitems) {
+    if (nitems <= 0) return -INFINITY;
+#if defined(__AVX__) || defined(__AVX2__)
+    int i = 0;
+    __m256 vmax = _mm256_set1_ps(-INFINITY);
+    __m256 vnan = _mm256_setzero_ps();
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        __m256 v = _mm256_loadu_ps(data + i);
+        vnan = _mm256_or_ps(vnan, _mm256_cmp_ps(v, v, _CMP_UNORD_Q));
+        vmax = _mm256_max_ps(vmax, v);
+    }
+    __m128 low = _mm256_castps256_ps128(vmax);
+    __m128 high = _mm256_extractf128_ps(vmax, 1);
+    __m128 max128 = _mm_max_ps(low, high);
+    __m128 tmp = _mm_max_ps(max128, _mm_movehl_ps(max128, max128));
+    tmp = _mm_max_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
+    float acc = _mm_cvtss_f32(tmp);
+    if (_mm256_movemask_ps(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        float v = data[i];
+        if (v != v) return v;
+        if (v > acc) acc = v;
+    }
+    return acc;
+#elif defined(__SSE__)
+    int i = 0;
+    __m128 vmax = _mm_set1_ps(-INFINITY);
+    __m128 vnan = _mm_setzero_ps();
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        __m128 v = _mm_loadu_ps(data + i);
+        vnan = _mm_or_ps(vnan, _mm_cmpunord_ps(v, v));
+        vmax = _mm_max_ps(vmax, v);
+    }
+    __m128 tmp = _mm_max_ps(vmax, _mm_movehl_ps(vmax, vmax));
+    tmp = _mm_max_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
+    float acc = _mm_cvtss_f32(tmp);
+    if (_mm_movemask_ps(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        float v = data[i];
+        if (v != v) return v;
+        if (v > acc) acc = v;
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    float32x4_t vmax = vdupq_n_f32(-INFINITY);
+    uint32x4_t vnan = vdupq_n_u32(0);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        float32x4_t v = vld1q_f32(data + i);
+        uint32x4_t eq = vceqq_f32(v, v);
+        vnan = vorrq_u32(vnan, vmvnq_u32(eq));
+        vmax = vmaxq_f32(vmax, v);
+    }
+#if defined(__aarch64__)
+    float acc = vmaxvq_f32(vmax);
+#else
+    float32x2_t max2 = vmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
+    max2 = vpmax_f32(max2, max2);
+    float acc = vget_lane_f32(max2, 0);
+#endif
+    uint32x2_t nan2 = vorr_u32(vget_low_u32(vnan), vget_high_u32(vnan));
+    nan2 = vpadd_u32(nan2, nan2);
+    if (vget_lane_u32(nan2, 0)) return NAN;
+    for (; i < nitems; i++) {
+        float v = data[i];
+        if (v != v) return v;
+        if (v > acc) acc = v;
+    }
+    return acc;
+#else
+    float acc = data[0];
+    for (int i = 0; i < nitems; i++) {
+        float v = data[i];
+        if (v != v) return v;
+        if (v > acc) acc = v;
+    }
+    return acc;
+#endif
 }
 
-static void skip_whitespace(state *s) {
-    while (*s->next && isspace((unsigned char) *s->next)) {
-        s->next++;
+static double reduce_min_float64_nan_safe(const double* data, int nitems) {
+    if (nitems <= 0) return INFINITY;
+#if defined(__AVX__) || defined(__AVX2__)
+    int i = 0;
+    __m256d vmin = _mm256_set1_pd(INFINITY);
+    __m256d vnan = _mm256_setzero_pd();
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        __m256d v = _mm256_loadu_pd(data + i);
+        vnan = _mm256_or_pd(vnan, _mm256_cmp_pd(v, v, _CMP_UNORD_Q));
+        vmin = _mm256_min_pd(vmin, v);
+    }
+    __m128d low = _mm256_castpd256_pd128(vmin);
+    __m128d high = _mm256_extractf128_pd(vmin, 1);
+    __m128d min128 = _mm_min_pd(low, high);
+    min128 = _mm_min_sd(min128, _mm_unpackhi_pd(min128, min128));
+    double acc = _mm_cvtsd_f64(min128);
+    if (_mm256_movemask_pd(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        double v = data[i];
+        if (v != v) return v;
+        if (v < acc) acc = v;
+    }
+    return acc;
+#elif defined(__SSE2__)
+    int i = 0;
+    __m128d vmin = _mm_set1_pd(INFINITY);
+    __m128d vnan = _mm_setzero_pd();
+    const int limit = nitems & ~1;
+    for (; i < limit; i += 2) {
+        __m128d v = _mm_loadu_pd(data + i);
+        vnan = _mm_or_pd(vnan, _mm_cmpunord_pd(v, v));
+        vmin = _mm_min_pd(vmin, v);
+    }
+    vmin = _mm_min_sd(vmin, _mm_unpackhi_pd(vmin, vmin));
+    double acc = _mm_cvtsd_f64(vmin);
+    if (_mm_movemask_pd(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        double v = data[i];
+        if (v != v) return v;
+        if (v < acc) acc = v;
     }
+    return acc;
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
+    int i = 0;
+    float64x2_t vmin = vdupq_n_f64(INFINITY);
+    uint64x2_t vnan = vdupq_n_u64(0);
+    const int limit = nitems & ~1;
+    for (; i < limit; i += 2) {
+        float64x2_t v = vld1q_f64(data + i);
+        uint64x2_t eq = vceqq_f64(v, v);
+        vnan = vorrq_u64(vnan, veorq_u64(eq, vdupq_n_u64(~0ULL)));
+        vmin = vminq_f64(vmin, v);
+    }
+    double acc = vminvq_f64(vmin);
+    uint64x2_t nan_or = vorrq_u64(vnan, vextq_u64(vnan, vnan, 1));
+    if (vgetq_lane_u64(nan_or, 0)) return NAN;
+    for (; i < nitems; i++) {
+        double v = data[i];
+        if (v != v) return v;
+        if (v < acc) acc = v;
+    }
+    return acc;
+#else
+    double acc = data[0];
+    for (int i = 0; i < nitems; i++) {
+        double v = data[i];
+        if (v != v) return v;
+        if (v < acc) acc = v;
+    }
+    return acc;
+#endif
 }
 
-static void read_number_token(state *s) {
-    const char *start = s->next;
-    s->value = strtod(s->next, (char **) &s->next);
-    s->type = TOK_NUMBER;
-
-    // Determine if it is a floating point or integer constant
-    bool is_float = false;
-    for (const char *p = start; p < s->next; p++) {
-        if (*p == '.' || *p == 'e' || *p == 'E') {
-            is_float = true;
-            break;
-        }
+static double reduce_max_float64_nan_safe(const double* data, int nitems) {
+    if (nitems <= 0) return -INFINITY;
+#if defined(__AVX__) || defined(__AVX2__)
+    int i = 0;
+    __m256d vmax = _mm256_set1_pd(-INFINITY);
+    __m256d vnan = _mm256_setzero_pd();
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        __m256d v = _mm256_loadu_pd(data + i);
+        vnan = _mm256_or_pd(vnan, _mm256_cmp_pd(v, v, _CMP_UNORD_Q));
+        vmax = _mm256_max_pd(vmax, v);
     }
-
-    if (is_float) {
-        // Match NumPy conventions: float constants match target_dtype when it's a float type
-        // This ensures FLOAT32 arrays + float constants -> FLOAT32 (NumPy behavior)
-        if (s->target_dtype == ME_FLOAT32) {
-            s->dtype = ME_FLOAT32;
-        } else {
-            s->dtype = ME_FLOAT64;
-        }
-    } else {
-        // For integers, we use a heuristic
-        if (s->value > INT_MAX || s->value < INT_MIN) {
-            s->dtype = ME_INT64;
-        } else {
-            // Use target_dtype if it's an integer type, otherwise default to INT32
-            if (is_integer_dtype(s->target_dtype)) {
-                s->dtype = s->target_dtype;
-            } else {
-                s->dtype = ME_INT32;
-            }
-        }
+    __m128d low = _mm256_castpd256_pd128(vmax);
+    __m128d high = _mm256_extractf128_pd(vmax, 1);
+    __m128d max128 = _mm_max_pd(low, high);
+    max128 = _mm_max_sd(max128, _mm_unpackhi_pd(max128, max128));
+    double acc = _mm_cvtsd_f64(max128);
+    if (_mm256_movemask_pd(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        double v = data[i];
+        if (v != v) return v;
+        if (v > acc) acc = v;
+    }
+    return acc;
+#elif defined(__SSE2__)
+    int i = 0;
+    __m128d vmax = _mm_set1_pd(-INFINITY);
+    __m128d vnan = _mm_setzero_pd();
+    const int limit = nitems & ~1;
+    for (; i < limit; i += 2) {
+        __m128d v = _mm_loadu_pd(data + i);
+        vnan = _mm_or_pd(vnan, _mm_cmpunord_pd(v, v));
+        vmax = _mm_max_pd(vmax, v);
+    }
+    vmax = _mm_max_sd(vmax, _mm_unpackhi_pd(vmax, vmax));
+    double acc = _mm_cvtsd_f64(vmax);
+    if (_mm_movemask_pd(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        double v = data[i];
+        if (v != v) return v;
+        if (v > acc) acc = v;
     }
+    return acc;
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
+    int i = 0;
+    float64x2_t vmax = vdupq_n_f64(-INFINITY);
+    uint64x2_t vnan = vdupq_n_u64(0);
+    const int limit = nitems & ~1;
+    for (; i < limit; i += 2) {
+        float64x2_t v = vld1q_f64(data + i);
+        uint64x2_t eq = vceqq_f64(v, v);
+        vnan = vorrq_u64(vnan, veorq_u64(eq, vdupq_n_u64(~0ULL)));
+        vmax = vmaxq_f64(vmax, v);
+    }
+    double acc = vmaxvq_f64(vmax);
+    uint64x2_t nan_or = vorrq_u64(vnan, vextq_u64(vnan, vnan, 1));
+    if (vgetq_lane_u64(nan_or, 0)) return NAN;
+    for (; i < nitems; i++) {
+        double v = data[i];
+        if (v != v) return v;
+        if (v > acc) acc = v;
+    }
+    return acc;
+#else
+    double acc = data[0];
+    for (int i = 0; i < nitems; i++) {
+        double v = data[i];
+        if (v != v) return v;
+        if (v > acc) acc = v;
+    }
+    return acc;
+#endif
 }
 
-static void read_identifier_token(state *s) {
-    const char *start = s->next;
-    while (is_identifier_char(*s->next)) {
-        s->next++;
+static int32_t reduce_min_int32(const int32_t* data, int nitems) {
+    if (nitems <= 0) return INT32_MAX;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i vmin = _mm256_set1_epi32(INT32_MAX);
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
+        vmin = _mm256_min_epi32(vmin, v);
     }
-
-    const me_variable *var = find_lookup(s, start, s->next - start);
-    if (!var) {
-        var = find_builtin(start, s->next - start);
+    int32_t tmp[8];
+    _mm256_storeu_si256((__m256i*)tmp, vmin);
+    int32_t acc = tmp[0];
+    for (int j = 1; j < 8; j++) {
+        if (tmp[j] < acc) acc = tmp[j];
     }
-
-    if (!var) {
-        s->type = TOK_ERROR;
-        return;
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
     }
-
-    switch (TYPE_MASK(var->type)) {
-        case ME_VARIABLE:
-            s->type = TOK_VARIABLE;
-            s->bound = var->address;
-            s->dtype = var->dtype;
-            break;
-
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7:
-            s->context = var->context;
-        /* Falls through. */
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-            s->type = var->type;
-            s->function = var->address;
-            break;
+    return acc;
+#elif defined(__SSE4_1__)
+    int i = 0;
+    __m128i vmin = _mm_set1_epi32(INT32_MAX);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        __m128i v = _mm_loadu_si128((const __m128i*)(data + i));
+        vmin = _mm_min_epi32(vmin, v);
+    }
+    int32_t tmp[4];
+    _mm_storeu_si128((__m128i*)tmp, vmin);
+    int32_t acc = tmp[0];
+    for (int j = 1; j < 4; j++) {
+        if (tmp[j] < acc) acc = tmp[j];
+    }
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    int32x4_t vmin = vdupq_n_s32(INT32_MAX);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        int32x4_t v = vld1q_s32(data + i);
+        vmin = vminq_s32(vmin, v);
+    }
+#if defined(__aarch64__)
+    int32_t acc = vminvq_s32(vmin);
+#else
+    int32x2_t min2 = vmin_s32(vget_low_s32(vmin), vget_high_s32(vmin));
+    min2 = vpmin_s32(min2, min2);
+    int32_t acc = vget_lane_s32(min2, 0);
+#endif
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#else
+    int32_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
     }
+    return acc;
+#endif
 }
 
-typedef struct {
-    const char *literal;
-    int token_type;
-    me_fun2 function;
-} operator_spec;
-
-static bool handle_multi_char_operator(state *s) {
-    static const operator_spec multi_ops[] = {
-        {"**", TOK_POW, pow},
-        {"<<", TOK_SHIFT, bit_shl},
-        {">>", TOK_SHIFT, bit_shr},
-        {"==", TOK_COMPARE, cmp_eq},
-        {"!=", TOK_COMPARE, cmp_ne},
-        {"<=", TOK_COMPARE, cmp_le},
-        {">=", TOK_COMPARE, cmp_ge},
-    };
-
-    for (size_t i = 0; i < sizeof(multi_ops) / sizeof(multi_ops[0]); i++) {
-        const operator_spec *op = &multi_ops[i];
-        size_t len = strlen(op->literal);
-        if (strncmp(s->next, op->literal, len) == 0) {
-            s->type = op->token_type;
-            s->function = op->function;
-            s->next += len;
-            return true;
-        }
+static int32_t reduce_max_int32(const int32_t* data, int nitems) {
+    if (nitems <= 0) return INT32_MIN;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i vmax = _mm256_set1_epi32(INT32_MIN);
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
+        vmax = _mm256_max_epi32(vmax, v);
     }
-    return false;
+    int32_t tmp[8];
+    _mm256_storeu_si256((__m256i*)tmp, vmax);
+    int32_t acc = tmp[0];
+    for (int j = 1; j < 8; j++) {
+        if (tmp[j] > acc) acc = tmp[j];
+    }
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__SSE4_1__)
+    int i = 0;
+    __m128i vmax = _mm_set1_epi32(INT32_MIN);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        __m128i v = _mm_loadu_si128((const __m128i*)(data + i));
+        vmax = _mm_max_epi32(vmax, v);
+    }
+    int32_t tmp[4];
+    _mm_storeu_si128((__m128i*)tmp, vmax);
+    int32_t acc = tmp[0];
+    for (int j = 1; j < 4; j++) {
+        if (tmp[j] > acc) acc = tmp[j];
+    }
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    int32x4_t vmax = vdupq_n_s32(INT32_MIN);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        int32x4_t v = vld1q_s32(data + i);
+        vmax = vmaxq_s32(vmax, v);
+    }
+#if defined(__aarch64__)
+    int32_t acc = vmaxvq_s32(vmax);
+#else
+    int32x2_t max2 = vmax_s32(vget_low_s32(vmax), vget_high_s32(vmax));
+    max2 = vpmax_s32(max2, max2);
+    int32_t acc = vget_lane_s32(max2, 0);
+#endif
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#else
+    int32_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#endif
 }
 
-static void handle_single_char_operator(state *s, char c) {
-    s->next++;
-    switch (c) {
-        case '+': s->type = TOK_INFIX;
-            s->function = add;
-            break;
-        case '-': s->type = TOK_INFIX;
-            s->function = sub;
-            break;
-        case '*': s->type = TOK_INFIX;
-            s->function = mul;
-            break;
-        case '/': s->type = TOK_INFIX;
-            s->function = divide;
-            break;
-        case '%': s->type = TOK_INFIX;
-            s->function = fmod;
-            break;
-        case '&': s->type = TOK_BITWISE;
-            s->function = bit_and;
-            break;
-        case '|': s->type = TOK_BITWISE;
-            s->function = bit_or;
-            break;
-        case '^': s->type = TOK_BITWISE;
-            s->function = bit_xor;
-            break;
-        case '~': s->type = TOK_BITWISE;
-            s->function = bit_not;
-            break;
-        case '<': s->type = TOK_COMPARE;
-            s->function = cmp_lt;
-            break;
-        case '>': s->type = TOK_COMPARE;
-            s->function = cmp_gt;
-            break;
-        case '(': s->type = TOK_OPEN;
-            break;
-        case ')': s->type = TOK_CLOSE;
-            break;
-        case ',': s->type = TOK_SEP;
-            break;
-        default: s->type = TOK_ERROR;
-            break;
+static int8_t reduce_min_int8(const int8_t* data, int nitems) {
+    if (nitems <= 0) return INT8_MAX;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i vmin = _mm256_set1_epi8(INT8_MAX);
+    const int limit = nitems & ~31;
+    for (; i < limit; i += 32) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
+        vmin = _mm256_min_epi8(vmin, v);
+    }
+    int8_t tmp[32];
+    _mm256_storeu_si256((__m256i*)tmp, vmin);
+    int8_t acc = tmp[0];
+    for (int j = 1; j < 32; j++) {
+        if (tmp[j] < acc) acc = tmp[j];
+    }
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    int8x16_t vmin = vdupq_n_s8(INT8_MAX);
+    const int limit = nitems & ~15;
+    for (; i < limit; i += 16) {
+        int8x16_t v = vld1q_s8(data + i);
+        vmin = vminq_s8(vmin, v);
+    }
+#if defined(__aarch64__)
+    int8_t acc = vminvq_s8(vmin);
+#else
+    int8x8_t min8 = vmin_s8(vget_low_s8(vmin), vget_high_s8(vmin));
+    min8 = vpmin_s8(min8, min8);
+    min8 = vpmin_s8(min8, min8);
+    int8_t acc = vget_lane_s8(min8, 0);
+#endif
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#else
+    int8_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
     }
+    return acc;
+#endif
 }
 
-static void read_operator_token(state *s) {
-    if (handle_multi_char_operator(s)) {
-        return;
+static int8_t reduce_max_int8(const int8_t* data, int nitems) {
+    if (nitems <= 0) return INT8_MIN;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i vmax = _mm256_set1_epi8(INT8_MIN);
+    const int limit = nitems & ~31;
+    for (; i < limit; i += 32) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
+        vmax = _mm256_max_epi8(vmax, v);
     }
-
-    if (!*s->next) {
-        s->type = TOK_END;
-        return;
+    int8_t tmp[32];
+    _mm256_storeu_si256((__m256i*)tmp, vmax);
+    int8_t acc = tmp[0];
+    for (int j = 1; j < 32; j++) {
+        if (tmp[j] > acc) acc = tmp[j];
     }
-
-    handle_single_char_operator(s, *s->next);
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    int8x16_t vmax = vdupq_n_s8(INT8_MIN);
+    const int limit = nitems & ~15;
+    for (; i < limit; i += 16) {
+        int8x16_t v = vld1q_s8(data + i);
+        vmax = vmaxq_s8(vmax, v);
+    }
+#if defined(__aarch64__)
+    int8_t acc = vmaxvq_s8(vmax);
+#else
+    int8x8_t max8 = vmax_s8(vget_low_s8(vmax), vget_high_s8(vmax));
+    max8 = vpmax_s8(max8, max8);
+    max8 = vpmax_s8(max8, max8);
+    int8_t acc = vget_lane_s8(max8, 0);
+#endif
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#else
+    int8_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#endif
 }
 
-void next_token(state *s) {
-    s->type = TOK_NULL;
-
-    do {
-        skip_whitespace(s);
-
-        if (!*s->next) {
+static int16_t reduce_min_int16(const int16_t* data, int nitems) {
+    if (nitems <= 0) return INT16_MAX;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i vmin = _mm256_set1_epi16(INT16_MAX);
+    const int limit = nitems & ~15;
+    for (; i < limit; i += 16) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
+        vmin = _mm256_min_epi16(vmin, v);
+    }
+    int16_t tmp[16];
+    _mm256_storeu_si256((__m256i*)tmp, vmin);
+    int16_t acc = tmp[0];
+    for (int j = 1; j < 16; j++) {
+        if (tmp[j] < acc) acc = tmp[j];
+    }
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    int16x8_t vmin = vdupq_n_s16(INT16_MAX);
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        int16x8_t v = vld1q_s16(data + i);
+        vmin = vminq_s16(vmin, v);
+    }
+#if defined(__aarch64__)
+    int16_t acc = vminvq_s16(vmin);
+#else
+    int16x4_t min4 = vmin_s16(vget_low_s16(vmin), vget_high_s16(vmin));
+    min4 = vpmin_s16(min4, min4);
+    min4 = vpmin_s16(min4, min4);
+    int16_t acc = vget_lane_s16(min4, 0);
+#endif
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#else
+    int16_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#endif
+}
+
+static int16_t reduce_max_int16(const int16_t* data, int nitems) {
+    if (nitems <= 0) return INT16_MIN;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i vmax = _mm256_set1_epi16(INT16_MIN);
+    const int limit = nitems & ~15;
+    for (; i < limit; i += 16) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
+        vmax = _mm256_max_epi16(vmax, v);
+    }
+    int16_t tmp[16];
+    _mm256_storeu_si256((__m256i*)tmp, vmax);
+    int16_t acc = tmp[0];
+    for (int j = 1; j < 16; j++) {
+        if (tmp[j] > acc) acc = tmp[j];
+    }
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    int16x8_t vmax = vdupq_n_s16(INT16_MIN);
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        int16x8_t v = vld1q_s16(data + i);
+        vmax = vmaxq_s16(vmax, v);
+    }
+#if defined(__aarch64__)
+    int16_t acc = vmaxvq_s16(vmax);
+#else
+    int16x4_t max4 = vmax_s16(vget_low_s16(vmax), vget_high_s16(vmax));
+    max4 = vpmax_s16(max4, max4);
+    max4 = vpmax_s16(max4, max4);
+    int16_t acc = vget_lane_s16(max4, 0);
+#endif
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#else
+    int16_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#endif
+}
+
+static int64_t reduce_min_int64(const int64_t* data, int nitems) {
+    if (nitems <= 0) return INT64_MAX;
+    int64_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+}
+
+static int64_t reduce_max_int64(const int64_t* data, int nitems) {
+    if (nitems <= 0) return INT64_MIN;
+    int64_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+}
+
+static uint8_t reduce_min_uint8(const uint8_t* data, int nitems) {
+    if (nitems <= 0) return UINT8_MAX;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i vmin = _mm256_set1_epi8((char)UINT8_MAX);
+    const int limit = nitems & ~31;
+    for (; i < limit; i += 32) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
+        vmin = _mm256_min_epu8(vmin, v);
+    }
+    uint8_t tmp[32];
+    _mm256_storeu_si256((__m256i*)tmp, vmin);
+    uint8_t acc = tmp[0];
+    for (int j = 1; j < 32; j++) {
+        if (tmp[j] < acc) acc = tmp[j];
+    }
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    uint8x16_t vmin = vdupq_n_u8(UINT8_MAX);
+    const int limit = nitems & ~15;
+    for (; i < limit; i += 16) {
+        uint8x16_t v = vld1q_u8(data + i);
+        vmin = vminq_u8(vmin, v);
+    }
+#if defined(__aarch64__)
+    uint8_t acc = vminvq_u8(vmin);
+#else
+    uint8x8_t min8 = vmin_u8(vget_low_u8(vmin), vget_high_u8(vmin));
+    min8 = vpmin_u8(min8, min8);
+    min8 = vpmin_u8(min8, min8);
+    uint8_t acc = vget_lane_u8(min8, 0);
+#endif
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#else
+    uint8_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#endif
+}
+
+static uint8_t reduce_max_uint8(const uint8_t* data, int nitems) {
+    if (nitems <= 0) return 0;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i vmax = _mm256_setzero_si256();
+    const int limit = nitems & ~31;
+    for (; i < limit; i += 32) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
+        vmax = _mm256_max_epu8(vmax, v);
+    }
+    uint8_t tmp[32];
+    _mm256_storeu_si256((__m256i*)tmp, vmax);
+    uint8_t acc = tmp[0];
+    for (int j = 1; j < 32; j++) {
+        if (tmp[j] > acc) acc = tmp[j];
+    }
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    uint8x16_t vmax = vdupq_n_u8(0);
+    const int limit = nitems & ~15;
+    for (; i < limit; i += 16) {
+        uint8x16_t v = vld1q_u8(data + i);
+        vmax = vmaxq_u8(vmax, v);
+    }
+#if defined(__aarch64__)
+    uint8_t acc = vmaxvq_u8(vmax);
+#else
+    uint8x8_t max8 = vmax_u8(vget_low_u8(vmax), vget_high_u8(vmax));
+    max8 = vpmax_u8(max8, max8);
+    max8 = vpmax_u8(max8, max8);
+    uint8_t acc = vget_lane_u8(max8, 0);
+#endif
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#else
+    uint8_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#endif
+}
+
+static uint16_t reduce_min_uint16(const uint16_t* data, int nitems) {
+    if (nitems <= 0) return UINT16_MAX;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i vmin = _mm256_set1_epi16((short)UINT16_MAX);
+    const int limit = nitems & ~15;
+    for (; i < limit; i += 16) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
+        vmin = _mm256_min_epu16(vmin, v);
+    }
+    uint16_t tmp[16];
+    _mm256_storeu_si256((__m256i*)tmp, vmin);
+    uint16_t acc = tmp[0];
+    for (int j = 1; j < 16; j++) {
+        if (tmp[j] < acc) acc = tmp[j];
+    }
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    uint16x8_t vmin = vdupq_n_u16(UINT16_MAX);
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        uint16x8_t v = vld1q_u16(data + i);
+        vmin = vminq_u16(vmin, v);
+    }
+#if defined(__aarch64__)
+    uint16_t acc = vminvq_u16(vmin);
+#else
+    uint16x4_t min4 = vmin_u16(vget_low_u16(vmin), vget_high_u16(vmin));
+    min4 = vpmin_u16(min4, min4);
+    min4 = vpmin_u16(min4, min4);
+    uint16_t acc = vget_lane_u16(min4, 0);
+#endif
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#else
+    uint16_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#endif
+}
+
+static uint16_t reduce_max_uint16(const uint16_t* data, int nitems) {
+    if (nitems <= 0) return 0;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i vmax = _mm256_setzero_si256();
+    const int limit = nitems & ~15;
+    for (; i < limit; i += 16) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
+        vmax = _mm256_max_epu16(vmax, v);
+    }
+    uint16_t tmp[16];
+    _mm256_storeu_si256((__m256i*)tmp, vmax);
+    uint16_t acc = tmp[0];
+    for (int j = 1; j < 16; j++) {
+        if (tmp[j] > acc) acc = tmp[j];
+    }
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    uint16x8_t vmax = vdupq_n_u16(0);
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        uint16x8_t v = vld1q_u16(data + i);
+        vmax = vmaxq_u16(vmax, v);
+    }
+#if defined(__aarch64__)
+    uint16_t acc = vmaxvq_u16(vmax);
+#else
+    uint16x4_t max4 = vmax_u16(vget_low_u16(vmax), vget_high_u16(vmax));
+    max4 = vpmax_u16(max4, max4);
+    max4 = vpmax_u16(max4, max4);
+    uint16_t acc = vget_lane_u16(max4, 0);
+#endif
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#else
+    uint16_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#endif
+}
+
+static uint32_t reduce_min_uint32(const uint32_t* data, int nitems) {
+    if (nitems <= 0) return UINT32_MAX;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i vmin = _mm256_set1_epi32((int)UINT32_MAX);
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
+        vmin = _mm256_min_epu32(vmin, v);
+    }
+    uint32_t tmp[8];
+    _mm256_storeu_si256((__m256i*)tmp, vmin);
+    uint32_t acc = tmp[0];
+    for (int j = 1; j < 8; j++) {
+        if (tmp[j] < acc) acc = tmp[j];
+    }
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    uint32x4_t vmin = vdupq_n_u32(UINT32_MAX);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        uint32x4_t v = vld1q_u32(data + i);
+        vmin = vminq_u32(vmin, v);
+    }
+#if defined(__aarch64__)
+    uint32_t acc = vminvq_u32(vmin);
+#else
+    uint32x2_t min2 = vmin_u32(vget_low_u32(vmin), vget_high_u32(vmin));
+    min2 = vpmin_u32(min2, min2);
+    uint32_t acc = vget_lane_u32(min2, 0);
+#endif
+    for (; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#else
+    uint32_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+#endif
+}
+
+static uint32_t reduce_max_uint32(const uint32_t* data, int nitems) {
+    if (nitems <= 0) return 0;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i vmax = _mm256_setzero_si256();
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
+        vmax = _mm256_max_epu32(vmax, v);
+    }
+    uint32_t tmp[8];
+    _mm256_storeu_si256((__m256i*)tmp, vmax);
+    uint32_t acc = tmp[0];
+    for (int j = 1; j < 8; j++) {
+        if (tmp[j] > acc) acc = tmp[j];
+    }
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    uint32x4_t vmax = vdupq_n_u32(0);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        uint32x4_t v = vld1q_u32(data + i);
+        vmax = vmaxq_u32(vmax, v);
+    }
+#if defined(__aarch64__)
+    uint32_t acc = vmaxvq_u32(vmax);
+#else
+    uint32x2_t max2 = vmax_u32(vget_low_u32(vmax), vget_high_u32(vmax));
+    max2 = vpmax_u32(max2, max2);
+    uint32_t acc = vget_lane_u32(max2, 0);
+#endif
+    for (; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#else
+    uint32_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+#endif
+}
+
+static uint64_t reduce_min_uint64(const uint64_t* data, int nitems) {
+    if (nitems <= 0) return UINT64_MAX;
+    uint64_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] < acc) acc = data[i];
+    }
+    return acc;
+}
+
+static uint64_t reduce_max_uint64(const uint64_t* data, int nitems) {
+    if (nitems <= 0) return 0;
+    uint64_t acc = data[0];
+    for (int i = 1; i < nitems; i++) {
+        if (data[i] > acc) acc = data[i];
+    }
+    return acc;
+}
+
+static float reduce_prod_float32_nan_safe(const float* data, int nitems) {
+    if (nitems <= 0) return 1.0f;
+#if defined(__AVX__) || defined(__AVX2__)
+    int i = 0;
+    __m256 vprod = _mm256_set1_ps(1.0f);
+    __m256 vnan = _mm256_setzero_ps();
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        __m256 v = _mm256_loadu_ps(data + i);
+        vnan = _mm256_or_ps(vnan, _mm256_cmp_ps(v, v, _CMP_UNORD_Q));
+        vprod = _mm256_mul_ps(vprod, v);
+    }
+    __m128 low = _mm256_castps256_ps128(vprod);
+    __m128 high = _mm256_extractf128_ps(vprod, 1);
+    __m128 prod128 = _mm_mul_ps(low, high);
+    __m128 tmp = _mm_mul_ps(prod128, _mm_movehl_ps(prod128, prod128));
+    tmp = _mm_mul_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
+    float acc = _mm_cvtss_f32(tmp);
+    if (_mm256_movemask_ps(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        float v = data[i];
+        acc *= v;
+        if (v != v) return v;
+    }
+    return acc;
+#elif defined(__SSE__)
+    int i = 0;
+    __m128 vprod = _mm_set1_ps(1.0f);
+    __m128 vnan = _mm_setzero_ps();
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        __m128 v = _mm_loadu_ps(data + i);
+        vnan = _mm_or_ps(vnan, _mm_cmpunord_ps(v, v));
+        vprod = _mm_mul_ps(vprod, v);
+    }
+    __m128 tmp = _mm_mul_ps(vprod, _mm_movehl_ps(vprod, vprod));
+    tmp = _mm_mul_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
+    float acc = _mm_cvtss_f32(tmp);
+    if (_mm_movemask_ps(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        float v = data[i];
+        acc *= v;
+        if (v != v) return v;
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    float32x4_t vprod = vdupq_n_f32(1.0f);
+    uint32x4_t vnan = vdupq_n_u32(0);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        float32x4_t v = vld1q_f32(data + i);
+        uint32x4_t eq = vceqq_f32(v, v);
+        vnan = vorrq_u32(vnan, vmvnq_u32(eq));
+        vprod = vmulq_f32(vprod, v);
+    }
+    float acc =
+        vgetq_lane_f32(vprod, 0) *
+        vgetq_lane_f32(vprod, 1) *
+        vgetq_lane_f32(vprod, 2) *
+        vgetq_lane_f32(vprod, 3);
+    uint32x2_t nan2 = vorr_u32(vget_low_u32(vnan), vget_high_u32(vnan));
+    nan2 = vpadd_u32(nan2, nan2);
+    if (vget_lane_u32(nan2, 0)) return NAN;
+    for (; i < nitems; i++) {
+        float v = data[i];
+        acc *= v;
+        if (v != v) return v;
+    }
+    return acc;
+#else
+    float acc = 1.0f;
+    for (int i = 0; i < nitems; i++) {
+        float v = data[i];
+        acc *= v;
+        if (v != v) return v;
+    }
+    return acc;
+#endif
+}
+
+static double reduce_prod_float64_nan_safe(const double* data, int nitems) {
+    if (nitems <= 0) return 1.0;
+#if defined(__AVX__) || defined(__AVX2__)
+    int i = 0;
+    __m256d vprod = _mm256_set1_pd(1.0);
+    __m256d vnan = _mm256_setzero_pd();
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        __m256d v = _mm256_loadu_pd(data + i);
+        vnan = _mm256_or_pd(vnan, _mm256_cmp_pd(v, v, _CMP_UNORD_Q));
+        vprod = _mm256_mul_pd(vprod, v);
+    }
+    __m128d low = _mm256_castpd256_pd128(vprod);
+    __m128d high = _mm256_extractf128_pd(vprod, 1);
+    __m128d prod128 = _mm_mul_pd(low, high);
+    prod128 = _mm_mul_sd(prod128, _mm_unpackhi_pd(prod128, prod128));
+    double acc = _mm_cvtsd_f64(prod128);
+    if (_mm256_movemask_pd(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        double v = data[i];
+        acc *= v;
+        if (v != v) return v;
+    }
+    return acc;
+#elif defined(__SSE2__)
+    int i = 0;
+    __m128d vprod = _mm_set1_pd(1.0);
+    __m128d vnan = _mm_setzero_pd();
+    const int limit = nitems & ~1;
+    for (; i < limit; i += 2) {
+        __m128d v = _mm_loadu_pd(data + i);
+        vnan = _mm_or_pd(vnan, _mm_cmpunord_pd(v, v));
+        vprod = _mm_mul_pd(vprod, v);
+    }
+    vprod = _mm_mul_sd(vprod, _mm_unpackhi_pd(vprod, vprod));
+    double acc = _mm_cvtsd_f64(vprod);
+    if (_mm_movemask_pd(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        double v = data[i];
+        acc *= v;
+        if (v != v) return v;
+    }
+    return acc;
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
+    int i = 0;
+    float64x2_t vprod = vdupq_n_f64(1.0);
+    uint64x2_t vnan = vdupq_n_u64(0);
+    const int limit = nitems & ~1;
+    for (; i < limit; i += 2) {
+        float64x2_t v = vld1q_f64(data + i);
+        uint64x2_t eq = vceqq_f64(v, v);
+        vnan = vorrq_u64(vnan, veorq_u64(eq, vdupq_n_u64(~0ULL)));
+        vprod = vmulq_f64(vprod, v);
+    }
+    double acc = vgetq_lane_f64(vprod, 0) * vgetq_lane_f64(vprod, 1);
+    uint64x2_t nan_or = vorrq_u64(vnan, vextq_u64(vnan, vnan, 1));
+    if (vgetq_lane_u64(nan_or, 0)) return NAN;
+    for (; i < nitems; i++) {
+        double v = data[i];
+        acc *= v;
+        if (v != v) return v;
+    }
+    return acc;
+#else
+    double acc = 1.0;
+    for (int i = 0; i < nitems; i++) {
+        double v = data[i];
+        acc *= v;
+        if (v != v) return v;
+    }
+    return acc;
+#endif
+}
+
+static float reduce_sum_float32_nan_safe(const float* data, int nitems) {
+    if (nitems <= 0) return 0.0f;
+#if defined(__AVX__) || defined(__AVX2__)
+    int i = 0;
+    __m256 vsum = _mm256_setzero_ps();
+    __m256 vnan = _mm256_setzero_ps();
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        __m256 v = _mm256_loadu_ps(data + i);
+        vnan = _mm256_or_ps(vnan, _mm256_cmp_ps(v, v, _CMP_UNORD_Q));
+        vsum = _mm256_add_ps(vsum, v);
+    }
+    __m128 low = _mm256_castps256_ps128(vsum);
+    __m128 high = _mm256_extractf128_ps(vsum, 1);
+    __m128 sum128 = _mm_add_ps(low, high);
+    __m128 tmp = _mm_add_ps(sum128, _mm_movehl_ps(sum128, sum128));
+    tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
+    float acc = _mm_cvtss_f32(tmp);
+    if (_mm256_movemask_ps(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        float v = data[i];
+        acc += v;
+        if (v != v) return v;
+    }
+    return acc;
+#elif defined(__SSE__)
+    int i = 0;
+    __m128 vsum = _mm_setzero_ps();
+    __m128 vnan = _mm_setzero_ps();
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        __m128 v = _mm_loadu_ps(data + i);
+        vnan = _mm_or_ps(vnan, _mm_cmpunord_ps(v, v));
+        vsum = _mm_add_ps(vsum, v);
+    }
+    __m128 tmp = _mm_add_ps(vsum, _mm_movehl_ps(vsum, vsum));
+    tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
+    float acc = _mm_cvtss_f32(tmp);
+    if (_mm_movemask_ps(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        float v = data[i];
+        acc += v;
+        if (v != v) return v;
+    }
+    return acc;
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int i = 0;
+    float32x4_t vsum = vdupq_n_f32(0.0f);
+    uint32x4_t vnan = vdupq_n_u32(0);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        float32x4_t v = vld1q_f32(data + i);
+        uint32x4_t eq = vceqq_f32(v, v);
+        vnan = vorrq_u32(vnan, vmvnq_u32(eq));
+        vsum = vaddq_f32(vsum, v);
+    }
+#if defined(__aarch64__)
+    float acc = vaddvq_f32(vsum);
+#else
+    float32x2_t sum2 = vadd_f32(vget_low_f32(vsum), vget_high_f32(vsum));
+    sum2 = vpadd_f32(sum2, sum2);
+    float acc = vget_lane_f32(sum2, 0);
+#endif
+    uint32x2_t nan2 = vorr_u32(vget_low_u32(vnan), vget_high_u32(vnan));
+    nan2 = vpadd_u32(nan2, nan2);
+    if (vget_lane_u32(nan2, 0)) return NAN;
+    for (; i < nitems; i++) {
+        float v = data[i];
+        acc += v;
+        if (v != v) return v;
+    }
+    return acc;
+#else
+    float acc = 0.0f;
+    for (int i = 0; i < nitems; i++) {
+        float v = data[i];
+        acc += v;
+        if (v != v) return v;
+    }
+    return acc;
+#endif
+}
+
+static double reduce_sum_float64_nan_safe(const double* data, int nitems) {
+    if (nitems <= 0) return 0.0;
+#if defined(__AVX__) || defined(__AVX2__)
+    int i = 0;
+    __m256d vsum = _mm256_setzero_pd();
+    __m256d vnan = _mm256_setzero_pd();
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        __m256d v = _mm256_loadu_pd(data + i);
+        vnan = _mm256_or_pd(vnan, _mm256_cmp_pd(v, v, _CMP_UNORD_Q));
+        vsum = _mm256_add_pd(vsum, v);
+    }
+    __m128d low = _mm256_castpd256_pd128(vsum);
+    __m128d high = _mm256_extractf128_pd(vsum, 1);
+    __m128d sum128 = _mm_add_pd(low, high);
+    sum128 = _mm_add_sd(sum128, _mm_unpackhi_pd(sum128, sum128));
+    double acc = _mm_cvtsd_f64(sum128);
+    if (_mm256_movemask_pd(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        double v = data[i];
+        acc += v;
+        if (v != v) return v;
+    }
+    return acc;
+#elif defined(__SSE2__)
+    int i = 0;
+    __m128d vsum = _mm_setzero_pd();
+    __m128d vnan = _mm_setzero_pd();
+    const int limit = nitems & ~1;
+    for (; i < limit; i += 2) {
+        __m128d v = _mm_loadu_pd(data + i);
+        vnan = _mm_or_pd(vnan, _mm_cmpunord_pd(v, v));
+        vsum = _mm_add_pd(vsum, v);
+    }
+    vsum = _mm_add_sd(vsum, _mm_unpackhi_pd(vsum, vsum));
+    double acc = _mm_cvtsd_f64(vsum);
+    if (_mm_movemask_pd(vnan)) return NAN;
+    for (; i < nitems; i++) {
+        double v = data[i];
+        acc += v;
+        if (v != v) return v;
+    }
+    return acc;
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
+    int i = 0;
+    float64x2_t vsum = vdupq_n_f64(0.0);
+    uint64x2_t vnan = vdupq_n_u64(0);
+    const int limit = nitems & ~1;
+    for (; i < limit; i += 2) {
+        float64x2_t v = vld1q_f64(data + i);
+        uint64x2_t eq = vceqq_f64(v, v);
+        vnan = vorrq_u64(vnan, veorq_u64(eq, vdupq_n_u64(~0ULL)));
+        vsum = vaddq_f64(vsum, v);
+    }
+    double acc = vaddvq_f64(vsum);
+    uint64x2_t nan_or = vorrq_u64(vnan, vextq_u64(vnan, vnan, 1));
+    if (vgetq_lane_u64(nan_or, 0)) return NAN;
+    for (; i < nitems; i++) {
+        double v = data[i];
+        acc += v;
+        if (v != v) return v;
+    }
+    return acc;
+#else
+    double acc = 0.0;
+    for (int i = 0; i < nitems; i++) {
+        double v = data[i];
+        acc += v;
+        if (v != v) return v;
+    }
+    return acc;
+#endif
+}
+
+static double comma(double a, double b) {
+    (void)a;
+    return b;
+}
+
+/* Bitwise operators (for integer types) */
+static double bit_and(double a, double b) { return (double)((int64_t)a & (int64_t)b); }
+static double bit_or(double a, double b) { return (double)((int64_t)a | (int64_t)b); }
+static double bit_xor(double a, double b) { return (double)((int64_t)a ^ (int64_t)b); }
+static double bit_not(double a) { return (double)(~(int64_t)a); }
+static double bit_shl(double a, double b) { return (double)((int64_t)a << (int64_t)b); }
+static double bit_shr(double a, double b) { return (double)((int64_t)a >> (int64_t)b); }
+
+/* Comparison operators (return 1.0 for true, 0.0 for false) */
+static double cmp_eq(double a, double b) { return a == b ? 1.0 : 0.0; }
+static double cmp_ne(double a, double b) { return a != b ? 1.0 : 0.0; }
+static double cmp_lt(double a, double b) { return a < b ? 1.0 : 0.0; }
+static double cmp_le(double a, double b) { return a <= b ? 1.0 : 0.0; }
+static double cmp_gt(double a, double b) { return a > b ? 1.0 : 0.0; }
+static double cmp_ge(double a, double b) { return a >= b ? 1.0 : 0.0; }
+
+/* Logical operators (for bool type) - short-circuit via OR/AND */
+static double logical_and(double a, double b) { return ((int)a) && ((int)b) ? 1.0 : 0.0; }
+static double logical_or(double a, double b) { return ((int)a) || ((int)b) ? 1.0 : 0.0; }
+static double logical_not(double a) { return !(int)a ? 1.0 : 0.0; }
+static double logical_xor(double a, double b) { return ((int)a) != ((int)b) ? 1.0 : 0.0; }
+
+static bool is_identifier_start(char c) {
+    return isalpha((unsigned char)c) || c == '_';
+}
+
+static bool is_identifier_char(char c) {
+    return isalnum((unsigned char)c) || c == '_';
+}
+
+static void skip_whitespace(state* s) {
+    while (*s->next && isspace((unsigned char)*s->next)) {
+        s->next++;
+    }
+}
+
+static void read_number_token(state* s) {
+    const char* start = s->next;
+    s->value = strtod(s->next, (char**)&s->next);
+    s->type = TOK_NUMBER;
+
+    // Determine if it is a floating point or integer constant
+    bool is_float = false;
+    for (const char* p = start; p < s->next; p++) {
+        if (*p == '.' || *p == 'e' || *p == 'E') {
+            is_float = true;
+            break;
+        }
+    }
+
+    if (is_float) {
+        // Match NumPy conventions: float constants match target_dtype when it's a float type
+        // This ensures FLOAT32 arrays + float constants -> FLOAT32 (NumPy behavior)
+        if (s->target_dtype == ME_FLOAT32) {
+            s->dtype = ME_FLOAT32;
+        }
+        else {
+            s->dtype = ME_FLOAT64;
+        }
+    }
+    else {
+        // For integers, we use a heuristic
+        if (s->value > INT_MAX || s->value < INT_MIN) {
+            s->dtype = ME_INT64;
+        }
+        else {
+            // Use target_dtype if it's an integer type, otherwise default to INT32
+            if (is_integer_dtype(s->target_dtype)) {
+                s->dtype = s->target_dtype;
+            }
+            else {
+                s->dtype = ME_INT32;
+            }
+        }
+    }
+}
+
+static void read_identifier_token(state* s) {
+    const char* start = s->next;
+    while (is_identifier_char(*s->next)) {
+        s->next++;
+    }
+
+    const me_variable* var = find_lookup(s, start, s->next - start);
+    if (!var) {
+        var = find_builtin(start, s->next - start);
+    }
+
+    if (!var) {
+        s->type = TOK_ERROR;
+        return;
+    }
+
+    switch (TYPE_MASK(var->type)) {
+    case ME_VARIABLE:
+        s->type = TOK_VARIABLE;
+        s->bound = var->address;
+        s->dtype = var->dtype;
+        break;
+
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        s->context = var->context;
+    /* Falls through. */
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+        s->type = var->type;
+        s->function = var->address;
+        break;
+    }
+}
+
+typedef struct {
+    const char* literal;
+    int token_type;
+    me_fun2 function;
+} operator_spec;
+
+static bool handle_multi_char_operator(state* s) {
+    static const operator_spec multi_ops[] = {
+        {"**", TOK_POW, pow},
+        {"<<", TOK_SHIFT, bit_shl},
+        {">>", TOK_SHIFT, bit_shr},
+        {"==", TOK_COMPARE, cmp_eq},
+        {"!=", TOK_COMPARE, cmp_ne},
+        {"<=", TOK_COMPARE, cmp_le},
+        {">=", TOK_COMPARE, cmp_ge},
+    };
+
+    for (size_t i = 0; i < sizeof(multi_ops) / sizeof(multi_ops[0]); i++) {
+        const operator_spec* op = &multi_ops[i];
+        size_t len = strlen(op->literal);
+        if (strncmp(s->next, op->literal, len) == 0) {
+            s->type = op->token_type;
+            s->function = op->function;
+            s->next += len;
+            return true;
+        }
+    }
+    return false;
+}
+
+static void handle_single_char_operator(state* s, char c) {
+    s->next++;
+    switch (c) {
+    case '+': s->type = TOK_INFIX;
+        s->function = add;
+        break;
+    case '-': s->type = TOK_INFIX;
+        s->function = sub;
+        break;
+    case '*': s->type = TOK_INFIX;
+        s->function = mul;
+        break;
+    case '/': s->type = TOK_INFIX;
+        s->function = divide;
+        break;
+    case '%': s->type = TOK_INFIX;
+        s->function = fmod;
+        break;
+    case '&': s->type = TOK_BITWISE;
+        s->function = bit_and;
+        break;
+    case '|': s->type = TOK_BITWISE;
+        s->function = bit_or;
+        break;
+    case '^': s->type = TOK_BITWISE;
+        s->function = bit_xor;
+        break;
+    case '~': s->type = TOK_BITWISE;
+        s->function = bit_not;
+        break;
+    case '<': s->type = TOK_COMPARE;
+        s->function = cmp_lt;
+        break;
+    case '>': s->type = TOK_COMPARE;
+        s->function = cmp_gt;
+        break;
+    case '(': s->type = TOK_OPEN;
+        break;
+    case ')': s->type = TOK_CLOSE;
+        break;
+    case ',': s->type = TOK_SEP;
+        break;
+    default: s->type = TOK_ERROR;
+        break;
+    }
+}
+
+static void read_operator_token(state* s) {
+    if (handle_multi_char_operator(s)) {
+        return;
+    }
+
+    if (!*s->next) {
+        s->type = TOK_END;
+        return;
+    }
+
+    handle_single_char_operator(s, *s->next);
+}
+
+void next_token(state* s) {
+    s->type = TOK_NULL;
+
+    do {
+        skip_whitespace(s);
+
+        if (!*s->next) {
             s->type = TOK_END;
             return;
         }
 
         if ((s->next[0] >= '0' && s->next[0] <= '9') || s->next[0] == '.') {
             read_number_token(s);
-        } else if (is_identifier_start(s->next[0])) {
+        }
+        else if (is_identifier_start(s->next[0])) {
             read_identifier_token(s);
-        } else {
+        }
+        else {
             read_operator_token(s);
         }
-    } while (s->type == TOK_NULL);
+    }
+    while (s->type == TOK_NULL);
 }
 
 
-static me_expr *list(state *s);
+static me_expr* list(state* s);
 
-static me_expr *expr(state *s);
+static me_expr* expr(state* s);
 
-static me_expr *power(state *s);
+static me_expr* power(state* s);
 
-static me_expr *shift_expr(state *s);
+static me_expr* shift_expr(state* s);
 
-static me_expr *bitwise_and(state *s);
+static me_expr* bitwise_and(state* s);
 
-static me_expr *bitwise_xor(state *s);
+static me_expr* bitwise_xor(state* s);
 
-static me_expr *bitwise_or(state *s);
+static me_expr* bitwise_or(state* s);
 
-static me_expr *comparison(state *s);
+static me_expr* comparison(state* s);
 
 
-static me_expr *base(state *s) {
+static me_expr* base(state* s) {
     /* <base>      =    <constant> | <variable> | <function-0> {"(" ")"} | <function-1> <power> | <function-X> "(" <expr> {"," <expr>} ")" | "(" <list> ")" */
-    me_expr *ret;
+    me_expr* ret;
     int arity;
 
     switch (TYPE_MASK(s->type)) {
-        case TOK_NUMBER:
-            ret = new_expr(ME_CONSTANT, 0);
-            CHECK_NULL(ret);
-
-            ret->value = s->value;
-            // Use inferred type for constants (floating point vs integer)
-            if (s->target_dtype == ME_AUTO) {
-                ret->dtype = s->dtype;
-            } else {
-                // If target_dtype is integer but constant is float/complex, we must use float/complex
-                if (is_integer_dtype(s->target_dtype)) {
-                    if (is_float_dtype(s->dtype) || is_complex_dtype(s->dtype)) {
-                        ret->dtype = s->dtype;
-                    } else if (is_integer_dtype(s->dtype) && dtype_size(s->dtype) > dtype_size(s->target_dtype)) {
-                        // Use larger integer type if needed
-                        ret->dtype = s->dtype;
-                    } else {
-                        ret->dtype = s->target_dtype;
-                    }
-                } else {
-                    // For float/complex target types, use target_dtype to match NumPy conventions
-                    // Float constants are typed based on target_dtype (FLOAT32 or FLOAT64)
-                    // This ensures FLOAT32 arrays + float constants -> FLOAT32 (NumPy behavior)
+    case TOK_NUMBER:
+        ret = new_expr(ME_CONSTANT, 0);
+        CHECK_NULL(ret);
+
+        ret->value = s->value;
+        // Use inferred type for constants (floating point vs integer)
+        if (s->target_dtype == ME_AUTO) {
+            ret->dtype = s->dtype;
+        }
+        else {
+            // If target_dtype is integer but constant is float/complex, we must use float/complex
+            if (is_integer_dtype(s->target_dtype)) {
+                if (is_float_dtype(s->dtype) || is_complex_dtype(s->dtype)) {
+                    ret->dtype = s->dtype;
+                }
+                else if (is_integer_dtype(s->dtype) && dtype_size(s->dtype) > dtype_size(s->target_dtype)) {
+                    // Use larger integer type if needed
+                    ret->dtype = s->dtype;
+                }
+                else {
                     ret->dtype = s->target_dtype;
                 }
             }
-            next_token(s);
-            break;
+            else {
+                // For float/complex target types, use target_dtype to match NumPy conventions
+                // Float constants are typed based on target_dtype (FLOAT32 or FLOAT64)
+                // This ensures FLOAT32 arrays + float constants -> FLOAT32 (NumPy behavior)
+                ret->dtype = s->target_dtype;
+            }
+        }
+        next_token(s);
+        break;
 
-        case TOK_VARIABLE:
-            ret = new_expr(ME_VARIABLE, 0);
-            CHECK_NULL(ret);
+    case TOK_VARIABLE:
+        ret = new_expr(ME_VARIABLE, 0);
+        CHECK_NULL(ret);
 
-            ret->bound = s->bound;
-            ret->dtype = s->dtype; // Set the variable's type
-            ret->input_dtype = s->dtype;
-            next_token(s);
-            break;
+        ret->bound = s->bound;
+        ret->dtype = s->dtype; // Set the variable's type
+        ret->input_dtype = s->dtype;
+        next_token(s);
+        break;
 
-        case ME_FUNCTION0:
-        case ME_CLOSURE0:
-            ret = new_expr(s->type, 0);
-            CHECK_NULL(ret);
+    case ME_FUNCTION0:
+    case ME_CLOSURE0:
+        ret = new_expr(s->type, 0);
+        CHECK_NULL(ret);
 
-            ret->function = s->function;
-            if (IS_CLOSURE(s->type)) ret->parameters[0] = s->context;
+        ret->function = s->function;
+        if (IS_CLOSURE(s->type)) ret->parameters[0] = s->context;
+        next_token(s);
+        if (s->type == TOK_OPEN) {
             next_token(s);
-            if (s->type == TOK_OPEN) {
+            if (s->type != TOK_CLOSE) {
+                s->type = TOK_ERROR;
+            }
+            else {
                 next_token(s);
-                if (s->type != TOK_CLOSE) {
-                    s->type = TOK_ERROR;
-                } else {
-                    next_token(s);
-                }
             }
-            break;
-
-        case ME_FUNCTION1:
-        case ME_CLOSURE1:
-            ret = new_expr(s->type, 0);
-            CHECK_NULL(ret);
+        }
+        break;
 
-            ret->function = s->function;
-            if (IS_CLOSURE(s->type)) ret->parameters[1] = s->context;
-            next_token(s);
-            ret->parameters[0] = power(s);
-            CHECK_NULL(ret->parameters[0], me_free(ret));
-            break;
+    case ME_FUNCTION1:
+    case ME_CLOSURE1:
+        ret = new_expr(s->type, 0);
+        CHECK_NULL(ret);
 
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7:
-            arity = ARITY(s->type);
-
-            ret = new_expr(s->type, 0);
-            CHECK_NULL(ret);
-
-            ret->function = s->function;
-            if (IS_CLOSURE(s->type)) ret->parameters[arity] = s->context;
-            next_token(s);
+        ret->function = s->function;
+        if (IS_CLOSURE(s->type)) ret->parameters[1] = s->context;
+        next_token(s);
+        ret->parameters[0] = power(s);
+        CHECK_NULL(ret->parameters[0], me_free(ret));
+        break;
+
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        arity = ARITY(s->type);
+
+        ret = new_expr(s->type, 0);
+        CHECK_NULL(ret);
+
+        ret->function = s->function;
+        if (IS_CLOSURE(s->type)) ret->parameters[arity] = s->context;
+        next_token(s);
 
-            if (s->type != TOK_OPEN) {
-                s->type = TOK_ERROR;
-            } else {
-                int i;
-                for (i = 0; i < arity; i++) {
-                    next_token(s);
-                    ret->parameters[i] = expr(s);
-                    CHECK_NULL(ret->parameters[i], me_free(ret));
+        if (s->type != TOK_OPEN) {
+            s->type = TOK_ERROR;
+        }
+        else {
+            int i;
+            for (i = 0; i < arity; i++) {
+                next_token(s);
+                ret->parameters[i] = expr(s);
+                CHECK_NULL(ret->parameters[i], me_free(ret));
 
-                    if (s->type != TOK_SEP) {
-                        break;
-                    }
-                }
-                if (s->type != TOK_CLOSE || i != arity - 1) {
-                    s->type = TOK_ERROR;
-                } else {
-                    next_token(s);
+                if (s->type != TOK_SEP) {
+                    break;
                 }
             }
-
-            break;
-
-        case TOK_OPEN:
-            next_token(s);
-            ret = list(s);
-            CHECK_NULL(ret);
-
-            if (s->type != TOK_CLOSE) {
+            if (s->type != TOK_CLOSE || i != arity - 1) {
                 s->type = TOK_ERROR;
-            } else {
+            }
+            else {
                 next_token(s);
             }
-            break;
+        }
 
-        default:
-            ret = new_expr(0, 0);
-            CHECK_NULL(ret);
+        break;
 
+    case TOK_OPEN:
+        next_token(s);
+        ret = list(s);
+        CHECK_NULL(ret);
+
+        if (s->type != TOK_CLOSE) {
             s->type = TOK_ERROR;
-            ret->value = NAN;
-            break;
+        }
+        else {
+            next_token(s);
+        }
+        break;
+
+    default:
+        ret = new_expr(0, 0);
+        CHECK_NULL(ret);
+
+        s->type = TOK_ERROR;
+        ret->value = NAN;
+        break;
     }
 
     return ret;
 }
 
 
-static me_expr *power(state *s) {
+static me_expr* power(state* s) {
     /* <power>     =    {("-" | "+")} <base> */
     int sign = 1;
     while (s->type == TOK_INFIX && (s->function == add || s->function == sub)) {
@@ -1352,12 +2835,13 @@ static me_expr *power(state *s) {
         next_token(s);
     }
 
-    me_expr *ret;
+    me_expr* ret;
 
     if (sign == 1) {
         ret = base(s);
-    } else {
-        me_expr *b = base(s);
+    }
+    else {
+        me_expr* b = base(s);
         CHECK_NULL(b);
 
         ret = NEW_EXPR(ME_FUNCTION1 | ME_FLAG_PURE, b);
@@ -1370,18 +2854,18 @@ static me_expr *power(state *s) {
 }
 
 #ifdef ME_POW_FROM_RIGHT
-static me_expr *factor(state *s) {
+static me_expr* factor(state* s) {
     /* <factor>    =    <power> {"**" <factor>}  (right associative) */
-    me_expr *ret = power(s);
+    me_expr* ret = power(s);
     CHECK_NULL(ret);
 
     if (s->type == TOK_POW) {
         me_fun2 t = s->function;
         next_token(s);
-        me_expr *f = factor(s); /* Right associative: recurse */
+        me_expr* f = factor(s); /* Right associative: recurse */
         CHECK_NULL(f, me_free(ret));
 
-        me_expr *prev = ret;
+        me_expr* prev = ret;
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, f);
         CHECK_NULL(ret, me_free(f), me_free(prev));
 
@@ -1392,22 +2876,22 @@ static me_expr *factor(state *s) {
     return ret;
 }
 #else
-static me_expr *factor(state *s) {
+static me_expr* factor(state* s) {
     /* <factor>    =    <power> {"**" <power>}  (left associative) */
-    me_expr *ret = power(s);
+    me_expr* ret = power(s);
     CHECK_NULL(ret);
 
     while (s->type == TOK_POW) {
         me_fun2 t = (me_fun2)s->function;
         next_token(s);
-        me_expr *f = power(s);
+        me_expr* f = power(s);
         CHECK_NULL(f, me_free(ret));
 
-        me_expr *prev = ret;
+        me_expr* prev = ret;
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, f);
         CHECK_NULL(ret, me_free(f), me_free(prev));
 
-        ret->function = (void *)t;
+        ret->function = (void*)t;
         apply_type_promotion(ret);
     }
 
@@ -1416,22 +2900,22 @@ static me_expr *factor(state *s) {
 #endif
 
 
-static me_expr *term(state *s) {
+static me_expr* term(state* s) {
     /* <term>      =    <factor> {("*" | "/" | "%") <factor>} */
-    me_expr *ret = factor(s);
+    me_expr* ret = factor(s);
     CHECK_NULL(ret);
 
     while (s->type == TOK_INFIX && (s->function == mul || s->function == divide || s->function == fmod)) {
         me_fun2 t = (me_fun2)s->function;
         next_token(s);
-        me_expr *f = factor(s);
+        me_expr* f = factor(s);
         CHECK_NULL(f, me_free(ret));
 
-        me_expr *prev = ret;
+        me_expr* prev = ret;
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, f);
         CHECK_NULL(ret, me_free(f), me_free(prev));
 
-        ret->function = (void *)t;
+        ret->function = (void*)t;
         apply_type_promotion(ret);
     }
 
@@ -1439,22 +2923,22 @@ static me_expr *term(state *s) {
 }
 
 
-static me_expr *expr(state *s) {
+static me_expr* expr(state* s) {
     /* <expr>      =    <term> {("+" | "-") <term>} */
-    me_expr *ret = term(s);
+    me_expr* ret = term(s);
     CHECK_NULL(ret);
 
     while (s->type == TOK_INFIX && (s->function == add || s->function == sub)) {
         me_fun2 t = (me_fun2)s->function;
         next_token(s);
-        me_expr *te = term(s);
+        me_expr* te = term(s);
         CHECK_NULL(te, me_free(ret));
 
-        me_expr *prev = ret;
+        me_expr* prev = ret;
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, te);
         CHECK_NULL(ret, me_free(te), me_free(prev));
 
-        ret->function = (void *)t;
+        ret->function = (void*)t;
         apply_type_promotion(ret); // Apply type promotion
     }
 
@@ -1462,22 +2946,22 @@ static me_expr *expr(state *s) {
 }
 
 
-static me_expr *shift_expr(state *s) {
+static me_expr* shift_expr(state* s) {
     /* <shift_expr> =    <expr> {("<<" | ">>") <expr>} */
-    me_expr *ret = expr(s);
+    me_expr* ret = expr(s);
     CHECK_NULL(ret);
 
     while (s->type == TOK_SHIFT) {
         me_fun2 t = (me_fun2)s->function;
         next_token(s);
-        me_expr *e = expr(s);
+        me_expr* e = expr(s);
         CHECK_NULL(e, me_free(ret));
 
-        me_expr *prev = ret;
+        me_expr* prev = ret;
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
         CHECK_NULL(ret, me_free(e), me_free(prev));
 
-        ret->function = (void *)t;
+        ret->function = (void*)t;
         apply_type_promotion(ret);
     }
 
@@ -1485,17 +2969,17 @@ static me_expr *shift_expr(state *s) {
 }
 
 
-static me_expr *bitwise_and(state *s) {
+static me_expr* bitwise_and(state* s) {
     /* <bitwise_and> =    <shift_expr> {"&" <shift_expr>} */
-    me_expr *ret = shift_expr(s);
+    me_expr* ret = shift_expr(s);
     CHECK_NULL(ret);
 
     while (s->type == TOK_BITWISE && s->function == bit_and) {
         next_token(s);
-        me_expr *e = shift_expr(s);
+        me_expr* e = shift_expr(s);
         CHECK_NULL(e, me_free(ret));
 
-        me_expr *prev = ret;
+        me_expr* prev = ret;
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
         CHECK_NULL(ret, me_free(e), me_free(prev));
 
@@ -1507,18 +2991,18 @@ static me_expr *bitwise_and(state *s) {
 }
 
 
-static me_expr *bitwise_xor(state *s) {
+static me_expr* bitwise_xor(state* s) {
     /* <bitwise_xor> =    <bitwise_and> {"^" <bitwise_and>} */
     /* Note: ^ is XOR for integers/bools. Use ** for power */
-    me_expr *ret = bitwise_and(s);
+    me_expr* ret = bitwise_and(s);
     CHECK_NULL(ret);
 
     while (s->type == TOK_BITWISE && s->function == bit_xor) {
         next_token(s);
-        me_expr *e = bitwise_and(s);
+        me_expr* e = bitwise_and(s);
         CHECK_NULL(e, me_free(ret));
 
-        me_expr *prev = ret;
+        me_expr* prev = ret;
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
         CHECK_NULL(ret, me_free(e), me_free(prev));
 
@@ -1530,22 +3014,22 @@ static me_expr *bitwise_xor(state *s) {
 }
 
 
-static me_expr *bitwise_or(state *s) {
+static me_expr* bitwise_or(state* s) {
     /* <bitwise_or> =    <bitwise_xor> {"|" <bitwise_xor>} */
-    me_expr *ret = bitwise_xor(s);
+    me_expr* ret = bitwise_xor(s);
     CHECK_NULL(ret);
 
     while (s->type == TOK_BITWISE && (s->function == bit_or)) {
         me_fun2 t = (me_fun2)s->function;
         next_token(s);
-        me_expr *e = bitwise_xor(s);
+        me_expr* e = bitwise_xor(s);
         CHECK_NULL(e, me_free(ret));
 
-        me_expr *prev = ret;
+        me_expr* prev = ret;
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
         CHECK_NULL(ret, me_free(e), me_free(prev));
 
-        ret->function = (void *)t;
+        ret->function = (void*)t;
         apply_type_promotion(ret);
     }
 
@@ -1553,22 +3037,22 @@ static me_expr *bitwise_or(state *s) {
 }
 
 
-static me_expr *comparison(state *s) {
+static me_expr* comparison(state* s) {
     /* <comparison> =    <bitwise_or> {("<" | ">" | "<=" | ">=" | "==" | "!=") <bitwise_or>} */
-    me_expr *ret = bitwise_or(s);
+    me_expr* ret = bitwise_or(s);
     CHECK_NULL(ret);
 
     while (s->type == TOK_COMPARE) {
         me_fun2 t = (me_fun2)s->function;
         next_token(s);
-        me_expr *e = bitwise_or(s);
+        me_expr* e = bitwise_or(s);
         CHECK_NULL(e, me_free(ret));
 
-        me_expr *prev = ret;
+        me_expr* prev = ret;
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
         CHECK_NULL(ret, me_free(e), me_free(prev));
 
-        ret->function = (void *)t;
+        ret->function = (void*)t;
         apply_type_promotion(ret);
         /* Comparisons always return bool */
         ret->dtype = ME_BOOL;
@@ -1578,17 +3062,17 @@ static me_expr *comparison(state *s) {
 }
 
 
-static me_expr *list(state *s) {
+static me_expr* list(state* s) {
     /* <list>      =    <comparison> {"," <comparison>} */
-    me_expr *ret = comparison(s);
+    me_expr* ret = comparison(s);
     CHECK_NULL(ret);
 
     while (s->type == TOK_SEP) {
         next_token(s);
-        me_expr *e = comparison(s);
+        me_expr* e = comparison(s);
         CHECK_NULL(e, me_free(ret));
 
-        me_expr *prev = ret;
+        me_expr* prev = ret;
         ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
         CHECK_NULL(ret, me_free(e), me_free(prev));
 
@@ -1603,59 +3087,59 @@ static me_expr *list(state *s) {
 #define ME_FUN(...) ((double(*)(__VA_ARGS__))n->function)
 #define M(e) me_eval_scalar(n->parameters[e])
 
-static double me_eval_scalar(const me_expr *n) {
+static double me_eval_scalar(const me_expr* n) {
     if (!n) return NAN;
 
     switch (TYPE_MASK(n->type)) {
-        case ME_CONSTANT: return n->value;
-        case ME_VARIABLE: return *(const double *) n->bound;
-
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-            switch (ARITY(n->type)) {
-                case 0: return ME_FUN(void)();
-                case 1: return ME_FUN(double)(M(0));
-                case 2: return ME_FUN(double, double)(M(0), M(1));
-                case 3: return ME_FUN(double, double, double)(M(0), M(1), M(2));
-                case 4: return ME_FUN(double, double, double, double)(M(0), M(1), M(2), M(3));
-                case 5: return ME_FUN(double, double, double, double, double)(M(0), M(1), M(2), M(3), M(4));
-                case 6: return ME_FUN(double, double, double, double, double, double)(
-                        M(0), M(1), M(2), M(3), M(4), M(5));
-                case 7: return ME_FUN(double, double, double, double, double, double, double)(
-                        M(0), M(1), M(2), M(3), M(4), M(5), M(6));
-                default: return NAN;
-            }
-
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7:
-            switch (ARITY(n->type)) {
-                case 0: return ME_FUN(void*)(n->parameters[0]);
-                case 1: return ME_FUN(void*, double)(n->parameters[1], M(0));
-                case 2: return ME_FUN(void*, double, double)(n->parameters[2], M(0), M(1));
-                case 3: return ME_FUN(void*, double, double, double)(n->parameters[3], M(0), M(1), M(2));
-                case 4: return ME_FUN(void*, double, double, double, double)(n->parameters[4], M(0), M(1), M(2), M(3));
-                case 5: return ME_FUN(void*, double, double, double, double, double)(
-                        n->parameters[5], M(0), M(1), M(2), M(3), M(4));
-                case 6: return ME_FUN(void*, double, double, double, double, double, double)(
-                        n->parameters[6], M(0), M(1), M(2), M(3), M(4), M(5));
-                case 7: return ME_FUN(void*, double, double, double, double, double, double, double)(
-                        n->parameters[7], M(0), M(1), M(2), M(3), M(4), M(5), M(6));
-                default: return NAN;
-            }
+    case ME_CONSTANT: return n->value;
+    case ME_VARIABLE: return *(const double*)n->bound;
+
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+        switch (ARITY(n->type)) {
+        case 0: return ME_FUN(void)();
+        case 1: return ME_FUN(double)(M(0));
+        case 2: return ME_FUN(double, double)(M(0), M(1));
+        case 3: return ME_FUN(double, double, double)(M(0), M(1), M(2));
+        case 4: return ME_FUN(double, double, double, double)(M(0), M(1), M(2), M(3));
+        case 5: return ME_FUN(double, double, double, double, double)(M(0), M(1), M(2), M(3), M(4));
+        case 6: return ME_FUN(double, double, double, double, double, double)(
+                M(0), M(1), M(2), M(3), M(4), M(5));
+        case 7: return ME_FUN(double, double, double, double, double, double, double)(
+                M(0), M(1), M(2), M(3), M(4), M(5), M(6));
+        default: return NAN;
+        }
 
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        switch (ARITY(n->type)) {
+        case 0: return ME_FUN(void*)(n->parameters[0]);
+        case 1: return ME_FUN(void*, double)(n->parameters[1], M(0));
+        case 2: return ME_FUN(void*, double, double)(n->parameters[2], M(0), M(1));
+        case 3: return ME_FUN(void*, double, double, double)(n->parameters[3], M(0), M(1), M(2));
+        case 4: return ME_FUN(void*, double, double, double, double)(n->parameters[4], M(0), M(1), M(2), M(3));
+        case 5: return ME_FUN(void*, double, double, double, double, double)(
+                n->parameters[5], M(0), M(1), M(2), M(3), M(4));
+        case 6: return ME_FUN(void*, double, double, double, double, double, double)(
+                n->parameters[6], M(0), M(1), M(2), M(3), M(4), M(5));
+        case 7: return ME_FUN(void*, double, double, double, double, double, double, double)(
+                n->parameters[7], M(0), M(1), M(2), M(3), M(4), M(5), M(6));
         default: return NAN;
+        }
+
+    default: return NAN;
     }
 }
 
@@ -1663,7 +3147,7 @@ static double me_eval_scalar(const me_expr *n) {
 #undef M
 
 /* Specialized vector operations for better performance */
-static void vec_add(const double *a, const double *b, double *out, int n) {
+static void vec_add(const double* a, const double* b, double* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1671,7 +3155,7 @@ static void vec_add(const double *a, const double *b, double *out, int n) {
     }
 }
 
-static void vec_sub(const double *a, const double *b, double *out, int n) {
+static void vec_sub(const double* a, const double* b, double* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1679,7 +3163,7 @@ static void vec_sub(const double *a, const double *b, double *out, int n) {
     }
 }
 
-static void vec_mul(const double *a, const double *b, double *out, int n) {
+static void vec_mul(const double* a, const double* b, double* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1687,7 +3171,7 @@ static void vec_mul(const double *a, const double *b, double *out, int n) {
     }
 }
 
-static void vec_div(const double *a, const double *b, double *out, int n) {
+static void vec_div(const double* a, const double* b, double* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1695,7 +3179,7 @@ static void vec_div(const double *a, const double *b, double *out, int n) {
     }
 }
 
-static void vec_add_scalar(const double *a, double b, double *out, int n) {
+static void vec_add_scalar(const double* a, double b, double* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1703,7 +3187,7 @@ static void vec_add_scalar(const double *a, double b, double *out, int n) {
     }
 }
 
-static void vec_mul_scalar(const double *a, double b, double *out, int n) {
+static void vec_mul_scalar(const double* a, double b, double* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1711,7 +3195,7 @@ static void vec_mul_scalar(const double *a, double b, double *out, int n) {
     }
 }
 
-static void vec_pow(const double *a, const double *b, double *out, int n) {
+static void vec_pow(const double* a, const double* b, double* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1719,7 +3203,7 @@ static void vec_pow(const double *a, const double *b, double *out, int n) {
     }
 }
 
-static void vec_pow_scalar(const double *a, double b, double *out, int n) {
+static void vec_pow_scalar(const double* a, double b, double* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1727,7 +3211,7 @@ static void vec_pow_scalar(const double *a, double b, double *out, int n) {
     }
 }
 
-static void vec_sqrt(const double *a, double *out, int n) {
+static void vec_sqrt(const double* a, double* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1735,7 +3219,7 @@ static void vec_sqrt(const double *a, double *out, int n) {
     }
 }
 
-static void vec_sin(const double *a, double *out, int n) {
+static void vec_sin(const double* a, double* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1743,7 +3227,7 @@ static void vec_sin(const double *a, double *out, int n) {
     }
 }
 
-static void vec_cos(const double *a, double *out, int n) {
+static void vec_cos(const double* a, double* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1751,7 +3235,7 @@ static void vec_cos(const double *a, double *out, int n) {
     }
 }
 
-static void vec_negate(const double *a, double *out, int n) {
+static void vec_negate(const double* a, double* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1763,7 +3247,7 @@ static void vec_negate(const double *a, double *out, int n) {
  * FLOAT32 VECTOR OPERATIONS
  * ============================================================================ */
 
-static void vec_add_f32(const float *a, const float *b, float *out, int n) {
+static void vec_add_f32(const float* a, const float* b, float* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1771,7 +3255,7 @@ static void vec_add_f32(const float *a, const float *b, float *out, int n) {
     }
 }
 
-static void vec_sub_f32(const float *a, const float *b, float *out, int n) {
+static void vec_sub_f32(const float* a, const float* b, float* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1779,7 +3263,7 @@ static void vec_sub_f32(const float *a, const float *b, float *out, int n) {
     }
 }
 
-static void vec_mul_f32(const float *a, const float *b, float *out, int n) {
+static void vec_mul_f32(const float* a, const float* b, float* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1787,7 +3271,7 @@ static void vec_mul_f32(const float *a, const float *b, float *out, int n) {
     }
 }
 
-static void vec_div_f32(const float *a, const float *b, float *out, int n) {
+static void vec_div_f32(const float* a, const float* b, float* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1795,7 +3279,7 @@ static void vec_div_f32(const float *a, const float *b, float *out, int n) {
     }
 }
 
-static void vec_add_scalar_f32(const float *a, float b, float *out, int n) {
+static void vec_add_scalar_f32(const float* a, float b, float* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1803,7 +3287,7 @@ static void vec_add_scalar_f32(const float *a, float b, float *out, int n) {
     }
 }
 
-static void vec_mul_scalar_f32(const float *a, float b, float *out, int n) {
+static void vec_mul_scalar_f32(const float* a, float b, float* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1811,7 +3295,7 @@ static void vec_mul_scalar_f32(const float *a, float b, float *out, int n) {
     }
 }
 
-static void vec_pow_f32(const float *a, const float *b, float *out, int n) {
+static void vec_pow_f32(const float* a, const float* b, float* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1819,7 +3303,7 @@ static void vec_pow_f32(const float *a, const float *b, float *out, int n) {
     }
 }
 
-static void vec_pow_scalar_f32(const float *a, float b, float *out, int n) {
+static void vec_pow_scalar_f32(const float* a, float b, float* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1827,7 +3311,7 @@ static void vec_pow_scalar_f32(const float *a, float b, float *out, int n) {
     }
 }
 
-static void vec_sqrt_f32(const float *a, float *out, int n) {
+static void vec_sqrt_f32(const float* a, float* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1835,7 +3319,7 @@ static void vec_sqrt_f32(const float *a, float *out, int n) {
     }
 }
 
-static void vec_sin_f32(const float *a, float *out, int n) {
+static void vec_sin_f32(const float* a, float* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1843,7 +3327,7 @@ static void vec_sin_f32(const float *a, float *out, int n) {
     }
 }
 
-static void vec_cos_f32(const float *a, float *out, int n) {
+static void vec_cos_f32(const float* a, float* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1851,7 +3335,7 @@ static void vec_cos_f32(const float *a, float *out, int n) {
     }
 }
 
-static void vec_negame_f32(const float *a, float *out, int n) {
+static void vec_negame_f32(const float* a, float* out, int n) {
     int i;
 #pragma GCC ivdep
     for (i = 0; i < n; i++) {
@@ -1957,25 +3441,25 @@ DEFINE_INT_VEC_OPS(u32, uint32_t)
 DEFINE_INT_VEC_OPS(u64, uint64_t)
 
 /* Boolean logical operations */
-static void vec_and_bool(const bool *a, const bool *b, bool *out, int n) {
+static void vec_and_bool(const bool* a, const bool* b, bool* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = a[i] && b[i];
 }
 
-static void vec_or_bool(const bool *a, const bool *b, bool *out, int n) {
+static void vec_or_bool(const bool* a, const bool* b, bool* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = a[i] || b[i];
 }
 
-static void vec_xor_bool(const bool *a, const bool *b, bool *out, int n) {
+static void vec_xor_bool(const bool* a, const bool* b, bool* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = a[i] != b[i];
 }
 
-static void vec_not_bool(const bool *a, bool *out, int n) {
+static void vec_not_bool(const bool* a, bool* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = !a[i];
@@ -2028,145 +3512,145 @@ DEFINE_COMPARE_OPS(f32, float)
 DEFINE_COMPARE_OPS(f64, double)
 
 /* Complex operations */
-static void vec_add_c64(const float _Complex *a, const float _Complex *b, float _Complex *out, int n) {
+static void vec_add_c64(const float _Complex* a, const float _Complex* b, float _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = add_c64(a[i], b[i]);
 }
 
-static void vec_sub_c64(const float _Complex *a, const float _Complex *b, float _Complex *out, int n) {
+static void vec_sub_c64(const float _Complex* a, const float _Complex* b, float _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = sub_c64(a[i], b[i]);
 }
 
-static void vec_mul_c64(const float _Complex *a, const float _Complex *b, float _Complex *out, int n) {
+static void vec_mul_c64(const float _Complex* a, const float _Complex* b, float _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = mul_c64(a[i], b[i]);
 }
 
-static void vec_div_c64(const float _Complex *a, const float _Complex *b, float _Complex *out, int n) {
+static void vec_div_c64(const float _Complex* a, const float _Complex* b, float _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = div_c64(a[i], b[i]);
 }
 
-static void vec_add_scalar_c64(const float _Complex *a, float _Complex b, float _Complex *out, int n) {
+static void vec_add_scalar_c64(const float _Complex* a, float _Complex b, float _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = add_c64(a[i], b);
 }
 
-static void vec_mul_scalar_c64(const float _Complex *a, float _Complex b, float _Complex *out, int n) {
+static void vec_mul_scalar_c64(const float _Complex* a, float _Complex b, float _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = mul_c64(a[i], b);
 }
 
-static void vec_pow_c64(const float _Complex *a, const float _Complex *b, float _Complex *out, int n) {
+static void vec_pow_c64(const float _Complex* a, const float _Complex* b, float _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = me_cpowf(a[i], b[i]);
 }
 
-static void vec_pow_scalar_c64(const float _Complex *a, float _Complex b, float _Complex *out, int n) {
+static void vec_pow_scalar_c64(const float _Complex* a, float _Complex b, float _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = me_cpowf(a[i], b);
 }
 
-static void vec_sqrt_c64(const float _Complex *a, float _Complex *out, int n) {
+static void vec_sqrt_c64(const float _Complex* a, float _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = me_csqrtf(a[i]);
 }
 
-static void vec_negame_c64(const float _Complex *a, float _Complex *out, int n) {
+static void vec_negame_c64(const float _Complex* a, float _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = neg_c64(a[i]);
 }
 
-static void vec_conj_c64(const float _Complex *a, float _Complex *out, int n) {
+static void vec_conj_c64(const float _Complex* a, float _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = me_conjf(a[i]);
 }
 
-static void vec_imag_c64(const float _Complex *a, float *out, int n) {
+static void vec_imag_c64(const float _Complex* a, float* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = me_cimagf(a[i]);
 }
 
-static void vec_add_c128(const double _Complex *a, const double _Complex *b, double _Complex *out, int n) {
+static void vec_add_c128(const double _Complex* a, const double _Complex* b, double _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = add_c128(a[i], b[i]);
 }
 
-static void vec_sub_c128(const double _Complex *a, const double _Complex *b, double _Complex *out, int n) {
+static void vec_sub_c128(const double _Complex* a, const double _Complex* b, double _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = sub_c128(a[i], b[i]);
 }
 
-static void vec_mul_c128(const double _Complex *a, const double _Complex *b, double _Complex *out, int n) {
+static void vec_mul_c128(const double _Complex* a, const double _Complex* b, double _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = mul_c128(a[i], b[i]);
 }
 
-static void vec_div_c128(const double _Complex *a, const double _Complex *b, double _Complex *out, int n) {
+static void vec_div_c128(const double _Complex* a, const double _Complex* b, double _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = div_c128(a[i], b[i]);
 }
 
-static void vec_add_scalar_c128(const double _Complex *a, double _Complex b, double _Complex *out, int n) {
+static void vec_add_scalar_c128(const double _Complex* a, double _Complex b, double _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = add_c128(a[i], b);
 }
 
-static void vec_mul_scalar_c128(const double _Complex *a, double _Complex b, double _Complex *out, int n) {
+static void vec_mul_scalar_c128(const double _Complex* a, double _Complex b, double _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = mul_c128(a[i], b);
 }
 
-static void vec_pow_c128(const double _Complex *a, const double _Complex *b, double _Complex *out, int n) {
+static void vec_pow_c128(const double _Complex* a, const double _Complex* b, double _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = me_cpow(a[i], b[i]);
 }
 
-static void vec_pow_scalar_c128(const double _Complex *a, double _Complex b, double _Complex *out, int n) {
+static void vec_pow_scalar_c128(const double _Complex* a, double _Complex b, double _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = me_cpow(a[i], b);
 }
 
-static void vec_sqrt_c128(const double _Complex *a, double _Complex *out, int n) {
+static void vec_sqrt_c128(const double _Complex* a, double _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = me_csqrt(a[i]);
 }
 
-static void vec_negame_c128(const double _Complex *a, double _Complex *out, int n) {
+static void vec_negame_c128(const double _Complex* a, double _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = neg_c128(a[i]);
 }
 
-static void vec_conj_c128(const double _Complex *a, double _Complex *out, int n) {
+static void vec_conj_c128(const double _Complex* a, double _Complex* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = me_conj(a[i]);
 }
 
-static void vec_imag_c128(const double _Complex *a, double *out, int n) {
+static void vec_imag_c128(const double _Complex* a, double* out, int n) {
     int i;
     IVDEP
     for (i = 0; i < n; i++) out[i] = me_cimag(a[i]);
@@ -2260,7 +3744,7 @@ DEFINE_VEC_CONVERT(f64, c128, double, double _Complex)
 DEFINE_VEC_CONVERT(c64, c128, float _Complex, double _Complex)
 
 /* Function to get conversion function pointer */
-typedef void (*convert_func_t)(const void *, void *, int);
+typedef void (*convert_func_t)(const void*, void*, int);
 
 static convert_func_t get_convert_func(me_dtype from, me_dtype to) {
     /* Return conversion function for a specific type pair */
@@ -2833,197 +4317,989 @@ DEFINE_ME_EVAL(c128, double _Complex,
 /* Public API - dispatches to correct type-specific evaluator */
 /* Structure to track promoted variables */
 typedef struct {
-    void *promoted_data; // Temporary buffer for promoted data
+    void* promoted_data; // Temporary buffer for promoted data
     me_dtype original_type;
     bool needs_free;
 } promoted_var_t;
 
 /* Helper to save original variable bindings */
-static void save_variable_bindings(const me_expr *node,
-                                   const void **original_bounds,
-                                   me_dtype *original_types,
-                                   int *save_idx) {
+static void save_variable_bindings(const me_expr* node,
+                                   const void** original_bounds,
+                                   me_dtype* original_types,
+                                   int* save_idx) {
     if (!node) return;
     switch (TYPE_MASK(node->type)) {
-        case ME_VARIABLE:
-            original_bounds[*save_idx] = node->bound;
-            original_types[*save_idx] = node->dtype;
-            (*save_idx)++;
-            break;
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
+    case ME_VARIABLE:
+        original_bounds[*save_idx] = node->bound;
+        original_types[*save_idx] = node->dtype;
+        (*save_idx)++;
+        break;
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
             const int arity = ARITY(node->type);
             for (int i = 0; i < arity; i++) {
-                save_variable_bindings((const me_expr *) node->parameters[i],
+                save_variable_bindings((const me_expr*)node->parameters[i],
                                        original_bounds, original_types, save_idx);
             }
-            break;
-        }
-    }
-}
-
-/* Recursively promote variables in expression tree */
-static void promote_variables_in_tree(me_expr *n, me_dtype target_type,
-                                      promoted_var_t *promotions, int *promo_count,
-                                      int nitems) {
-    if (!n) return;
-
-    switch (TYPE_MASK(n->type)) {
-        case ME_CONSTANT:
-            // Constants are promoted on-the-fly during evaluation
-            break;
-
-        case ME_VARIABLE:
-            if (n->dtype != target_type) {
-                // Need to promote this variable
-                void *promoted = malloc(nitems * dtype_size(target_type));
-                if (promoted) {
-                    convert_func_t conv = get_convert_func(n->dtype, target_type);
-                    if (conv) {
-                        conv(n->bound, promoted, nitems);
-
-                        // Track this promotion for later cleanup
-                        promotions[*promo_count].promoted_data = promoted;
-                        promotions[*promo_count].original_type = n->dtype;
-                        promotions[*promo_count].needs_free = true;
-                        (*promo_count)++;
-
-                        // Temporarily replace bound pointer
-                        n->bound = promoted;
-                        n->dtype = target_type;
-                    } else {
-                        free(promoted);
+            break;
+        }
+    }
+}
+
+/* Recursively promote variables in expression tree */
+static void promote_variables_in_tree(me_expr* n, me_dtype target_type,
+                                      promoted_var_t* promotions, int* promo_count,
+                                      int nitems) {
+    if (!n) return;
+
+    switch (TYPE_MASK(n->type)) {
+    case ME_CONSTANT:
+        // Constants are promoted on-the-fly during evaluation
+        break;
+
+    case ME_VARIABLE:
+        if (n->dtype != target_type) {
+            // Need to promote this variable
+            void* promoted = malloc(nitems * dtype_size(target_type));
+            if (promoted) {
+                convert_func_t conv = get_convert_func(n->dtype, target_type);
+                if (conv) {
+                    conv(n->bound, promoted, nitems);
+
+                    // Track this promotion for later cleanup
+                    promotions[*promo_count].promoted_data = promoted;
+                    promotions[*promo_count].original_type = n->dtype;
+                    promotions[*promo_count].needs_free = true;
+                    (*promo_count)++;
+
+                    // Temporarily replace bound pointer
+                    n->bound = promoted;
+                    n->dtype = target_type;
+                }
+                else {
+                    free(promoted);
+                }
+            }
+        }
+        break;
+
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
+            const int arity = ARITY(n->type);
+            for (int i = 0; i < arity; i++) {
+                promote_variables_in_tree((me_expr*)n->parameters[i], target_type,
+                                          promotions, promo_count, nitems);
+            }
+            break;
+        }
+    }
+}
+
+/* Restore original variable bindings after promotion */
+static void restore_variables_in_tree(me_expr* n, const void** original_bounds,
+                                      const me_dtype* original_types, int* restore_idx) {
+    if (!n) return;
+
+    switch (TYPE_MASK(n->type)) {
+    case ME_VARIABLE:
+        if (original_bounds[*restore_idx] != NULL) {
+            n->bound = original_bounds[*restore_idx];
+            n->dtype = original_types[*restore_idx];
+            (*restore_idx)++;
+        }
+        break;
+
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
+            const int arity = ARITY(n->type);
+            for (int i = 0; i < arity; i++) {
+                restore_variables_in_tree((me_expr*)n->parameters[i], original_bounds, original_types, restore_idx);
+            }
+            break;
+        }
+    }
+}
+
+/* Check if all variables in tree match target type */
+static bool all_variables_match_type(const me_expr* n, me_dtype target_type) {
+    if (!n) return true;
+
+    switch (TYPE_MASK(n->type)) {
+    case ME_CONSTANT:
+        return true; // Constants are always OK
+
+    case ME_VARIABLE:
+        return n->dtype == target_type;
+
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
+            const int arity = ARITY(n->type);
+            for (int i = 0; i < arity; i++) {
+                if (!all_variables_match_type((const me_expr*)n->parameters[i], target_type)) {
+                    return false;
+                }
+            }
+            return true;
+        }
+    }
+
+    return true;
+}
+
+static void eval_reduction(const me_expr* n) {
+    if (!n || !n->output || !is_reduction_node(n)) return;
+
+    me_expr* arg = (me_expr*)n->parameters[0];
+    if (!arg) return;
+
+    const int nitems = n->nitems;
+    me_dtype arg_type = arg->dtype;
+    me_dtype result_type = reduction_output_dtype(arg_type, n->function);
+    me_dtype output_type = n->dtype;
+    bool is_prod = n->function == (void*)prod_reduce;
+    bool is_min = n->function == (void*)min_reduce;
+    bool is_max = n->function == (void*)max_reduce;
+    bool is_any = n->function == (void*)any_reduce;
+    bool is_all = n->function == (void*)all_reduce;
+
+    void* write_ptr = n->output;
+    void* temp_output = NULL;
+    if (output_type != result_type) {
+        temp_output = malloc(dtype_size(result_type));
+        if (!temp_output) return;
+        write_ptr = temp_output;
+    }
+
+    if (arg->type == ME_CONSTANT) {
+        double val = arg->value;
+        if (is_any || is_all) {
+            bool acc = is_all;
+            if (nitems == 0) {
+                acc = is_all;
+            }
+            else {
+                switch (arg_type) {
+                case ME_BOOL:
+                    acc = val != 0.0;
+                    break;
+                case ME_INT8:
+                case ME_INT16:
+                case ME_INT32:
+                case ME_INT64:
+                case ME_UINT8:
+                case ME_UINT16:
+                case ME_UINT32:
+                case ME_UINT64:
+                case ME_FLOAT32:
+                case ME_FLOAT64:
+                    acc = val != 0.0;
+                    break;
+                case ME_COMPLEX64:
+                case ME_COMPLEX128:
+                    acc = val != 0.0;
+                    break;
+                default:
+                    acc = false;
+                    break;
+                }
+            }
+            ((bool*)write_ptr)[0] = acc;
+        }
+        else if (is_min || is_max) {
+            switch (arg_type) {
+            case ME_BOOL:
+                {
+                    bool acc = is_min;
+                    if (nitems > 0) {
+                        acc = (bool)val;
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_INT8:
+                {
+                    int8_t acc = (int8_t)(is_min ? INT8_MAX : INT8_MIN);
+                    if (nitems > 0) acc = (int8_t)val;
+                    ((int8_t*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_INT16:
+                {
+                    int16_t acc = (int16_t)(is_min ? INT16_MAX : INT16_MIN);
+                    if (nitems > 0) acc = (int16_t)val;
+                    ((int16_t*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_INT32:
+                {
+                    int32_t acc = (int32_t)(is_min ? INT32_MAX : INT32_MIN);
+                    if (nitems > 0) acc = (int32_t)val;
+                    ((int32_t*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_INT64:
+                {
+                    int64_t acc = is_min ? INT64_MAX : INT64_MIN;
+                    if (nitems > 0) acc = (int64_t)val;
+                    ((int64_t*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_UINT8:
+                {
+                    uint8_t acc = is_min ? UINT8_MAX : 0;
+                    if (nitems > 0) acc = (uint8_t)val;
+                    ((uint8_t*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_UINT16:
+                {
+                    uint16_t acc = is_min ? UINT16_MAX : 0;
+                    if (nitems > 0) acc = (uint16_t)val;
+                    ((uint16_t*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_UINT32:
+                {
+                    uint32_t acc = is_min ? UINT32_MAX : 0;
+                    if (nitems > 0) acc = (uint32_t)val;
+                    ((uint32_t*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_UINT64:
+                {
+                    uint64_t acc = is_min ? UINT64_MAX : 0;
+                    if (nitems > 0) acc = (uint64_t)val;
+                    ((uint64_t*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_FLOAT32:
+                {
+                    float acc = is_min ? INFINITY : -INFINITY;
+                    if (nitems > 0) acc = (float)val;
+                    ((float*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_FLOAT64:
+                {
+                    double acc = is_min ? INFINITY : -INFINITY;
+                    if (nitems > 0) acc = val;
+                    ((double*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_COMPLEX64:
+                {
+                    ((float _Complex*)write_ptr)[0] = (float _Complex)0.0f;
+                    break;
+                }
+            case ME_COMPLEX128:
+                {
+                    ((double _Complex*)write_ptr)[0] = (double _Complex)0.0;
+                    break;
+                }
+            default:
+                break;
+            }
+        }
+        else {
+            switch (arg_type) {
+            case ME_BOOL:
+            case ME_INT8:
+            case ME_INT16:
+            case ME_INT32:
+            case ME_INT64:
+                {
+                    int64_t acc = is_prod ? 1 : 0;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1 : 0;
+                    }
+                    else if (is_prod) {
+                        int64_t v = (int64_t)val;
+                        for (int i = 0; i < nitems; i++) acc *= v;
+                    }
+                    else {
+                        acc = (int64_t)val * (int64_t)nitems;
+                    }
+                    ((int64_t*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_UINT8:
+            case ME_UINT16:
+            case ME_UINT32:
+            case ME_UINT64:
+                {
+                    uint64_t acc = is_prod ? 1 : 0;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1 : 0;
+                    }
+                    else if (is_prod) {
+                        uint64_t v = (uint64_t)val;
+                        for (int i = 0; i < nitems; i++) acc *= v;
+                    }
+                    else {
+                        acc = (uint64_t)val * (uint64_t)nitems;
+                    }
+                    ((uint64_t*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_FLOAT32:
+                {
+                    float acc = is_prod ? 1.0f : 0.0f;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1.0f : 0.0f;
+                    }
+                    else if (is_prod) {
+                        float v = (float)val;
+                        for (int i = 0; i < nitems; i++) acc *= v;
+                    }
+                    else {
+                        acc = (float)val * (float)nitems;
+                    }
+                    ((float*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_FLOAT64:
+                {
+                    double acc = is_prod ? 1.0 : 0.0;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1.0 : 0.0;
+                    }
+                    else if (is_prod) {
+                        for (int i = 0; i < nitems; i++) acc *= val;
+                    }
+                    else {
+                        acc = val * (double)nitems;
+                    }
+                    ((double*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_COMPLEX64:
+                {
+                    float _Complex acc = is_prod ? (float _Complex)1.0f : (float _Complex)0.0f;
+                    float _Complex v = (float _Complex)val;
+                    if (nitems == 0) {
+                        acc = is_prod ? (float _Complex)1.0f : (float _Complex)0.0f;
+                    }
+                    else if (is_prod) {
+                        for (int i = 0; i < nitems; i++) acc *= v;
+                    }
+                    else {
+                        acc = v * (float)nitems;
+                    }
+                    ((float _Complex*)write_ptr)[0] = acc;
+                    break;
+                }
+            case ME_COMPLEX128:
+                {
+                    double _Complex acc = is_prod ? (double _Complex)1.0 : (double _Complex)0.0;
+                    double _Complex v = (double _Complex)val;
+                    if (nitems == 0) {
+                        acc = is_prod ? (double _Complex)1.0 : (double _Complex)0.0;
+                    }
+                    else if (is_prod) {
+                        for (int i = 0; i < nitems; i++) acc *= v;
+                    }
+                    else {
+                        acc = v * (double)nitems;
+                    }
+                    ((double _Complex*)write_ptr)[0] = acc;
+                    break;
+                }
+            default:
+                break;
+            }
+        }
+    }
+    else if (arg->type == ME_VARIABLE) {
+        switch (arg_type) {
+        case ME_BOOL:
+            {
+                const bool* data = (const bool*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            if (is_any) {
+                                if (data[i]) { acc = true; break; }
+                            }
+                            else {
+                                if (!data[i]) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                }
+                else if (is_min || is_max) {
+                    bool acc = is_min;
+                    if (nitems > 0) {
+                        acc = data[0];
+                        for (int i = 1; i < nitems; i++) {
+                            acc = is_min ? (acc && data[i]) : (acc || data[i]);
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                }
+                else {
+                    int64_t acc = is_prod ? 1 : 0;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1 : 0;
+                    }
+                    else if (is_prod) {
+                        for (int i = 0; i < nitems; i++) acc *= data[i] ? 1 : 0;
+                    }
+                    else {
+                        for (int i = 0; i < nitems; i++) acc += data[i] ? 1 : 0;
+                    }
+                    ((int64_t*)write_ptr)[0] = acc;
+                }
+                break;
+            }
+        case ME_INT8:
+            {
+                const int8_t* data = (const int8_t*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            if (is_any) {
+                                if (data[i] != 0) { acc = true; break; }
+                            }
+                            else {
+                                if (data[i] == 0) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                }
+                else if (is_min || is_max) {
+                    int8_t acc = is_min ? reduce_min_int8(data, nitems) :
+                        reduce_max_int8(data, nitems);
+                    ((int8_t*)write_ptr)[0] = acc;
+                }
+                else {
+                    int64_t acc = is_prod ? 1 : 0;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1 : 0;
+                    }
+                    else if (is_prod) {
+                        for (int i = 0; i < nitems; i++) acc *= data[i];
+                    }
+                    else {
+                        for (int i = 0; i < nitems; i++) acc += data[i];
+                    }
+                    ((int64_t*)write_ptr)[0] = acc;
+                }
+                break;
+            }
+        case ME_INT16:
+            {
+                const int16_t* data = (const int16_t*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            if (is_any) {
+                                if (data[i] != 0) { acc = true; break; }
+                            }
+                            else {
+                                if (data[i] == 0) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                }
+                else if (is_min || is_max) {
+                    int16_t acc = is_min ? reduce_min_int16(data, nitems) :
+                        reduce_max_int16(data, nitems);
+                    ((int16_t*)write_ptr)[0] = acc;
+                }
+                else {
+                    int64_t acc = is_prod ? 1 : 0;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1 : 0;
+                    }
+                    else if (is_prod) {
+                        for (int i = 0; i < nitems; i++) acc *= data[i];
+                    }
+                    else {
+                        for (int i = 0; i < nitems; i++) acc += data[i];
+                    }
+                    ((int64_t*)write_ptr)[0] = acc;
+                }
+                break;
+            }
+        case ME_INT32:
+            {
+                const int32_t* data = (const int32_t*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            if (is_any) {
+                                if (data[i] != 0) { acc = true; break; }
+                            }
+                            else {
+                                if (data[i] == 0) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                }
+                else if (is_min || is_max) {
+                    int32_t acc = is_min ? reduce_min_int32(data, nitems) :
+                        reduce_max_int32(data, nitems);
+                    ((int32_t*)write_ptr)[0] = acc;
+                }
+                else {
+                    int64_t acc = is_prod ? 1 : 0;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1 : 0;
+                    }
+                    else if (is_prod) {
+                        for (int i = 0; i < nitems; i++) acc *= data[i];
+                    }
+                    else {
+                        for (int i = 0; i < nitems; i++) acc += data[i];
+                    }
+                    ((int64_t*)write_ptr)[0] = acc;
+                }
+                break;
+            }
+        case ME_INT64:
+            {
+                const int64_t* data = (const int64_t*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            if (is_any) {
+                                if (data[i] != 0) { acc = true; break; }
+                            }
+                            else {
+                                if (data[i] == 0) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                }
+                else if (is_min || is_max) {
+                    int64_t acc = is_min ? reduce_min_int64(data, nitems) :
+                        reduce_max_int64(data, nitems);
+                    ((int64_t*)write_ptr)[0] = acc;
+                }
+                else {
+                    int64_t acc = is_prod ? 1 : 0;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1 : 0;
+                    }
+                    else if (is_prod) {
+                        for (int i = 0; i < nitems; i++) acc *= data[i];
+                    }
+                    else {
+                        for (int i = 0; i < nitems; i++) acc += data[i];
+                    }
+                    ((int64_t*)write_ptr)[0] = acc;
+                }
+                break;
+            }
+        case ME_UINT8:
+            {
+                const uint8_t* data = (const uint8_t*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            if (is_any) {
+                                if (data[i] != 0) { acc = true; break; }
+                            }
+                            else {
+                                if (data[i] == 0) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                }
+                else if (is_min || is_max) {
+                    uint8_t acc = is_min ? reduce_min_uint8(data, nitems) :
+                        reduce_max_uint8(data, nitems);
+                    ((uint8_t*)write_ptr)[0] = acc;
+                }
+                else {
+                    uint64_t acc = is_prod ? 1 : 0;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1 : 0;
+                    }
+                    else if (is_prod) {
+                        for (int i = 0; i < nitems; i++) acc *= data[i];
+                    }
+                    else {
+                        for (int i = 0; i < nitems; i++) acc += data[i];
+                    }
+                    ((uint64_t*)write_ptr)[0] = acc;
+                }
+                break;
+            }
+        case ME_UINT16:
+            {
+                const uint16_t* data = (const uint16_t*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            if (is_any) {
+                                if (data[i] != 0) { acc = true; break; }
+                            }
+                            else {
+                                if (data[i] == 0) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                }
+                else if (is_min || is_max) {
+                    uint16_t acc = is_min ? reduce_min_uint16(data, nitems) :
+                        reduce_max_uint16(data, nitems);
+                    ((uint16_t*)write_ptr)[0] = acc;
+                }
+                else {
+                    uint64_t acc = is_prod ? 1 : 0;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1 : 0;
+                    }
+                    else if (is_prod) {
+                        for (int i = 0; i < nitems; i++) acc *= data[i];
+                    }
+                    else {
+                        for (int i = 0; i < nitems; i++) acc += data[i];
+                    }
+                    ((uint64_t*)write_ptr)[0] = acc;
+                }
+                break;
+            }
+        case ME_UINT32:
+            {
+                const uint32_t* data = (const uint32_t*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            if (is_any) {
+                                if (data[i] != 0) { acc = true; break; }
+                            }
+                            else {
+                                if (data[i] == 0) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                }
+                else if (is_min || is_max) {
+                    uint32_t acc = is_min ? reduce_min_uint32(data, nitems) :
+                        reduce_max_uint32(data, nitems);
+                    ((uint32_t*)write_ptr)[0] = acc;
+                }
+                else {
+                    uint64_t acc = is_prod ? 1 : 0;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1 : 0;
+                    }
+                    else if (is_prod) {
+                        for (int i = 0; i < nitems; i++) acc *= data[i];
+                    }
+                    else {
+                        for (int i = 0; i < nitems; i++) acc += data[i];
+                    }
+                    ((uint64_t*)write_ptr)[0] = acc;
+                }
+                break;
+            }
+        case ME_UINT64:
+            {
+                const uint64_t* data = (const uint64_t*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            if (is_any) {
+                                if (data[i] != 0) { acc = true; break; }
+                            }
+                            else {
+                                if (data[i] == 0) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                }
+                else if (is_min || is_max) {
+                    uint64_t acc = is_min ? reduce_min_uint64(data, nitems) :
+                        reduce_max_uint64(data, nitems);
+                    ((uint64_t*)write_ptr)[0] = acc;
+                }
+                else {
+                    uint64_t acc = is_prod ? 1 : 0;
+                    if (nitems == 0) {
+                        acc = is_prod ? 1 : 0;
+                    }
+                    else if (is_prod) {
+                        for (int i = 0; i < nitems; i++) acc *= data[i];
+                    }
+                    else {
+                        for (int i = 0; i < nitems; i++) acc += data[i];
                     }
+                    ((uint64_t*)write_ptr)[0] = acc;
                 }
+                break;
             }
-            break;
-
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
-            const int arity = ARITY(n->type);
-            for (int i = 0; i < arity; i++) {
-                promote_variables_in_tree((me_expr *) n->parameters[i], target_type,
-                                          promotions, promo_count, nitems);
+        case ME_FLOAT32:
+            {
+                const float* data = (const float*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            if (is_any) {
+                                if (data[i] != 0.0f) { acc = true; break; }
+                            }
+                            else {
+                                if (data[i] == 0.0f) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                }
+                else {
+                    float acc = 0.0f;
+                    if (nitems == 0) {
+                        if (is_min) acc = INFINITY;
+                        else if (is_max) acc = -INFINITY;
+                        else acc = is_prod ? 1.0f : 0.0f;
+                    }
+                    else if (is_min) {
+                        acc = reduce_min_float32_nan_safe(data, nitems);
+                    }
+                    else if (is_max) {
+                        acc = reduce_max_float32_nan_safe(data, nitems);
+                    }
+                    else if (is_prod) {
+                        acc = reduce_prod_float32_nan_safe(data, nitems);
+                    }
+                    else {
+                        acc = reduce_sum_float32_nan_safe(data, nitems);
+                    }
+                    ((float*)write_ptr)[0] = acc;
+                }
+                break;
             }
-            break;
-        }
-    }
-}
-
-/* Restore original variable bindings after promotion */
-static void restore_variables_in_tree(me_expr *n, const void **original_bounds,
-                                      const me_dtype *original_types, int *restore_idx) {
-    if (!n) return;
-
-    switch (TYPE_MASK(n->type)) {
-        case ME_VARIABLE:
-            if (original_bounds[*restore_idx] != NULL) {
-                n->bound = original_bounds[*restore_idx];
-                n->dtype = original_types[*restore_idx];
-                (*restore_idx)++;
+        case ME_FLOAT64:
+            {
+                const double* data = (const double*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            if (is_any) {
+                                if (data[i] != 0.0) { acc = true; break; }
+                            }
+                            else {
+                                if (data[i] == 0.0) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                }
+                else {
+                    double acc = 0.0;
+                    if (nitems == 0) {
+                        if (is_min) acc = INFINITY;
+                        else if (is_max) acc = -INFINITY;
+                        else acc = is_prod ? 1.0 : 0.0;
+                    }
+                    else if (is_min) {
+                        acc = reduce_min_float64_nan_safe(data, nitems);
+                    }
+                    else if (is_max) {
+                        acc = reduce_max_float64_nan_safe(data, nitems);
+                    }
+                    else if (is_prod) {
+                        acc = reduce_prod_float64_nan_safe(data, nitems);
+                    }
+                    else {
+                        acc = reduce_sum_float64_nan_safe(data, nitems);
+                    }
+                    ((double*)write_ptr)[0] = acc;
+                }
+                break;
             }
-            break;
-
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
-            const int arity = ARITY(n->type);
-            for (int i = 0; i < arity; i++) {
-                restore_variables_in_tree((me_expr *) n->parameters[i], original_bounds, original_types, restore_idx);
+        case ME_COMPLEX64:
+            {
+                const float _Complex* data = (const float _Complex*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            bool nonzero = IS_NONZERO_c64(data[i]);
+                            if (is_any) {
+                                if (nonzero) { acc = true; break; }
+                            }
+                            else {
+                                if (!nonzero) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                    break;
+                }
+                if (is_min || is_max) {
+                    ((float _Complex*)write_ptr)[0] = (float _Complex)0.0f;
+                    break;
+                }
+                float _Complex acc = is_prod ? (float _Complex)1.0f : (float _Complex)0.0f;
+                if (nitems == 0) {
+                    acc = is_prod ? (float _Complex)1.0f : (float _Complex)0.0f;
+                }
+                else if (is_prod) {
+                    for (int i = 0; i < nitems; i++) acc *= data[i];
+                }
+                else {
+                    for (int i = 0; i < nitems; i++) acc += data[i];
+                }
+                ((float _Complex*)write_ptr)[0] = acc;
+                break;
+            }
+        case ME_COMPLEX128:
+            {
+                const double _Complex* data = (const double _Complex*)arg->bound;
+                if (is_any || is_all) {
+                    bool acc = is_all;
+                    if (nitems > 0) {
+                        acc = is_all;
+                        for (int i = 0; i < nitems; i++) {
+                            bool nonzero = IS_NONZERO_c128(data[i]);
+                            if (is_any) {
+                                if (nonzero) { acc = true; break; }
+                            }
+                            else {
+                                if (!nonzero) { acc = false; break; }
+                            }
+                        }
+                    }
+                    ((bool*)write_ptr)[0] = acc;
+                    break;
+                }
+                if (is_min || is_max) {
+                    ((double _Complex*)write_ptr)[0] = (double _Complex)0.0;
+                    break;
+                }
+                double _Complex acc = is_prod ? (double _Complex)1.0 : (double _Complex)0.0;
+                if (nitems == 0) {
+                    acc = is_prod ? (double _Complex)1.0 : (double _Complex)0.0;
+                }
+                else if (is_prod) {
+                    for (int i = 0; i < nitems; i++) acc *= data[i];
+                }
+                else {
+                    for (int i = 0; i < nitems; i++) acc += data[i];
+                }
+                ((double _Complex*)write_ptr)[0] = acc;
+                break;
             }
+        default:
             break;
         }
     }
-}
-
-/* Check if all variables in tree match target type */
-static bool all_variables_match_type(const me_expr *n, me_dtype target_type) {
-    if (!n) return true;
 
-    switch (TYPE_MASK(n->type)) {
-        case ME_CONSTANT:
-            return true; // Constants are always OK
-
-        case ME_VARIABLE:
-            return n->dtype == target_type;
-
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
-            const int arity = ARITY(n->type);
-            for (int i = 0; i < arity; i++) {
-                if (!all_variables_match_type((const me_expr *) n->parameters[i], target_type)) {
-                    return false;
-                }
-            }
-            return true;
+    if (temp_output) {
+        convert_func_t conv = get_convert_func(result_type, output_type);
+        if (conv) {
+            conv(temp_output, n->output, 1);
         }
+        free(temp_output);
     }
-
-    return true;
 }
 
-static void private_eval(const me_expr *n) {
+static void private_eval(const me_expr* n) {
     if (!n) return;
 
+    if (is_reduction_node(n)) {
+        eval_reduction(n);
+        return;
+    }
+
     // Special case: imag() and real() functions return real from complex input
     if (IS_FUNCTION(n->type) && ARITY(n->type) == 1) {
         if (n->function == (void*)imag_wrapper || n->function == (void*)real_wrapper) {
-            me_expr *arg = (me_expr*)n->parameters[0];
+            me_expr* arg = (me_expr*)n->parameters[0];
             me_dtype arg_type = infer_result_type(arg);
 
             if (arg_type == ME_COMPLEX64) {
@@ -3036,8 +5312,8 @@ static void private_eval(const me_expr *n) {
                 me_eval_c64(arg);
 
                 // Extract real/imaginary part to float32 output
-                const float _Complex *cdata = (const float _Complex*)arg->output;
-                float *output = (float*)n->output;
+                const float _Complex* cdata = (const float _Complex*)arg->output;
+                float* output = (float*)n->output;
                 if (n->function == (void*)imag_wrapper) {
                     for (int i = 0; i < n->nitems; i++) {
 #if defined(_MSC_VER) && defined(__clang__)
@@ -3046,7 +5322,8 @@ static void private_eval(const me_expr *n) {
                         output[i] = cimagf(cdata[i]);
 #endif
                     }
-                } else { // real_wrapper
+                }
+                else { // real_wrapper
                     for (int i = 0; i < n->nitems; i++) {
 #if defined(_MSC_VER) && defined(__clang__)
                         output[i] = __builtin_crealf(cdata[i]);
@@ -3056,7 +5333,8 @@ static void private_eval(const me_expr *n) {
                     }
                 }
                 return;
-            } else if (arg_type == ME_COMPLEX128) {
+            }
+            else if (arg_type == ME_COMPLEX128) {
                 // Evaluate argument as complex128
                 if (!arg->output) {
                     arg->output = malloc(n->nitems * sizeof(double _Complex));
@@ -3066,8 +5344,8 @@ static void private_eval(const me_expr *n) {
                 me_eval_c128(arg);
 
                 // Extract real/imaginary part to float64 output
-                const double _Complex *cdata = (const double _Complex*)arg->output;
-                double *output = (double*)n->output;
+                const double _Complex* cdata = (const double _Complex*)arg->output;
+                double* output = (double*)n->output;
                 if (n->function == (void*)imag_wrapper) {
                     for (int i = 0; i < n->nitems; i++) {
 #if defined(_MSC_VER) && defined(__clang__)
@@ -3076,7 +5354,8 @@ static void private_eval(const me_expr *n) {
                         output[i] = cimag(cdata[i]);
 #endif
                     }
-                } else { // real_wrapper
+                }
+                else { // real_wrapper
                     for (int i = 0; i < n->nitems; i++) {
 #if defined(_MSC_VER) && defined(__clang__)
                         output[i] = __builtin_creal(cdata[i]);
@@ -3107,38 +5386,38 @@ static void private_eval(const me_expr *n) {
 #endif
         }
         switch (n->dtype) {
-            case ME_BOOL: me_eval_i8(n);
-                break;
-            case ME_INT8: me_eval_i8(n);
-                break;
-            case ME_INT16: me_eval_i16(n);
-                break;
-            case ME_INT32: me_eval_i32(n);
-                break;
-            case ME_INT64: me_eval_i64(n);
-                break;
-            case ME_UINT8: me_eval_u8(n);
-                break;
-            case ME_UINT16: me_eval_u16(n);
-                break;
-            case ME_UINT32: me_eval_u32(n);
-                break;
-            case ME_UINT64: me_eval_u64(n);
-                break;
-            case ME_FLOAT32: me_eval_f32(n);
-                break;
-            case ME_FLOAT64: me_eval_f64(n);
-                break;
-            case ME_COMPLEX64: me_eval_c64(n);
-                break;
-            case ME_COMPLEX128: me_eval_c128(n);
-                break;
-            default:
-                fprintf(stderr, "FATAL: Invalid dtype %d in evaluation.\n", n->dtype);
+        case ME_BOOL: me_eval_i8(n);
+            break;
+        case ME_INT8: me_eval_i8(n);
+            break;
+        case ME_INT16: me_eval_i16(n);
+            break;
+        case ME_INT32: me_eval_i32(n);
+            break;
+        case ME_INT64: me_eval_i64(n);
+            break;
+        case ME_UINT8: me_eval_u8(n);
+            break;
+        case ME_UINT16: me_eval_u16(n);
+            break;
+        case ME_UINT32: me_eval_u32(n);
+            break;
+        case ME_UINT64: me_eval_u64(n);
+            break;
+        case ME_FLOAT32: me_eval_f32(n);
+            break;
+        case ME_FLOAT64: me_eval_f64(n);
+            break;
+        case ME_COMPLEX64: me_eval_c64(n);
+            break;
+        case ME_COMPLEX128: me_eval_c128(n);
+            break;
+        default:
+            fprintf(stderr, "FATAL: Invalid dtype %d in evaluation.\n", n->dtype);
 #ifdef NDEBUG
-                abort(); // Release build: terminate immediately
+            abort(); // Release build: terminate immediately
 #else
-                assert(0 && "Invalid dtype"); // Debug: trigger debugger
+            assert(0 && "Invalid dtype"); // Debug: trigger debugger
 #endif
         }
         return;
@@ -3150,30 +5429,30 @@ static void private_eval(const me_expr *n) {
     int promo_count = 0;
 
     // Save original variable bindings
-    const void *original_bounds[ME_MAX_VARS];
+    const void* original_bounds[ME_MAX_VARS];
     me_dtype original_types[ME_MAX_VARS];
     int save_idx = 0;
 
     save_variable_bindings(n, original_bounds, original_types, &save_idx);
 
     // Promote variables
-    promote_variables_in_tree((me_expr *) n, result_type, promotions, &promo_count, n->nitems);
+    promote_variables_in_tree((me_expr*)n, result_type, promotions, &promo_count, n->nitems);
 
     // Check if we need output type conversion (e.g., computation in float64, output in bool)
     me_dtype saved_dtype = n->dtype;
-    void *original_output = n->output;
-    void *temp_output = NULL;
+    void* original_output = n->output;
+    void* temp_output = NULL;
 
     if (saved_dtype != result_type) {
         // Allocate temp buffer for computation
         temp_output = malloc(n->nitems * dtype_size(result_type));
         if (temp_output) {
-            ((me_expr *) n)->output = temp_output;
+            ((me_expr*)n)->output = temp_output;
         }
     }
 
     // Update expression type for evaluation
-    ((me_expr *) n)->dtype = result_type;
+    ((me_expr*)n)->dtype = result_type;
 
     // Evaluate with promoted types
     if (result_type == ME_AUTO) {
@@ -3185,38 +5464,38 @@ static void private_eval(const me_expr *n) {
 #endif
     }
     switch (result_type) {
-        case ME_BOOL: me_eval_i8(n);
-            break;
-        case ME_INT8: me_eval_i8(n);
-            break;
-        case ME_INT16: me_eval_i16(n);
-            break;
-        case ME_INT32: me_eval_i32(n);
-            break;
-        case ME_INT64: me_eval_i64(n);
-            break;
-        case ME_UINT8: me_eval_u8(n);
-            break;
-        case ME_UINT16: me_eval_u16(n);
-            break;
-        case ME_UINT32: me_eval_u32(n);
-            break;
-        case ME_UINT64: me_eval_u64(n);
-            break;
-        case ME_FLOAT32: me_eval_f32(n);
-            break;
-        case ME_FLOAT64: me_eval_f64(n);
-            break;
-        case ME_COMPLEX64: me_eval_c64(n);
-            break;
-        case ME_COMPLEX128: me_eval_c128(n);
-            break;
-        default:
-            fprintf(stderr, "FATAL: Invalid result type %d in evaluation.\n", result_type);
+    case ME_BOOL: me_eval_i8(n);
+        break;
+    case ME_INT8: me_eval_i8(n);
+        break;
+    case ME_INT16: me_eval_i16(n);
+        break;
+    case ME_INT32: me_eval_i32(n);
+        break;
+    case ME_INT64: me_eval_i64(n);
+        break;
+    case ME_UINT8: me_eval_u8(n);
+        break;
+    case ME_UINT16: me_eval_u16(n);
+        break;
+    case ME_UINT32: me_eval_u32(n);
+        break;
+    case ME_UINT64: me_eval_u64(n);
+        break;
+    case ME_FLOAT32: me_eval_f32(n);
+        break;
+    case ME_FLOAT64: me_eval_f64(n);
+        break;
+    case ME_COMPLEX64: me_eval_c64(n);
+        break;
+    case ME_COMPLEX128: me_eval_c128(n);
+        break;
+    default:
+        fprintf(stderr, "FATAL: Invalid result type %d in evaluation.\n", result_type);
 #ifdef NDEBUG
-            abort(); // Release build: terminate immediately
+        abort(); // Release build: terminate immediately
 #else
-            assert(0 && "Invalid dtype"); // Debug: trigger debugger
+        assert(0 && "Invalid dtype"); // Debug: trigger debugger
 #endif
     }
 
@@ -3227,16 +5506,16 @@ static void private_eval(const me_expr *n) {
             conv(temp_output, original_output, n->nitems);
         }
         // Restore original output pointer
-        ((me_expr *) n)->output = original_output;
+        ((me_expr*)n)->output = original_output;
         free(temp_output);
     }
 
     // Restore original variable bindings
     int restore_idx = 0;
-    restore_variables_in_tree((me_expr *) n, original_bounds, original_types, &restore_idx);
+    restore_variables_in_tree((me_expr*)n, original_bounds, original_types, &restore_idx);
 
     // Restore expression type
-    ((me_expr *) n)->dtype = saved_dtype;
+    ((me_expr*)n)->dtype = saved_dtype;
 
     // Free promoted buffers
     for (int i = 0; i < promo_count; i++) {
@@ -3247,99 +5526,102 @@ static void private_eval(const me_expr *n) {
 }
 
 /* Helper to update variable bindings and nitems in tree */
-static void save_nitems_in_tree(const me_expr *node, int *nitems_array, int *idx) {
+static void save_nitems_in_tree(const me_expr* node, int* nitems_array, int* idx) {
     if (!node) return;
     nitems_array[(*idx)++] = node->nitems;
 
     switch (TYPE_MASK(node->type)) {
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
             const int arity = ARITY(node->type);
             for (int i = 0; i < arity; i++) {
-                save_nitems_in_tree((const me_expr *) node->parameters[i], nitems_array, idx);
+                save_nitems_in_tree((const me_expr*)node->parameters[i], nitems_array, idx);
             }
             break;
         }
-        default:
-            break;
+    default:
+        break;
     }
 }
 
-static void restore_nitems_in_tree(me_expr *node, const int *nitems_array, int *idx) {
+static void restore_nitems_in_tree(me_expr* node, const int* nitems_array, int* idx) {
     if (!node) return;
     node->nitems = nitems_array[(*idx)++];
 
     switch (TYPE_MASK(node->type)) {
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
             const int arity = ARITY(node->type);
             for (int i = 0; i < arity; i++) {
-                restore_nitems_in_tree((me_expr *) node->parameters[i], nitems_array, idx);
+                restore_nitems_in_tree((me_expr*)node->parameters[i], nitems_array, idx);
             }
             break;
         }
-        default:
-            break;
+    default:
+        break;
     }
 }
 
 /* Helper to free intermediate output buffers */
-static void free_intermediate_buffers(me_expr *node) {
+static void free_intermediate_buffers(me_expr* node) {
     if (!node) return;
 
     switch (TYPE_MASK(node->type)) {
-        case ME_CONSTANT:
-        case ME_VARIABLE:
-            // These don't have intermediate buffers
-            break;
-
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
+    case ME_CONSTANT:
+    case ME_VARIABLE:
+        // These don't have intermediate buffers
+        break;
+
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
             const int arity = ARITY(node->type);
             for (int i = 0; i < arity; i++) {
-                me_expr *param = (me_expr *) node->parameters[i];
+                me_expr* param = (me_expr*)node->parameters[i];
                 free_intermediate_buffers(param);
 
                 // Free intermediate buffer (but not for root or variables/constants)
@@ -3354,68 +5636,70 @@ static void free_intermediate_buffers(me_expr *node) {
 }
 
 /* Helper to save original variable bindings with their pointers */
-static void save_variable_metadata(const me_expr *node, const void **var_pointers, size_t *var_sizes, int *var_count) {
+static void save_variable_metadata(const me_expr* node, const void** var_pointers, size_t* var_sizes, int* var_count) {
     if (!node) return;
     switch (TYPE_MASK(node->type)) {
-        case ME_VARIABLE:
-            // Check if this pointer is already in the list
-            for (int i = 0; i < *var_count; i++) {
-                if (var_pointers[i] == node->bound) return; // Already saved
-            }
-            var_pointers[*var_count] = node->bound;
-            var_sizes[*var_count] = dtype_size(node->input_dtype);
-            (*var_count)++;
-            break;
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
+    case ME_VARIABLE:
+        // Check if this pointer is already in the list
+        for (int i = 0; i < *var_count; i++) {
+            if (var_pointers[i] == node->bound) return; // Already saved
+        }
+        var_pointers[*var_count] = node->bound;
+        var_sizes[*var_count] = dtype_size(node->input_dtype);
+        (*var_count)++;
+        break;
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
             const int arity = ARITY(node->type);
             for (int i = 0; i < arity; i++) {
-                save_variable_metadata((const me_expr *) node->parameters[i], var_pointers, var_sizes, var_count);
+                save_variable_metadata((const me_expr*)node->parameters[i], var_pointers, var_sizes, var_count);
             }
             break;
         }
     }
 }
 
-static int count_variable_nodes(const me_expr *node) {
+static int count_variable_nodes(const me_expr* node) {
     if (!node) return 0;
     switch (TYPE_MASK(node->type)) {
-        case ME_VARIABLE:
-            return 1;
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
+    case ME_VARIABLE:
+        return 1;
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
             int count = 0;
             const int arity = ARITY(node->type);
             for (int i = 0; i < arity; i++) {
-                count += count_variable_nodes((const me_expr *) node->parameters[i]);
+                count += count_variable_nodes((const me_expr*)node->parameters[i]);
             }
             return count;
         }
@@ -3423,11 +5707,12 @@ static int count_variable_nodes(const me_expr *node) {
     return 0;
 }
 
-static void collect_variable_nodes(me_expr *node, const void **var_pointers, int n_vars,
-                                   me_expr **var_nodes, int *var_indices, int *node_count) {
+static void collect_variable_nodes(me_expr* node, const void** var_pointers, int n_vars,
+                                   me_expr** var_nodes, int* var_indices, int* node_count) {
     if (!node) return;
     switch (TYPE_MASK(node->type)) {
-        case ME_VARIABLE: {
+    case ME_VARIABLE:
+        {
             int idx = -1;
             for (int i = 0; i < n_vars; i++) {
                 if (node->bound == var_pointers[i]) {
@@ -3442,25 +5727,26 @@ static void collect_variable_nodes(me_expr *node, const void **var_pointers, int
             }
             break;
         }
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
             const int arity = ARITY(node->type);
             for (int i = 0; i < arity; i++) {
-                collect_variable_nodes((me_expr *) node->parameters[i], var_pointers, n_vars,
+                collect_variable_nodes((me_expr*)node->parameters[i], var_pointers, n_vars,
                                        var_nodes, var_indices, node_count);
             }
             break;
@@ -3469,37 +5755,38 @@ static void collect_variable_nodes(me_expr *node, const void **var_pointers, int
 }
 
 /* Helper to update variable bindings by matching original pointers */
-static void update_vars_by_pointer(me_expr *node, const void **old_pointers, const void **new_pointers, int n_vars) {
+static void update_vars_by_pointer(me_expr* node, const void** old_pointers, const void** new_pointers, int n_vars) {
     if (!node) return;
     switch (TYPE_MASK(node->type)) {
-        case ME_VARIABLE:
-            // Find which variable this is and update to new pointer
-            for (int i = 0; i < n_vars; i++) {
-                if (node->bound == old_pointers[i]) {
-                    node->bound = new_pointers[i];
-                    break;
-                }
+    case ME_VARIABLE:
+        // Find which variable this is and update to new pointer
+        for (int i = 0; i < n_vars; i++) {
+            if (node->bound == old_pointers[i]) {
+                node->bound = new_pointers[i];
+                break;
             }
-            break;
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
+        }
+        break;
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
             const int arity = ARITY(node->type);
             for (int i = 0; i < arity; i++) {
-                update_vars_by_pointer((me_expr *) node->parameters[i], old_pointers, new_pointers, n_vars);
+                update_vars_by_pointer((me_expr*)node->parameters[i], old_pointers, new_pointers, n_vars);
             }
             break;
         }
@@ -3507,7 +5794,7 @@ static void update_vars_by_pointer(me_expr *node, const void **old_pointers, con
 }
 
 /* Helper to update variable bindings and nitems in tree */
-static void update_variable_bindings(me_expr *node, const void **new_bounds, int *var_idx, int new_nitems) {
+static void update_variable_bindings(me_expr* node, const void** new_bounds, int* var_idx, int new_nitems) {
     if (!node) return;
 
     // Update nitems for all nodes to handle intermediate buffers
@@ -3516,31 +5803,32 @@ static void update_variable_bindings(me_expr *node, const void **new_bounds, int
     }
 
     switch (TYPE_MASK(node->type)) {
-        case ME_VARIABLE:
-            if (new_bounds && *var_idx >= 0) {
-                node->bound = new_bounds[*var_idx];
-                (*var_idx)++;
-            }
-            break;
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7: {
+    case ME_VARIABLE:
+        if (new_bounds && *var_idx >= 0) {
+            node->bound = new_bounds[*var_idx];
+            (*var_idx)++;
+        }
+        break;
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
             const int arity = ARITY(node->type);
             for (int i = 0; i < arity; i++) {
-                update_variable_bindings((me_expr *) node->parameters[i], new_bounds, var_idx, new_nitems);
+                update_variable_bindings((me_expr*)node->parameters[i], new_bounds, var_idx, new_nitems);
             }
             break;
         }
@@ -3548,13 +5836,13 @@ static void update_variable_bindings(me_expr *node, const void **new_bounds, int
 }
 
 /* Evaluate compiled expression with new variable and output pointers */
-static me_expr *clone_expr(const me_expr *src) {
+static me_expr* clone_expr(const me_expr* src) {
     if (!src) return NULL;
 
     const int arity = ARITY(src->type);
-    const int psize = sizeof(void *) * arity;
-    const int size = (sizeof(me_expr) - sizeof(void *)) + psize + (IS_CLOSURE(src->type) ? sizeof(void *) : 0);
-    me_expr *clone = malloc(size);
+    const int psize = sizeof(void*) * arity;
+    const int size = (sizeof(me_expr) - sizeof(void*)) + psize + (IS_CLOSURE(src->type) ? sizeof(void*) : 0);
+    me_expr* clone = malloc(size);
     if (!clone) return NULL;
 
     // Copy the entire structure
@@ -3563,11 +5851,11 @@ static me_expr *clone_expr(const me_expr *src) {
     // Clone children recursively
     if (arity > 0) {
         for (int i = 0; i < arity; i++) {
-            clone->parameters[i] = clone_expr((const me_expr *) src->parameters[i]);
+            clone->parameters[i] = clone_expr((const me_expr*)src->parameters[i]);
             if (src->parameters[i] && !clone->parameters[i]) {
                 // Clone failed, clean up
                 for (int j = 0; j < i; j++) {
-                    me_free((me_expr *) clone->parameters[j]);
+                    me_free((me_expr*)clone->parameters[j]);
                 }
                 free(clone);
                 return NULL;
@@ -3588,30 +5876,31 @@ static me_expr *clone_expr(const me_expr *src) {
  * This function is safe to call from multiple threads simultaneously,
  * even on the same expression object. Each call creates a temporary
  * clone of the expression tree to avoid race conditions. */
-void me_eval(const me_expr *expr, const void **vars_chunk,
-             int n_vars, void *output_chunk, int chunk_nitems) {
-    if (!expr) return;
+int me_eval(const me_expr* expr, const void** vars_chunk,
+            int n_vars, void* output_chunk, int chunk_nitems) {
+    if (!expr) return ME_EVAL_ERR_NULL_EXPR;
 
     // Verify variable count matches
-    const void *original_var_pointers[ME_MAX_VARS];
+    const void* original_var_pointers[ME_MAX_VARS];
     size_t var_sizes[ME_MAX_VARS];
     int actual_var_count = 0;
     save_variable_metadata(expr, original_var_pointers, var_sizes, &actual_var_count);
     if (actual_var_count > ME_MAX_VARS) {
         fprintf(stderr, "Error: Expression uses %d variables, exceeds ME_MAX_VARS=%d\n",
                 actual_var_count, ME_MAX_VARS);
-        return;
+        return ME_EVAL_ERR_TOO_MANY_VARS;
     }
 
     if (actual_var_count != n_vars) {
-        return;
+        return ME_EVAL_ERR_VAR_MISMATCH;
     }
 
     // Clone the expression tree
-    me_expr *clone = clone_expr(expr);
-    if (!clone) return;
+    me_expr* clone = clone_expr(expr);
+    if (!clone) return ME_EVAL_ERR_OOM;
 
     const int block_nitems = ME_EVAL_BLOCK_NITEMS;
+    int status = ME_EVAL_SUCCESS;
 
     if (!ME_EVAL_ENABLE_BLOCKING || chunk_nitems <= block_nitems) {
         // Update clone's variable bindings
@@ -3626,11 +5915,22 @@ void me_eval(const me_expr *expr, const void **vars_chunk,
 
         // Evaluate the clone
         private_eval(clone);
-    } else {
+    }
+    else if (is_reduction_node(clone)) {
+        // Reductions operate on the full chunk; avoid block processing.
+        update_vars_by_pointer(clone, original_var_pointers, vars_chunk, n_vars);
+
+        int update_idx = 0;
+        update_variable_bindings(clone, NULL, &update_idx, chunk_nitems);
+
+        clone->output = output_chunk;
+        private_eval(clone);
+    }
+    else {
         const size_t output_item_size = dtype_size(clone->dtype);
         const int max_var_nodes = count_variable_nodes(clone);
-        me_expr **var_nodes = NULL;
-        int *var_indices = NULL;
+        me_expr** var_nodes = NULL;
+        int* var_indices = NULL;
         int var_node_count = 0;
 
         if (max_var_nodes > 0) {
@@ -3639,8 +5939,8 @@ void me_eval(const me_expr *expr, const void **vars_chunk,
             if (!var_nodes || !var_indices) {
                 free(var_nodes);
                 free(var_indices);
-                me_free(clone);
-                return;
+                status = ME_EVAL_ERR_OOM;
+                goto cleanup;
             }
             collect_variable_nodes(clone, original_var_pointers, n_vars,
                                    var_nodes, var_indices, &var_node_count);
@@ -3657,9 +5957,9 @@ void me_eval(const me_expr *expr, const void **vars_chunk,
                 current = chunk_nitems - offset;
             }
 
-            const void *block_vars[ME_MAX_VARS];
+            const void* block_vars[ME_MAX_VARS];
             for (int i = 0; i < n_vars; i++) {
-                const unsigned char *base = (const unsigned char *)vars_chunk[i];
+                const unsigned char* base = (const unsigned char*)vars_chunk[i];
                 block_vars[i] = base + (size_t)offset * var_sizes[i];
             }
 
@@ -3670,7 +5970,7 @@ void me_eval(const me_expr *expr, const void **vars_chunk,
             int update_idx = 0;
             update_variable_bindings(clone, NULL, &update_idx, current);
 
-            clone->output = (unsigned char *)output_chunk + (size_t)offset * output_item_size;
+            clone->output = (unsigned char*)output_chunk + (size_t)offset * output_item_size;
             private_eval(clone);
         }
 
@@ -3678,12 +5978,14 @@ void me_eval(const me_expr *expr, const void **vars_chunk,
         free(var_indices);
     }
 
+cleanup:
     // Free the clone (including any intermediate buffers it allocated)
     me_free(clone);
+    return status;
 }
 
 
-static void optimize(me_expr *n) {
+static void optimize(me_expr* n) {
     /* Evaluates as much as possible. */
     if (!n) return;
     if (n->type == ME_CONSTANT) return;
@@ -3696,7 +5998,7 @@ static void optimize(me_expr *n) {
         int i;
         for (i = 0; i < arity; ++i) {
             optimize(n->parameters[i]);
-            if (((me_expr *) (n->parameters[i]))->type != ME_CONSTANT) {
+            if (((me_expr*)(n->parameters[i]))->type != ME_CONSTANT) {
                 known = 0;
             }
         }
@@ -3709,9 +6011,37 @@ static void optimize(me_expr *n) {
     }
 }
 
+#if defined(_WIN32) || defined(_WIN64)
+static bool has_complex_node(const me_expr* n) {
+    if (!n) return false;
+    if (n->dtype == ME_COMPLEX64 || n->dtype == ME_COMPLEX128) return true;
+    const int arity = ARITY(n->type);
+    for (int i = 0; i < arity; i++) {
+        if (has_complex_node((const me_expr*)n->parameters[i])) return true;
+    }
+    return false;
+}
+
+static bool has_complex_input(const me_expr* n) {
+    if (!n) return false;
+    if (n->input_dtype == ME_COMPLEX64 || n->input_dtype == ME_COMPLEX128) return true;
+    const int arity = ARITY(n->type);
+    for (int i = 0; i < arity; i++) {
+        if (has_complex_input((const me_expr*)n->parameters[i])) return true;
+    }
+    return false;
+}
+#endif
+
+
+static int private_compile(const char* expression, const me_variable* variables, int var_count,
+                           void* output, int nitems, me_dtype dtype, int* error, me_expr** out) {
+    if (out) *out = NULL;
+    if (!expression || !out || var_count < 0) {
+        if (error) *error = -1;
+        return ME_COMPILE_ERR_INVALID_ARG;
+    }
 
-static me_expr *private_compile(const char *expression, const me_variable *variables, int var_count,
-                                void *output, int nitems, me_dtype dtype, int *error) {
     // Validate dtype usage: either all vars are ME_AUTO (use dtype), or dtype is ME_AUTO (use var dtypes)
     if (variables && var_count > 0) {
         int auto_count = 0;
@@ -3720,7 +6050,8 @@ static me_expr *private_compile(const char *expression, const me_variable *varia
         for (int i = 0; i < var_count; i++) {
             if (variables[i].dtype == ME_AUTO) {
                 auto_count++;
-            } else {
+            }
+            else {
                 specified_count++;
             }
         }
@@ -3733,27 +6064,28 @@ static me_expr *private_compile(const char *expression, const me_variable *varia
                     stderr,
                     "Error: When output dtype is ME_AUTO, all variable dtypes must be specified (not ME_AUTO)\n");
                 if (error) *error = -1;
-                return NULL;
+                return ME_COMPILE_ERR_VAR_UNSPECIFIED;
             }
-        } else {
+        }
+        else {
             // Mode 2: Output dtype is specified
             // Two sub-modes: all ME_AUTO (homogeneous), or all explicit (heterogeneous with conversion)
             if (auto_count > 0 && specified_count > 0) {
                 // Mixed mode not allowed
                 fprintf(stderr, "Error: Variable dtypes must be all ME_AUTO or all explicitly specified\n");
                 if (error) *error = -1;
-                return NULL;
+                return ME_COMPILE_ERR_VAR_MIXED;
             }
         }
     }
 
     // Create a copy of variables with dtype filled in (if not already set)
-    me_variable *vars_copy = NULL;
+    me_variable* vars_copy = NULL;
     if (variables && var_count > 0) {
         vars_copy = malloc(var_count * sizeof(me_variable));
         if (!vars_copy) {
             if (error) *error = -1;
-            return NULL;
+            return ME_COMPILE_ERR_OOM;
         }
         for (int i = 0; i < var_count; i++) {
             vars_copy[i] = variables[i];
@@ -3772,32 +6104,66 @@ static me_expr *private_compile(const char *expression, const me_variable *varia
     // When dtype is ME_AUTO, infer target dtype from variables to avoid type mismatch
     if (dtype != ME_AUTO) {
         s.target_dtype = dtype;
-    } else if (variables && var_count > 0) {
+    }
+    else if (variables && var_count > 0) {
         // Use the first variable's dtype as the target for constants
         // This prevents type promotion issues when mixing float32 vars with float64 constants
         s.target_dtype = variables[0].dtype;
-    } else {
+    }
+    else {
         s.target_dtype = ME_AUTO;
     }
 
     next_token(&s);
-    me_expr *root = list(&s);
-
-    if (vars_copy) free(vars_copy);
+    me_expr* root = list(&s);
 
     if (root == NULL) {
         if (error) *error = -1;
-        return NULL;
+        if (vars_copy) free(vars_copy);
+        return ME_COMPILE_ERR_OOM;
     }
 
+    if (contains_reduction(root) && !reduction_usage_is_valid(root)) {
+        me_free(root);
+        if (error) *error = -1;
+        if (vars_copy) free(vars_copy);
+        return ME_COMPILE_ERR_REDUCTION_INVALID;
+    }
+
+#if defined(_WIN32) || defined(_WIN64)
+    {
+        const me_variable* vars_check = vars_copy ? vars_copy : variables;
+        bool complex_vars = false;
+        if (vars_check) {
+            for (int i = 0; i < var_count; i++) {
+                if (vars_check[i].dtype == ME_COMPLEX64 || vars_check[i].dtype == ME_COMPLEX128) {
+                    complex_vars = true;
+                    break;
+                }
+            }
+        }
+        if (complex_vars ||
+            dtype == ME_COMPLEX64 || dtype == ME_COMPLEX128 ||
+            has_complex_node(root) || has_complex_input(root)) {
+            fprintf(stderr, "Error: Complex expressions are not supported on Windows (no C99 complex ABI)\n");
+            me_free(root);
+            if (error) *error = -1;
+            if (vars_copy) free(vars_copy);
+            return ME_COMPILE_ERR_COMPLEX_UNSUPPORTED;
+        }
+    }
+#endif
+
     if (s.type != TOK_END) {
         me_free(root);
         if (error) {
             *error = (s.next - s.start);
             if (*error == 0) *error = 1;
         }
-        return 0;
-    } else {
+        if (vars_copy) free(vars_copy);
+        return ME_COMPILE_ERR_PARSE;
+    }
+    else {
         optimize(root);
         root->output = output;
         root->nitems = nitems;
@@ -3805,24 +6171,33 @@ static me_expr *private_compile(const char *expression, const me_variable *varia
         // If dtype is ME_AUTO, infer from expression; otherwise use provided dtype
         if (dtype == ME_AUTO) {
             root->dtype = infer_output_type(root);
-        } else {
+        }
+        else {
             // User explicitly requested a dtype - use it (will cast if needed)
             root->dtype = dtype;
         }
 
         if (error) *error = 0;
-        return root;
+        if (vars_copy) free(vars_copy);
+        *out = root;
+        return ME_COMPILE_SUCCESS;
     }
 }
 
 // Synthetic addresses for ordinal matching (when user provides NULL addresses)
 static char synthetic_var_addresses[ME_MAX_VARS];
 
-me_expr *me_compile(const char *expression, const me_variable *variables,
-                    int var_count, me_dtype dtype, int *error) {
+int me_compile(const char* expression, const me_variable* variables,
+               int var_count, me_dtype dtype, int* error, me_expr** out) {
+    if (out) *out = NULL;
+    if (!out) {
+        if (error) *error = -1;
+        return ME_COMPILE_ERR_INVALID_ARG;
+    }
+
     // For chunked evaluation, we compile without specific output/nitems
     // If variables have NULL addresses, assign synthetic unique addresses for ordinal matching
-    me_variable *vars_copy = NULL;
+    me_variable* vars_copy = NULL;
     int needs_synthetic = 0;
 
     if (variables && var_count > 0) {
@@ -3839,7 +6214,7 @@ me_expr *me_compile(const char *expression, const me_variable *variables,
             vars_copy = malloc(var_count * sizeof(me_variable));
             if (!vars_copy) {
                 if (error) *error = -1;
-                return NULL;
+                return ME_COMPILE_ERR_OOM;
             }
 
             for (int i = 0; i < var_count; i++) {
@@ -3850,17 +6225,17 @@ me_expr *me_compile(const char *expression, const me_variable *variables,
                 }
             }
 
-            me_expr *result = private_compile(expression, vars_copy, var_count, NULL, 0, dtype, error);
+            int status = private_compile(expression, vars_copy, var_count, NULL, 0, dtype, error, out);
             free(vars_copy);
-            return result;
+            return status;
         }
     }
 
     // No NULL addresses, use variables as-is
-    return private_compile(expression, variables, var_count, NULL, 0, dtype, error);
+    return private_compile(expression, variables, var_count, NULL, 0, dtype, error, out);
 }
 
-static void pn(const me_expr *n, int depth) {
+static void pn(const me_expr* n, int depth) {
     int i, arity;
     printf("%*s", depth, "");
 
@@ -3870,44 +6245,44 @@ static void pn(const me_expr *n, int depth) {
     }
 
     switch (TYPE_MASK(n->type)) {
-        case ME_CONSTANT: printf("%f\n", n->value);
-            break;
-        case ME_VARIABLE: printf("bound %p\n", n->bound);
-            break;
-
-        case ME_FUNCTION0:
-        case ME_FUNCTION1:
-        case ME_FUNCTION2:
-        case ME_FUNCTION3:
-        case ME_FUNCTION4:
-        case ME_FUNCTION5:
-        case ME_FUNCTION6:
-        case ME_FUNCTION7:
-        case ME_CLOSURE0:
-        case ME_CLOSURE1:
-        case ME_CLOSURE2:
-        case ME_CLOSURE3:
-        case ME_CLOSURE4:
-        case ME_CLOSURE5:
-        case ME_CLOSURE6:
-        case ME_CLOSURE7:
-            arity = ARITY(n->type);
-            printf("f%d", arity);
-            for (i = 0; i < arity; i++) {
-                printf(" %p", n->parameters[i]);
-            }
-            printf("\n");
-            for (i = 0; i < arity; i++) {
-                pn(n->parameters[i], depth + 1);
-            }
-            break;
+    case ME_CONSTANT: printf("%f\n", n->value);
+        break;
+    case ME_VARIABLE: printf("bound %p\n", n->bound);
+        break;
+
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        arity = ARITY(n->type);
+        printf("f%d", arity);
+        for (i = 0; i < arity; i++) {
+            printf(" %p", n->parameters[i]);
+        }
+        printf("\n");
+        for (i = 0; i < arity; i++) {
+            pn(n->parameters[i], depth + 1);
+        }
+        break;
     }
 }
 
-void me_print(const me_expr *n) {
+void me_print(const me_expr* n) {
     pn(n, 0);
 }
 
-me_dtype me_get_dtype(const me_expr *expr) {
+me_dtype me_get_dtype(const me_expr* expr) {
     return expr ? expr->dtype : ME_AUTO;
 }
diff --git a/src/blosc2/miniexpr.h b/src/blosc2/miniexpr.h
index 57fc7b65..e778e579 100644
--- a/src/blosc2/miniexpr.h
+++ b/src/blosc2/miniexpr.h
@@ -46,7 +46,7 @@ extern "C" {
 
 /* Internal eval block size (elements). Compile-time fixed. */
 #ifndef ME_EVAL_BLOCK_NITEMS
-#define ME_EVAL_BLOCK_NITEMS 1024
+#define ME_EVAL_BLOCK_NITEMS 4096
 #endif
 
 /* Maximum number of variables supported in a single expression. */
@@ -135,29 +135,54 @@ typedef struct me_variable {
  *          - ME_AUTO: All variables must specify their dtypes, output is inferred
  *          - Specific type: Either all variables are ME_AUTO (homogeneous, all use this type),
  *            OR all variables have explicit dtypes (heterogeneous, result cast to this type)
- *   error: Optional pointer to receive error position (0 on success, >0 on error)
+ *   error: Optional pointer to receive error position (0 on success, >0 on parse error)
+ *   out: Output pointer to receive the compiled expression
  *
- * Returns: Compiled expression ready for chunked evaluation, or NULL on error
+ * Returns: ME_COMPILE_SUCCESS (0) on success, or a negative ME_COMPILE_ERR_* code on failure
  *
  * Example 1 (simple - all same type):
  *   me_variable vars[] = {{"x"}, {"y"}};  // Both ME_AUTO
- *   me_expr *expr = me_compile("x + y", vars, 2, ME_FLOAT64, &err);
+ *   me_expr *expr = NULL;
+ *   if (me_compile("x + y", vars, 2, ME_FLOAT64, &err, &expr) != ME_COMPILE_SUCCESS) { return; }
  *
  * Example 2 (mixed types with ME_AUTO):
  *   me_variable vars[] = {{"x", ME_INT32}, {"y", ME_FLOAT64}};
- *   me_expr *expr = me_compile("x + y", vars, 2, ME_AUTO, &err);
+ *   me_expr *expr = NULL;
+ *   if (me_compile("x + y", vars, 2, ME_AUTO, &err, &expr) != ME_COMPILE_SUCCESS) { return; }
  *
  * Example 3 (mixed types with explicit output):
  *   me_variable vars[] = {{"x", ME_INT32}, {"y", ME_FLOAT64}};
- *   me_expr *expr = me_compile("x + y", vars, 2, ME_FLOAT32, &err);
+ *   me_expr *expr = NULL;
+ *   if (me_compile("x + y", vars, 2, ME_FLOAT32, &err, &expr) != ME_COMPILE_SUCCESS) { return; }
  *   // Variables keep their types, result is cast to FLOAT32
  *
  *   // Later, provide data in same order as variable definitions
  *   const void *data[] = {x_array, y_array};  // x first, y second
- *   me_eval(expr, data, 2, output, nitems);
+ *   if (me_eval(expr, data, 2, output, nitems) != ME_EVAL_SUCCESS) { return; }
  */
-me_expr *me_compile(const char *expression, const me_variable *variables,
-                    int var_count, me_dtype dtype, int *error);
+int me_compile(const char *expression, const me_variable *variables,
+               int var_count, me_dtype dtype, int *error, me_expr **out);
+
+/* Status codes for me_compile(). */
+typedef enum {
+    ME_COMPILE_SUCCESS = 0,
+    ME_COMPILE_ERR_OOM = -1,
+    ME_COMPILE_ERR_PARSE = -2,
+    ME_COMPILE_ERR_INVALID_ARG = -3,
+    ME_COMPILE_ERR_COMPLEX_UNSUPPORTED = -4,
+    ME_COMPILE_ERR_REDUCTION_INVALID = -5,
+    ME_COMPILE_ERR_VAR_MIXED = -6,
+    ME_COMPILE_ERR_VAR_UNSPECIFIED = -7
+} me_compile_status;
+
+/* Status codes for me_eval(). */
+typedef enum {
+    ME_EVAL_SUCCESS = 0,
+    ME_EVAL_ERR_OOM = -1,
+    ME_EVAL_ERR_NULL_EXPR = -2,
+    ME_EVAL_ERR_TOO_MANY_VARS = -3,
+    ME_EVAL_ERR_VAR_MISMATCH = -4
+} me_eval_status;
 
 /* Evaluates compiled expression with variable and output pointers.
  * This function can be safely called from multiple threads simultaneously on the
@@ -171,11 +196,14 @@ me_expr *me_compile(const char *expression, const me_variable *variables,
  *   output_chunk: Pointer to output buffer for this chunk
  *   chunk_nitems: Number of elements in this chunk
  *
+ * Returns:
+ *   ME_EVAL_SUCCESS (0) on success, or a negative ME_EVAL_ERR_* code on failure.
+ *
  * Use this function for both serial and parallel evaluation. It is thread-safe
  * and can be used from multiple threads to process different chunks simultaneously.
  */
-void me_eval(const me_expr *expr, const void **vars_chunk,
-             int n_vars, void *output_chunk, int chunk_nitems);
+int me_eval(const me_expr *expr, const void **vars_chunk,
+            int n_vars, void *output_chunk, int chunk_nitems);
 
 /* Prints the expression tree for debugging purposes. */
 void me_print(const me_expr *n);

From 12f0a29fe846ef11338a7265bc3f2a12e3593b5f Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 2 Jan 2026 14:01:39 +0100
Subject: [PATCH 060/123] Testing multithreaded reductions

---
 src/blosc2/blosc2_ext.pyx |  40 +++++----
 src/blosc2/lazyexpr.py    |  69 ++++++++++++++-
 src/blosc2/miniexpr.c     | 175 +++++---------------------------------
 src/blosc2/ndarray.py     |   2 +-
 4 files changed, 113 insertions(+), 173 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 0eb684e8..40e2197e 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -29,7 +29,7 @@ from cython.operator cimport dereference
 from libc.stdint cimport uintptr_t
 from libc.stdlib cimport free, malloc, realloc, calloc
 from libc.stdlib cimport abs as c_abs
-from libc.string cimport memcpy, strcpy, strdup, strlen
+from libc.string cimport memcpy, memset, strcpy, strdup, strlen
 from libcpp cimport bool as c_bool
 
 from enum import Enum
@@ -54,6 +54,8 @@ cdef extern from "<stdint.h>":
     ctypedef unsigned int   uint32_t
     ctypedef unsigned long long uint64_t
 
+cdef extern from "<stdio.h>":
+    int printf(const char *format, ...) nogil
 
 cdef extern from "blosc2.h":
 
@@ -605,6 +607,7 @@ ctypedef struct me_udata:
     b2nd_array_t** inputs
     int ninputs
     b2nd_array_t *array
+    void* aux_reduc_ptr
     int64_t chunks_in_array[B2ND_MAX_DIM]
     int64_t blocks_in_chunk[B2ND_MAX_DIM]
     me_expr* miniexpr_handle
@@ -1840,14 +1843,6 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
     cdef int64_t stop_ndim[B2ND_MAX_DIM]
     cdef int64_t buffershape[B2ND_MAX_DIM]
 
-    # Get the right slice for each operand
-    blosc2_unidim_to_multidim(udata.array.ndim, udata.chunks_in_array, nchunk, chunk_ndim)
-    blosc2_unidim_to_multidim(udata.array.ndim, udata.blocks_in_chunk, nblock, block_ndim)
-    for i in range(udata.array.ndim):
-        start_ndim[i] = chunk_ndim[i] * udata.array.chunkshape[i] + block_ndim[i] * udata.array.blockshape[i]
-        stop_ndim[i] = start_ndim[i] + udata.array.blockshape[i]
-        buffershape[i] = udata.array.blockshape[i]
-
     cdef b2nd_array_t* ndarr
     cdef int rc
     cdef void** input_buffers = <void**> malloc(udata.ninputs * sizeof(uint8_t*))
@@ -1885,11 +1880,21 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
                 raise ValueError("miniexpr: error decompressing the chunk")
 
     cdef me_expr* miniexpr_handle = udata.miniexpr_handle
+    cdef void* aux_reduc_ptr
+    cdef uintptr_t offset_bytes
+    cdef int nblocks_per_chunk = udata.array.chunknitems // udata.array.blocknitems
     if miniexpr_handle == NULL:
         raise ValueError("miniexpr: handle not assigned")
     # Call thread-safe miniexpr C API
-    rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
-                 <void*>params_output, ndarr.blocknitems)
+    if udata.aux_reduc_ptr == NULL:
+        rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
+                     <void*>params_output, ndarr.blocknitems)
+    else:
+        # Reduction operation
+        offset_bytes = <uintptr_t> typesize * (nchunk * nblocks_per_chunk + nblock)
+        aux_reduc_ptr = <void *> (<uintptr_t> udata.aux_reduc_ptr + offset_bytes)
+        rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs, aux_reduc_ptr, ndarr.blocknitems)
+        memset(<char *>params_output, 0, udata.array.sc.blocksize)  # clear output buffer
     if rc != 0:
         raise RuntimeError(f"miniexpr: issues during evaluation; error code: {rc}")
 
@@ -2790,7 +2795,7 @@ cdef class NDArray:
 
         return udata
 
-    cdef me_udata *_fill_me_udata(self, inputs):
+    cdef me_udata *_fill_me_udata(self, inputs, aux_reduc):
         cdef me_udata *udata = <me_udata *> malloc(sizeof(me_udata))
         operands = list(inputs.values())
         ninputs = len(operands)
@@ -2800,6 +2805,12 @@ cdef class NDArray:
         udata.inputs = inputs_
         udata.ninputs = ninputs
         udata.array = self.array
+        cdef void* aux_reduc_ptr = NULL
+        if aux_reduc is not None:
+            if not isinstance(aux_reduc, np.ndarray):
+                raise TypeError("aux_reduc must be a NumPy array")
+            aux_reduc_ptr = <void *> np.PyArray_DATA(<np.ndarray> aux_reduc)
+        udata.aux_reduc_ptr = aux_reduc_ptr
         # Save these in udf_udata to avoid computing them for each block
         for i in range(self.array.ndim):
             udata.chunks_in_array[i] = udata.array.extshape[i] // udata.array.chunkshape[i]
@@ -2807,13 +2818,12 @@ cdef class NDArray:
 
         return udata
 
-    def _set_pref_expr(self, expression, inputs):
+    def _set_pref_expr(self, expression, inputs, aux_reduc=None):
         # Set prefilter for miniexpr
         cdef blosc2_cparams* cparams = self.array.sc.storage.cparams
         cparams.prefilter = <blosc2_prefilter_fn> miniexpr_prefilter
 
-        # cdef udf_udata* udata = self._fill_udf_udata(func_id, inputs)
-        cdef me_udata* udata = self._fill_me_udata(inputs)
+        cdef me_udata* udata = self._fill_me_udata(inputs, aux_reduc)
 
         # Get the compiled expression handle for multi-threading
         cdef Py_ssize_t n = len(inputs)
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 3ee73351..11baa3b9 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1297,7 +1297,7 @@ def fast_eval(  # noqa: C901
                 break
 
     if sys.platform == "win32":
-        # Miniexpr has issues on Windows; still investigating
+        # Miniexpr has issues on Windows, but only with complex types; still investigating
         use_miniexpr = False
 
     if use_miniexpr:
@@ -1875,6 +1875,11 @@ def reduce_slices(  # noqa: C901
     :ref:`NDArray` or np.ndarray
         The resulting output array.
     """
+    global try_miniexpr
+
+    # Use a local copy so we don't modify the global
+    use_miniexpr = try_miniexpr  # & False
+
     out = kwargs.pop("_output", None)
     res_out_ = None  # temporary required to store max/min for argmax/argmin
     ne_args: dict = kwargs.pop("_ne_args", {})
@@ -1882,6 +1887,7 @@ def reduce_slices(  # noqa: C901
         ne_args = {}
     where: dict | None = kwargs.pop("_where_args", None)
     reduce_op = reduce_args.pop("op")
+    reduce_op_str = reduce_args.pop("op_str", None)
     axis = reduce_args["axis"]
     keepdims = reduce_args["keepdims"]
     dtype = reduce_args.get("dtype", None)
@@ -1928,7 +1934,9 @@ def reduce_slices(  # noqa: C901
     # Note: we could have expr = blosc2.lazyexpr('numpy_array + 1') (i.e. no choice for chunks)
     blosc2_arrs = tuple(o for o in operands.values() if hasattr(o, "chunks"))
     fast_path = False
+    all_ndarray = False
     chunks = None
+    blocks = None
     if blosc2_arrs:  # fast path only relevant if there are blosc2 arrays
         operand = max(blosc2_arrs, key=lambda x: len(x.shape))
 
@@ -1941,6 +1949,7 @@ def reduce_slices(  # noqa: C901
         aligned, iter_disk = dict.fromkeys(operands.keys(), False), False
         if fast_path:
             chunks = operand.chunks
+            blocks = operand.blocks
             # Check that all operands are NDArray for fast path
             all_ndarray = all(
                 isinstance(value, blosc2.NDArray) and value.shape != () for value in operands.values()
@@ -1974,6 +1983,58 @@ def reduce_slices(  # noqa: C901
         chunks = temp.chunks
         del temp
 
+    if (where is None and fast_path and all_ndarray and expression == "o0") or expression == "(o0)":
+        # Only this case is supported so far
+        if use_miniexpr:
+            for op in operands.values():
+                # Only NDArray in-memory operands
+                if not (isinstance(op, blosc2.NDArray) and op.urlpath is None and out is None):
+                    use_miniexpr = False
+                    break
+                # Check that partitions are well-behaved (no padding)
+                if not blosc2.are_partitions_behaved(op.shape, op.chunks, op.blocks):
+                    use_miniexpr = False
+                    break
+
+        if use_miniexpr:
+            cparams = kwargs.pop("cparams", blosc2.CParams())
+            # Use the same chunks/blocks as the input operands for consistency
+            res_eval = blosc2.empty(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
+            # Compute the number of blocks in the result
+            nblocks = res_eval.nbytes // res_eval.blocksize
+            print("nblocks:", nblocks, dtype)
+            aux_reduc = np.empty(nblocks, dtype=dtype)
+            try:
+                print("expr->miniexpr:", expression, reduce_op)
+                if reduce_op_str is None:
+                    use_miniexpr = False
+                expression = f"{reduce_op_str}({expression})"
+                res_eval._set_pref_expr(expression, operands, aux_reduc)
+                # This line would NOT allocate physical RAM on any modern OS:
+                aux = np.empty(res_eval.shape, res_eval.dtype)
+                # Physical allocation happens here (when writing):
+                res_eval[...] = aux
+            except Exception:
+                use_miniexpr = False
+            finally:
+                res_eval.schunk.remove_prefilter("miniexpr")
+                global iter_chunks
+                # Ensure any background reading thread is closed
+                iter_chunks = None
+
+            if not use_miniexpr:
+                # If miniexpr failed, fallback to regular evaluation
+                # (continue to the manual chunked evaluation below)
+                pass
+            else:
+                from time import time
+
+                t0 = time()
+                result = reduce_op.value.reduce(aux_reduc, **reduce_args)
+                t = time() - t0
+                print(f"reduction of aux_reduc took {t * 1e6:.6f} us")
+                return result
+
     # Iterate over the operands and get the chunks
     chunk_operands = {}
     # Check which chunks intersect with _slice
@@ -2754,6 +2815,7 @@ def where(self, value1=None, value2=None):
     def sum(self, axis=None, dtype=None, keepdims=False, **kwargs):
         reduce_args = {
             "op": ReduceOp.SUM,
+            "op_str": "sum",
             "axis": axis,
             "dtype": dtype,
             "keepdims": keepdims,
@@ -2763,6 +2825,7 @@ def sum(self, axis=None, dtype=None, keepdims=False, **kwargs):
     def prod(self, axis=None, dtype=None, keepdims=False, **kwargs):
         reduce_args = {
             "op": ReduceOp.PROD,
+            "op_str": "prod",
             "axis": axis,
             "dtype": dtype,
             "keepdims": keepdims,
@@ -2853,6 +2916,7 @@ def var(self, axis=None, dtype=None, keepdims=False, ddof=0, **kwargs):
     def min(self, axis=None, keepdims=False, **kwargs):
         reduce_args = {
             "op": ReduceOp.MIN,
+            "op_str": "min",
             "axis": axis,
             "keepdims": keepdims,
         }
@@ -2861,6 +2925,7 @@ def min(self, axis=None, keepdims=False, **kwargs):
     def max(self, axis=None, keepdims=False, **kwargs):
         reduce_args = {
             "op": ReduceOp.MAX,
+            "op_str": "max",
             "axis": axis,
             "keepdims": keepdims,
         }
@@ -2869,6 +2934,7 @@ def max(self, axis=None, keepdims=False, **kwargs):
     def any(self, axis=None, keepdims=False, **kwargs):
         reduce_args = {
             "op": ReduceOp.ANY,
+            "op_str": "any",
             "axis": axis,
             "keepdims": keepdims,
         }
@@ -2877,6 +2943,7 @@ def any(self, axis=None, keepdims=False, **kwargs):
     def all(self, axis=None, keepdims=False, **kwargs):
         reduce_args = {
             "op": ReduceOp.ALL,
+            "op_str": "all",
             "axis": axis,
             "keepdims": keepdims,
         }
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index 8890daba..169c736b 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -2103,85 +2103,15 @@ static uint64_t reduce_max_uint64(const uint64_t* data, int nitems) {
     return acc;
 }
 
-static float reduce_prod_float32_nan_safe(const float* data, int nitems) {
-    if (nitems <= 0) return 1.0f;
-#if defined(__AVX__) || defined(__AVX2__)
-    int i = 0;
-    __m256 vprod = _mm256_set1_ps(1.0f);
-    __m256 vnan = _mm256_setzero_ps();
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        __m256 v = _mm256_loadu_ps(data + i);
-        vnan = _mm256_or_ps(vnan, _mm256_cmp_ps(v, v, _CMP_UNORD_Q));
-        vprod = _mm256_mul_ps(vprod, v);
-    }
-    __m128 low = _mm256_castps256_ps128(vprod);
-    __m128 high = _mm256_extractf128_ps(vprod, 1);
-    __m128 prod128 = _mm_mul_ps(low, high);
-    __m128 tmp = _mm_mul_ps(prod128, _mm_movehl_ps(prod128, prod128));
-    tmp = _mm_mul_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
-    float acc = _mm_cvtss_f32(tmp);
-    if (_mm256_movemask_ps(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        float v = data[i];
-        acc *= v;
-        if (v != v) return v;
-    }
-    return acc;
-#elif defined(__SSE__)
-    int i = 0;
-    __m128 vprod = _mm_set1_ps(1.0f);
-    __m128 vnan = _mm_setzero_ps();
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        __m128 v = _mm_loadu_ps(data + i);
-        vnan = _mm_or_ps(vnan, _mm_cmpunord_ps(v, v));
-        vprod = _mm_mul_ps(vprod, v);
-    }
-    __m128 tmp = _mm_mul_ps(vprod, _mm_movehl_ps(vprod, vprod));
-    tmp = _mm_mul_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
-    float acc = _mm_cvtss_f32(tmp);
-    if (_mm_movemask_ps(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        float v = data[i];
-        acc *= v;
-        if (v != v) return v;
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    float32x4_t vprod = vdupq_n_f32(1.0f);
-    uint32x4_t vnan = vdupq_n_u32(0);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        float32x4_t v = vld1q_f32(data + i);
-        uint32x4_t eq = vceqq_f32(v, v);
-        vnan = vorrq_u32(vnan, vmvnq_u32(eq));
-        vprod = vmulq_f32(vprod, v);
-    }
-    float acc =
-        vgetq_lane_f32(vprod, 0) *
-        vgetq_lane_f32(vprod, 1) *
-        vgetq_lane_f32(vprod, 2) *
-        vgetq_lane_f32(vprod, 3);
-    uint32x2_t nan2 = vorr_u32(vget_low_u32(vnan), vget_high_u32(vnan));
-    nan2 = vpadd_u32(nan2, nan2);
-    if (vget_lane_u32(nan2, 0)) return NAN;
-    for (; i < nitems; i++) {
-        float v = data[i];
-        acc *= v;
-        if (v != v) return v;
-    }
-    return acc;
-#else
-    float acc = 1.0f;
+static double reduce_prod_float32_nan_safe(const float* data, int nitems) {
+    if (nitems <= 0) return 1.0;
+    double acc = 1.0;
     for (int i = 0; i < nitems; i++) {
-        float v = data[i];
+        double v = (double)data[i];
         acc *= v;
         if (v != v) return v;
     }
     return acc;
-#endif
 }
 
 static double reduce_prod_float64_nan_safe(const double* data, int nitems) {
@@ -2258,87 +2188,15 @@ static double reduce_prod_float64_nan_safe(const double* data, int nitems) {
 #endif
 }
 
-static float reduce_sum_float32_nan_safe(const float* data, int nitems) {
-    if (nitems <= 0) return 0.0f;
-#if defined(__AVX__) || defined(__AVX2__)
-    int i = 0;
-    __m256 vsum = _mm256_setzero_ps();
-    __m256 vnan = _mm256_setzero_ps();
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        __m256 v = _mm256_loadu_ps(data + i);
-        vnan = _mm256_or_ps(vnan, _mm256_cmp_ps(v, v, _CMP_UNORD_Q));
-        vsum = _mm256_add_ps(vsum, v);
-    }
-    __m128 low = _mm256_castps256_ps128(vsum);
-    __m128 high = _mm256_extractf128_ps(vsum, 1);
-    __m128 sum128 = _mm_add_ps(low, high);
-    __m128 tmp = _mm_add_ps(sum128, _mm_movehl_ps(sum128, sum128));
-    tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
-    float acc = _mm_cvtss_f32(tmp);
-    if (_mm256_movemask_ps(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        float v = data[i];
-        acc += v;
-        if (v != v) return v;
-    }
-    return acc;
-#elif defined(__SSE__)
-    int i = 0;
-    __m128 vsum = _mm_setzero_ps();
-    __m128 vnan = _mm_setzero_ps();
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        __m128 v = _mm_loadu_ps(data + i);
-        vnan = _mm_or_ps(vnan, _mm_cmpunord_ps(v, v));
-        vsum = _mm_add_ps(vsum, v);
-    }
-    __m128 tmp = _mm_add_ps(vsum, _mm_movehl_ps(vsum, vsum));
-    tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
-    float acc = _mm_cvtss_f32(tmp);
-    if (_mm_movemask_ps(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        float v = data[i];
-        acc += v;
-        if (v != v) return v;
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    float32x4_t vsum = vdupq_n_f32(0.0f);
-    uint32x4_t vnan = vdupq_n_u32(0);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        float32x4_t v = vld1q_f32(data + i);
-        uint32x4_t eq = vceqq_f32(v, v);
-        vnan = vorrq_u32(vnan, vmvnq_u32(eq));
-        vsum = vaddq_f32(vsum, v);
-    }
-#if defined(__aarch64__)
-    float acc = vaddvq_f32(vsum);
-#else
-    float32x2_t sum2 = vadd_f32(vget_low_f32(vsum), vget_high_f32(vsum));
-    sum2 = vpadd_f32(sum2, sum2);
-    float acc = vget_lane_f32(sum2, 0);
-#endif
-    uint32x2_t nan2 = vorr_u32(vget_low_u32(vnan), vget_high_u32(vnan));
-    nan2 = vpadd_u32(nan2, nan2);
-    if (vget_lane_u32(nan2, 0)) return NAN;
-    for (; i < nitems; i++) {
-        float v = data[i];
-        acc += v;
-        if (v != v) return v;
-    }
-    return acc;
-#else
-    float acc = 0.0f;
+static double reduce_sum_float32_nan_safe(const float* data, int nitems) {
+    if (nitems <= 0) return 0.0;
+    double acc = 0.0;
     for (int i = 0; i < nitems; i++) {
-        float v = data[i];
+        double v = (double)data[i];
         acc += v;
         if (v != v) return v;
     }
     return acc;
-#endif
 }
 
 static double reduce_sum_float64_nan_safe(const double* data, int nitems) {
@@ -5137,25 +4995,30 @@ static void eval_reduction(const me_expr* n) {
                     ((bool*)write_ptr)[0] = acc;
                 }
                 else {
-                    float acc = 0.0f;
                     if (nitems == 0) {
+                        float acc = 0.0f;
                         if (is_min) acc = INFINITY;
                         else if (is_max) acc = -INFINITY;
                         else acc = is_prod ? 1.0f : 0.0f;
+                        ((float*)write_ptr)[0] = acc;
                     }
                     else if (is_min) {
-                        acc = reduce_min_float32_nan_safe(data, nitems);
+                        float acc = reduce_min_float32_nan_safe(data, nitems);
+                        ((float*)write_ptr)[0] = acc;
                     }
                     else if (is_max) {
-                        acc = reduce_max_float32_nan_safe(data, nitems);
+                        float acc = reduce_max_float32_nan_safe(data, nitems);
+                        ((float*)write_ptr)[0] = acc;
                     }
                     else if (is_prod) {
-                        acc = reduce_prod_float32_nan_safe(data, nitems);
+                        /* Accumulate float32 sum/prod in float64 for better precision. */
+                        double acc = reduce_prod_float32_nan_safe(data, nitems);
+                        ((float*)write_ptr)[0] = (float)acc;
                     }
                     else {
-                        acc = reduce_sum_float32_nan_safe(data, nitems);
+                        double acc = reduce_sum_float32_nan_safe(data, nitems);
+                        ((float*)write_ptr)[0] = (float)acc;
                     }
-                    ((float*)write_ptr)[0] = acc;
                 }
                 break;
             }
diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py
index 41367ea6..85d4dcb2 100644
--- a/src/blosc2/ndarray.py
+++ b/src/blosc2/ndarray.py
@@ -3773,7 +3773,7 @@ def ndim(self) -> int:
 
     @property
     def size(self) -> int:
-        """The size (in bytes) for this container."""
+        """The size (in elements) for this container."""
         return super().size
 
     @property

From 068d423a01b601ce8ac15fc557bb5c4256fe1059 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 2 Jan 2026 14:27:13 +0100
Subject: [PATCH 061/123] Some fixes and experiments

---
 src/blosc2/blosc2_ext.pyx | 2 +-
 src/blosc2/lazyexpr.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 40e2197e..ba02a6e3 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1894,7 +1894,7 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
         offset_bytes = <uintptr_t> typesize * (nchunk * nblocks_per_chunk + nblock)
         aux_reduc_ptr = <void *> (<uintptr_t> udata.aux_reduc_ptr + offset_bytes)
         rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs, aux_reduc_ptr, ndarr.blocknitems)
-        memset(<char *>params_output, 0, udata.array.sc.blocksize)  # clear output buffer
+        #memset(<char *>params_output, 0, udata.array.sc.blocksize)  # clear output buffer
     if rc != 0:
         raise RuntimeError(f"miniexpr: issues during evaluation; error code: {rc}")
 
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 11baa3b9..3a4a2f74 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1983,7 +1983,7 @@ def reduce_slices(  # noqa: C901
         chunks = temp.chunks
         del temp
 
-    if (where is None and fast_path and all_ndarray and expression == "o0") or expression == "(o0)":
+    if (where is None and fast_path and all_ndarray) and (expression == "o0" or expression == "(o0)"):
         # Only this case is supported so far
         if use_miniexpr:
             for op in operands.values():

From fb9f2968a3556cd637cd6dc16813bb5111aa8aab Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 2 Jan 2026 14:31:48 +0100
Subject: [PATCH 062/123] Clear output buffer

---
 src/blosc2/blosc2_ext.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index ba02a6e3..bf3b1329 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1894,7 +1894,7 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
         offset_bytes = <uintptr_t> typesize * (nchunk * nblocks_per_chunk + nblock)
         aux_reduc_ptr = <void *> (<uintptr_t> udata.aux_reduc_ptr + offset_bytes)
         rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs, aux_reduc_ptr, ndarr.blocknitems)
-        #memset(<char *>params_output, 0, udata.array.sc.blocksize)  # clear output buffer
+        memset(<void *>params_output, 0, udata.array.sc.blocksize)  # clear output buffer
     if rc != 0:
         raise RuntimeError(f"miniexpr: issues during evaluation; error code: {rc}")
 

From 47cdf2ac39e4decb9ad0162b85ace5ae356fcc2b Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 2 Jan 2026 18:39:17 +0100
Subject: [PATCH 063/123] Add a benchmark for reductions (preliminary)

---
 bench/ndarray/expr-reduction-sum.py | 46 +++++++++++++++++++++++++++++
 src/blosc2/blosc2_ext.pyx           |  3 +-
 src/blosc2/lazyexpr.py              |  2 ++
 3 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 bench/ndarray/expr-reduction-sum.py

diff --git a/bench/ndarray/expr-reduction-sum.py b/bench/ndarray/expr-reduction-sum.py
new file mode 100644
index 00000000..c3453698
--- /dev/null
+++ b/bench/ndarray/expr-reduction-sum.py
@@ -0,0 +1,46 @@
+from time import time
+import blosc2
+import numpy as np
+import numexpr as ne
+
+N = 10_000
+dtype= np.float32
+#dtype= np.float64
+#dtype= np.int32
+cparams = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=1)
+cparams_out = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=0,
+                             blocksize=cparams.blocksize, splitmode=blosc2.SplitMode.NEVER_SPLIT)
+
+t0 = time()
+#a = blosc2.ones((N, N), dtype=dtype, cparams=cparams)
+#a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
+a = blosc2.linspace(0., 1., np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
+print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
+t0 = time()
+b = a.copy()
+c = a.copy()
+print(f"Time to copy data: {(time() - t0) * 1000 :.4f} ms")
+
+t0 = time()
+res = blosc2.sum(a, cparams=cparams)
+t = time() - t0
+print(f"Time to evaluate: {t * 1000 :.4f} ms")
+print(f"Speed (GB/s): {(a.nbytes / 1e9) / t:.2f}")
+print("res:", res)
+
+na = a[:]
+nb = b[:]
+nc = c[:]
+np.testing.assert_allclose(res, np.sum(na), rtol=1e-5)
+
+t0 = time()
+res = ne.evaluate("sum(na)")
+t = time() - t0
+print(f"Time to evaluate with NumExpr: {t * 1000 :.4f} ms")
+print(f"Speed (GB/s): {(na.nbytes / 1e9) / t:.2f}")
+
+t0 = time()
+res = np.sum(na)
+t = time() - t0
+print(f"Time to evaluate with NumPy: {t * 1000 :.4f} ms")
+print(f"Speed (GB/s): {(na.nbytes / 1e9) / t:.2f}")
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index bf3b1329..3af87c35 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1894,7 +1894,8 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
         offset_bytes = <uintptr_t> typesize * (nchunk * nblocks_per_chunk + nblock)
         aux_reduc_ptr = <void *> (<uintptr_t> udata.aux_reduc_ptr + offset_bytes)
         rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs, aux_reduc_ptr, ndarr.blocknitems)
-        memset(<void *>params_output, 0, udata.array.sc.blocksize)  # clear output buffer
+        # The output buffer is cleared in the prefilter function
+        # memset(<void *>params_output, 0, udata.array.sc.blocksize)  # clear output buffer
     if rc != 0:
         raise RuntimeError(f"miniexpr: issues during evaluation; error code: {rc}")
 
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 3a4a2f74..6f5d9b30 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1998,6 +1998,7 @@ def reduce_slices(  # noqa: C901
 
         if use_miniexpr:
             cparams = kwargs.pop("cparams", blosc2.CParams())
+            # print(f"cparams: {cparams}")
             # Use the same chunks/blocks as the input operands for consistency
             res_eval = blosc2.empty(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
             # Compute the number of blocks in the result
@@ -2033,6 +2034,7 @@ def reduce_slices(  # noqa: C901
                 result = reduce_op.value.reduce(aux_reduc, **reduce_args)
                 t = time() - t0
                 print(f"reduction of aux_reduc took {t * 1e6:.6f} us")
+                # print(f"res_eval.info:", res_eval.info)
                 return result
 
     # Iterate over the operands and get the chunks

From 999a1b63bb41b053cc3bbc6b63cb285ae29a2ffb Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 3 Jan 2026 08:43:14 +0100
Subject: [PATCH 064/123] Upgrade to latest openzl commit in c-blosc2

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 648d0b72..395957af 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,7 +92,7 @@ else()
     include(FetchContent)
     FetchContent_Declare(blosc2
         GIT_REPOSITORY https://github.com/Blosc/c-blosc2
-        GIT_TAG 290853dc15dc0e0e887cc72fbaac692cefd75014  # openzl
+        GIT_TAG 31365f4ab555f722b37212ee689d076c8a53b279  # openzl
         # in case you want to use a local copy of c-blosc2 for development, uncomment the line below
         # SOURCE_DIR "/Users/faltet/blosc/c-blosc2"
     )

From 5ceba215e4cd4936b39ff3d44be862d5701feb3f Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 3 Jan 2026 14:52:04 +0100
Subject: [PATCH 065/123] Optimization for dispose outputs in reductions

---
 CMakeLists.txt            |  2 +-
 src/blosc2/blosc2_ext.pyx | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 395957af..7a9fc29b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,7 +92,7 @@ else()
     include(FetchContent)
     FetchContent_Declare(blosc2
         GIT_REPOSITORY https://github.com/Blosc/c-blosc2
-        GIT_TAG 31365f4ab555f722b37212ee689d076c8a53b279  # openzl
+        GIT_TAG 9d250c2201f6e385c56a372b08037f7debc6fa1b  # openzl
         # in case you want to use a local copy of c-blosc2 for development, uncomment the line below
         # SOURCE_DIR "/Users/faltet/blosc/c-blosc2"
     )
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 3af87c35..53d7ebe7 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -209,6 +209,7 @@ cdef extern from "blosc2.h":
         uint8_t* ttmp
         size_t ttmp_nbytes
         blosc2_context* ctx
+        c_bool output_is_disposable
 
     ctypedef struct blosc2_postfilter_params:
         void *user_data
@@ -1682,7 +1683,7 @@ cdef class SChunk:
         cdef blosc2_cparams* cparams = self.schunk.storage.cparams
         cparams.prefilter = <blosc2_prefilter_fn> general_filler
 
-        cdef blosc2_prefilter_params* preparams = <blosc2_prefilter_params *> malloc(sizeof(blosc2_prefilter_params))
+        cdef blosc2_prefilter_params* preparams = <blosc2_prefilter_params *> calloc(1, sizeof(blosc2_prefilter_params))
         cdef filler_udata* fill_udata = <filler_udata *> malloc(sizeof(filler_udata))
         fill_udata.py_func = <char *> malloc(strlen(func_id) + 1)
         strcpy(fill_udata.py_func, func_id)
@@ -1715,7 +1716,7 @@ cdef class SChunk:
 
         cdef blosc2_cparams* cparams = self.schunk.storage.cparams
         cparams.prefilter = <blosc2_prefilter_fn> general_prefilter
-        cdef blosc2_prefilter_params* preparams = <blosc2_prefilter_params *> malloc(sizeof(blosc2_prefilter_params))
+        cdef blosc2_prefilter_params* preparams = <blosc2_prefilter_params *> calloc(1, sizeof(blosc2_prefilter_params))
         cdef user_filters_udata* pref_udata = <user_filters_udata*> malloc(sizeof(user_filters_udata))
         pref_udata.py_func = <char *> malloc(strlen(func_id) + 1)
         strcpy(pref_udata.py_func, func_id)
@@ -2856,8 +2857,9 @@ cdef class NDArray:
             free(variables[i].name)
         free(variables)
 
-        cdef blosc2_prefilter_params* preparams = <blosc2_prefilter_params *> malloc(sizeof(blosc2_prefilter_params))
+        cdef blosc2_prefilter_params* preparams = <blosc2_prefilter_params *> calloc(1, sizeof(blosc2_prefilter_params))
         preparams.user_data = udata
+        preparams.output_is_disposable = False if aux_reduc is None else True
         cparams.preparams = preparams
         _check_cparams(cparams)
 
@@ -2880,7 +2882,7 @@ cdef class NDArray:
         cdef blosc2_cparams* cparams = self.array.sc.storage.cparams
         cparams.prefilter = <blosc2_prefilter_fn> general_udf_prefilter
 
-        cdef blosc2_prefilter_params* preparams = <blosc2_prefilter_params *> malloc(sizeof(blosc2_prefilter_params))
+        cdef blosc2_prefilter_params* preparams = <blosc2_prefilter_params *> calloc(1, sizeof(blosc2_prefilter_params))
         preparams.user_data = self._fill_udf_udata(func_id, inputs_id)
         cparams.preparams = preparams
         _check_cparams(cparams)

From d7b0f21de43aba15996c5d59edce9aa24115f5b7 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 3 Jan 2026 15:07:19 +0100
Subject: [PATCH 066/123] Allow miniexpr to take care of reductions with
 multi-operand expressions

---
 bench/ndarray/expr-reduction-sum-multi.py |  40 +++++
 bench/ndarray/expr-reduction-sum.py       |  10 +-
 src/blosc2/lazyexpr.py                    |   6 +-
 src/blosc2/miniexpr.c                     | 208 ++++++++++++++++++++--
 4 files changed, 245 insertions(+), 19 deletions(-)
 create mode 100644 bench/ndarray/expr-reduction-sum-multi.py

diff --git a/bench/ndarray/expr-reduction-sum-multi.py b/bench/ndarray/expr-reduction-sum-multi.py
new file mode 100644
index 00000000..1acaf1e4
--- /dev/null
+++ b/bench/ndarray/expr-reduction-sum-multi.py
@@ -0,0 +1,40 @@
+from time import time
+import blosc2
+import numpy as np
+import numexpr as ne
+
+N = 10_000
+dtype= np.float32
+cparams = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=1)
+
+t0 = time()
+#a = blosc2.ones((N, N), dtype=dtype, cparams=cparams)
+#a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
+a = blosc2.linspace(0., 1., np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
+#rng = np.random.default_rng(1234)
+#a = rng.integers(0, 2, size=(N, N), dtype=dtype)
+#a = blosc2.asarray(a, cparams=cparams, urlpath="a.b2nd", mode="w")
+print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
+#print(a[:])
+t0 = time()
+b = a.copy()
+c = a.copy()
+print(f"Time to copy data: {(time() - t0) * 1000 :.4f} ms")
+
+t0 = time()
+res = blosc2.sum(a + b + c, cparams=cparams)
+print(f"Time to evaluate: {(time() - t0) * 1000 :.4f} ms")
+print("Result:", res, "Mean:", res / (N * N))
+
+na = a[:]
+nb = b[:]
+nc = c[:]
+#np.testing.assert_allclose(res, np.sum(na + nb + nc))
+#
+#t0 = time()
+#res = ne.evaluate("sum(na)")
+#print(f"Time to evaluate with NumExpr: {(time() - t0) * 1000 :.4f} ms")
+
+t0 = time()
+res = np.sum(na + nb + nc)
+print(f"Time to evaluate with NumPy: {(time() - t0) * 1000 :.4f} ms")
diff --git a/bench/ndarray/expr-reduction-sum.py b/bench/ndarray/expr-reduction-sum.py
index c3453698..f6320c30 100644
--- a/bench/ndarray/expr-reduction-sum.py
+++ b/bench/ndarray/expr-reduction-sum.py
@@ -31,16 +31,16 @@
 na = a[:]
 nb = b[:]
 nc = c[:]
-np.testing.assert_allclose(res, np.sum(na), rtol=1e-5)
+# np.testing.assert_allclose(res, np.sum(na), rtol=1e-5)
 
 t0 = time()
-res = ne.evaluate("sum(na)")
+res = np.sum(na)
 t = time() - t0
-print(f"Time to evaluate with NumExpr: {t * 1000 :.4f} ms")
+print(f"Time to evaluate with NumPy: {t * 1000 :.4f} ms")
 print(f"Speed (GB/s): {(na.nbytes / 1e9) / t:.2f}")
 
 t0 = time()
-res = np.sum(na)
+res = ne.evaluate("sum(na)")
 t = time() - t0
-print(f"Time to evaluate with NumPy: {t * 1000 :.4f} ms")
+print(f"Time to evaluate with NumExpr: {t * 1000 :.4f} ms")
 print(f"Speed (GB/s): {(na.nbytes / 1e9) / t:.2f}")
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 6f5d9b30..dea1eabe 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1983,7 +1983,11 @@ def reduce_slices(  # noqa: C901
         chunks = temp.chunks
         del temp
 
-    if (where is None and fast_path and all_ndarray) and (expression == "o0" or expression == "(o0)"):
+    # if (where is None and fast_path and all_ndarray) and (expression == "o0" or expression == "(o0)"):
+    # miniexpr does not shine specially for single operand reductions
+    if (where is None and fast_path and all_ndarray) and not (
+        expression == "o0" or expression == "(o0)"
+    ):  # or 1:  # XXX make tests pass
         # Only this case is supported so far
         if use_miniexpr:
             for op in operands.values():
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index 169c736b..cdf7070e 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -569,6 +569,9 @@ typedef struct state {
 
 /* Forward declarations */
 static me_expr* new_expr(const int type, const me_expr* parameters[]);
+static me_dtype infer_output_type(const me_expr* n);
+static void private_eval(const me_expr* n);
+static void eval_reduction(const me_expr* n, int output_nitems);
 static double conj_wrapper(double x);
 static double imag_wrapper(double x);
 static double real_wrapper(double x);
@@ -631,15 +634,49 @@ static bool contains_reduction(const me_expr* n) {
 }
 
 static bool reduction_usage_is_valid(const me_expr* n) {
-    if (!is_reduction_node(n)) return false;
-    me_expr* arg = (me_expr*)n->parameters[0];
-    if (!arg) return false;
-    if (n->function == (void*)min_reduce || n->function == (void*)max_reduce) {
-        if (arg->dtype == ME_COMPLEX64 || arg->dtype == ME_COMPLEX128) {
-            return false;
+    if (!n) return true;
+    if (is_reduction_node(n)) {
+        me_expr* arg = (me_expr*)n->parameters[0];
+        if (!arg) return false;
+        if (contains_reduction(arg)) return false;
+        me_dtype arg_type = infer_output_type(arg);
+        if (n->function == (void*)min_reduce || n->function == (void*)max_reduce) {
+            if (arg_type == ME_COMPLEX64 || arg_type == ME_COMPLEX128) {
+                return false;
+            }
         }
+        return true;
+    }
+
+    switch (TYPE_MASK(n->type)) {
+    case ME_FUNCTION0:
+    case ME_FUNCTION1:
+    case ME_FUNCTION2:
+    case ME_FUNCTION3:
+    case ME_FUNCTION4:
+    case ME_FUNCTION5:
+    case ME_FUNCTION6:
+    case ME_FUNCTION7:
+    case ME_CLOSURE0:
+    case ME_CLOSURE1:
+    case ME_CLOSURE2:
+    case ME_CLOSURE3:
+    case ME_CLOSURE4:
+    case ME_CLOSURE5:
+    case ME_CLOSURE6:
+    case ME_CLOSURE7:
+        {
+            const int arity = ARITY(n->type);
+            for (int i = 0; i < arity; i++) {
+                if (!reduction_usage_is_valid((const me_expr*)n->parameters[i])) {
+                    return false;
+                }
+            }
+            return true;
+        }
+    default:
+        return true;
     }
-    return TYPE_MASK(arg->type) == ME_VARIABLE || TYPE_MASK(arg->type) == ME_CONSTANT;
 }
 
 /* Infer computation type from expression tree (for evaluation) */
@@ -3698,7 +3735,12 @@ typedef float (*me_fun1_f32)(float);
     SQRT_FUNC, SIN_FUNC, COS_FUNC, EXP_FUNC, LOG_FUNC, FABS_FUNC, POW_FUNC, \
     VEC_CONJ) \
 static void me_eval_##SUFFIX(const me_expr *n) { \
-    if (!n || !n->output || n->nitems <= 0) return; \
+    if (!n || !n->output) return; \
+    if (is_reduction_node(n)) { \
+        eval_reduction(n, n->nitems); \
+        return; \
+    } \
+    if (n->nitems <= 0) return; \
     \
     int i, j; \
     const int arity = ARITY(n->type); \
@@ -4364,14 +4406,139 @@ static bool all_variables_match_type(const me_expr* n, me_dtype target_type) {
     return true;
 }
 
-static void eval_reduction(const me_expr* n) {
+static void broadcast_reduction_output(void* output, me_dtype dtype, int output_nitems) {
+    if (!output || output_nitems <= 1) return;
+    switch (dtype) {
+    case ME_BOOL:
+        {
+            bool val = ((bool*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((bool*)output)[i] = val;
+            }
+            break;
+        }
+    case ME_INT8:
+        {
+            int8_t val = ((int8_t*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((int8_t*)output)[i] = val;
+            }
+            break;
+        }
+    case ME_INT16:
+        {
+            int16_t val = ((int16_t*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((int16_t*)output)[i] = val;
+            }
+            break;
+        }
+    case ME_INT32:
+        {
+            int32_t val = ((int32_t*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((int32_t*)output)[i] = val;
+            }
+            break;
+        }
+    case ME_INT64:
+        {
+            int64_t val = ((int64_t*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((int64_t*)output)[i] = val;
+            }
+            break;
+        }
+    case ME_UINT8:
+        {
+            uint8_t val = ((uint8_t*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((uint8_t*)output)[i] = val;
+            }
+            break;
+        }
+    case ME_UINT16:
+        {
+            uint16_t val = ((uint16_t*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((uint16_t*)output)[i] = val;
+            }
+            break;
+        }
+    case ME_UINT32:
+        {
+            uint32_t val = ((uint32_t*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((uint32_t*)output)[i] = val;
+            }
+            break;
+        }
+    case ME_UINT64:
+        {
+            uint64_t val = ((uint64_t*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((uint64_t*)output)[i] = val;
+            }
+            break;
+        }
+    case ME_FLOAT32:
+        {
+            float val = ((float*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((float*)output)[i] = val;
+            }
+            break;
+        }
+    case ME_FLOAT64:
+        {
+            double val = ((double*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((double*)output)[i] = val;
+            }
+            break;
+        }
+    case ME_COMPLEX64:
+        {
+            float _Complex val = ((float _Complex*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((float _Complex*)output)[i] = val;
+            }
+            break;
+        }
+    case ME_COMPLEX128:
+        {
+            double _Complex val = ((double _Complex*)output)[0];
+            for (int i = 1; i < output_nitems; i++) {
+                ((double _Complex*)output)[i] = val;
+            }
+            break;
+        }
+    default:
+        break;
+    }
+}
+
+static void eval_reduction(const me_expr* n, int output_nitems) {
     if (!n || !n->output || !is_reduction_node(n)) return;
+    if (output_nitems <= 0) return;
 
     me_expr* arg = (me_expr*)n->parameters[0];
     if (!arg) return;
 
     const int nitems = n->nitems;
     me_dtype arg_type = arg->dtype;
+    if (arg->type != ME_CONSTANT && arg->type != ME_VARIABLE) {
+        arg_type = infer_output_type(arg);
+        if (nitems > 0) {
+            if (!arg->output) {
+                arg->output = malloc((size_t)nitems * dtype_size(arg_type));
+                if (!arg->output) return;
+            }
+            arg->nitems = nitems;
+            arg->dtype = arg_type;
+            private_eval(arg);
+        }
+    }
     me_dtype result_type = reduction_output_dtype(arg_type, n->function);
     me_dtype output_type = n->dtype;
     bool is_prod = n->function == (void*)prod_reduce;
@@ -4383,7 +4550,7 @@ static void eval_reduction(const me_expr* n) {
     void* write_ptr = n->output;
     void* temp_output = NULL;
     if (output_type != result_type) {
-        temp_output = malloc(dtype_size(result_type));
+        temp_output = malloc((size_t)output_nitems * dtype_size(result_type));
         if (!temp_output) return;
         write_ptr = temp_output;
     }
@@ -4627,7 +4794,13 @@ static void eval_reduction(const me_expr* n) {
             }
         }
     }
-    else if (arg->type == ME_VARIABLE) {
+    else {
+        const void* saved_bound = arg->bound;
+        int saved_type = arg->type;
+        if (arg->type != ME_VARIABLE) {
+            ((me_expr*)arg)->bound = arg->output;
+            ((me_expr*)arg)->type = ME_VARIABLE;
+        }
         switch (arg_type) {
         case ME_BOOL:
             {
@@ -5140,12 +5313,21 @@ static void eval_reduction(const me_expr* n) {
         default:
             break;
         }
+        if (saved_type != ME_VARIABLE) {
+            ((me_expr*)arg)->bound = saved_bound;
+            ((me_expr*)arg)->type = saved_type;
+        }
+    }
+
+    {
+        me_dtype write_type = temp_output ? result_type : output_type;
+        broadcast_reduction_output(write_ptr, write_type, output_nitems);
     }
 
     if (temp_output) {
         convert_func_t conv = get_convert_func(result_type, output_type);
         if (conv) {
-            conv(temp_output, n->output, 1);
+            conv(temp_output, n->output, output_nitems);
         }
         free(temp_output);
     }
@@ -5155,7 +5337,7 @@ static void private_eval(const me_expr* n) {
     if (!n) return;
 
     if (is_reduction_node(n)) {
-        eval_reduction(n);
+        eval_reduction(n, 1);
         return;
     }
 

From 8bce9f42ffe19f969d31e1011d8516b94526d0bd Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 3 Jan 2026 15:21:42 +0100
Subject: [PATCH 067/123] Improvements in reduction benchs

---
 bench/ndarray/expr-reduction-sum-multi.py | 18 ++++++++++++------
 bench/ndarray/expr-reduction-sum.py       |  4 ++--
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/bench/ndarray/expr-reduction-sum-multi.py b/bench/ndarray/expr-reduction-sum-multi.py
index 1acaf1e4..b8a38423 100644
--- a/bench/ndarray/expr-reduction-sum-multi.py
+++ b/bench/ndarray/expr-reduction-sum-multi.py
@@ -23,18 +23,24 @@
 
 t0 = time()
 res = blosc2.sum(a + b + c, cparams=cparams)
-print(f"Time to evaluate: {(time() - t0) * 1000 :.4f} ms")
+t = time() - t0
+print(f"Time to evaluate: {t * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(a.nbytes * 3 / 1e9) / t:.2f}")
 print("Result:", res, "Mean:", res / (N * N))
 
 na = a[:]
 nb = b[:]
 nc = c[:]
 #np.testing.assert_allclose(res, np.sum(na + nb + nc))
-#
-#t0 = time()
-#res = ne.evaluate("sum(na)")
-#print(f"Time to evaluate with NumExpr: {(time() - t0) * 1000 :.4f} ms")
 
 t0 = time()
 res = np.sum(na + nb + nc)
-print(f"Time to evaluate with NumPy: {(time() - t0) * 1000 :.4f} ms")
+t = time() - t0
+print(f"Time to evaluate with NumPy: {t * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(na.nbytes * 3 / 1e9) / t:.2f}")
+
+t0 = time()
+res = ne.evaluate("sum(na)")
+t = time() - t0
+print(f"Time to evaluate with NumExpr: {t * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(na.nbytes / 1e9) / t:.2f}")
diff --git a/bench/ndarray/expr-reduction-sum.py b/bench/ndarray/expr-reduction-sum.py
index f6320c30..45adf529 100644
--- a/bench/ndarray/expr-reduction-sum.py
+++ b/bench/ndarray/expr-reduction-sum.py
@@ -36,11 +36,11 @@
 t0 = time()
 res = np.sum(na)
 t = time() - t0
-print(f"Time to evaluate with NumPy: {t * 1000 :.4f} ms")
+print(f"Time to evaluate with NumPy: {t * 1000 :.4f} ms", end=" ")
 print(f"Speed (GB/s): {(na.nbytes / 1e9) / t:.2f}")
 
 t0 = time()
 res = ne.evaluate("sum(na)")
 t = time() - t0
-print(f"Time to evaluate with NumExpr: {t * 1000 :.4f} ms")
+print(f"Time to evaluate with NumExpr: {t * 1000 :.4f} ms", end=" ")
 print(f"Speed (GB/s): {(na.nbytes / 1e9) / t:.2f}")

From 4834614f65fbd2e2096bf551b9b821577795116b Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 3 Jan 2026 15:26:32 +0100
Subject: [PATCH 068/123] More complex expression in bench

---
 bench/ndarray/expr-reduction-sum-multi.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bench/ndarray/expr-reduction-sum-multi.py b/bench/ndarray/expr-reduction-sum-multi.py
index b8a38423..f6ad9f80 100644
--- a/bench/ndarray/expr-reduction-sum-multi.py
+++ b/bench/ndarray/expr-reduction-sum-multi.py
@@ -22,7 +22,7 @@
 print(f"Time to copy data: {(time() - t0) * 1000 :.4f} ms")
 
 t0 = time()
-res = blosc2.sum(a + b + c, cparams=cparams)
+res = blosc2.sum(2 * a**2 - 3 * b + c + 1.2, cparams=cparams)
 t = time() - t0
 print(f"Time to evaluate: {t * 1000 :.4f} ms", end=" ")
 print(f"Speed (GB/s): {(a.nbytes * 3 / 1e9) / t:.2f}")
@@ -31,16 +31,16 @@
 na = a[:]
 nb = b[:]
 nc = c[:]
-#np.testing.assert_allclose(res, np.sum(na + nb + nc))
+#np.testing.assert_allclose(res, np.sum(2 * a**2 - 3 * b + c + 1.2))
 
 t0 = time()
-res = np.sum(na + nb + nc)
+res = np.sum(2 * na**2 - 3 * nb + nc + 1.2)
 t = time() - t0
 print(f"Time to evaluate with NumPy: {t * 1000 :.4f} ms", end=" ")
 print(f"Speed (GB/s): {(na.nbytes * 3 / 1e9) / t:.2f}")
 
 t0 = time()
-res = ne.evaluate("sum(na)")
+res = ne.evaluate("sum(2 * na**2 - 3 * nb + nc + 1.2)")
 t = time() - t0
 print(f"Time to evaluate with NumExpr: {t * 1000 :.4f} ms", end=" ")
 print(f"Speed (GB/s): {(na.nbytes / 1e9) / t:.2f}")

From ec65a7d7cff06fe33be654b915507d64438440a7 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sun, 4 Jan 2026 10:16:53 +0100
Subject: [PATCH 069/123] Update to latest miniexpr, and some nicer stats in
 benchs

---
 bench/ndarray/expr-blocked-eval.py  |  26 ++-
 bench/ndarray/expr-reduction-sum.py |  10 +-
 src/blosc2/miniexpr.c               | 252 +++++++++++++++++++++++++++-
 3 files changed, 272 insertions(+), 16 deletions(-)

diff --git a/bench/ndarray/expr-blocked-eval.py b/bench/ndarray/expr-blocked-eval.py
index e8f56a2b..9fb76aac 100644
--- a/bench/ndarray/expr-blocked-eval.py
+++ b/bench/ndarray/expr-blocked-eval.py
@@ -4,12 +4,14 @@
 import numexpr as ne
 
 N = 10_000
-dtype= np.float32
+dtype= np.int32
+#dtype= np.float32
+#dtype= np.float64
 cparams = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=1)
 
 t0 = time()
-#a = blosc2.ones((N, N), dtype=dtype, cparams=cparams)
-a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
+a = blosc2.ones((N, N), dtype=dtype, cparams=cparams)
+#a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
 # a = blosc2.linspace(0., 1., np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
 print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
 t0 = time()
@@ -19,18 +21,24 @@
 
 t0 = time()
 res = ((a + b) * c).compute(cparams=cparams)
-print(f"Time to evaluate: {(time() - t0) * 1000 :.4f} ms")
+t = time() - t0
+print(f"Time to evaluate: {t * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(a.nbytes * 4 / 1e9) / t:.2f}")
 # print(res.info)
 
 na = a[:]
 nb = b[:]
 nc = c[:]
-np.testing.assert_allclose(res, (na + nb) * nc)
+#np.testing.assert_allclose(res, (na + nb) * nc)
 
 t0 = time()
-res = ne.evaluate("(na + nb) * nc")
-print(f"Time to evaluate with NumExpr: {(time() - t0) * 1000 :.4f} ms")
+res = (na + nb) * nc
+t = time() - t0
+print(f"Time to evaluate with NumPy: {t * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(na.nbytes * 4 / 1e9) / t:.2f}")
 
 t0 = time()
-res = na + nb * nc
-print(f"Time to evaluate with NumPy: {(time() - t0) * 1000 :.4f} ms")
+res = ne.evaluate("(na + nb) * nc")
+t = time() - t0
+print(f"Time to evaluate with NumExpr: {t * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(na.nbytes * 4 / 1e9) / t:.2f}")
diff --git a/bench/ndarray/expr-reduction-sum.py b/bench/ndarray/expr-reduction-sum.py
index 45adf529..1998a8da 100644
--- a/bench/ndarray/expr-reduction-sum.py
+++ b/bench/ndarray/expr-reduction-sum.py
@@ -4,17 +4,17 @@
 import numexpr as ne
 
 N = 10_000
-dtype= np.float32
+dtype= np.int32
+#dtype= np.float32
 #dtype= np.float64
-#dtype= np.int32
 cparams = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=1)
 cparams_out = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=0,
                              blocksize=cparams.blocksize, splitmode=blosc2.SplitMode.NEVER_SPLIT)
 
 t0 = time()
-#a = blosc2.ones((N, N), dtype=dtype, cparams=cparams)
+a = blosc2.ones((N, N), dtype=dtype, cparams=cparams)
 #a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
-a = blosc2.linspace(0., 1., np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
+#a = blosc2.linspace(0., 1., np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
 print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
 t0 = time()
 b = a.copy()
@@ -24,7 +24,7 @@
 t0 = time()
 res = blosc2.sum(a, cparams=cparams)
 t = time() - t0
-print(f"Time to evaluate: {t * 1000 :.4f} ms")
+print(f"Time to evaluate: {t * 1000 :.4f} ms", end=" ")
 print(f"Speed (GB/s): {(a.nbytes / 1e9) / t:.2f}")
 print("res:", res)
 
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
index cdf7070e..728bbbc5 100755
--- a/src/blosc2/miniexpr.c
+++ b/src/blosc2/miniexpr.c
@@ -2142,6 +2142,87 @@ static uint64_t reduce_max_uint64(const uint64_t* data, int nitems) {
 
 static double reduce_prod_float32_nan_safe(const float* data, int nitems) {
     if (nitems <= 0) return 1.0;
+#if defined(__AVX__) || defined(__AVX2__)
+    int i = 0;
+    __m256d vprod0 = _mm256_set1_pd(1.0);
+    __m256d vprod1 = _mm256_set1_pd(1.0);
+    int nan_mask = 0;
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        __m256 v = _mm256_loadu_ps(data + i);
+        nan_mask |= _mm256_movemask_ps(_mm256_cmp_ps(v, v, _CMP_UNORD_Q));
+        __m128 vlow = _mm256_castps256_ps128(v);
+        __m128 vhigh = _mm256_extractf128_ps(v, 1);
+        __m256d vlo = _mm256_cvtps_pd(vlow);
+        __m256d vhi = _mm256_cvtps_pd(vhigh);
+        vprod0 = _mm256_mul_pd(vprod0, vlo);
+        vprod1 = _mm256_mul_pd(vprod1, vhi);
+    }
+    __m256d vprod = _mm256_mul_pd(vprod0, vprod1);
+    __m128d low = _mm256_castpd256_pd128(vprod);
+    __m128d high = _mm256_extractf128_pd(vprod, 1);
+    __m128d prod128 = _mm_mul_pd(low, high);
+    prod128 = _mm_mul_sd(prod128, _mm_unpackhi_pd(prod128, prod128));
+    double acc = _mm_cvtsd_f64(prod128);
+    if (nan_mask) return NAN;
+    for (; i < nitems; i++) {
+        double v = (double)data[i];
+        acc *= v;
+        if (v != v) return v;
+    }
+    return acc;
+#elif defined(__SSE2__)
+    int i = 0;
+    __m128d vprod0 = _mm_set1_pd(1.0);
+    __m128d vprod1 = _mm_set1_pd(1.0);
+    int nan_mask = 0;
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        __m128 v = _mm_loadu_ps(data + i);
+        nan_mask |= _mm_movemask_ps(_mm_cmpunord_ps(v, v));
+        __m128 vhigh = _mm_movehl_ps(v, v);
+        __m128d vlo = _mm_cvtps_pd(v);
+        __m128d vhi = _mm_cvtps_pd(vhigh);
+        vprod0 = _mm_mul_pd(vprod0, vlo);
+        vprod1 = _mm_mul_pd(vprod1, vhi);
+    }
+    __m128d prod128 = _mm_mul_pd(vprod0, vprod1);
+    prod128 = _mm_mul_sd(prod128, _mm_unpackhi_pd(prod128, prod128));
+    double acc = _mm_cvtsd_f64(prod128);
+    if (nan_mask) return NAN;
+    for (; i < nitems; i++) {
+        double v = (double)data[i];
+        acc *= v;
+        if (v != v) return v;
+    }
+    return acc;
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
+    int i = 0;
+    float64x2_t vprod0 = vdupq_n_f64(1.0);
+    float64x2_t vprod1 = vdupq_n_f64(1.0);
+    uint32x4_t vnan = vdupq_n_u32(0);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        float32x4_t v = vld1q_f32(data + i);
+        uint32x4_t eq = vceqq_f32(v, v);
+        vnan = vorrq_u32(vnan, veorq_u32(eq, vdupq_n_u32(~0U)));
+        float64x2_t vlo = vcvt_f64_f32(vget_low_f32(v));
+        float64x2_t vhi = vcvt_f64_f32(vget_high_f32(v));
+        vprod0 = vmulq_f64(vprod0, vlo);
+        vprod1 = vmulq_f64(vprod1, vhi);
+    }
+    float64x2_t vprod = vmulq_f64(vprod0, vprod1);
+    double acc = vgetq_lane_f64(vprod, 0) * vgetq_lane_f64(vprod, 1);
+    uint32x4_t nan_or = vorrq_u32(vnan, vextq_u32(vnan, vnan, 2));
+    nan_or = vorrq_u32(nan_or, vextq_u32(nan_or, nan_or, 1));
+    if (vgetq_lane_u32(nan_or, 0)) return NAN;
+    for (; i < nitems; i++) {
+        double v = (double)data[i];
+        acc *= v;
+        if (v != v) return v;
+    }
+    return acc;
+#else
     double acc = 1.0;
     for (int i = 0; i < nitems; i++) {
         double v = (double)data[i];
@@ -2149,6 +2230,7 @@ static double reduce_prod_float32_nan_safe(const float* data, int nitems) {
         if (v != v) return v;
     }
     return acc;
+#endif
 }
 
 static double reduce_prod_float64_nan_safe(const double* data, int nitems) {
@@ -2227,6 +2309,86 @@ static double reduce_prod_float64_nan_safe(const double* data, int nitems) {
 
 static double reduce_sum_float32_nan_safe(const float* data, int nitems) {
     if (nitems <= 0) return 0.0;
+#if defined(__AVX__) || defined(__AVX2__)
+    int i = 0;
+    __m256d vsum0 = _mm256_setzero_pd();
+    __m256d vsum1 = _mm256_setzero_pd();
+    int nan_mask = 0;
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        __m256 v = _mm256_loadu_ps(data + i);
+        nan_mask |= _mm256_movemask_ps(_mm256_cmp_ps(v, v, _CMP_UNORD_Q));
+        __m128 vlow = _mm256_castps256_ps128(v);
+        __m128 vhigh = _mm256_extractf128_ps(v, 1);
+        __m256d vlo = _mm256_cvtps_pd(vlow);
+        __m256d vhi = _mm256_cvtps_pd(vhigh);
+        vsum0 = _mm256_add_pd(vsum0, vlo);
+        vsum1 = _mm256_add_pd(vsum1, vhi);
+    }
+    __m256d vsum = _mm256_add_pd(vsum0, vsum1);
+    __m128d low = _mm256_castpd256_pd128(vsum);
+    __m128d high = _mm256_extractf128_pd(vsum, 1);
+    __m128d sum128 = _mm_add_pd(low, high);
+    sum128 = _mm_add_sd(sum128, _mm_unpackhi_pd(sum128, sum128));
+    double acc = _mm_cvtsd_f64(sum128);
+    if (nan_mask) return NAN;
+    for (; i < nitems; i++) {
+        double v = (double)data[i];
+        acc += v;
+        if (v != v) return v;
+    }
+    return acc;
+#elif defined(__SSE2__)
+    int i = 0;
+    __m128d vsum0 = _mm_setzero_pd();
+    __m128d vsum1 = _mm_setzero_pd();
+    int nan_mask = 0;
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        __m128 v = _mm_loadu_ps(data + i);
+        nan_mask |= _mm_movemask_ps(_mm_cmpunord_ps(v, v));
+        __m128 vhigh = _mm_movehl_ps(v, v);
+        __m128d vlo = _mm_cvtps_pd(v);
+        __m128d vhi = _mm_cvtps_pd(vhigh);
+        vsum0 = _mm_add_pd(vsum0, vlo);
+        vsum1 = _mm_add_pd(vsum1, vhi);
+    }
+    __m128d sum128 = _mm_add_pd(vsum0, vsum1);
+    sum128 = _mm_add_sd(sum128, _mm_unpackhi_pd(sum128, sum128));
+    double acc = _mm_cvtsd_f64(sum128);
+    if (nan_mask) return NAN;
+    for (; i < nitems; i++) {
+        double v = (double)data[i];
+        acc += v;
+        if (v != v) return v;
+    }
+    return acc;
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
+    int i = 0;
+    float64x2_t vsum0 = vdupq_n_f64(0.0);
+    float64x2_t vsum1 = vdupq_n_f64(0.0);
+    uint32x4_t vnan = vdupq_n_u32(0);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        float32x4_t v = vld1q_f32(data + i);
+        uint32x4_t eq = vceqq_f32(v, v);
+        vnan = vorrq_u32(vnan, veorq_u32(eq, vdupq_n_u32(~0U)));
+        float64x2_t vlo = vcvt_f64_f32(vget_low_f32(v));
+        float64x2_t vhi = vcvt_f64_f32(vget_high_f32(v));
+        vsum0 = vaddq_f64(vsum0, vlo);
+        vsum1 = vaddq_f64(vsum1, vhi);
+    }
+    double acc = vaddvq_f64(vaddq_f64(vsum0, vsum1));
+    uint32x4_t nan_or = vorrq_u32(vnan, vextq_u32(vnan, vnan, 2));
+    nan_or = vorrq_u32(nan_or, vextq_u32(nan_or, nan_or, 1));
+    if (vgetq_lane_u32(nan_or, 0)) return NAN;
+    for (; i < nitems; i++) {
+        double v = (double)data[i];
+        acc += v;
+        if (v != v) return v;
+    }
+    return acc;
+#else
     double acc = 0.0;
     for (int i = 0; i < nitems; i++) {
         double v = (double)data[i];
@@ -2234,8 +2396,10 @@ static double reduce_sum_float32_nan_safe(const float* data, int nitems) {
         if (v != v) return v;
     }
     return acc;
+#endif
 }
 
+
 static double reduce_sum_float64_nan_safe(const double* data, int nitems) {
     if (nitems <= 0) return 0.0;
 #if defined(__AVX__) || defined(__AVX2__)
@@ -2310,6 +2474,90 @@ static double reduce_sum_float64_nan_safe(const double* data, int nitems) {
 #endif
 }
 
+static int64_t reduce_sum_int32(const int32_t* data, int nitems) {
+    if (nitems <= 0) return 0;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i acc0 = _mm256_setzero_si256();
+    __m256i acc1 = _mm256_setzero_si256();
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        __m256i v = _mm256_loadu_si256((const __m256i *)(data + i));
+        __m128i vlow = _mm256_castsi256_si128(v);
+        __m128i vhigh = _mm256_extracti128_si256(v, 1);
+        __m256i vlow64 = _mm256_cvtepi32_epi64(vlow);
+        __m256i vhigh64 = _mm256_cvtepi32_epi64(vhigh);
+        acc0 = _mm256_add_epi64(acc0, vlow64);
+        acc1 = _mm256_add_epi64(acc1, vhigh64);
+    }
+    acc0 = _mm256_add_epi64(acc0, acc1);
+    int64_t tmp[4];
+    _mm256_storeu_si256((__m256i *)tmp, acc0);
+    int64_t acc = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+    for (; i < nitems; i++) {
+        acc += data[i];
+    }
+    return acc;
+#else
+    int64_t acc = 0;
+    for (int i = 0; i < nitems; i++) {
+        acc += data[i];
+    }
+    return acc;
+#endif
+}
+
+static uint64_t reduce_sum_uint32(const uint32_t* data, int nitems) {
+    if (nitems <= 0) return 0;
+#if defined(__AVX2__)
+    int i = 0;
+    __m256i acc0 = _mm256_setzero_si256();
+    __m256i acc1 = _mm256_setzero_si256();
+    const int limit = nitems & ~7;
+    for (; i < limit; i += 8) {
+        __m256i v = _mm256_loadu_si256((const __m256i *)(data + i));
+        __m128i vlow = _mm256_castsi256_si128(v);
+        __m128i vhigh = _mm256_extracti128_si256(v, 1);
+        __m256i vlow64 = _mm256_cvtepu32_epi64(vlow);
+        __m256i vhigh64 = _mm256_cvtepu32_epi64(vhigh);
+        acc0 = _mm256_add_epi64(acc0, vlow64);
+        acc1 = _mm256_add_epi64(acc1, vhigh64);
+    }
+    acc0 = _mm256_add_epi64(acc0, acc1);
+    uint64_t tmp[4];
+    _mm256_storeu_si256((__m256i *)tmp, acc0);
+    uint64_t acc = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+    for (; i < nitems; i++) {
+        acc += data[i];
+    }
+    return acc;
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
+    int i = 0;
+    uint64x2_t acc0 = vdupq_n_u64(0);
+    uint64x2_t acc1 = vdupq_n_u64(0);
+    const int limit = nitems & ~3;
+    for (; i < limit; i += 4) {
+        uint32x4_t v = vld1q_u32(data + i);
+        uint64x2_t lo = vmovl_u32(vget_low_u32(v));
+        uint64x2_t hi = vmovl_u32(vget_high_u32(v));
+        acc0 = vaddq_u64(acc0, lo);
+        acc1 = vaddq_u64(acc1, hi);
+    }
+    uint64x2_t accv = vaddq_u64(acc0, acc1);
+    uint64_t acc = vgetq_lane_u64(accv, 0) + vgetq_lane_u64(accv, 1);
+    for (; i < nitems; i++) {
+        acc += data[i];
+    }
+    return acc;
+#else
+    uint64_t acc = 0;
+    for (int i = 0; i < nitems; i++) {
+        acc += data[i];
+    }
+    return acc;
+#endif
+}
+
 static double comma(double a, double b) {
     (void)a;
     return b;
@@ -4953,7 +5201,7 @@ static void eval_reduction(const me_expr* n, int output_nitems) {
                         for (int i = 0; i < nitems; i++) acc *= data[i];
                     }
                     else {
-                        for (int i = 0; i < nitems; i++) acc += data[i];
+                        acc = reduce_sum_int32(data, nitems);
                     }
                     ((int64_t*)write_ptr)[0] = acc;
                 }
@@ -5105,7 +5353,7 @@ static void eval_reduction(const me_expr* n, int output_nitems) {
                         for (int i = 0; i < nitems; i++) acc *= data[i];
                     }
                     else {
-                        for (int i = 0; i < nitems; i++) acc += data[i];
+                        acc = reduce_sum_uint32(data, nitems);
                     }
                     ((uint64_t*)write_ptr)[0] = acc;
                 }

From 54748a317d6d3b70ec09c1b6163232312d11dcbf Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sun, 4 Jan 2026 10:26:59 +0100
Subject: [PATCH 070/123] Minor fix

---
 bench/ndarray/expr-reduction-sum-multi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench/ndarray/expr-reduction-sum-multi.py b/bench/ndarray/expr-reduction-sum-multi.py
index f6ad9f80..6cbe48a0 100644
--- a/bench/ndarray/expr-reduction-sum-multi.py
+++ b/bench/ndarray/expr-reduction-sum-multi.py
@@ -31,7 +31,7 @@
 na = a[:]
 nb = b[:]
 nc = c[:]
-#np.testing.assert_allclose(res, np.sum(2 * a**2 - 3 * b + c + 1.2))
+#np.testing.assert_allclose(res, np.sum(2 * na**2 - 3 * nb + nc + 1.2))
 
 t0 = time()
 res = np.sum(2 * na**2 - 3 * nb + nc + 1.2)

From e70a31a860fcfc1093e72ede14a867c5796fc616 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sun, 4 Jan 2026 13:07:14 +0100
Subject: [PATCH 071/123] Improved conditions determining when miniexpr can
 enter into action

---
 src/blosc2/lazyexpr.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index dea1eabe..4760d0df 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1983,11 +1983,13 @@ def reduce_slices(  # noqa: C901
         chunks = temp.chunks
         del temp
 
-    # if (where is None and fast_path and all_ndarray) and (expression == "o0" or expression == "(o0)"):
-    # miniexpr does not shine specially for single operand reductions
-    if (where is None and fast_path and all_ndarray) and not (
-        expression == "o0" or expression == "(o0)"
-    ):  # or 1:  # XXX make tests pass
+    # miniexpr reduction path only supported for some cases so far
+    if where is None and fast_path and all_ndarray and reduced_shape == ():
+        if reduce_op in (ReduceOp.ARGMAX, ReduceOp.ARGMIN):
+            use_miniexpr = False  # not supported yet
+        elif len(operands) <= 2:
+            # This is supported, but performance is generally worse than manual chunked evaluation
+            use_miniexpr = False
         # Only this case is supported so far
         if use_miniexpr:
             for op in operands.values():
@@ -2007,7 +2009,6 @@ def reduce_slices(  # noqa: C901
             res_eval = blosc2.empty(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
             # Compute the number of blocks in the result
             nblocks = res_eval.nbytes // res_eval.blocksize
-            print("nblocks:", nblocks, dtype)
             aux_reduc = np.empty(nblocks, dtype=dtype)
             try:
                 print("expr->miniexpr:", expression, reduce_op)
@@ -2032,13 +2033,12 @@ def reduce_slices(  # noqa: C901
                 # (continue to the manual chunked evaluation below)
                 pass
             else:
-                from time import time
-
-                t0 = time()
-                result = reduce_op.value.reduce(aux_reduc, **reduce_args)
-                t = time() - t0
-                print(f"reduction of aux_reduc took {t * 1e6:.6f} us")
-                # print(f"res_eval.info:", res_eval.info)
+                if reduce_op == ReduceOp.ANY:
+                    result = np.any(aux_reduc, **reduce_args)
+                elif reduce_op == ReduceOp.ALL:
+                    result = np.all(aux_reduc, **reduce_args)
+                else:
+                    result = reduce_op.value.reduce(aux_reduc, **reduce_args)
                 return result
 
     # Iterate over the operands and get the chunks

From 58baecc9312501ef5d3e99ba96765d97c79a5d67 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sun, 4 Jan 2026 13:31:45 +0100
Subject: [PATCH 072/123] Benefit Intel architectures for now

---
 src/blosc2/lazyexpr.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 4760d0df..58bc9263 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1987,8 +1987,12 @@ def reduce_slices(  # noqa: C901
     if where is None and fast_path and all_ndarray and reduced_shape == ():
         if reduce_op in (ReduceOp.ARGMAX, ReduceOp.ARGMIN):
             use_miniexpr = False  # not supported yet
-        elif len(operands) <= 2:
+        elif len(operands) < 2:
             # This is supported, but performance is generally worse than manual chunked evaluation
+            # Determining the exact number of operands that gives better performance is tricky;
+            # for example, apple silicon CPUs seem to benefit from miniexpr starting with 3 operands,
+            # whereas Intel CPUs seem to do better with just 2 operands.
+            # TODO: more benchmarks needed
             use_miniexpr = False
         # Only this case is supported so far
         if use_miniexpr:

From eb43bf22da7a422cfd80e19adef4476a0c5d25a5 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 5 Jan 2026 11:24:52 +0100
Subject: [PATCH 073/123] New (faster) way of evaluating with miniexpr

---
 bench/ndarray/expr-blocked-eval.py            |  44 -------
 bench/ndarray/expr-reduction-sum.py           |  46 -------
 bench/ndarray/miniexpr-eval.py                |  47 +++++++
 ...-multi.py => miniexpr-reduct-sum-multi.py} |  25 ++--
 bench/ndarray/miniexpr-reduct-sum.py          |  42 ++++++
 src/blosc2/blosc2_ext.pyx                     |  18 +++
 src/blosc2/lazyexpr.py                        | 124 +++++++++---------
 7 files changed, 181 insertions(+), 165 deletions(-)
 delete mode 100644 bench/ndarray/expr-blocked-eval.py
 delete mode 100644 bench/ndarray/expr-reduction-sum.py
 create mode 100644 bench/ndarray/miniexpr-eval.py
 rename bench/ndarray/{expr-reduction-sum-multi.py => miniexpr-reduct-sum-multi.py} (59%)
 create mode 100644 bench/ndarray/miniexpr-reduct-sum.py

diff --git a/bench/ndarray/expr-blocked-eval.py b/bench/ndarray/expr-blocked-eval.py
deleted file mode 100644
index 9fb76aac..00000000
--- a/bench/ndarray/expr-blocked-eval.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from time import time
-import blosc2
-import numpy as np
-import numexpr as ne
-
-N = 10_000
-dtype= np.int32
-#dtype= np.float32
-#dtype= np.float64
-cparams = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=1)
-
-t0 = time()
-a = blosc2.ones((N, N), dtype=dtype, cparams=cparams)
-#a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
-# a = blosc2.linspace(0., 1., np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
-print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
-t0 = time()
-b = a.copy()
-c = a.copy()
-print(f"Time to copy data: {(time() - t0) * 1000 :.4f} ms")
-
-t0 = time()
-res = ((a + b) * c).compute(cparams=cparams)
-t = time() - t0
-print(f"Time to evaluate: {t * 1000 :.4f} ms", end=" ")
-print(f"Speed (GB/s): {(a.nbytes * 4 / 1e9) / t:.2f}")
-# print(res.info)
-
-na = a[:]
-nb = b[:]
-nc = c[:]
-#np.testing.assert_allclose(res, (na + nb) * nc)
-
-t0 = time()
-res = (na + nb) * nc
-t = time() - t0
-print(f"Time to evaluate with NumPy: {t * 1000 :.4f} ms", end=" ")
-print(f"Speed (GB/s): {(na.nbytes * 4 / 1e9) / t:.2f}")
-
-t0 = time()
-res = ne.evaluate("(na + nb) * nc")
-t = time() - t0
-print(f"Time to evaluate with NumExpr: {t * 1000 :.4f} ms", end=" ")
-print(f"Speed (GB/s): {(na.nbytes * 4 / 1e9) / t:.2f}")
diff --git a/bench/ndarray/expr-reduction-sum.py b/bench/ndarray/expr-reduction-sum.py
deleted file mode 100644
index 1998a8da..00000000
--- a/bench/ndarray/expr-reduction-sum.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from time import time
-import blosc2
-import numpy as np
-import numexpr as ne
-
-N = 10_000
-dtype= np.int32
-#dtype= np.float32
-#dtype= np.float64
-cparams = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=1)
-cparams_out = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=0,
-                             blocksize=cparams.blocksize, splitmode=blosc2.SplitMode.NEVER_SPLIT)
-
-t0 = time()
-a = blosc2.ones((N, N), dtype=dtype, cparams=cparams)
-#a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
-#a = blosc2.linspace(0., 1., np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
-print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
-t0 = time()
-b = a.copy()
-c = a.copy()
-print(f"Time to copy data: {(time() - t0) * 1000 :.4f} ms")
-
-t0 = time()
-res = blosc2.sum(a, cparams=cparams)
-t = time() - t0
-print(f"Time to evaluate: {t * 1000 :.4f} ms", end=" ")
-print(f"Speed (GB/s): {(a.nbytes / 1e9) / t:.2f}")
-print("res:", res)
-
-na = a[:]
-nb = b[:]
-nc = c[:]
-# np.testing.assert_allclose(res, np.sum(na), rtol=1e-5)
-
-t0 = time()
-res = np.sum(na)
-t = time() - t0
-print(f"Time to evaluate with NumPy: {t * 1000 :.4f} ms", end=" ")
-print(f"Speed (GB/s): {(na.nbytes / 1e9) / t:.2f}")
-
-t0 = time()
-res = ne.evaluate("sum(na)")
-t = time() - t0
-print(f"Time to evaluate with NumExpr: {t * 1000 :.4f} ms", end=" ")
-print(f"Speed (GB/s): {(na.nbytes / 1e9) / t:.2f}")
diff --git a/bench/ndarray/miniexpr-eval.py b/bench/ndarray/miniexpr-eval.py
new file mode 100644
index 00000000..c7b89035
--- /dev/null
+++ b/bench/ndarray/miniexpr-eval.py
@@ -0,0 +1,47 @@
+from time import time
+import blosc2
+import numpy as np
+import numexpr as ne
+
+N = 10_000
+# dtype= np.int32
+dtype= np.float32
+# dtype= np.float64
+cparams = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=1)
+
+t0 = time()
+# a = blosc2.ones((N, N), dtype=dtype, cparams=cparams)
+# a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
+a = blosc2.linspace(0., 1., np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
+print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
+t0 = time()
+b = a.copy()
+c = a.copy()
+print(f"Time to copy data: {(time() - t0) * 1000 :.4f} ms")
+
+t0 = time()
+res = (2 * a**2 - 3 * b + c + 1.2).compute(cparams=cparams)
+t = time() - t0
+print(f"Time to evaluate: {t * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(a.nbytes * 4 / 1e9) / t:.2f}")
+# print(res.info)
+
+na = a[:]
+nb = b[:]
+nc = c[:]
+
+t0 = time()
+nres = 2 * na**2 - 3 * nb + nc + 1.2
+nt = time() - t0
+print(f"Time to evaluate with NumPy: {nt * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(na.nbytes * 4 / 1e9) / nt:.2f}")
+print(f"Speedup Blosc2 vs NumPy: {nt / t:.2f}x")
+np.testing.assert_allclose(res, nres, rtol=1e-5)
+
+t0 = time()
+neres = ne.evaluate("2 * na**2 - 3 * nb + nc + 1.2")
+net = time() - t0
+print(f"Time to evaluate with NumExpr: {net * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(na.nbytes * 4 / 1e9) / net:.2f}")
+print(f"Speedup Blosc2 vs NumExpr: {net / t:.2f}x")
+np.testing.assert_allclose(res, neres, rtol=1e-5)
diff --git a/bench/ndarray/expr-reduction-sum-multi.py b/bench/ndarray/miniexpr-reduct-sum-multi.py
similarity index 59%
rename from bench/ndarray/expr-reduction-sum-multi.py
rename to bench/ndarray/miniexpr-reduct-sum-multi.py
index 6cbe48a0..3a734001 100644
--- a/bench/ndarray/expr-reduction-sum-multi.py
+++ b/bench/ndarray/miniexpr-reduct-sum-multi.py
@@ -15,14 +15,13 @@
 #a = rng.integers(0, 2, size=(N, N), dtype=dtype)
 #a = blosc2.asarray(a, cparams=cparams, urlpath="a.b2nd", mode="w")
 print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
-#print(a[:])
 t0 = time()
 b = a.copy()
 c = a.copy()
 print(f"Time to copy data: {(time() - t0) * 1000 :.4f} ms")
 
 t0 = time()
-res = blosc2.sum(2 * a**2 - 3 * b + c + 1.2, cparams=cparams)
+res = blosc2.sum(2 * a**2 - 3 * b + c + 1.2)
 t = time() - t0
 print(f"Time to evaluate: {t * 1000 :.4f} ms", end=" ")
 print(f"Speed (GB/s): {(a.nbytes * 3 / 1e9) / t:.2f}")
@@ -31,16 +30,20 @@
 na = a[:]
 nb = b[:]
 nc = c[:]
-#np.testing.assert_allclose(res, np.sum(2 * na**2 - 3 * nb + nc + 1.2))
 
 t0 = time()
-res = np.sum(2 * na**2 - 3 * nb + nc + 1.2)
-t = time() - t0
-print(f"Time to evaluate with NumPy: {t * 1000 :.4f} ms", end=" ")
-print(f"Speed (GB/s): {(na.nbytes * 3 / 1e9) / t:.2f}")
+nres = np.sum(2 * na**2 - 3 * nb + nc + 1.2)
+nt = time() - t0
+print(f"Time to evaluate with NumPy: {nt * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(na.nbytes * 3 / 1e9) / nt:.2f}")
+print("Result:", res, "Mean:", res / (N * N))
+print(f"Speedup Blosc2 vs NumPy: {nt / t:.2f}x")
+assert np.allclose(res, nres)
 
 t0 = time()
-res = ne.evaluate("sum(2 * na**2 - 3 * nb + nc + 1.2)")
-t = time() - t0
-print(f"Time to evaluate with NumExpr: {t * 1000 :.4f} ms", end=" ")
-print(f"Speed (GB/s): {(na.nbytes / 1e9) / t:.2f}")
+neres = ne.evaluate("sum(2 * na**2 - 3 * nb + nc + 1.2)")
+net = time() - t0
+print(f"Time to evaluate with NumExpr: {net * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(na.nbytes * 3 / 1e9) / net:.2f}")
+print("Result:", res, "Mean:", res / (N * N))
+print(f"Speedup Blosc2 vs NumExpr: {net / t:.2f}x")
diff --git a/bench/ndarray/miniexpr-reduct-sum.py b/bench/ndarray/miniexpr-reduct-sum.py
new file mode 100644
index 00000000..8714dc66
--- /dev/null
+++ b/bench/ndarray/miniexpr-reduct-sum.py
@@ -0,0 +1,42 @@
+from time import time
+import blosc2
+import numpy as np
+import numexpr as ne
+
+N = 10_000
+# dtype= np.int32
+dtype= np.float32
+# dtype= np.float64
+cparams = blosc2.CParams(codec=blosc2.Codec.BLOSCLZ, clevel=1)
+
+t0 = time()
+# a = blosc2.ones((N, N), dtype=dtype, cparams=cparams)
+# a = blosc2.arange(np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
+a = blosc2.linspace(0., 1., np.prod((N, N)), shape=(N, N), dtype=dtype, cparams=cparams)
+print(f"Time to create data: {(time() - t0) * 1000 :.4f} ms")
+
+t0 = time()
+res = blosc2.sum(a)
+t = time() - t0
+print(f"Time to evaluate: {t * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(a.nbytes / 1e9) / t:.2f}")
+print("Result:", res, "Mean:", res / (N * N))
+
+na = a[:]
+
+t0 = time()
+nres = np.sum(na)
+nt = time() - t0
+print(f"Time to evaluate with NumPy: {nt * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(na.nbytes / 1e9) / nt:.2f}")
+print("Result:", res, "Mean:", res / (N * N))
+print(f"Speedup Blosc2 vs NumPy: {nt / t:.2f}x")
+assert np.allclose(res, nres)
+
+t0 = time()
+neres = ne.evaluate("sum(na)")
+net = time() - t0
+print(f"Time to evaluate with NumExpr: {net * 1000 :.4f} ms", end=" ")
+print(f"Speed (GB/s): {(na.nbytes / 1e9) / net:.2f}")
+print("Result:", res, "Mean:", res / (N * N))
+print(f"Speedup Blosc2 vs NumExpr: {net / t:.2f}x")
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 53d7ebe7..bbca201d 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1435,6 +1435,7 @@ cdef class SChunk:
         cdef int size
         cdef int32_t len_chunk = <int32_t> (buf.len + BLOSC2_MAX_OVERHEAD)
         cdef uint8_t* chunk = <uint8_t*> malloc(len_chunk)
+        self.schunk.current_nchunk = nchunk  # prefilter needs this value to be set
         if RELEASEGIL:
             with nogil:
                 # No need to create another cctx
@@ -1473,6 +1474,7 @@ cdef class SChunk:
         cdef int size
         cdef int32_t len_chunk = <int32_t> (buf.len + BLOSC2_MAX_OVERHEAD)
         cdef uint8_t* chunk = <uint8_t*> malloc(len_chunk)
+        self.schunk.current_nchunk = nchunk  # prefilter needs this value to be set
         if RELEASEGIL:
             with nogil:
                 size = blosc2_compress_ctx(self.schunk.cctx, buf.buf, <int32_t> buf.len, chunk, len_chunk)
@@ -1495,6 +1497,22 @@ cdef class SChunk:
             raise RuntimeError("Could not update the desired chunk")
         return rc
 
+    # This is used internally for prefiltering
+    def _prefilter_data(self, nchunk, data, chunk_data):
+        cdef Py_buffer buf
+        PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE)
+        cdef Py_buffer chunk_buf
+        PyObject_GetBuffer(chunk_data, &chunk_buf, PyBUF_SIMPLE)
+        self.schunk.current_nchunk = nchunk  # prefilter needs this value to be set
+        cdef int size = blosc2_compress_ctx(self.schunk.cctx, buf.buf, <int32_t> buf.len, chunk_buf.buf, chunk_buf.len)
+        PyBuffer_Release(&buf)
+        PyBuffer_Release(&chunk_buf)
+        if size < 0:
+            raise RuntimeError("Could not compress the data")
+        elif size == 0:
+            raise RuntimeError("The result could not fit ")
+        return size
+
     def get_slice(self, start=0, stop=None, out=None):
         cdef int64_t nitems = self.schunk.nbytes // self.schunk.typesize
         start, stop, _ = slice(start, stop, 1).indices(nitems)
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 58bc9263..eafa793a 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1302,15 +1302,16 @@ def fast_eval(  # noqa: C901
 
     if use_miniexpr:
         cparams = kwargs.pop("cparams", blosc2.CParams())
-        # Use the same chunks/blocks as the input operands for consistency
-        res_eval = blosc2.empty(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
+        # All values will be overwritten, so we can use an uninitialized array
+        res_eval = blosc2.uninit(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
         try:
             # print("expr->miniexpr:", expression)
             res_eval._set_pref_expr(expression, operands)
-            # This line would NOT allocate physical RAM on any modern OS:
-            aux = np.empty(res_eval.shape, res_eval.dtype)
-            # Physical allocation happens here (when writing):
-            res_eval[...] = aux
+            # Data to compress is fetched from operands, so it can be uninitialized here
+            data = np.empty(res_eval.schunk.chunksize, dtype=np.uint8)
+            # Exercise prefilter for each chunk
+            for nchunk in range(res_eval.schunk.nchunks):
+                res_eval.schunk.update_data(nchunk, data, copy=False)
         except Exception:
             use_miniexpr = False
         finally:
@@ -1935,6 +1936,7 @@ def reduce_slices(  # noqa: C901
     blosc2_arrs = tuple(o for o in operands.values() if hasattr(o, "chunks"))
     fast_path = False
     all_ndarray = False
+    any_persisted = False
     chunks = None
     blocks = None
     if blosc2_arrs:  # fast path only relevant if there are blosc2 arrays
@@ -1946,7 +1948,8 @@ def reduce_slices(  # noqa: C901
         same_chunks = all(operand.chunks == o.chunks for o in operands.values() if hasattr(o, "chunks"))
         same_blocks = all(operand.blocks == o.blocks for o in operands.values() if hasattr(o, "blocks"))
         fast_path = same_shape and same_chunks and same_blocks and (0 not in operand.chunks)
-        aligned, iter_disk = dict.fromkeys(operands.keys(), False), False
+        aligned = dict.fromkeys(operands.keys(), False)
+        iter_disk = False
         if fast_path:
             chunks = operand.chunks
             blocks = operand.blocks
@@ -1984,66 +1987,59 @@ def reduce_slices(  # noqa: C901
         del temp
 
     # miniexpr reduction path only supported for some cases so far
-    if where is None and fast_path and all_ndarray and reduced_shape == ():
-        if reduce_op in (ReduceOp.ARGMAX, ReduceOp.ARGMIN):
-            use_miniexpr = False  # not supported yet
-        elif len(operands) < 2:
-            # This is supported, but performance is generally worse than manual chunked evaluation
-            # Determining the exact number of operands that gives better performance is tricky;
-            # for example, apple silicon CPUs seem to benefit from miniexpr starting with 3 operands,
-            # whereas Intel CPUs seem to do better with just 2 operands.
-            # TODO: more benchmarks needed
-            use_miniexpr = False
-        # Only this case is supported so far
-        if use_miniexpr:
-            for op in operands.values():
-                # Only NDArray in-memory operands
-                if not (isinstance(op, blosc2.NDArray) and op.urlpath is None and out is None):
-                    use_miniexpr = False
-                    break
-                # Check that partitions are well-behaved (no padding)
-                if not blosc2.are_partitions_behaved(op.shape, op.chunks, op.blocks):
-                    use_miniexpr = False
-                    break
+    if not (where is None and fast_path and all_ndarray and not any_persisted and reduced_shape == ()):
+        use_miniexpr = False
 
-        if use_miniexpr:
-            cparams = kwargs.pop("cparams", blosc2.CParams())
-            # print(f"cparams: {cparams}")
-            # Use the same chunks/blocks as the input operands for consistency
-            res_eval = blosc2.empty(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
-            # Compute the number of blocks in the result
-            nblocks = res_eval.nbytes // res_eval.blocksize
-            aux_reduc = np.empty(nblocks, dtype=dtype)
-            try:
-                print("expr->miniexpr:", expression, reduce_op)
-                if reduce_op_str is None:
-                    use_miniexpr = False
-                expression = f"{reduce_op_str}({expression})"
-                res_eval._set_pref_expr(expression, operands, aux_reduc)
-                # This line would NOT allocate physical RAM on any modern OS:
-                aux = np.empty(res_eval.shape, res_eval.dtype)
-                # Physical allocation happens here (when writing):
-                res_eval[...] = aux
-            except Exception:
+    # Some reductions are not supported yet in miniexpr
+    if reduce_op in (ReduceOp.ARGMAX, ReduceOp.ARGMIN):
+        use_miniexpr = False
+
+    # Only behaved partitions are supported in miniexpr reductions
+    if use_miniexpr:
+        for op in operands.values():
+            # Check that partitions are well-behaved (no padding)
+            if not blosc2.are_partitions_behaved(op.shape, op.chunks, op.blocks):
                 use_miniexpr = False
-            finally:
-                res_eval.schunk.remove_prefilter("miniexpr")
-                global iter_chunks
-                # Ensure any background reading thread is closed
-                iter_chunks = None
-
-            if not use_miniexpr:
-                # If miniexpr failed, fallback to regular evaluation
-                # (continue to the manual chunked evaluation below)
-                pass
+                break
+
+    if use_miniexpr:
+        # Experiments say that not splitting is best (at least on Apple Silicon M4 Pro)
+        cparams = kwargs.pop("cparams", blosc2.CParams(splitmode=blosc2.SplitMode.NEVER_SPLIT))
+        # Create a fake NDArray just to drive the miniexpr evaluation (values won't be used)
+        res_eval = blosc2.uninit(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
+        # Compute the number of blocks in the result
+        nblocks = res_eval.nbytes // res_eval.blocksize
+        aux_reduc = np.empty(nblocks, dtype=dtype)
+        try:
+            # print("expr->miniexpr:", expression, reduce_op)
+            expression = f"{reduce_op_str}({expression})"
+            res_eval._set_pref_expr(expression, operands, aux_reduc)
+            # Data won't even try to be compressed, so buffers can be unitialized and reused
+            data = np.empty(res_eval.schunk.chunksize, dtype=np.uint8)
+            chunk_data = np.empty(res_eval.schunk.chunksize + blosc2.MAX_OVERHEAD, dtype=np.uint8)
+            # Exercise prefilter for each chunk
+            for nchunk in range(res_eval.schunk.nchunks):
+                res_eval.schunk._prefilter_data(nchunk, data, chunk_data)
+        except Exception:
+            use_miniexpr = False
+        finally:
+            res_eval.schunk.remove_prefilter("miniexpr")
+            global iter_chunks
+            # Ensure any background reading thread is closed
+            iter_chunks = None
+
+        if not use_miniexpr:
+            # If miniexpr failed, fallback to regular evaluation
+            # (continue to the manual chunked evaluation below)
+            pass
+        else:
+            if reduce_op == ReduceOp.ANY:
+                result = np.any(aux_reduc, **reduce_args)
+            elif reduce_op == ReduceOp.ALL:
+                result = np.all(aux_reduc, **reduce_args)
             else:
-                if reduce_op == ReduceOp.ANY:
-                    result = np.any(aux_reduc, **reduce_args)
-                elif reduce_op == ReduceOp.ALL:
-                    result = np.all(aux_reduc, **reduce_args)
-                else:
-                    result = reduce_op.value.reduce(aux_reduc, **reduce_args)
-                return result
+                result = reduce_op.value.reduce(aux_reduc, **reduce_args)
+            return result
 
     # Iterate over the operands and get the chunks
     chunk_operands = {}

From 07ccba9569b4511e6e5babea01fe3e0ec0e82836 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 13:21:32 +0100
Subject: [PATCH 074/123] Use the new version of miniexpr as a library

---
 CMakeLists.txt              |   19 +-
 src/blosc2/miniexpr.c       | 6581 -----------------------------------
 src/blosc2/miniexpr.h       |  225 --
 src/blosc2/miniexpr_numpy.h |  157 -
 4 files changed, 15 insertions(+), 6967 deletions(-)
 delete mode 100755 src/blosc2/miniexpr.c
 delete mode 100644 src/blosc2/miniexpr.h
 delete mode 100644 src/blosc2/miniexpr_numpy.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a9fc29b..54d6f537 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,14 +40,25 @@ add_custom_command(
           "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/blosc2_ext.pyx" --output-file blosc2_ext.c
   DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/blosc2_ext.pyx"
   VERBATIM)
+
 # ...and add it to the target
-Python_add_library(blosc2_ext MODULE blosc2_ext.c
-                   "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/miniexpr.c" WITH_SOABI)
+Python_add_library(blosc2_ext MODULE blosc2_ext.c WITH_SOABI)
+
 # We need to link against NumPy
 target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 
-# Add include directory for miniexpr.h and others
-target_include_directories(blosc2_ext PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2")
+# Fetch and build miniexpr library
+include(FetchContent)
+
+FetchContent_Declare(miniexpr
+    GIT_REPOSITORY https://github.com/Blosc/miniexpr.git
+    GIT_TAG 3e0ad9f2800cfb46729da88553a9228845eaa731  # latest SIMD additions
+)
+FetchContent_MakeAvailable(miniexpr)
+
+# Link against miniexpr static library
+target_link_libraries(blosc2_ext PRIVATE miniexpr_static)
+
 target_compile_features(blosc2_ext PRIVATE c_std_11)
 if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang")
     execute_process(
diff --git a/src/blosc2/miniexpr.c b/src/blosc2/miniexpr.c
deleted file mode 100755
index 728bbbc5..00000000
--- a/src/blosc2/miniexpr.c
+++ /dev/null
@@ -1,6581 +0,0 @@
-/*********************************************************************
-  Blosc - Blocked Shuffling and Compression Library
-
-  Copyright (c) 2025  Blosc Development Team <blosc@blosc.org>
-  https://blosc.org
-  License: BSD 3-Clause (see LICENSE.txt)
-
-  See LICENSE.txt for details about copyright and rights to use.
-**********************************************************************/
-
-// Loosely based on https://github.com/CodePlea/tinyexpr. License follows:
-// SPDX-License-Identifier: Zlib
-/*
- * TINYEXPR - Tiny recursive descent parser and evaluation engine in C
- *
- * Copyright (c) 2015-2020 Lewis Van Winkle
- *
- * http://CodePlea.com
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgement in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* COMPILE TIME OPTIONS */
-
-/* Exponentiation associativity:
-For a**b**c = (a**b)**c and -a**b = (-a)**b do nothing.
-For a**b**c = a**(b**c) and -a**b = -(a**b) uncomment the next line.*/
-/* #define ME_POW_FROM_RIGHT */
-
-/* Logarithms
-For log = natural log do nothing (NumPy compatible)
-For log = base 10 log comment the next line. */
-#define ME_NAT_LOG
-
-#include "miniexpr.h"
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <limits.h>
-#include <stdint.h>
-#include <stdbool.h>
-#if defined(__SSE2__) || defined(__SSE__) || defined(__AVX__) || defined(__AVX2__)
-#include <immintrin.h>
-#endif
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#include <arm_neon.h>
-#endif
-#if defined(_MSC_VER) && !defined(__clang__)
-#define IVDEP
-#else
-#define IVDEP _Pragma("GCC ivdep")
-#endif
-
-#include <complex.h>
-
-#if defined(_MSC_VER) && !defined(__clang__)
-#define float_complex _Fcomplex
-#define double_complex _Dcomplex
-// And it doesn't support standard operators for them in C
-static inline _Fcomplex add_c64(_Fcomplex a, _Fcomplex b) {
-    return _FCbuild(crealf(a) + crealf(b), cimagf(a) + cimagf(b));
-}
-static inline _Fcomplex sub_c64(_Fcomplex a, _Fcomplex b) {
-    return _FCbuild(crealf(a) - crealf(b), cimagf(a) - cimagf(b));
-}
-static inline _Fcomplex neg_c64(_Fcomplex a) { return _FCbuild(-crealf(a), -cimagf(a)); }
-static inline _Fcomplex mul_c64(_Fcomplex a, _Fcomplex b) {
-    return _FCbuild(crealf(a) * crealf(b) - cimagf(a) * cimagf(b), crealf(a) * cimagf(b) + cimagf(a) * crealf(b));
-}
-static inline _Fcomplex div_c64(_Fcomplex a, _Fcomplex b) {
-    float denom = crealf(b) * crealf(b) + cimagf(b) * cimagf(b);
-    return _FCbuild((crealf(a) * crealf(b) + cimagf(a) * cimagf(b)) / denom,
-                    (cimagf(a) * crealf(b) - crealf(a) * cimagf(b)) / denom);
-}
-static inline _Dcomplex add_c128(_Dcomplex a, _Dcomplex b) { return _Cbuild(creal(a) + creal(b), cimag(a) + cimag(b)); }
-static inline _Dcomplex sub_c128(_Dcomplex a, _Dcomplex b) { return _Cbuild(creal(a) - creal(b), cimag(a) - cimag(b)); }
-static inline _Dcomplex neg_c128(_Dcomplex a) { return _Cbuild(-creal(a), -cimag(a)); }
-static inline _Dcomplex mul_c128(_Dcomplex a, _Dcomplex b) {
-    return _Cbuild(creal(a) * creal(b) - cimag(a) * cimag(b), creal(a) * cimag(b) + cimag(a) * creal(b));
-}
-static inline _Dcomplex div_c128(_Dcomplex a, _Dcomplex b) {
-    double denom = creal(b) * creal(b) + cimag(b) * cimag(b);
-    return _Cbuild((creal(a) * creal(b) + cimag(a) * cimag(b)) / denom,
-                   (cimag(a) * creal(b) - creal(a) * cimag(b)) / denom);
-}
-#else
-#define float_complex float _Complex
-#define double_complex double _Complex
-#define add_c64(a, b) ((a) + (b))
-#define sub_c64(a, b) ((a) - (b))
-#define neg_c64(a) (-(a))
-#define mul_c64(a, b) ((a) * (b))
-#define div_c64(a, b) ((a) / (b))
-#define add_c128(a, b) ((a) + (b))
-#define sub_c128(a, b) ((a) - (b))
-#define neg_c128(a) (-(a))
-#define mul_c128(a, b) ((a) * (b))
-#define div_c128(a, b) ((a) / (b))
-#endif
-
-#if defined(_MSC_VER) && !defined(__clang__)
-/* Wrappers for complex functions to handle MSVC's _Fcomplex/_Dcomplex */
-static inline float _Complex me_cpowf(float _Complex a, float _Complex b) {
-    union {
-        float _Complex c;
-        _Fcomplex m;
-    } ua, ub, ur;
-    ua.c = a;
-    ub.c = b;
-    ur.m = cpowf(ua.m, ub.m);
-    return ur.c;
-}
-static inline double _Complex me_cpow(double _Complex a, double _Complex b) {
-    union {
-        double _Complex c;
-        _Dcomplex m;
-    } ua, ub, ur;
-    ua.c = a;
-    ub.c = b;
-    ur.m = cpow(ua.m, ub.m);
-    return ur.c;
-}
-static inline float _Complex me_csqrtf(float _Complex a) {
-    union {
-        float _Complex c;
-        _Fcomplex m;
-    } ua, ur;
-    ua.c = a;
-    ur.m = csqrtf(ua.m);
-    return ur.c;
-}
-static inline double _Complex me_csqrt(double _Complex a) {
-    union {
-        double _Complex c;
-        _Dcomplex m;
-    } ua, ur;
-    ua.c = a;
-    ur.m = csqrt(ua.m);
-    return ur.c;
-}
-static inline float _Complex me_cexpf(float _Complex a) {
-    union {
-        float _Complex c;
-        _Fcomplex m;
-    } ua, ur;
-    ua.c = a;
-    ur.m = cexpf(ua.m);
-    return ur.c;
-}
-static inline double _Complex me_cexp(double _Complex a) {
-    union {
-        double _Complex c;
-        _Dcomplex m;
-    } ua, ur;
-    ua.c = a;
-    ur.m = cexp(ua.m);
-    return ur.c;
-}
-static inline float _Complex me_clogf(float _Complex a) {
-    union {
-        float _Complex c;
-        _Fcomplex m;
-    } ua, ur;
-    ua.c = a;
-    ur.m = clogf(ua.m);
-    return ur.c;
-}
-static inline double _Complex me_clog(double _Complex a) {
-    union {
-        double _Complex c;
-        _Dcomplex m;
-    } ua, ur;
-    ua.c = a;
-    ur.m = clog(ua.m);
-    return ur.c;
-}
-static inline float me_cabsf(float _Complex a) {
-    union {
-        float _Complex c;
-        _Fcomplex m;
-    } ua;
-    ua.c = a;
-    return cabsf(ua.m);
-}
-static inline double me_cabs(double _Complex a) {
-    union {
-        double _Complex c;
-        _Dcomplex m;
-    } ua;
-    ua.c = a;
-    return cabs(ua.m);
-}
-static inline float me_cimagf(float _Complex a) {
-    union {
-        float _Complex c;
-        _Fcomplex m;
-    } ua;
-    ua.c = a;
-    return cimagf(ua.m);
-}
-static inline double me_cimag(double _Complex a) {
-    union {
-        double _Complex c;
-        _Dcomplex m;
-    } ua;
-    ua.c = a;
-    return cimag(ua.m);
-}
-static inline float me_crealf(float _Complex a) {
-    union {
-        float _Complex c;
-        _Fcomplex m;
-    } ua;
-    ua.c = a;
-    return crealf(ua.m);
-}
-static inline double me_creal(double _Complex a) {
-    union {
-        double _Complex c;
-        _Dcomplex m;
-    } ua;
-    ua.c = a;
-    return creal(ua.m);
-}
-static inline float _Complex me_conjf(float _Complex a) {
-    union {
-        float _Complex c;
-        _Fcomplex m;
-    } ua, ur;
-    ua.c = a;
-    ur.m = conjf(ua.m);
-    return ur.c;
-}
-static inline double _Complex me_conj(double _Complex a) {
-    union {
-        double _Complex c;
-        _Dcomplex m;
-    } ua, ur;
-    ua.c = a;
-    ur.m = conj(ua.m);
-    return ur.c;
-}
-#else
-#if defined(_MSC_VER) && defined(__clang__)
-#define me_cimagf __builtin_cimagf
-#define me_cimag __builtin_cimag
-#define me_crealf __builtin_crealf
-#define me_creal __builtin_creal
-#define me_conjf __builtin_conjf
-#define me_conj __builtin_conj
-#define me_cpowf __builtin_cpowf
-#define me_cpow __builtin_cpow
-#define me_csqrtf __builtin_csqrtf
-#define me_csqrt __builtin_csqrt
-#define me_cexpf __builtin_cexpf
-#define me_cexp __builtin_cexp
-#define me_clogf __builtin_clogf
-#define me_clog __builtin_clog
-#define me_cabsf __builtin_cabsf
-#define me_cabs __builtin_cabs
-#else
-#define me_cpowf cpowf
-#define me_cpow cpow
-#define me_csqrtf csqrtf
-#define me_csqrt csqrt
-#define me_cexpf cexpf
-#define me_cexp cexp
-#define me_clogf clogf
-#define me_clog clog
-#define me_cabsf cabsf
-#define me_cabs cabs
-#define me_cimagf cimagf
-#define me_cimag cimag
-#define me_crealf crealf
-#define me_creal creal
-#define me_conjf conjf
-#define me_conj conj
-#endif
-#endif
-
-/* Type-specific cast and comparison macros to handle MSVC complex structs */
-#define TO_TYPE_bool(x) (bool)(x)
-#define TO_TYPE_i8(x) (int8_t)(x)
-#define TO_TYPE_i16(x) (int16_t)(x)
-#define TO_TYPE_i32(x) (int32_t)(x)
-#define TO_TYPE_i64(x) (int64_t)(x)
-#define TO_TYPE_u8(x) (uint8_t)(x)
-#define TO_TYPE_u16(x) (uint16_t)(x)
-#define TO_TYPE_u32(x) (uint32_t)(x)
-#define TO_TYPE_u64(x) (uint64_t)(x)
-#define TO_TYPE_f32(x) (float)(x)
-#define TO_TYPE_f64(x) (double)(x)
-
-#define FROM_TYPE_bool(x) (double)(x)
-#define FROM_TYPE_i8(x) (double)(x)
-#define FROM_TYPE_i16(x) (double)(x)
-#define FROM_TYPE_i32(x) (double)(x)
-#define FROM_TYPE_i64(x) (double)(x)
-#define FROM_TYPE_u8(x) (double)(x)
-#define FROM_TYPE_u16(x) (double)(x)
-#define FROM_TYPE_u32(x) (double)(x)
-#define FROM_TYPE_u64(x) (double)(x)
-#define FROM_TYPE_f32(x) (double)(x)
-#define FROM_TYPE_f64(x) (double)(x)
-
-#define IS_NONZERO_bool(x) (x)
-#define IS_NONZERO_i8(x) ((x) != 0)
-#define IS_NONZERO_i16(x) ((x) != 0)
-#define IS_NONZERO_i32(x) ((x) != 0)
-#define IS_NONZERO_i64(x) ((x) != 0)
-#define IS_NONZERO_u8(x) ((x) != 0)
-#define IS_NONZERO_u16(x) ((x) != 0)
-#define IS_NONZERO_u32(x) ((x) != 0)
-#define IS_NONZERO_u64(x) ((x) != 0)
-#define IS_NONZERO_f32(x) ((x) != 0.0f)
-#define IS_NONZERO_f64(x) ((x) != 0.0)
-
-#if defined(_MSC_VER) && !defined(__clang__)
-#define TO_TYPE_c64(x) _FCbuild((float)(x), 0.0f)
-#define TO_TYPE_c128(x) _Cbuild((double)(x), 0.0)
-#define FROM_TYPE_c64(x) (double)crealf(x)
-#define FROM_TYPE_c128(x) (double)creal(x)
-#define IS_NONZERO_c64(x) (crealf(x) != 0.0f || cimagf(x) != 0.0f)
-#define IS_NONZERO_c128(x) (creal(x) != 0.0 || cimag(x) != 0.0)
-
-/* Helper macros for complex-to-complex conversions */
-#define CONV_c64_to_c128(x) _Cbuild((double)crealf(x), (double)cimagf(x))
-#define TO_TYPE_c128_from_c64(x) CONV_c64_to_c128(x)
-#else
-#define TO_TYPE_c64(x) (float_complex)(x)
-#define TO_TYPE_c128(x) (double_complex)(x)
-#define FROM_TYPE_c64(x) (double)me_crealf(x)
-#define FROM_TYPE_c128(x) (double)me_creal(x)
-#define IS_NONZERO_c64(x) (me_crealf(x) != 0.0f || me_cimagf(x) != 0.0f)
-#define IS_NONZERO_c128(x) (me_creal(x) != 0.0 || me_cimag(x) != 0.0)
-#define TO_TYPE_c128_from_c64(x) (double_complex)(x)
-#endif
-
-#include <assert.h>
-
-#ifndef NAN
-#define NAN (0.0/0.0)
-#endif
-
-#ifndef INFINITY
-#define INFINITY (1.0/0.0)
-#endif
-
-
-typedef double (*me_fun2)(double, double);
-
-#if defined(_WIN32) || defined(_WIN64)
-static bool has_complex_node(const me_expr* n);
-static bool has_complex_input(const me_expr* n);
-#endif
-
-enum {
-    TOK_NULL = ME_CLOSURE7 + 1, TOK_ERROR, TOK_END, TOK_SEP,
-    TOK_OPEN, TOK_CLOSE, TOK_NUMBER, TOK_VARIABLE, TOK_INFIX,
-    TOK_BITWISE, TOK_SHIFT, TOK_COMPARE, TOK_POW
-};
-
-/* Internal definition of me_expr (opaque to users) */
-struct me_expr {
-    int type;
-
-    union {
-        double value;
-        const void* bound;
-        const void* function;
-    };
-
-    /* Vector operation info */
-    void* output; // Generic pointer (can be float* or double*)
-    int nitems;
-    me_dtype dtype; // Data type for this expression (result type after promotion)
-    me_dtype input_dtype; // Original input type (for variables/constants)
-    /* Bytecode info (for fused evaluation) */
-    void* bytecode; // Pointer to compiled bytecode
-    int ncode; // Number of instructions
-    void* parameters[1]; // Must be last (flexible array member)
-};
-
-
-/* Type promotion table following NumPy rules */
-/* Note: ME_AUTO (0) should never appear in type promotion, so we index from 1 */
-static const me_dtype type_promotion_table[13][13] = {
-    /* Rows: left operand, Columns: right operand */
-    /* BOOL,  INT8,    INT16,   INT32,   INT64,   UINT8,   UINT16,  UINT32,  UINT64,  FLOAT32, FLOAT64, COMPLEX64, COMPLEX128 */
-    {
-        ME_BOOL, ME_INT8, ME_INT16, ME_INT32, ME_INT64, ME_UINT8, ME_UINT16, ME_UINT32, ME_UINT64, ME_FLOAT32,
-        ME_FLOAT64, ME_COMPLEX64, ME_COMPLEX128
-    }, /* BOOL */
-    {
-        ME_INT8, ME_INT8, ME_INT16, ME_INT32, ME_INT64, ME_INT16, ME_INT32, ME_INT64, ME_FLOAT64, ME_FLOAT32,
-        ME_FLOAT64, ME_COMPLEX64, ME_COMPLEX128
-    }, /* INT8 */
-    {
-        ME_INT16, ME_INT16, ME_INT16, ME_INT32, ME_INT64, ME_INT32, ME_INT32, ME_INT64, ME_FLOAT64, ME_FLOAT32,
-        ME_FLOAT64, ME_COMPLEX64, ME_COMPLEX128
-    }, /* INT16 */
-    {
-        ME_INT32, ME_INT32, ME_INT32, ME_INT32, ME_INT64, ME_INT64, ME_INT64, ME_INT64, ME_FLOAT64, ME_FLOAT64,
-        ME_FLOAT64, ME_COMPLEX128, ME_COMPLEX128
-    }, /* INT32 */
-    {
-        ME_INT64, ME_INT64, ME_INT64, ME_INT64, ME_INT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64,
-        ME_FLOAT64, ME_COMPLEX128, ME_COMPLEX128
-    }, /* INT64 */
-    {
-        ME_UINT8, ME_INT16, ME_INT32, ME_INT64, ME_FLOAT64, ME_UINT8, ME_UINT16, ME_UINT32, ME_UINT64, ME_FLOAT32,
-        ME_FLOAT64, ME_COMPLEX64, ME_COMPLEX128
-    }, /* UINT8 */
-    {
-        ME_UINT16, ME_INT32, ME_INT32, ME_INT64, ME_FLOAT64, ME_UINT16, ME_UINT16, ME_UINT32, ME_UINT64, ME_FLOAT32,
-        ME_FLOAT64, ME_COMPLEX64, ME_COMPLEX128
-    }, /* UINT16 */
-    {
-        ME_UINT32, ME_INT64, ME_INT64, ME_INT64, ME_FLOAT64, ME_UINT32, ME_UINT32, ME_UINT32, ME_UINT64, ME_FLOAT64,
-        ME_FLOAT64, ME_COMPLEX128, ME_COMPLEX128
-    }, /* UINT32 */
-    {
-        ME_UINT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_UINT64, ME_UINT64, ME_UINT64, ME_UINT64,
-        ME_FLOAT64, ME_FLOAT64, ME_COMPLEX128, ME_COMPLEX128
-    }, /* UINT64 */
-    {
-        ME_FLOAT32, ME_FLOAT32, ME_FLOAT32, ME_FLOAT64, ME_FLOAT64, ME_FLOAT32, ME_FLOAT32, ME_FLOAT64, ME_FLOAT64,
-        ME_FLOAT32, ME_FLOAT64, ME_COMPLEX64, ME_COMPLEX128
-    }, /* FLOAT32 */
-    {
-        ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64, ME_FLOAT64,
-        ME_FLOAT64, ME_FLOAT64, ME_COMPLEX128, ME_COMPLEX128
-    }, /* FLOAT64 */
-    {
-        ME_COMPLEX64, ME_COMPLEX64, ME_COMPLEX64, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX64, ME_COMPLEX64,
-        ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX64, ME_COMPLEX128, ME_COMPLEX64, ME_COMPLEX128
-    }, /* COMPLEX64 */
-    {
-        ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128,
-        ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128, ME_COMPLEX128
-    } /* COMPLEX128 */
-};
-
-/* Promote two types according to NumPy rules */
-static me_dtype promote_types(me_dtype a, me_dtype b) {
-    // ME_AUTO should have been resolved during compilation
-    if (a == ME_AUTO || b == ME_AUTO) {
-        fprintf(stderr, "FATAL: ME_AUTO in type promotion (a=%d, b=%d). This is a bug.\n", a, b);
-#ifdef NDEBUG
-        abort(); // Release build: terminate immediately
-#else
-        assert(0 && "ME_AUTO should be resolved during compilation"); // Debug: trigger debugger
-#endif
-    }
-
-    // Adjust indices since table starts at ME_BOOL (index 1), not ME_AUTO (index 0)
-    int a_idx = a - 1;
-    int b_idx = b - 1;
-    if (a_idx >= 0 && a_idx < 13 && b_idx >= 0 && b_idx < 13) {
-        return type_promotion_table[a_idx][b_idx];
-    }
-    fprintf(stderr, "WARNING: Invalid dtype in type promotion (a=%d, b=%d). Falling back to FLOAT64.\n", a, b);
-    return ME_FLOAT64; // Fallback for out-of-range types
-}
-
-static bool is_integer_dtype(me_dtype dt) {
-    return dt >= ME_INT8 && dt <= ME_UINT64;
-}
-
-static bool is_float_dtype(me_dtype dt) {
-    return dt == ME_FLOAT32 || dt == ME_FLOAT64;
-}
-
-static bool is_complex_dtype(me_dtype dt) {
-    return dt == ME_COMPLEX64 || dt == ME_COMPLEX128;
-}
-
-static double sum_reduce(double x);
-static double prod_reduce(double x);
-static double any_reduce(double x);
-static double all_reduce(double x);
-
-static me_dtype reduction_output_dtype(me_dtype dt, const void* func) {
-    if (func == (void*)any_reduce || func == (void*)all_reduce) {
-        return ME_BOOL;
-    }
-    if (func == (void*)sum_reduce || func == (void*)prod_reduce) {
-        if (dt == ME_BOOL) {
-            return ME_INT64;
-        }
-        if (dt >= ME_UINT8 && dt <= ME_UINT64) {
-            return ME_UINT64;
-        }
-        if (dt >= ME_INT8 && dt <= ME_INT64) {
-            return ME_INT64;
-        }
-    }
-    return dt;
-}
-
-/* Get size of a type in bytes */
-static size_t dtype_size(me_dtype dtype) {
-    switch (dtype) {
-    case ME_BOOL: return sizeof(bool);
-    case ME_INT8: return sizeof(int8_t);
-    case ME_INT16: return sizeof(int16_t);
-    case ME_INT32: return sizeof(int32_t);
-    case ME_INT64: return sizeof(int64_t);
-    case ME_UINT8: return sizeof(uint8_t);
-    case ME_UINT16: return sizeof(uint16_t);
-    case ME_UINT32: return sizeof(uint32_t);
-    case ME_UINT64: return sizeof(uint64_t);
-    case ME_FLOAT32: return sizeof(float);
-    case ME_FLOAT64: return sizeof(double);
-    case ME_COMPLEX64: return sizeof(float _Complex);
-    case ME_COMPLEX128: return sizeof(double _Complex);
-    default: return 0;
-    }
-}
-
-
-enum { ME_CONSTANT = 1 };
-
-
-typedef struct state {
-    const char* start;
-    const char* next;
-    int type;
-
-    union {
-        double value;
-        const double* bound;
-        const void* function;
-    };
-
-    void* context;
-    me_dtype dtype; // Type of current token
-    me_dtype target_dtype; // Target dtype for the overall expression
-
-    const me_variable* lookup;
-    int lookup_len;
-} state;
-
-
-#define TYPE_MASK(TYPE) ((TYPE)&0x0000001F)
-
-#define IS_PURE(TYPE) (((TYPE) & ME_FLAG_PURE) != 0)
-#define IS_FUNCTION(TYPE) (((TYPE) & ME_FUNCTION0) != 0)
-#define IS_CLOSURE(TYPE) (((TYPE) & ME_CLOSURE0) != 0)
-#define ARITY(TYPE) ( ((TYPE) & (ME_FUNCTION0 | ME_CLOSURE0)) ? ((TYPE) & 0x00000007) : 0 )
-#define NEW_EXPR(type, ...) new_expr((type), (const me_expr*[]){__VA_ARGS__})
-#define CHECK_NULL(ptr, ...) if ((ptr) == NULL) { __VA_ARGS__; return NULL; }
-
-/* Forward declarations */
-static me_expr* new_expr(const int type, const me_expr* parameters[]);
-static me_dtype infer_output_type(const me_expr* n);
-static void private_eval(const me_expr* n);
-static void eval_reduction(const me_expr* n, int output_nitems);
-static double conj_wrapper(double x);
-static double imag_wrapper(double x);
-static double real_wrapper(double x);
-static double round_wrapper(double x);
-static double sum_reduce(double x);
-static double prod_reduce(double x);
-static double any_reduce(double x);
-static double all_reduce(double x);
-static double min_reduce(double x);
-static double max_reduce(double x);
-static double sign(double x);
-static double square(double x);
-static double trunc_wrapper(double x);
-static double where_scalar(double c, double x, double y);
-
-static bool is_reduction_function(const void* func) {
-    return func == (void*)sum_reduce || func == (void*)prod_reduce ||
-        func == (void*)min_reduce || func == (void*)max_reduce ||
-        func == (void*)any_reduce || func == (void*)all_reduce;
-}
-
-static bool is_reduction_node(const me_expr* n) {
-    return n && IS_FUNCTION(n->type) && ARITY(n->type) == 1 &&
-        is_reduction_function(n->function);
-}
-
-static bool contains_reduction(const me_expr* n) {
-    if (!n) return false;
-    if (is_reduction_node(n)) return true;
-
-    switch (TYPE_MASK(n->type)) {
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(n->type);
-            for (int i = 0; i < arity; i++) {
-                if (contains_reduction((const me_expr*)n->parameters[i])) {
-                    return true;
-                }
-            }
-            return false;
-        }
-    default:
-        return false;
-    }
-}
-
-static bool reduction_usage_is_valid(const me_expr* n) {
-    if (!n) return true;
-    if (is_reduction_node(n)) {
-        me_expr* arg = (me_expr*)n->parameters[0];
-        if (!arg) return false;
-        if (contains_reduction(arg)) return false;
-        me_dtype arg_type = infer_output_type(arg);
-        if (n->function == (void*)min_reduce || n->function == (void*)max_reduce) {
-            if (arg_type == ME_COMPLEX64 || arg_type == ME_COMPLEX128) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    switch (TYPE_MASK(n->type)) {
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(n->type);
-            for (int i = 0; i < arity; i++) {
-                if (!reduction_usage_is_valid((const me_expr*)n->parameters[i])) {
-                    return false;
-                }
-            }
-            return true;
-        }
-    default:
-        return true;
-    }
-}
-
-/* Infer computation type from expression tree (for evaluation) */
-static me_dtype infer_result_type(const me_expr* n) {
-    if (!n) return ME_FLOAT64;
-
-    switch (TYPE_MASK(n->type)) {
-    case ME_CONSTANT:
-        return n->dtype;
-
-    case ME_VARIABLE:
-        return n->dtype;
-
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            if (is_reduction_node(n)) {
-                me_dtype param_type = infer_result_type((const me_expr*)n->parameters[0]);
-                return reduction_output_dtype(param_type, n->function);
-            }
-            // Special case: imag() and real() return real type from complex input
-            if (IS_FUNCTION(n->type) && ARITY(n->type) == 1) {
-                if (n->function == (void*)imag_wrapper || n->function == (void*)real_wrapper) {
-                    me_dtype param_type = infer_result_type((const me_expr*)n->parameters[0]);
-                    if (param_type == ME_COMPLEX64) {
-                        return ME_FLOAT32;
-                    }
-                    else if (param_type == ME_COMPLEX128) {
-                        return ME_FLOAT64;
-                    }
-                    // If input is not complex, return as-is (shouldn't happen, but be safe)
-                    return param_type;
-                }
-            }
-
-            // For comparisons with ME_BOOL output, we still need to infer the
-            // computation type from operands (e.g., float64 for float inputs).
-            // Don't return ME_BOOL early - let the operand types determine
-            // the computation type.
-
-            const int arity = ARITY(n->type);
-            me_dtype result = ME_BOOL;
-
-            for (int i = 0; i < arity; i++) {
-                me_dtype param_type = infer_result_type((const me_expr*)n->parameters[i]);
-                result = promote_types(result, param_type);
-            }
-
-            return result;
-        }
-    }
-
-    return ME_FLOAT64;
-}
-
-/* Infer logical output type from expression tree (for compilation with ME_AUTO) */
-static me_dtype infer_output_type(const me_expr* n) {
-    if (!n) return ME_FLOAT64;
-
-    switch (TYPE_MASK(n->type)) {
-    case ME_CONSTANT:
-        return n->dtype;
-
-    case ME_VARIABLE:
-        return n->dtype;
-
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            if (is_reduction_node(n)) {
-                me_dtype param_type = infer_output_type((const me_expr*)n->parameters[0]);
-                return reduction_output_dtype(param_type, n->function);
-            }
-            // Special case: imag() and real() return real type from complex input
-            if (IS_FUNCTION(n->type) && ARITY(n->type) == 1) {
-                if (n->function == (void*)imag_wrapper || n->function == (void*)real_wrapper) {
-                    me_dtype param_type = infer_output_type((const me_expr*)n->parameters[0]);
-                    if (param_type == ME_COMPLEX64) {
-                        return ME_FLOAT32;
-                    }
-                    else if (param_type == ME_COMPLEX128) {
-                        return ME_FLOAT64;
-                    }
-                    // If input is not complex, return as-is (shouldn't happen, but be safe)
-                    return param_type;
-                }
-            }
-
-            // Special case: where(cond, x, y) -> promote(x, y), regardless of cond type.
-            if (IS_FUNCTION(n->type) && ARITY(n->type) == 3 &&
-                n->function == (void*)where_scalar) {
-                me_dtype x_type = infer_output_type((const me_expr*)n->parameters[1]);
-                me_dtype y_type = infer_output_type((const me_expr*)n->parameters[2]);
-                return promote_types(x_type, y_type);
-            }
-
-            // If this node is a comparison (dtype == ME_BOOL set during parsing),
-            // the output type is ME_BOOL
-            if (n->dtype == ME_BOOL) {
-                return ME_BOOL;
-            }
-
-            // Otherwise, infer from operands
-            const int arity = ARITY(n->type);
-            me_dtype result = ME_BOOL;
-
-            for (int i = 0; i < arity; i++) {
-                me_dtype param_type = infer_output_type((const me_expr*)n->parameters[i]);
-                result = promote_types(result, param_type);
-            }
-
-            return result;
-        }
-    }
-
-    return ME_FLOAT64;
-}
-
-/* Apply type promotion to a binary operation node */
-static me_expr* create_conversion_node(me_expr* source, me_dtype target_dtype) {
-    /* Create a unary conversion node that converts source to target_dtype */
-    me_expr* conv = NEW_EXPR(ME_FUNCTION1 | ME_FLAG_PURE, source);
-    if (conv) {
-        conv->function = NULL; // Mark as conversion
-        conv->dtype = target_dtype;
-        conv->input_dtype = source->dtype;
-    }
-    return conv;
-}
-
-static void apply_type_promotion(me_expr* node) {
-    if (!node || ARITY(node->type) < 2) return;
-
-    me_expr* left = (me_expr*)node->parameters[0];
-    me_expr* right = (me_expr*)node->parameters[1];
-
-    if (left && right) {
-        me_dtype left_type = left->dtype;
-        me_dtype right_type = right->dtype;
-        me_dtype promoted = promote_types(left_type, right_type);
-
-        // Store the promoted output type
-        node->dtype = promoted;
-
-        // TODO: Conversion nodes not fully implemented yet
-        // See TYPE_PROMOTION_IMPLEMENTATION.md for details
-        /*
-        // Insert conversion nodes if needed
-        if (left_type != promoted) {
-            me_expr *conv_left = creame_conversion_node(left, promoted);
-            if (conv_left) {
-                node->parameters[0] = conv_left;
-            }
-        }
-
-        if (right_type != promoted) {
-            me_expr *conv_right = creame_conversion_node(right, promoted);
-            if (conv_right) {
-                node->parameters[1] = conv_right;
-            }
-        }
-        */
-    }
-}
-
-static me_expr* new_expr(const int type, const me_expr* parameters[]) {
-    const int arity = ARITY(type);
-    const int psize = sizeof(void*) * arity;
-    const int size = (sizeof(me_expr) - sizeof(void*)) + psize + (IS_CLOSURE(type) ? sizeof(void*) : 0);
-    me_expr* ret = malloc(size);
-    CHECK_NULL(ret);
-
-    memset(ret, 0, size);
-    if (arity && parameters) {
-        memcpy(ret->parameters, parameters, psize);
-    }
-    ret->type = type;
-    ret->bound = 0;
-    ret->output = NULL;
-    ret->nitems = 0;
-    ret->dtype = ME_FLOAT64; // Default to double
-    ret->bytecode = NULL;
-    ret->ncode = 0;
-    return ret;
-}
-
-
-void me_free_parameters(me_expr* n) {
-    if (!n) return;
-    switch (TYPE_MASK(n->type)) {
-    case ME_FUNCTION7:
-    case ME_CLOSURE7:
-        if (n->parameters[6] && ((me_expr*)n->parameters[6])->output &&
-            ((me_expr*)n->parameters[6])->output != n->output) {
-            free(((me_expr*)n->parameters[6])->output);
-        }
-        me_free(n->parameters[6]);
-    case ME_FUNCTION6:
-    case ME_CLOSURE6:
-        if (n->parameters[5] && ((me_expr*)n->parameters[5])->output &&
-            ((me_expr*)n->parameters[5])->output != n->output) {
-            free(((me_expr*)n->parameters[5])->output);
-        }
-        me_free(n->parameters[5]);
-    case ME_FUNCTION5:
-    case ME_CLOSURE5:
-        if (n->parameters[4] && ((me_expr*)n->parameters[4])->output &&
-            ((me_expr*)n->parameters[4])->output != n->output) {
-            free(((me_expr*)n->parameters[4])->output);
-        }
-        me_free(n->parameters[4]);
-    case ME_FUNCTION4:
-    case ME_CLOSURE4:
-        if (n->parameters[3] && ((me_expr*)n->parameters[3])->output &&
-            ((me_expr*)n->parameters[3])->output != n->output) {
-            free(((me_expr*)n->parameters[3])->output);
-        }
-        me_free(n->parameters[3]);
-    case ME_FUNCTION3:
-    case ME_CLOSURE3:
-        if (n->parameters[2] && ((me_expr*)n->parameters[2])->output &&
-            ((me_expr*)n->parameters[2])->output != n->output) {
-            free(((me_expr*)n->parameters[2])->output);
-        }
-        me_free(n->parameters[2]);
-    case ME_FUNCTION2:
-    case ME_CLOSURE2:
-        if (n->parameters[1] && ((me_expr*)n->parameters[1])->output &&
-            ((me_expr*)n->parameters[1])->output != n->output) {
-            free(((me_expr*)n->parameters[1])->output);
-        }
-        me_free(n->parameters[1]);
-    case ME_FUNCTION1:
-    case ME_CLOSURE1:
-        if (n->parameters[0] && ((me_expr*)n->parameters[0])->output &&
-            ((me_expr*)n->parameters[0])->output != n->output) {
-            free(((me_expr*)n->parameters[0])->output);
-        }
-        me_free(n->parameters[0]);
-    }
-}
-
-
-void me_free(me_expr* n) {
-    if (!n) return;
-    me_free_parameters(n);
-    if (n->bytecode) {
-        free(n->bytecode);
-    }
-    free(n);
-}
-
-
-static double pi(void) { return 3.14159265358979323846; }
-static double e(void) { return 2.71828182845904523536; }
-
-/* Wrapper for expm1: exp(x) - 1, more accurate for small x */
-static double expm1_wrapper(double x) { return expm1(x); }
-
-/* Wrapper for log1p: log(1 + x), more accurate for small x */
-static double log1p_wrapper(double x) { return log1p(x); }
-
-/* Wrapper for log2: base-2 logarithm */
-static double log2_wrapper(double x) { return log2(x); }
-
-/* logaddexp: log(exp(a) + exp(b)), numerically stable */
-static double logaddexp(double a, double b) {
-    if (a == b) {
-        return a + log1p(1.0); // log(2*exp(a)) = a + log(2)
-    }
-    double max_val = (a > b) ? a : b;
-    double min_val = (a > b) ? b : a;
-    return max_val + log1p(exp(min_val - max_val));
-}
-
-/* Forward declarations for complex operations */
-/* (Already declared above) */
-
-/* Wrapper functions for complex operations (for function pointer compatibility) */
-/* These are placeholders - actual implementation is in vector functions */
-static double conj_wrapper(double x) { return x; }
-
-static double imag_wrapper(double x) {
-    (void)x;
-    return 0.0;
-}
-
-/* Wrapper for round: round to nearest integer */
-static double round_wrapper(double x) { return round(x); }
-
-/* sign: returns -1.0, 0.0, or 1.0 based on sign of x */
-static double sign(double x) {
-    if (x > 0.0) return 1.0;
-    if (x < 0.0) return -1.0;
-    return 0.0;
-}
-
-/* square: x * x */
-static double square(double x) { return x * x; }
-
-/* Wrapper for trunc: truncate towards zero */
-static double trunc_wrapper(double x) { return trunc(x); }
-
-/* Scalar helper for where(), used only in generic slow path */
-static double where_scalar(double c, double x, double y) {
-    return (c != 0.0) ? x : y;
-}
-
-static double real_wrapper(double x) { return x; }
-
-static double fac(double a) {
-    /* simplest version of fac */
-    if (a < 0.0)
-        return NAN;
-    if (a > UINT_MAX)
-        return INFINITY;
-    unsigned int ua = (unsigned int)(a);
-    unsigned long int result = 1, i;
-    for (i = 1; i <= ua; i++) {
-        if (i > ULONG_MAX / result)
-            return INFINITY;
-        result *= i;
-    }
-    return (double)result;
-}
-
-static double ncr(double n, double r) {
-    if (n < 0.0 || r < 0.0 || n < r) return NAN;
-    if (n > UINT_MAX || r > UINT_MAX) return INFINITY;
-    unsigned long int un = (unsigned int)(n), ur = (unsigned int)(r), i;
-    unsigned long int result = 1;
-    if (ur > un / 2) ur = un - ur;
-    for (i = 1; i <= ur; i++) {
-        if (result > ULONG_MAX / (un - ur + i))
-            return INFINITY;
-        result *= un - ur + i;
-        result /= i;
-    }
-    return result;
-}
-
-static double npr(double n, double r) { return ncr(n, r) * fac(r); }
-
-#ifdef _MSC_VER
-#pragma function (ceil)
-#pragma function (floor)
-#endif
-
-static const me_variable functions[] = {
-    /* must be in alphabetical order */
-    /* Format: {name, dtype, address, type, context} */
-    {"abs", 0, fabs, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"acos", 0, acos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"acosh", 0, acosh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"all", 0, all_reduce, ME_FUNCTION1, 0},
-    {"any", 0, any_reduce, ME_FUNCTION1, 0},
-    {"arccos", 0, acos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"arccosh", 0, acosh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"arcsin", 0, asin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"arcsinh", 0, asinh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"arctan", 0, atan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"arctan2", 0, atan2, ME_FUNCTION2 | ME_FLAG_PURE, 0},
-    {"arctanh", 0, atanh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"asin", 0, asin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"asinh", 0, asinh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"atan", 0, atan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"atan2", 0, atan2, ME_FUNCTION2 | ME_FLAG_PURE, 0},
-    {"atanh", 0, atanh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"ceil", 0, ceil, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"conj", 0, conj_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"cos", 0, cos, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"cosh", 0, cosh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"e", 0, e, ME_FUNCTION0 | ME_FLAG_PURE, 0},
-    {"exp", 0, exp, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"expm1", 0, expm1_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"fac", 0, fac, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"floor", 0, floor, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"imag", 0, imag_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"ln", 0, log, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-#ifdef ME_NAT_LOG
-    {"log", 0, log, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-#else
-    {"log", 0, log10, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-#endif
-    {"log10", 0, log10, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"log1p", 0, log1p_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"log2", 0, log2_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"logaddexp", 0, logaddexp, ME_FUNCTION2 | ME_FLAG_PURE, 0},
-    {"max", 0, max_reduce, ME_FUNCTION1, 0},
-    {"min", 0, min_reduce, ME_FUNCTION1, 0},
-    {"ncr", 0, ncr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
-    {"npr", 0, npr, ME_FUNCTION2 | ME_FLAG_PURE, 0},
-    {"pi", 0, pi, ME_FUNCTION0 | ME_FLAG_PURE, 0},
-    {"pow", 0, pow, ME_FUNCTION2 | ME_FLAG_PURE, 0},
-    {"prod", 0, prod_reduce, ME_FUNCTION1, 0},
-    {"real", 0, real_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"round", 0, round_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"sign", 0, sign, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"sin", 0, sin, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"sinh", 0, sinh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"sqrt", 0, sqrt, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"square", 0, square, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"sum", 0, sum_reduce, ME_FUNCTION1, 0},
-    {"tan", 0, tan, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"tanh", 0, tanh, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"trunc", 0, trunc_wrapper, ME_FUNCTION1 | ME_FLAG_PURE, 0},
-    {"where", 0, where_scalar, ME_FUNCTION3 | ME_FLAG_PURE, 0},
-    {0, 0, 0, 0, 0}
-};
-
-static const me_variable* find_builtin(const char* name, int len) {
-    int imin = 0;
-    int imax = sizeof(functions) / sizeof(me_variable) - 2;
-
-    /*Binary search.*/
-    while (imax >= imin) {
-        const int i = (imin + ((imax - imin) / 2));
-        int c = strncmp(name, functions[i].name, len);
-        if (!c) c = '\0' - functions[i].name[len];
-        if (c == 0) {
-            return functions + i;
-        }
-        else if (c > 0) {
-            imin = i + 1;
-        }
-        else {
-            imax = i - 1;
-        }
-    }
-
-    return 0;
-}
-
-static const me_variable* find_lookup(const state* s, const char* name, int len) {
-    int iters;
-    const me_variable* var;
-    if (!s->lookup) return 0;
-
-    for (var = s->lookup, iters = s->lookup_len; iters; ++var, --iters) {
-        if (strncmp(name, var->name, len) == 0 && var->name[len] == '\0') {
-            return var;
-        }
-    }
-    return 0;
-}
-
-
-static double add(double a, double b) { return a + b; }
-static double sub(double a, double b) { return a - b; }
-static double mul(double a, double b) { return a * b; }
-static double divide(double a, double b) { return a / b; }
-static double negate(double a) { return -a; }
-static volatile double sum_salt = 0.0;
-static volatile double prod_salt = 1.0;
-static volatile double min_salt = 0.0;
-static volatile double max_salt = 0.0;
-static volatile double any_salt = 0.0;
-static volatile double all_salt = 0.0;
-static double sum_reduce(double x) { return x + sum_salt; }
-static double prod_reduce(double x) { return x * prod_salt; }
-static double any_reduce(double x) { return x + any_salt; }
-static double all_reduce(double x) { return x * (1.0 + all_salt); }
-static double min_reduce(double x) { return x + min_salt; }
-static double max_reduce(double x) { return x - max_salt; }
-
-static float reduce_min_float32_nan_safe(const float* data, int nitems) {
-    if (nitems <= 0) return INFINITY;
-#if defined(__AVX__) || defined(__AVX2__)
-    int i = 0;
-    __m256 vmin = _mm256_set1_ps(INFINITY);
-    __m256 vnan = _mm256_setzero_ps();
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        __m256 v = _mm256_loadu_ps(data + i);
-        vnan = _mm256_or_ps(vnan, _mm256_cmp_ps(v, v, _CMP_UNORD_Q));
-        vmin = _mm256_min_ps(vmin, v);
-    }
-    __m128 low = _mm256_castps256_ps128(vmin);
-    __m128 high = _mm256_extractf128_ps(vmin, 1);
-    __m128 min128 = _mm_min_ps(low, high);
-    __m128 tmp = _mm_min_ps(min128, _mm_movehl_ps(min128, min128));
-    tmp = _mm_min_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
-    float acc = _mm_cvtss_f32(tmp);
-    if (_mm256_movemask_ps(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        float v = data[i];
-        if (v != v) return v;
-        if (v < acc) acc = v;
-    }
-    return acc;
-#elif defined(__SSE__)
-    int i = 0;
-    __m128 vmin = _mm_set1_ps(INFINITY);
-    __m128 vnan = _mm_setzero_ps();
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        __m128 v = _mm_loadu_ps(data + i);
-        vnan = _mm_or_ps(vnan, _mm_cmpunord_ps(v, v));
-        vmin = _mm_min_ps(vmin, v);
-    }
-    __m128 tmp = _mm_min_ps(vmin, _mm_movehl_ps(vmin, vmin));
-    tmp = _mm_min_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
-    float acc = _mm_cvtss_f32(tmp);
-    if (_mm_movemask_ps(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        float v = data[i];
-        if (v != v) return v;
-        if (v < acc) acc = v;
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    float32x4_t vmin = vdupq_n_f32(INFINITY);
-    uint32x4_t vnan = vdupq_n_u32(0);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        float32x4_t v = vld1q_f32(data + i);
-        uint32x4_t eq = vceqq_f32(v, v);
-        vnan = vorrq_u32(vnan, vmvnq_u32(eq));
-        vmin = vminq_f32(vmin, v);
-    }
-#if defined(__aarch64__)
-    float acc = vminvq_f32(vmin);
-#else
-    float32x2_t min2 = vmin_f32(vget_low_f32(vmin), vget_high_f32(vmin));
-    min2 = vpmin_f32(min2, min2);
-    float acc = vget_lane_f32(min2, 0);
-#endif
-    uint32x2_t nan2 = vorr_u32(vget_low_u32(vnan), vget_high_u32(vnan));
-    nan2 = vpadd_u32(nan2, nan2);
-    if (vget_lane_u32(nan2, 0)) return NAN;
-    for (; i < nitems; i++) {
-        float v = data[i];
-        if (v != v) return v;
-        if (v < acc) acc = v;
-    }
-    return acc;
-#else
-    float acc = data[0];
-    for (int i = 0; i < nitems; i++) {
-        float v = data[i];
-        if (v != v) return v;
-        if (v < acc) acc = v;
-    }
-    return acc;
-#endif
-}
-
-static float reduce_max_float32_nan_safe(const float* data, int nitems) {
-    if (nitems <= 0) return -INFINITY;
-#if defined(__AVX__) || defined(__AVX2__)
-    int i = 0;
-    __m256 vmax = _mm256_set1_ps(-INFINITY);
-    __m256 vnan = _mm256_setzero_ps();
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        __m256 v = _mm256_loadu_ps(data + i);
-        vnan = _mm256_or_ps(vnan, _mm256_cmp_ps(v, v, _CMP_UNORD_Q));
-        vmax = _mm256_max_ps(vmax, v);
-    }
-    __m128 low = _mm256_castps256_ps128(vmax);
-    __m128 high = _mm256_extractf128_ps(vmax, 1);
-    __m128 max128 = _mm_max_ps(low, high);
-    __m128 tmp = _mm_max_ps(max128, _mm_movehl_ps(max128, max128));
-    tmp = _mm_max_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
-    float acc = _mm_cvtss_f32(tmp);
-    if (_mm256_movemask_ps(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        float v = data[i];
-        if (v != v) return v;
-        if (v > acc) acc = v;
-    }
-    return acc;
-#elif defined(__SSE__)
-    int i = 0;
-    __m128 vmax = _mm_set1_ps(-INFINITY);
-    __m128 vnan = _mm_setzero_ps();
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        __m128 v = _mm_loadu_ps(data + i);
-        vnan = _mm_or_ps(vnan, _mm_cmpunord_ps(v, v));
-        vmax = _mm_max_ps(vmax, v);
-    }
-    __m128 tmp = _mm_max_ps(vmax, _mm_movehl_ps(vmax, vmax));
-    tmp = _mm_max_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1));
-    float acc = _mm_cvtss_f32(tmp);
-    if (_mm_movemask_ps(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        float v = data[i];
-        if (v != v) return v;
-        if (v > acc) acc = v;
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    float32x4_t vmax = vdupq_n_f32(-INFINITY);
-    uint32x4_t vnan = vdupq_n_u32(0);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        float32x4_t v = vld1q_f32(data + i);
-        uint32x4_t eq = vceqq_f32(v, v);
-        vnan = vorrq_u32(vnan, vmvnq_u32(eq));
-        vmax = vmaxq_f32(vmax, v);
-    }
-#if defined(__aarch64__)
-    float acc = vmaxvq_f32(vmax);
-#else
-    float32x2_t max2 = vmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
-    max2 = vpmax_f32(max2, max2);
-    float acc = vget_lane_f32(max2, 0);
-#endif
-    uint32x2_t nan2 = vorr_u32(vget_low_u32(vnan), vget_high_u32(vnan));
-    nan2 = vpadd_u32(nan2, nan2);
-    if (vget_lane_u32(nan2, 0)) return NAN;
-    for (; i < nitems; i++) {
-        float v = data[i];
-        if (v != v) return v;
-        if (v > acc) acc = v;
-    }
-    return acc;
-#else
-    float acc = data[0];
-    for (int i = 0; i < nitems; i++) {
-        float v = data[i];
-        if (v != v) return v;
-        if (v > acc) acc = v;
-    }
-    return acc;
-#endif
-}
-
-static double reduce_min_float64_nan_safe(const double* data, int nitems) {
-    if (nitems <= 0) return INFINITY;
-#if defined(__AVX__) || defined(__AVX2__)
-    int i = 0;
-    __m256d vmin = _mm256_set1_pd(INFINITY);
-    __m256d vnan = _mm256_setzero_pd();
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        __m256d v = _mm256_loadu_pd(data + i);
-        vnan = _mm256_or_pd(vnan, _mm256_cmp_pd(v, v, _CMP_UNORD_Q));
-        vmin = _mm256_min_pd(vmin, v);
-    }
-    __m128d low = _mm256_castpd256_pd128(vmin);
-    __m128d high = _mm256_extractf128_pd(vmin, 1);
-    __m128d min128 = _mm_min_pd(low, high);
-    min128 = _mm_min_sd(min128, _mm_unpackhi_pd(min128, min128));
-    double acc = _mm_cvtsd_f64(min128);
-    if (_mm256_movemask_pd(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        double v = data[i];
-        if (v != v) return v;
-        if (v < acc) acc = v;
-    }
-    return acc;
-#elif defined(__SSE2__)
-    int i = 0;
-    __m128d vmin = _mm_set1_pd(INFINITY);
-    __m128d vnan = _mm_setzero_pd();
-    const int limit = nitems & ~1;
-    for (; i < limit; i += 2) {
-        __m128d v = _mm_loadu_pd(data + i);
-        vnan = _mm_or_pd(vnan, _mm_cmpunord_pd(v, v));
-        vmin = _mm_min_pd(vmin, v);
-    }
-    vmin = _mm_min_sd(vmin, _mm_unpackhi_pd(vmin, vmin));
-    double acc = _mm_cvtsd_f64(vmin);
-    if (_mm_movemask_pd(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        double v = data[i];
-        if (v != v) return v;
-        if (v < acc) acc = v;
-    }
-    return acc;
-#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
-    int i = 0;
-    float64x2_t vmin = vdupq_n_f64(INFINITY);
-    uint64x2_t vnan = vdupq_n_u64(0);
-    const int limit = nitems & ~1;
-    for (; i < limit; i += 2) {
-        float64x2_t v = vld1q_f64(data + i);
-        uint64x2_t eq = vceqq_f64(v, v);
-        vnan = vorrq_u64(vnan, veorq_u64(eq, vdupq_n_u64(~0ULL)));
-        vmin = vminq_f64(vmin, v);
-    }
-    double acc = vminvq_f64(vmin);
-    uint64x2_t nan_or = vorrq_u64(vnan, vextq_u64(vnan, vnan, 1));
-    if (vgetq_lane_u64(nan_or, 0)) return NAN;
-    for (; i < nitems; i++) {
-        double v = data[i];
-        if (v != v) return v;
-        if (v < acc) acc = v;
-    }
-    return acc;
-#else
-    double acc = data[0];
-    for (int i = 0; i < nitems; i++) {
-        double v = data[i];
-        if (v != v) return v;
-        if (v < acc) acc = v;
-    }
-    return acc;
-#endif
-}
-
-static double reduce_max_float64_nan_safe(const double* data, int nitems) {
-    if (nitems <= 0) return -INFINITY;
-#if defined(__AVX__) || defined(__AVX2__)
-    int i = 0;
-    __m256d vmax = _mm256_set1_pd(-INFINITY);
-    __m256d vnan = _mm256_setzero_pd();
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        __m256d v = _mm256_loadu_pd(data + i);
-        vnan = _mm256_or_pd(vnan, _mm256_cmp_pd(v, v, _CMP_UNORD_Q));
-        vmax = _mm256_max_pd(vmax, v);
-    }
-    __m128d low = _mm256_castpd256_pd128(vmax);
-    __m128d high = _mm256_extractf128_pd(vmax, 1);
-    __m128d max128 = _mm_max_pd(low, high);
-    max128 = _mm_max_sd(max128, _mm_unpackhi_pd(max128, max128));
-    double acc = _mm_cvtsd_f64(max128);
-    if (_mm256_movemask_pd(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        double v = data[i];
-        if (v != v) return v;
-        if (v > acc) acc = v;
-    }
-    return acc;
-#elif defined(__SSE2__)
-    int i = 0;
-    __m128d vmax = _mm_set1_pd(-INFINITY);
-    __m128d vnan = _mm_setzero_pd();
-    const int limit = nitems & ~1;
-    for (; i < limit; i += 2) {
-        __m128d v = _mm_loadu_pd(data + i);
-        vnan = _mm_or_pd(vnan, _mm_cmpunord_pd(v, v));
-        vmax = _mm_max_pd(vmax, v);
-    }
-    vmax = _mm_max_sd(vmax, _mm_unpackhi_pd(vmax, vmax));
-    double acc = _mm_cvtsd_f64(vmax);
-    if (_mm_movemask_pd(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        double v = data[i];
-        if (v != v) return v;
-        if (v > acc) acc = v;
-    }
-    return acc;
-#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
-    int i = 0;
-    float64x2_t vmax = vdupq_n_f64(-INFINITY);
-    uint64x2_t vnan = vdupq_n_u64(0);
-    const int limit = nitems & ~1;
-    for (; i < limit; i += 2) {
-        float64x2_t v = vld1q_f64(data + i);
-        uint64x2_t eq = vceqq_f64(v, v);
-        vnan = vorrq_u64(vnan, veorq_u64(eq, vdupq_n_u64(~0ULL)));
-        vmax = vmaxq_f64(vmax, v);
-    }
-    double acc = vmaxvq_f64(vmax);
-    uint64x2_t nan_or = vorrq_u64(vnan, vextq_u64(vnan, vnan, 1));
-    if (vgetq_lane_u64(nan_or, 0)) return NAN;
-    for (; i < nitems; i++) {
-        double v = data[i];
-        if (v != v) return v;
-        if (v > acc) acc = v;
-    }
-    return acc;
-#else
-    double acc = data[0];
-    for (int i = 0; i < nitems; i++) {
-        double v = data[i];
-        if (v != v) return v;
-        if (v > acc) acc = v;
-    }
-    return acc;
-#endif
-}
-
-static int32_t reduce_min_int32(const int32_t* data, int nitems) {
-    if (nitems <= 0) return INT32_MAX;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i vmin = _mm256_set1_epi32(INT32_MAX);
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
-        vmin = _mm256_min_epi32(vmin, v);
-    }
-    int32_t tmp[8];
-    _mm256_storeu_si256((__m256i*)tmp, vmin);
-    int32_t acc = tmp[0];
-    for (int j = 1; j < 8; j++) {
-        if (tmp[j] < acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__SSE4_1__)
-    int i = 0;
-    __m128i vmin = _mm_set1_epi32(INT32_MAX);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        __m128i v = _mm_loadu_si128((const __m128i*)(data + i));
-        vmin = _mm_min_epi32(vmin, v);
-    }
-    int32_t tmp[4];
-    _mm_storeu_si128((__m128i*)tmp, vmin);
-    int32_t acc = tmp[0];
-    for (int j = 1; j < 4; j++) {
-        if (tmp[j] < acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    int32x4_t vmin = vdupq_n_s32(INT32_MAX);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        int32x4_t v = vld1q_s32(data + i);
-        vmin = vminq_s32(vmin, v);
-    }
-#if defined(__aarch64__)
-    int32_t acc = vminvq_s32(vmin);
-#else
-    int32x2_t min2 = vmin_s32(vget_low_s32(vmin), vget_high_s32(vmin));
-    min2 = vpmin_s32(min2, min2);
-    int32_t acc = vget_lane_s32(min2, 0);
-#endif
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#else
-    int32_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#endif
-}
-
-static int32_t reduce_max_int32(const int32_t* data, int nitems) {
-    if (nitems <= 0) return INT32_MIN;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i vmax = _mm256_set1_epi32(INT32_MIN);
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
-        vmax = _mm256_max_epi32(vmax, v);
-    }
-    int32_t tmp[8];
-    _mm256_storeu_si256((__m256i*)tmp, vmax);
-    int32_t acc = tmp[0];
-    for (int j = 1; j < 8; j++) {
-        if (tmp[j] > acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__SSE4_1__)
-    int i = 0;
-    __m128i vmax = _mm_set1_epi32(INT32_MIN);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        __m128i v = _mm_loadu_si128((const __m128i*)(data + i));
-        vmax = _mm_max_epi32(vmax, v);
-    }
-    int32_t tmp[4];
-    _mm_storeu_si128((__m128i*)tmp, vmax);
-    int32_t acc = tmp[0];
-    for (int j = 1; j < 4; j++) {
-        if (tmp[j] > acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    int32x4_t vmax = vdupq_n_s32(INT32_MIN);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        int32x4_t v = vld1q_s32(data + i);
-        vmax = vmaxq_s32(vmax, v);
-    }
-#if defined(__aarch64__)
-    int32_t acc = vmaxvq_s32(vmax);
-#else
-    int32x2_t max2 = vmax_s32(vget_low_s32(vmax), vget_high_s32(vmax));
-    max2 = vpmax_s32(max2, max2);
-    int32_t acc = vget_lane_s32(max2, 0);
-#endif
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#else
-    int32_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#endif
-}
-
-static int8_t reduce_min_int8(const int8_t* data, int nitems) {
-    if (nitems <= 0) return INT8_MAX;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i vmin = _mm256_set1_epi8(INT8_MAX);
-    const int limit = nitems & ~31;
-    for (; i < limit; i += 32) {
-        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
-        vmin = _mm256_min_epi8(vmin, v);
-    }
-    int8_t tmp[32];
-    _mm256_storeu_si256((__m256i*)tmp, vmin);
-    int8_t acc = tmp[0];
-    for (int j = 1; j < 32; j++) {
-        if (tmp[j] < acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    int8x16_t vmin = vdupq_n_s8(INT8_MAX);
-    const int limit = nitems & ~15;
-    for (; i < limit; i += 16) {
-        int8x16_t v = vld1q_s8(data + i);
-        vmin = vminq_s8(vmin, v);
-    }
-#if defined(__aarch64__)
-    int8_t acc = vminvq_s8(vmin);
-#else
-    int8x8_t min8 = vmin_s8(vget_low_s8(vmin), vget_high_s8(vmin));
-    min8 = vpmin_s8(min8, min8);
-    min8 = vpmin_s8(min8, min8);
-    int8_t acc = vget_lane_s8(min8, 0);
-#endif
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#else
-    int8_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#endif
-}
-
-static int8_t reduce_max_int8(const int8_t* data, int nitems) {
-    if (nitems <= 0) return INT8_MIN;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i vmax = _mm256_set1_epi8(INT8_MIN);
-    const int limit = nitems & ~31;
-    for (; i < limit; i += 32) {
-        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
-        vmax = _mm256_max_epi8(vmax, v);
-    }
-    int8_t tmp[32];
-    _mm256_storeu_si256((__m256i*)tmp, vmax);
-    int8_t acc = tmp[0];
-    for (int j = 1; j < 32; j++) {
-        if (tmp[j] > acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    int8x16_t vmax = vdupq_n_s8(INT8_MIN);
-    const int limit = nitems & ~15;
-    for (; i < limit; i += 16) {
-        int8x16_t v = vld1q_s8(data + i);
-        vmax = vmaxq_s8(vmax, v);
-    }
-#if defined(__aarch64__)
-    int8_t acc = vmaxvq_s8(vmax);
-#else
-    int8x8_t max8 = vmax_s8(vget_low_s8(vmax), vget_high_s8(vmax));
-    max8 = vpmax_s8(max8, max8);
-    max8 = vpmax_s8(max8, max8);
-    int8_t acc = vget_lane_s8(max8, 0);
-#endif
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#else
-    int8_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#endif
-}
-
-static int16_t reduce_min_int16(const int16_t* data, int nitems) {
-    if (nitems <= 0) return INT16_MAX;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i vmin = _mm256_set1_epi16(INT16_MAX);
-    const int limit = nitems & ~15;
-    for (; i < limit; i += 16) {
-        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
-        vmin = _mm256_min_epi16(vmin, v);
-    }
-    int16_t tmp[16];
-    _mm256_storeu_si256((__m256i*)tmp, vmin);
-    int16_t acc = tmp[0];
-    for (int j = 1; j < 16; j++) {
-        if (tmp[j] < acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    int16x8_t vmin = vdupq_n_s16(INT16_MAX);
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        int16x8_t v = vld1q_s16(data + i);
-        vmin = vminq_s16(vmin, v);
-    }
-#if defined(__aarch64__)
-    int16_t acc = vminvq_s16(vmin);
-#else
-    int16x4_t min4 = vmin_s16(vget_low_s16(vmin), vget_high_s16(vmin));
-    min4 = vpmin_s16(min4, min4);
-    min4 = vpmin_s16(min4, min4);
-    int16_t acc = vget_lane_s16(min4, 0);
-#endif
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#else
-    int16_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#endif
-}
-
-static int16_t reduce_max_int16(const int16_t* data, int nitems) {
-    if (nitems <= 0) return INT16_MIN;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i vmax = _mm256_set1_epi16(INT16_MIN);
-    const int limit = nitems & ~15;
-    for (; i < limit; i += 16) {
-        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
-        vmax = _mm256_max_epi16(vmax, v);
-    }
-    int16_t tmp[16];
-    _mm256_storeu_si256((__m256i*)tmp, vmax);
-    int16_t acc = tmp[0];
-    for (int j = 1; j < 16; j++) {
-        if (tmp[j] > acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    int16x8_t vmax = vdupq_n_s16(INT16_MIN);
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        int16x8_t v = vld1q_s16(data + i);
-        vmax = vmaxq_s16(vmax, v);
-    }
-#if defined(__aarch64__)
-    int16_t acc = vmaxvq_s16(vmax);
-#else
-    int16x4_t max4 = vmax_s16(vget_low_s16(vmax), vget_high_s16(vmax));
-    max4 = vpmax_s16(max4, max4);
-    max4 = vpmax_s16(max4, max4);
-    int16_t acc = vget_lane_s16(max4, 0);
-#endif
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#else
-    int16_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#endif
-}
-
-static int64_t reduce_min_int64(const int64_t* data, int nitems) {
-    if (nitems <= 0) return INT64_MAX;
-    int64_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-}
-
-static int64_t reduce_max_int64(const int64_t* data, int nitems) {
-    if (nitems <= 0) return INT64_MIN;
-    int64_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-}
-
-static uint8_t reduce_min_uint8(const uint8_t* data, int nitems) {
-    if (nitems <= 0) return UINT8_MAX;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i vmin = _mm256_set1_epi8((char)UINT8_MAX);
-    const int limit = nitems & ~31;
-    for (; i < limit; i += 32) {
-        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
-        vmin = _mm256_min_epu8(vmin, v);
-    }
-    uint8_t tmp[32];
-    _mm256_storeu_si256((__m256i*)tmp, vmin);
-    uint8_t acc = tmp[0];
-    for (int j = 1; j < 32; j++) {
-        if (tmp[j] < acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    uint8x16_t vmin = vdupq_n_u8(UINT8_MAX);
-    const int limit = nitems & ~15;
-    for (; i < limit; i += 16) {
-        uint8x16_t v = vld1q_u8(data + i);
-        vmin = vminq_u8(vmin, v);
-    }
-#if defined(__aarch64__)
-    uint8_t acc = vminvq_u8(vmin);
-#else
-    uint8x8_t min8 = vmin_u8(vget_low_u8(vmin), vget_high_u8(vmin));
-    min8 = vpmin_u8(min8, min8);
-    min8 = vpmin_u8(min8, min8);
-    uint8_t acc = vget_lane_u8(min8, 0);
-#endif
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#else
-    uint8_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#endif
-}
-
-static uint8_t reduce_max_uint8(const uint8_t* data, int nitems) {
-    if (nitems <= 0) return 0;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i vmax = _mm256_setzero_si256();
-    const int limit = nitems & ~31;
-    for (; i < limit; i += 32) {
-        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
-        vmax = _mm256_max_epu8(vmax, v);
-    }
-    uint8_t tmp[32];
-    _mm256_storeu_si256((__m256i*)tmp, vmax);
-    uint8_t acc = tmp[0];
-    for (int j = 1; j < 32; j++) {
-        if (tmp[j] > acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    uint8x16_t vmax = vdupq_n_u8(0);
-    const int limit = nitems & ~15;
-    for (; i < limit; i += 16) {
-        uint8x16_t v = vld1q_u8(data + i);
-        vmax = vmaxq_u8(vmax, v);
-    }
-#if defined(__aarch64__)
-    uint8_t acc = vmaxvq_u8(vmax);
-#else
-    uint8x8_t max8 = vmax_u8(vget_low_u8(vmax), vget_high_u8(vmax));
-    max8 = vpmax_u8(max8, max8);
-    max8 = vpmax_u8(max8, max8);
-    uint8_t acc = vget_lane_u8(max8, 0);
-#endif
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#else
-    uint8_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#endif
-}
-
-static uint16_t reduce_min_uint16(const uint16_t* data, int nitems) {
-    if (nitems <= 0) return UINT16_MAX;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i vmin = _mm256_set1_epi16((short)UINT16_MAX);
-    const int limit = nitems & ~15;
-    for (; i < limit; i += 16) {
-        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
-        vmin = _mm256_min_epu16(vmin, v);
-    }
-    uint16_t tmp[16];
-    _mm256_storeu_si256((__m256i*)tmp, vmin);
-    uint16_t acc = tmp[0];
-    for (int j = 1; j < 16; j++) {
-        if (tmp[j] < acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    uint16x8_t vmin = vdupq_n_u16(UINT16_MAX);
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        uint16x8_t v = vld1q_u16(data + i);
-        vmin = vminq_u16(vmin, v);
-    }
-#if defined(__aarch64__)
-    uint16_t acc = vminvq_u16(vmin);
-#else
-    uint16x4_t min4 = vmin_u16(vget_low_u16(vmin), vget_high_u16(vmin));
-    min4 = vpmin_u16(min4, min4);
-    min4 = vpmin_u16(min4, min4);
-    uint16_t acc = vget_lane_u16(min4, 0);
-#endif
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#else
-    uint16_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#endif
-}
-
-static uint16_t reduce_max_uint16(const uint16_t* data, int nitems) {
-    if (nitems <= 0) return 0;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i vmax = _mm256_setzero_si256();
-    const int limit = nitems & ~15;
-    for (; i < limit; i += 16) {
-        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
-        vmax = _mm256_max_epu16(vmax, v);
-    }
-    uint16_t tmp[16];
-    _mm256_storeu_si256((__m256i*)tmp, vmax);
-    uint16_t acc = tmp[0];
-    for (int j = 1; j < 16; j++) {
-        if (tmp[j] > acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    uint16x8_t vmax = vdupq_n_u16(0);
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        uint16x8_t v = vld1q_u16(data + i);
-        vmax = vmaxq_u16(vmax, v);
-    }
-#if defined(__aarch64__)
-    uint16_t acc = vmaxvq_u16(vmax);
-#else
-    uint16x4_t max4 = vmax_u16(vget_low_u16(vmax), vget_high_u16(vmax));
-    max4 = vpmax_u16(max4, max4);
-    max4 = vpmax_u16(max4, max4);
-    uint16_t acc = vget_lane_u16(max4, 0);
-#endif
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#else
-    uint16_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#endif
-}
-
-static uint32_t reduce_min_uint32(const uint32_t* data, int nitems) {
-    if (nitems <= 0) return UINT32_MAX;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i vmin = _mm256_set1_epi32((int)UINT32_MAX);
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
-        vmin = _mm256_min_epu32(vmin, v);
-    }
-    uint32_t tmp[8];
-    _mm256_storeu_si256((__m256i*)tmp, vmin);
-    uint32_t acc = tmp[0];
-    for (int j = 1; j < 8; j++) {
-        if (tmp[j] < acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    uint32x4_t vmin = vdupq_n_u32(UINT32_MAX);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        uint32x4_t v = vld1q_u32(data + i);
-        vmin = vminq_u32(vmin, v);
-    }
-#if defined(__aarch64__)
-    uint32_t acc = vminvq_u32(vmin);
-#else
-    uint32x2_t min2 = vmin_u32(vget_low_u32(vmin), vget_high_u32(vmin));
-    min2 = vpmin_u32(min2, min2);
-    uint32_t acc = vget_lane_u32(min2, 0);
-#endif
-    for (; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#else
-    uint32_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-#endif
-}
-
-static uint32_t reduce_max_uint32(const uint32_t* data, int nitems) {
-    if (nitems <= 0) return 0;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i vmax = _mm256_setzero_si256();
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        __m256i v = _mm256_loadu_si256((const __m256i*)(data + i));
-        vmax = _mm256_max_epu32(vmax, v);
-    }
-    uint32_t tmp[8];
-    _mm256_storeu_si256((__m256i*)tmp, vmax);
-    uint32_t acc = tmp[0];
-    for (int j = 1; j < 8; j++) {
-        if (tmp[j] > acc) acc = tmp[j];
-    }
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-    int i = 0;
-    uint32x4_t vmax = vdupq_n_u32(0);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        uint32x4_t v = vld1q_u32(data + i);
-        vmax = vmaxq_u32(vmax, v);
-    }
-#if defined(__aarch64__)
-    uint32_t acc = vmaxvq_u32(vmax);
-#else
-    uint32x2_t max2 = vmax_u32(vget_low_u32(vmax), vget_high_u32(vmax));
-    max2 = vpmax_u32(max2, max2);
-    uint32_t acc = vget_lane_u32(max2, 0);
-#endif
-    for (; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#else
-    uint32_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-#endif
-}
-
-static uint64_t reduce_min_uint64(const uint64_t* data, int nitems) {
-    if (nitems <= 0) return UINT64_MAX;
-    uint64_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] < acc) acc = data[i];
-    }
-    return acc;
-}
-
-static uint64_t reduce_max_uint64(const uint64_t* data, int nitems) {
-    if (nitems <= 0) return 0;
-    uint64_t acc = data[0];
-    for (int i = 1; i < nitems; i++) {
-        if (data[i] > acc) acc = data[i];
-    }
-    return acc;
-}
-
-static double reduce_prod_float32_nan_safe(const float* data, int nitems) {
-    if (nitems <= 0) return 1.0;
-#if defined(__AVX__) || defined(__AVX2__)
-    int i = 0;
-    __m256d vprod0 = _mm256_set1_pd(1.0);
-    __m256d vprod1 = _mm256_set1_pd(1.0);
-    int nan_mask = 0;
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        __m256 v = _mm256_loadu_ps(data + i);
-        nan_mask |= _mm256_movemask_ps(_mm256_cmp_ps(v, v, _CMP_UNORD_Q));
-        __m128 vlow = _mm256_castps256_ps128(v);
-        __m128 vhigh = _mm256_extractf128_ps(v, 1);
-        __m256d vlo = _mm256_cvtps_pd(vlow);
-        __m256d vhi = _mm256_cvtps_pd(vhigh);
-        vprod0 = _mm256_mul_pd(vprod0, vlo);
-        vprod1 = _mm256_mul_pd(vprod1, vhi);
-    }
-    __m256d vprod = _mm256_mul_pd(vprod0, vprod1);
-    __m128d low = _mm256_castpd256_pd128(vprod);
-    __m128d high = _mm256_extractf128_pd(vprod, 1);
-    __m128d prod128 = _mm_mul_pd(low, high);
-    prod128 = _mm_mul_sd(prod128, _mm_unpackhi_pd(prod128, prod128));
-    double acc = _mm_cvtsd_f64(prod128);
-    if (nan_mask) return NAN;
-    for (; i < nitems; i++) {
-        double v = (double)data[i];
-        acc *= v;
-        if (v != v) return v;
-    }
-    return acc;
-#elif defined(__SSE2__)
-    int i = 0;
-    __m128d vprod0 = _mm_set1_pd(1.0);
-    __m128d vprod1 = _mm_set1_pd(1.0);
-    int nan_mask = 0;
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        __m128 v = _mm_loadu_ps(data + i);
-        nan_mask |= _mm_movemask_ps(_mm_cmpunord_ps(v, v));
-        __m128 vhigh = _mm_movehl_ps(v, v);
-        __m128d vlo = _mm_cvtps_pd(v);
-        __m128d vhi = _mm_cvtps_pd(vhigh);
-        vprod0 = _mm_mul_pd(vprod0, vlo);
-        vprod1 = _mm_mul_pd(vprod1, vhi);
-    }
-    __m128d prod128 = _mm_mul_pd(vprod0, vprod1);
-    prod128 = _mm_mul_sd(prod128, _mm_unpackhi_pd(prod128, prod128));
-    double acc = _mm_cvtsd_f64(prod128);
-    if (nan_mask) return NAN;
-    for (; i < nitems; i++) {
-        double v = (double)data[i];
-        acc *= v;
-        if (v != v) return v;
-    }
-    return acc;
-#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
-    int i = 0;
-    float64x2_t vprod0 = vdupq_n_f64(1.0);
-    float64x2_t vprod1 = vdupq_n_f64(1.0);
-    uint32x4_t vnan = vdupq_n_u32(0);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        float32x4_t v = vld1q_f32(data + i);
-        uint32x4_t eq = vceqq_f32(v, v);
-        vnan = vorrq_u32(vnan, veorq_u32(eq, vdupq_n_u32(~0U)));
-        float64x2_t vlo = vcvt_f64_f32(vget_low_f32(v));
-        float64x2_t vhi = vcvt_f64_f32(vget_high_f32(v));
-        vprod0 = vmulq_f64(vprod0, vlo);
-        vprod1 = vmulq_f64(vprod1, vhi);
-    }
-    float64x2_t vprod = vmulq_f64(vprod0, vprod1);
-    double acc = vgetq_lane_f64(vprod, 0) * vgetq_lane_f64(vprod, 1);
-    uint32x4_t nan_or = vorrq_u32(vnan, vextq_u32(vnan, vnan, 2));
-    nan_or = vorrq_u32(nan_or, vextq_u32(nan_or, nan_or, 1));
-    if (vgetq_lane_u32(nan_or, 0)) return NAN;
-    for (; i < nitems; i++) {
-        double v = (double)data[i];
-        acc *= v;
-        if (v != v) return v;
-    }
-    return acc;
-#else
-    double acc = 1.0;
-    for (int i = 0; i < nitems; i++) {
-        double v = (double)data[i];
-        acc *= v;
-        if (v != v) return v;
-    }
-    return acc;
-#endif
-}
-
-static double reduce_prod_float64_nan_safe(const double* data, int nitems) {
-    if (nitems <= 0) return 1.0;
-#if defined(__AVX__) || defined(__AVX2__)
-    int i = 0;
-    __m256d vprod = _mm256_set1_pd(1.0);
-    __m256d vnan = _mm256_setzero_pd();
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        __m256d v = _mm256_loadu_pd(data + i);
-        vnan = _mm256_or_pd(vnan, _mm256_cmp_pd(v, v, _CMP_UNORD_Q));
-        vprod = _mm256_mul_pd(vprod, v);
-    }
-    __m128d low = _mm256_castpd256_pd128(vprod);
-    __m128d high = _mm256_extractf128_pd(vprod, 1);
-    __m128d prod128 = _mm_mul_pd(low, high);
-    prod128 = _mm_mul_sd(prod128, _mm_unpackhi_pd(prod128, prod128));
-    double acc = _mm_cvtsd_f64(prod128);
-    if (_mm256_movemask_pd(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        double v = data[i];
-        acc *= v;
-        if (v != v) return v;
-    }
-    return acc;
-#elif defined(__SSE2__)
-    int i = 0;
-    __m128d vprod = _mm_set1_pd(1.0);
-    __m128d vnan = _mm_setzero_pd();
-    const int limit = nitems & ~1;
-    for (; i < limit; i += 2) {
-        __m128d v = _mm_loadu_pd(data + i);
-        vnan = _mm_or_pd(vnan, _mm_cmpunord_pd(v, v));
-        vprod = _mm_mul_pd(vprod, v);
-    }
-    vprod = _mm_mul_sd(vprod, _mm_unpackhi_pd(vprod, vprod));
-    double acc = _mm_cvtsd_f64(vprod);
-    if (_mm_movemask_pd(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        double v = data[i];
-        acc *= v;
-        if (v != v) return v;
-    }
-    return acc;
-#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
-    int i = 0;
-    float64x2_t vprod = vdupq_n_f64(1.0);
-    uint64x2_t vnan = vdupq_n_u64(0);
-    const int limit = nitems & ~1;
-    for (; i < limit; i += 2) {
-        float64x2_t v = vld1q_f64(data + i);
-        uint64x2_t eq = vceqq_f64(v, v);
-        vnan = vorrq_u64(vnan, veorq_u64(eq, vdupq_n_u64(~0ULL)));
-        vprod = vmulq_f64(vprod, v);
-    }
-    double acc = vgetq_lane_f64(vprod, 0) * vgetq_lane_f64(vprod, 1);
-    uint64x2_t nan_or = vorrq_u64(vnan, vextq_u64(vnan, vnan, 1));
-    if (vgetq_lane_u64(nan_or, 0)) return NAN;
-    for (; i < nitems; i++) {
-        double v = data[i];
-        acc *= v;
-        if (v != v) return v;
-    }
-    return acc;
-#else
-    double acc = 1.0;
-    for (int i = 0; i < nitems; i++) {
-        double v = data[i];
-        acc *= v;
-        if (v != v) return v;
-    }
-    return acc;
-#endif
-}
-
-static double reduce_sum_float32_nan_safe(const float* data, int nitems) {
-    if (nitems <= 0) return 0.0;
-#if defined(__AVX__) || defined(__AVX2__)
-    int i = 0;
-    __m256d vsum0 = _mm256_setzero_pd();
-    __m256d vsum1 = _mm256_setzero_pd();
-    int nan_mask = 0;
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        __m256 v = _mm256_loadu_ps(data + i);
-        nan_mask |= _mm256_movemask_ps(_mm256_cmp_ps(v, v, _CMP_UNORD_Q));
-        __m128 vlow = _mm256_castps256_ps128(v);
-        __m128 vhigh = _mm256_extractf128_ps(v, 1);
-        __m256d vlo = _mm256_cvtps_pd(vlow);
-        __m256d vhi = _mm256_cvtps_pd(vhigh);
-        vsum0 = _mm256_add_pd(vsum0, vlo);
-        vsum1 = _mm256_add_pd(vsum1, vhi);
-    }
-    __m256d vsum = _mm256_add_pd(vsum0, vsum1);
-    __m128d low = _mm256_castpd256_pd128(vsum);
-    __m128d high = _mm256_extractf128_pd(vsum, 1);
-    __m128d sum128 = _mm_add_pd(low, high);
-    sum128 = _mm_add_sd(sum128, _mm_unpackhi_pd(sum128, sum128));
-    double acc = _mm_cvtsd_f64(sum128);
-    if (nan_mask) return NAN;
-    for (; i < nitems; i++) {
-        double v = (double)data[i];
-        acc += v;
-        if (v != v) return v;
-    }
-    return acc;
-#elif defined(__SSE2__)
-    int i = 0;
-    __m128d vsum0 = _mm_setzero_pd();
-    __m128d vsum1 = _mm_setzero_pd();
-    int nan_mask = 0;
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        __m128 v = _mm_loadu_ps(data + i);
-        nan_mask |= _mm_movemask_ps(_mm_cmpunord_ps(v, v));
-        __m128 vhigh = _mm_movehl_ps(v, v);
-        __m128d vlo = _mm_cvtps_pd(v);
-        __m128d vhi = _mm_cvtps_pd(vhigh);
-        vsum0 = _mm_add_pd(vsum0, vlo);
-        vsum1 = _mm_add_pd(vsum1, vhi);
-    }
-    __m128d sum128 = _mm_add_pd(vsum0, vsum1);
-    sum128 = _mm_add_sd(sum128, _mm_unpackhi_pd(sum128, sum128));
-    double acc = _mm_cvtsd_f64(sum128);
-    if (nan_mask) return NAN;
-    for (; i < nitems; i++) {
-        double v = (double)data[i];
-        acc += v;
-        if (v != v) return v;
-    }
-    return acc;
-#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
-    int i = 0;
-    float64x2_t vsum0 = vdupq_n_f64(0.0);
-    float64x2_t vsum1 = vdupq_n_f64(0.0);
-    uint32x4_t vnan = vdupq_n_u32(0);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        float32x4_t v = vld1q_f32(data + i);
-        uint32x4_t eq = vceqq_f32(v, v);
-        vnan = vorrq_u32(vnan, veorq_u32(eq, vdupq_n_u32(~0U)));
-        float64x2_t vlo = vcvt_f64_f32(vget_low_f32(v));
-        float64x2_t vhi = vcvt_f64_f32(vget_high_f32(v));
-        vsum0 = vaddq_f64(vsum0, vlo);
-        vsum1 = vaddq_f64(vsum1, vhi);
-    }
-    double acc = vaddvq_f64(vaddq_f64(vsum0, vsum1));
-    uint32x4_t nan_or = vorrq_u32(vnan, vextq_u32(vnan, vnan, 2));
-    nan_or = vorrq_u32(nan_or, vextq_u32(nan_or, nan_or, 1));
-    if (vgetq_lane_u32(nan_or, 0)) return NAN;
-    for (; i < nitems; i++) {
-        double v = (double)data[i];
-        acc += v;
-        if (v != v) return v;
-    }
-    return acc;
-#else
-    double acc = 0.0;
-    for (int i = 0; i < nitems; i++) {
-        double v = (double)data[i];
-        acc += v;
-        if (v != v) return v;
-    }
-    return acc;
-#endif
-}
-
-
-static double reduce_sum_float64_nan_safe(const double* data, int nitems) {
-    if (nitems <= 0) return 0.0;
-#if defined(__AVX__) || defined(__AVX2__)
-    int i = 0;
-    __m256d vsum = _mm256_setzero_pd();
-    __m256d vnan = _mm256_setzero_pd();
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        __m256d v = _mm256_loadu_pd(data + i);
-        vnan = _mm256_or_pd(vnan, _mm256_cmp_pd(v, v, _CMP_UNORD_Q));
-        vsum = _mm256_add_pd(vsum, v);
-    }
-    __m128d low = _mm256_castpd256_pd128(vsum);
-    __m128d high = _mm256_extractf128_pd(vsum, 1);
-    __m128d sum128 = _mm_add_pd(low, high);
-    sum128 = _mm_add_sd(sum128, _mm_unpackhi_pd(sum128, sum128));
-    double acc = _mm_cvtsd_f64(sum128);
-    if (_mm256_movemask_pd(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        double v = data[i];
-        acc += v;
-        if (v != v) return v;
-    }
-    return acc;
-#elif defined(__SSE2__)
-    int i = 0;
-    __m128d vsum = _mm_setzero_pd();
-    __m128d vnan = _mm_setzero_pd();
-    const int limit = nitems & ~1;
-    for (; i < limit; i += 2) {
-        __m128d v = _mm_loadu_pd(data + i);
-        vnan = _mm_or_pd(vnan, _mm_cmpunord_pd(v, v));
-        vsum = _mm_add_pd(vsum, v);
-    }
-    vsum = _mm_add_sd(vsum, _mm_unpackhi_pd(vsum, vsum));
-    double acc = _mm_cvtsd_f64(vsum);
-    if (_mm_movemask_pd(vnan)) return NAN;
-    for (; i < nitems; i++) {
-        double v = data[i];
-        acc += v;
-        if (v != v) return v;
-    }
-    return acc;
-#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
-    int i = 0;
-    float64x2_t vsum = vdupq_n_f64(0.0);
-    uint64x2_t vnan = vdupq_n_u64(0);
-    const int limit = nitems & ~1;
-    for (; i < limit; i += 2) {
-        float64x2_t v = vld1q_f64(data + i);
-        uint64x2_t eq = vceqq_f64(v, v);
-        vnan = vorrq_u64(vnan, veorq_u64(eq, vdupq_n_u64(~0ULL)));
-        vsum = vaddq_f64(vsum, v);
-    }
-    double acc = vaddvq_f64(vsum);
-    uint64x2_t nan_or = vorrq_u64(vnan, vextq_u64(vnan, vnan, 1));
-    if (vgetq_lane_u64(nan_or, 0)) return NAN;
-    for (; i < nitems; i++) {
-        double v = data[i];
-        acc += v;
-        if (v != v) return v;
-    }
-    return acc;
-#else
-    double acc = 0.0;
-    for (int i = 0; i < nitems; i++) {
-        double v = data[i];
-        acc += v;
-        if (v != v) return v;
-    }
-    return acc;
-#endif
-}
-
-static int64_t reduce_sum_int32(const int32_t* data, int nitems) {
-    if (nitems <= 0) return 0;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i acc0 = _mm256_setzero_si256();
-    __m256i acc1 = _mm256_setzero_si256();
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        __m256i v = _mm256_loadu_si256((const __m256i *)(data + i));
-        __m128i vlow = _mm256_castsi256_si128(v);
-        __m128i vhigh = _mm256_extracti128_si256(v, 1);
-        __m256i vlow64 = _mm256_cvtepi32_epi64(vlow);
-        __m256i vhigh64 = _mm256_cvtepi32_epi64(vhigh);
-        acc0 = _mm256_add_epi64(acc0, vlow64);
-        acc1 = _mm256_add_epi64(acc1, vhigh64);
-    }
-    acc0 = _mm256_add_epi64(acc0, acc1);
-    int64_t tmp[4];
-    _mm256_storeu_si256((__m256i *)tmp, acc0);
-    int64_t acc = tmp[0] + tmp[1] + tmp[2] + tmp[3];
-    for (; i < nitems; i++) {
-        acc += data[i];
-    }
-    return acc;
-#else
-    int64_t acc = 0;
-    for (int i = 0; i < nitems; i++) {
-        acc += data[i];
-    }
-    return acc;
-#endif
-}
-
-static uint64_t reduce_sum_uint32(const uint32_t* data, int nitems) {
-    if (nitems <= 0) return 0;
-#if defined(__AVX2__)
-    int i = 0;
-    __m256i acc0 = _mm256_setzero_si256();
-    __m256i acc1 = _mm256_setzero_si256();
-    const int limit = nitems & ~7;
-    for (; i < limit; i += 8) {
-        __m256i v = _mm256_loadu_si256((const __m256i *)(data + i));
-        __m128i vlow = _mm256_castsi256_si128(v);
-        __m128i vhigh = _mm256_extracti128_si256(v, 1);
-        __m256i vlow64 = _mm256_cvtepu32_epi64(vlow);
-        __m256i vhigh64 = _mm256_cvtepu32_epi64(vhigh);
-        acc0 = _mm256_add_epi64(acc0, vlow64);
-        acc1 = _mm256_add_epi64(acc1, vhigh64);
-    }
-    acc0 = _mm256_add_epi64(acc0, acc1);
-    uint64_t tmp[4];
-    _mm256_storeu_si256((__m256i *)tmp, acc0);
-    uint64_t acc = tmp[0] + tmp[1] + tmp[2] + tmp[3];
-    for (; i < nitems; i++) {
-        acc += data[i];
-    }
-    return acc;
-#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__aarch64__)
-    int i = 0;
-    uint64x2_t acc0 = vdupq_n_u64(0);
-    uint64x2_t acc1 = vdupq_n_u64(0);
-    const int limit = nitems & ~3;
-    for (; i < limit; i += 4) {
-        uint32x4_t v = vld1q_u32(data + i);
-        uint64x2_t lo = vmovl_u32(vget_low_u32(v));
-        uint64x2_t hi = vmovl_u32(vget_high_u32(v));
-        acc0 = vaddq_u64(acc0, lo);
-        acc1 = vaddq_u64(acc1, hi);
-    }
-    uint64x2_t accv = vaddq_u64(acc0, acc1);
-    uint64_t acc = vgetq_lane_u64(accv, 0) + vgetq_lane_u64(accv, 1);
-    for (; i < nitems; i++) {
-        acc += data[i];
-    }
-    return acc;
-#else
-    uint64_t acc = 0;
-    for (int i = 0; i < nitems; i++) {
-        acc += data[i];
-    }
-    return acc;
-#endif
-}
-
-static double comma(double a, double b) {
-    (void)a;
-    return b;
-}
-
-/* Bitwise operators (for integer types) */
-static double bit_and(double a, double b) { return (double)((int64_t)a & (int64_t)b); }
-static double bit_or(double a, double b) { return (double)((int64_t)a | (int64_t)b); }
-static double bit_xor(double a, double b) { return (double)((int64_t)a ^ (int64_t)b); }
-static double bit_not(double a) { return (double)(~(int64_t)a); }
-static double bit_shl(double a, double b) { return (double)((int64_t)a << (int64_t)b); }
-static double bit_shr(double a, double b) { return (double)((int64_t)a >> (int64_t)b); }
-
-/* Comparison operators (return 1.0 for true, 0.0 for false) */
-static double cmp_eq(double a, double b) { return a == b ? 1.0 : 0.0; }
-static double cmp_ne(double a, double b) { return a != b ? 1.0 : 0.0; }
-static double cmp_lt(double a, double b) { return a < b ? 1.0 : 0.0; }
-static double cmp_le(double a, double b) { return a <= b ? 1.0 : 0.0; }
-static double cmp_gt(double a, double b) { return a > b ? 1.0 : 0.0; }
-static double cmp_ge(double a, double b) { return a >= b ? 1.0 : 0.0; }
-
-/* Logical operators (for bool type) - short-circuit via OR/AND */
-static double logical_and(double a, double b) { return ((int)a) && ((int)b) ? 1.0 : 0.0; }
-static double logical_or(double a, double b) { return ((int)a) || ((int)b) ? 1.0 : 0.0; }
-static double logical_not(double a) { return !(int)a ? 1.0 : 0.0; }
-static double logical_xor(double a, double b) { return ((int)a) != ((int)b) ? 1.0 : 0.0; }
-
-static bool is_identifier_start(char c) {
-    return isalpha((unsigned char)c) || c == '_';
-}
-
-static bool is_identifier_char(char c) {
-    return isalnum((unsigned char)c) || c == '_';
-}
-
-static void skip_whitespace(state* s) {
-    while (*s->next && isspace((unsigned char)*s->next)) {
-        s->next++;
-    }
-}
-
-static void read_number_token(state* s) {
-    const char* start = s->next;
-    s->value = strtod(s->next, (char**)&s->next);
-    s->type = TOK_NUMBER;
-
-    // Determine if it is a floating point or integer constant
-    bool is_float = false;
-    for (const char* p = start; p < s->next; p++) {
-        if (*p == '.' || *p == 'e' || *p == 'E') {
-            is_float = true;
-            break;
-        }
-    }
-
-    if (is_float) {
-        // Match NumPy conventions: float constants match target_dtype when it's a float type
-        // This ensures FLOAT32 arrays + float constants -> FLOAT32 (NumPy behavior)
-        if (s->target_dtype == ME_FLOAT32) {
-            s->dtype = ME_FLOAT32;
-        }
-        else {
-            s->dtype = ME_FLOAT64;
-        }
-    }
-    else {
-        // For integers, we use a heuristic
-        if (s->value > INT_MAX || s->value < INT_MIN) {
-            s->dtype = ME_INT64;
-        }
-        else {
-            // Use target_dtype if it's an integer type, otherwise default to INT32
-            if (is_integer_dtype(s->target_dtype)) {
-                s->dtype = s->target_dtype;
-            }
-            else {
-                s->dtype = ME_INT32;
-            }
-        }
-    }
-}
-
-static void read_identifier_token(state* s) {
-    const char* start = s->next;
-    while (is_identifier_char(*s->next)) {
-        s->next++;
-    }
-
-    const me_variable* var = find_lookup(s, start, s->next - start);
-    if (!var) {
-        var = find_builtin(start, s->next - start);
-    }
-
-    if (!var) {
-        s->type = TOK_ERROR;
-        return;
-    }
-
-    switch (TYPE_MASK(var->type)) {
-    case ME_VARIABLE:
-        s->type = TOK_VARIABLE;
-        s->bound = var->address;
-        s->dtype = var->dtype;
-        break;
-
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        s->context = var->context;
-    /* Falls through. */
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-        s->type = var->type;
-        s->function = var->address;
-        break;
-    }
-}
-
-typedef struct {
-    const char* literal;
-    int token_type;
-    me_fun2 function;
-} operator_spec;
-
-static bool handle_multi_char_operator(state* s) {
-    static const operator_spec multi_ops[] = {
-        {"**", TOK_POW, pow},
-        {"<<", TOK_SHIFT, bit_shl},
-        {">>", TOK_SHIFT, bit_shr},
-        {"==", TOK_COMPARE, cmp_eq},
-        {"!=", TOK_COMPARE, cmp_ne},
-        {"<=", TOK_COMPARE, cmp_le},
-        {">=", TOK_COMPARE, cmp_ge},
-    };
-
-    for (size_t i = 0; i < sizeof(multi_ops) / sizeof(multi_ops[0]); i++) {
-        const operator_spec* op = &multi_ops[i];
-        size_t len = strlen(op->literal);
-        if (strncmp(s->next, op->literal, len) == 0) {
-            s->type = op->token_type;
-            s->function = op->function;
-            s->next += len;
-            return true;
-        }
-    }
-    return false;
-}
-
-static void handle_single_char_operator(state* s, char c) {
-    s->next++;
-    switch (c) {
-    case '+': s->type = TOK_INFIX;
-        s->function = add;
-        break;
-    case '-': s->type = TOK_INFIX;
-        s->function = sub;
-        break;
-    case '*': s->type = TOK_INFIX;
-        s->function = mul;
-        break;
-    case '/': s->type = TOK_INFIX;
-        s->function = divide;
-        break;
-    case '%': s->type = TOK_INFIX;
-        s->function = fmod;
-        break;
-    case '&': s->type = TOK_BITWISE;
-        s->function = bit_and;
-        break;
-    case '|': s->type = TOK_BITWISE;
-        s->function = bit_or;
-        break;
-    case '^': s->type = TOK_BITWISE;
-        s->function = bit_xor;
-        break;
-    case '~': s->type = TOK_BITWISE;
-        s->function = bit_not;
-        break;
-    case '<': s->type = TOK_COMPARE;
-        s->function = cmp_lt;
-        break;
-    case '>': s->type = TOK_COMPARE;
-        s->function = cmp_gt;
-        break;
-    case '(': s->type = TOK_OPEN;
-        break;
-    case ')': s->type = TOK_CLOSE;
-        break;
-    case ',': s->type = TOK_SEP;
-        break;
-    default: s->type = TOK_ERROR;
-        break;
-    }
-}
-
-static void read_operator_token(state* s) {
-    if (handle_multi_char_operator(s)) {
-        return;
-    }
-
-    if (!*s->next) {
-        s->type = TOK_END;
-        return;
-    }
-
-    handle_single_char_operator(s, *s->next);
-}
-
-void next_token(state* s) {
-    s->type = TOK_NULL;
-
-    do {
-        skip_whitespace(s);
-
-        if (!*s->next) {
-            s->type = TOK_END;
-            return;
-        }
-
-        if ((s->next[0] >= '0' && s->next[0] <= '9') || s->next[0] == '.') {
-            read_number_token(s);
-        }
-        else if (is_identifier_start(s->next[0])) {
-            read_identifier_token(s);
-        }
-        else {
-            read_operator_token(s);
-        }
-    }
-    while (s->type == TOK_NULL);
-}
-
-
-static me_expr* list(state* s);
-
-static me_expr* expr(state* s);
-
-static me_expr* power(state* s);
-
-static me_expr* shift_expr(state* s);
-
-static me_expr* bitwise_and(state* s);
-
-static me_expr* bitwise_xor(state* s);
-
-static me_expr* bitwise_or(state* s);
-
-static me_expr* comparison(state* s);
-
-
-static me_expr* base(state* s) {
-    /* <base>      =    <constant> | <variable> | <function-0> {"(" ")"} | <function-1> <power> | <function-X> "(" <expr> {"," <expr>} ")" | "(" <list> ")" */
-    me_expr* ret;
-    int arity;
-
-    switch (TYPE_MASK(s->type)) {
-    case TOK_NUMBER:
-        ret = new_expr(ME_CONSTANT, 0);
-        CHECK_NULL(ret);
-
-        ret->value = s->value;
-        // Use inferred type for constants (floating point vs integer)
-        if (s->target_dtype == ME_AUTO) {
-            ret->dtype = s->dtype;
-        }
-        else {
-            // If target_dtype is integer but constant is float/complex, we must use float/complex
-            if (is_integer_dtype(s->target_dtype)) {
-                if (is_float_dtype(s->dtype) || is_complex_dtype(s->dtype)) {
-                    ret->dtype = s->dtype;
-                }
-                else if (is_integer_dtype(s->dtype) && dtype_size(s->dtype) > dtype_size(s->target_dtype)) {
-                    // Use larger integer type if needed
-                    ret->dtype = s->dtype;
-                }
-                else {
-                    ret->dtype = s->target_dtype;
-                }
-            }
-            else {
-                // For float/complex target types, use target_dtype to match NumPy conventions
-                // Float constants are typed based on target_dtype (FLOAT32 or FLOAT64)
-                // This ensures FLOAT32 arrays + float constants -> FLOAT32 (NumPy behavior)
-                ret->dtype = s->target_dtype;
-            }
-        }
-        next_token(s);
-        break;
-
-    case TOK_VARIABLE:
-        ret = new_expr(ME_VARIABLE, 0);
-        CHECK_NULL(ret);
-
-        ret->bound = s->bound;
-        ret->dtype = s->dtype; // Set the variable's type
-        ret->input_dtype = s->dtype;
-        next_token(s);
-        break;
-
-    case ME_FUNCTION0:
-    case ME_CLOSURE0:
-        ret = new_expr(s->type, 0);
-        CHECK_NULL(ret);
-
-        ret->function = s->function;
-        if (IS_CLOSURE(s->type)) ret->parameters[0] = s->context;
-        next_token(s);
-        if (s->type == TOK_OPEN) {
-            next_token(s);
-            if (s->type != TOK_CLOSE) {
-                s->type = TOK_ERROR;
-            }
-            else {
-                next_token(s);
-            }
-        }
-        break;
-
-    case ME_FUNCTION1:
-    case ME_CLOSURE1:
-        ret = new_expr(s->type, 0);
-        CHECK_NULL(ret);
-
-        ret->function = s->function;
-        if (IS_CLOSURE(s->type)) ret->parameters[1] = s->context;
-        next_token(s);
-        ret->parameters[0] = power(s);
-        CHECK_NULL(ret->parameters[0], me_free(ret));
-        break;
-
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        arity = ARITY(s->type);
-
-        ret = new_expr(s->type, 0);
-        CHECK_NULL(ret);
-
-        ret->function = s->function;
-        if (IS_CLOSURE(s->type)) ret->parameters[arity] = s->context;
-        next_token(s);
-
-        if (s->type != TOK_OPEN) {
-            s->type = TOK_ERROR;
-        }
-        else {
-            int i;
-            for (i = 0; i < arity; i++) {
-                next_token(s);
-                ret->parameters[i] = expr(s);
-                CHECK_NULL(ret->parameters[i], me_free(ret));
-
-                if (s->type != TOK_SEP) {
-                    break;
-                }
-            }
-            if (s->type != TOK_CLOSE || i != arity - 1) {
-                s->type = TOK_ERROR;
-            }
-            else {
-                next_token(s);
-            }
-        }
-
-        break;
-
-    case TOK_OPEN:
-        next_token(s);
-        ret = list(s);
-        CHECK_NULL(ret);
-
-        if (s->type != TOK_CLOSE) {
-            s->type = TOK_ERROR;
-        }
-        else {
-            next_token(s);
-        }
-        break;
-
-    default:
-        ret = new_expr(0, 0);
-        CHECK_NULL(ret);
-
-        s->type = TOK_ERROR;
-        ret->value = NAN;
-        break;
-    }
-
-    return ret;
-}
-
-
-static me_expr* power(state* s) {
-    /* <power>     =    {("-" | "+")} <base> */
-    int sign = 1;
-    while (s->type == TOK_INFIX && (s->function == add || s->function == sub)) {
-        if (s->function == sub) sign = -sign;
-        next_token(s);
-    }
-
-    me_expr* ret;
-
-    if (sign == 1) {
-        ret = base(s);
-    }
-    else {
-        me_expr* b = base(s);
-        CHECK_NULL(b);
-
-        ret = NEW_EXPR(ME_FUNCTION1 | ME_FLAG_PURE, b);
-        CHECK_NULL(ret, me_free(b));
-
-        ret->function = negate;
-    }
-
-    return ret;
-}
-
-#ifdef ME_POW_FROM_RIGHT
-static me_expr* factor(state* s) {
-    /* <factor>    =    <power> {"**" <factor>}  (right associative) */
-    me_expr* ret = power(s);
-    CHECK_NULL(ret);
-
-    if (s->type == TOK_POW) {
-        me_fun2 t = s->function;
-        next_token(s);
-        me_expr* f = factor(s); /* Right associative: recurse */
-        CHECK_NULL(f, me_free(ret));
-
-        me_expr* prev = ret;
-        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, f);
-        CHECK_NULL(ret, me_free(f), me_free(prev));
-
-        ret->function = t;
-        apply_type_promotion(ret);
-    }
-
-    return ret;
-}
-#else
-static me_expr* factor(state* s) {
-    /* <factor>    =    <power> {"**" <power>}  (left associative) */
-    me_expr* ret = power(s);
-    CHECK_NULL(ret);
-
-    while (s->type == TOK_POW) {
-        me_fun2 t = (me_fun2)s->function;
-        next_token(s);
-        me_expr* f = power(s);
-        CHECK_NULL(f, me_free(ret));
-
-        me_expr* prev = ret;
-        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, f);
-        CHECK_NULL(ret, me_free(f), me_free(prev));
-
-        ret->function = (void*)t;
-        apply_type_promotion(ret);
-    }
-
-    return ret;
-}
-#endif
-
-
-static me_expr* term(state* s) {
-    /* <term>      =    <factor> {("*" | "/" | "%") <factor>} */
-    me_expr* ret = factor(s);
-    CHECK_NULL(ret);
-
-    while (s->type == TOK_INFIX && (s->function == mul || s->function == divide || s->function == fmod)) {
-        me_fun2 t = (me_fun2)s->function;
-        next_token(s);
-        me_expr* f = factor(s);
-        CHECK_NULL(f, me_free(ret));
-
-        me_expr* prev = ret;
-        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, f);
-        CHECK_NULL(ret, me_free(f), me_free(prev));
-
-        ret->function = (void*)t;
-        apply_type_promotion(ret);
-    }
-
-    return ret;
-}
-
-
-static me_expr* expr(state* s) {
-    /* <expr>      =    <term> {("+" | "-") <term>} */
-    me_expr* ret = term(s);
-    CHECK_NULL(ret);
-
-    while (s->type == TOK_INFIX && (s->function == add || s->function == sub)) {
-        me_fun2 t = (me_fun2)s->function;
-        next_token(s);
-        me_expr* te = term(s);
-        CHECK_NULL(te, me_free(ret));
-
-        me_expr* prev = ret;
-        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, te);
-        CHECK_NULL(ret, me_free(te), me_free(prev));
-
-        ret->function = (void*)t;
-        apply_type_promotion(ret); // Apply type promotion
-    }
-
-    return ret;
-}
-
-
-static me_expr* shift_expr(state* s) {
-    /* <shift_expr> =    <expr> {("<<" | ">>") <expr>} */
-    me_expr* ret = expr(s);
-    CHECK_NULL(ret);
-
-    while (s->type == TOK_SHIFT) {
-        me_fun2 t = (me_fun2)s->function;
-        next_token(s);
-        me_expr* e = expr(s);
-        CHECK_NULL(e, me_free(ret));
-
-        me_expr* prev = ret;
-        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
-        CHECK_NULL(ret, me_free(e), me_free(prev));
-
-        ret->function = (void*)t;
-        apply_type_promotion(ret);
-    }
-
-    return ret;
-}
-
-
-static me_expr* bitwise_and(state* s) {
-    /* <bitwise_and> =    <shift_expr> {"&" <shift_expr>} */
-    me_expr* ret = shift_expr(s);
-    CHECK_NULL(ret);
-
-    while (s->type == TOK_BITWISE && s->function == bit_and) {
-        next_token(s);
-        me_expr* e = shift_expr(s);
-        CHECK_NULL(e, me_free(ret));
-
-        me_expr* prev = ret;
-        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
-        CHECK_NULL(ret, me_free(e), me_free(prev));
-
-        ret->function = bit_and;
-        apply_type_promotion(ret);
-    }
-
-    return ret;
-}
-
-
-static me_expr* bitwise_xor(state* s) {
-    /* <bitwise_xor> =    <bitwise_and> {"^" <bitwise_and>} */
-    /* Note: ^ is XOR for integers/bools. Use ** for power */
-    me_expr* ret = bitwise_and(s);
-    CHECK_NULL(ret);
-
-    while (s->type == TOK_BITWISE && s->function == bit_xor) {
-        next_token(s);
-        me_expr* e = bitwise_and(s);
-        CHECK_NULL(e, me_free(ret));
-
-        me_expr* prev = ret;
-        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
-        CHECK_NULL(ret, me_free(e), me_free(prev));
-
-        ret->function = bit_xor;
-        apply_type_promotion(ret);
-    }
-
-    return ret;
-}
-
-
-static me_expr* bitwise_or(state* s) {
-    /* <bitwise_or> =    <bitwise_xor> {"|" <bitwise_xor>} */
-    me_expr* ret = bitwise_xor(s);
-    CHECK_NULL(ret);
-
-    while (s->type == TOK_BITWISE && (s->function == bit_or)) {
-        me_fun2 t = (me_fun2)s->function;
-        next_token(s);
-        me_expr* e = bitwise_xor(s);
-        CHECK_NULL(e, me_free(ret));
-
-        me_expr* prev = ret;
-        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
-        CHECK_NULL(ret, me_free(e), me_free(prev));
-
-        ret->function = (void*)t;
-        apply_type_promotion(ret);
-    }
-
-    return ret;
-}
-
-
-static me_expr* comparison(state* s) {
-    /* <comparison> =    <bitwise_or> {("<" | ">" | "<=" | ">=" | "==" | "!=") <bitwise_or>} */
-    me_expr* ret = bitwise_or(s);
-    CHECK_NULL(ret);
-
-    while (s->type == TOK_COMPARE) {
-        me_fun2 t = (me_fun2)s->function;
-        next_token(s);
-        me_expr* e = bitwise_or(s);
-        CHECK_NULL(e, me_free(ret));
-
-        me_expr* prev = ret;
-        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
-        CHECK_NULL(ret, me_free(e), me_free(prev));
-
-        ret->function = (void*)t;
-        apply_type_promotion(ret);
-        /* Comparisons always return bool */
-        ret->dtype = ME_BOOL;
-    }
-
-    return ret;
-}
-
-
-static me_expr* list(state* s) {
-    /* <list>      =    <comparison> {"," <comparison>} */
-    me_expr* ret = comparison(s);
-    CHECK_NULL(ret);
-
-    while (s->type == TOK_SEP) {
-        next_token(s);
-        me_expr* e = comparison(s);
-        CHECK_NULL(e, me_free(ret));
-
-        me_expr* prev = ret;
-        ret = NEW_EXPR(ME_FUNCTION2 | ME_FLAG_PURE, ret, e);
-        CHECK_NULL(ret, me_free(e), me_free(prev));
-
-        ret->function = comma;
-        apply_type_promotion(ret);
-    }
-
-    return ret;
-}
-
-
-#define ME_FUN(...) ((double(*)(__VA_ARGS__))n->function)
-#define M(e) me_eval_scalar(n->parameters[e])
-
-static double me_eval_scalar(const me_expr* n) {
-    if (!n) return NAN;
-
-    switch (TYPE_MASK(n->type)) {
-    case ME_CONSTANT: return n->value;
-    case ME_VARIABLE: return *(const double*)n->bound;
-
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-        switch (ARITY(n->type)) {
-        case 0: return ME_FUN(void)();
-        case 1: return ME_FUN(double)(M(0));
-        case 2: return ME_FUN(double, double)(M(0), M(1));
-        case 3: return ME_FUN(double, double, double)(M(0), M(1), M(2));
-        case 4: return ME_FUN(double, double, double, double)(M(0), M(1), M(2), M(3));
-        case 5: return ME_FUN(double, double, double, double, double)(M(0), M(1), M(2), M(3), M(4));
-        case 6: return ME_FUN(double, double, double, double, double, double)(
-                M(0), M(1), M(2), M(3), M(4), M(5));
-        case 7: return ME_FUN(double, double, double, double, double, double, double)(
-                M(0), M(1), M(2), M(3), M(4), M(5), M(6));
-        default: return NAN;
-        }
-
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        switch (ARITY(n->type)) {
-        case 0: return ME_FUN(void*)(n->parameters[0]);
-        case 1: return ME_FUN(void*, double)(n->parameters[1], M(0));
-        case 2: return ME_FUN(void*, double, double)(n->parameters[2], M(0), M(1));
-        case 3: return ME_FUN(void*, double, double, double)(n->parameters[3], M(0), M(1), M(2));
-        case 4: return ME_FUN(void*, double, double, double, double)(n->parameters[4], M(0), M(1), M(2), M(3));
-        case 5: return ME_FUN(void*, double, double, double, double, double)(
-                n->parameters[5], M(0), M(1), M(2), M(3), M(4));
-        case 6: return ME_FUN(void*, double, double, double, double, double, double)(
-                n->parameters[6], M(0), M(1), M(2), M(3), M(4), M(5));
-        case 7: return ME_FUN(void*, double, double, double, double, double, double, double)(
-                n->parameters[7], M(0), M(1), M(2), M(3), M(4), M(5), M(6));
-        default: return NAN;
-        }
-
-    default: return NAN;
-    }
-}
-
-#undef ME_FUN
-#undef M
-
-/* Specialized vector operations for better performance */
-static void vec_add(const double* a, const double* b, double* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = a[i] + b[i];
-    }
-}
-
-static void vec_sub(const double* a, const double* b, double* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = a[i] - b[i];
-    }
-}
-
-static void vec_mul(const double* a, const double* b, double* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = a[i] * b[i];
-    }
-}
-
-static void vec_div(const double* a, const double* b, double* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = a[i] / b[i];
-    }
-}
-
-static void vec_add_scalar(const double* a, double b, double* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = a[i] + b;
-    }
-}
-
-static void vec_mul_scalar(const double* a, double b, double* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = a[i] * b;
-    }
-}
-
-static void vec_pow(const double* a, const double* b, double* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = pow(a[i], b[i]);
-    }
-}
-
-static void vec_pow_scalar(const double* a, double b, double* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = pow(a[i], b);
-    }
-}
-
-static void vec_sqrt(const double* a, double* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = sqrt(a[i]);
-    }
-}
-
-static void vec_sin(const double* a, double* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = sin(a[i]);
-    }
-}
-
-static void vec_cos(const double* a, double* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = cos(a[i]);
-    }
-}
-
-static void vec_negate(const double* a, double* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = -a[i];
-    }
-}
-
-/* ============================================================================
- * FLOAT32 VECTOR OPERATIONS
- * ============================================================================ */
-
-static void vec_add_f32(const float* a, const float* b, float* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = a[i] + b[i];
-    }
-}
-
-static void vec_sub_f32(const float* a, const float* b, float* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = a[i] - b[i];
-    }
-}
-
-static void vec_mul_f32(const float* a, const float* b, float* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = a[i] * b[i];
-    }
-}
-
-static void vec_div_f32(const float* a, const float* b, float* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = a[i] / b[i];
-    }
-}
-
-static void vec_add_scalar_f32(const float* a, float b, float* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = a[i] + b;
-    }
-}
-
-static void vec_mul_scalar_f32(const float* a, float b, float* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = a[i] * b;
-    }
-}
-
-static void vec_pow_f32(const float* a, const float* b, float* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = powf(a[i], b[i]);
-    }
-}
-
-static void vec_pow_scalar_f32(const float* a, float b, float* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = powf(a[i], b);
-    }
-}
-
-static void vec_sqrt_f32(const float* a, float* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = sqrtf(a[i]);
-    }
-}
-
-static void vec_sin_f32(const float* a, float* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = sinf(a[i]);
-    }
-}
-
-static void vec_cos_f32(const float* a, float* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = cosf(a[i]);
-    }
-}
-
-static void vec_negame_f32(const float* a, float* out, int n) {
-    int i;
-#pragma GCC ivdep
-    for (i = 0; i < n; i++) {
-        out[i] = -a[i];
-    }
-}
-
-/* ============================================================================
- * INTEGER VECTOR OPERATIONS (int8_t through uint64_t)
- * ============================================================================ */
-
-/* Macros to generate integer vector operations */
-#define DEFINE_INT_VEC_OPS(SUFFIX, TYPE) \
-static void vec_add_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = a[i] + b[i]; \
-} \
-static void vec_sub_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = a[i] - b[i]; \
-} \
-static void vec_mul_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = a[i] * b[i]; \
-} \
-static void vec_div_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = (b[i] != 0) ? (a[i] / b[i]) : 0; \
-} \
-static void vec_add_scalar_##SUFFIX(const TYPE *a, TYPE b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = a[i] + b; \
-} \
-static void vec_mul_scalar_##SUFFIX(const TYPE *a, TYPE b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = a[i] * b; \
-} \
-static void vec_pow_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = (TYPE)pow((double)a[i], (double)b[i]); \
-} \
-static void vec_pow_scalar_##SUFFIX(const TYPE *a, TYPE b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = (TYPE)pow((double)a[i], (double)b); \
-} \
-static void vec_sqrt_##SUFFIX(const TYPE *a, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = (TYPE)sqrt((double)a[i]); \
-} \
-static void vec_negame_##SUFFIX(const TYPE *a, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = -a[i]; \
-} \
-static void vec_and_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = a[i] & b[i]; \
-} \
-static void vec_or_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = a[i] | b[i]; \
-} \
-static void vec_xor_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = a[i] ^ b[i]; \
-} \
-static void vec_not_##SUFFIX(const TYPE *a, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = ~a[i]; \
-} \
-static void vec_shl_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = a[i] << b[i]; \
-} \
-static void vec_shr_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    _Pragma("GCC ivdep") \
-    for (i = 0; i < n; i++) out[i] = a[i] >> b[i]; \
-}
-
-/* Generate ops for all integer types */
-DEFINE_INT_VEC_OPS(i8, int8_t)
-DEFINE_INT_VEC_OPS(i16, int16_t)
-DEFINE_INT_VEC_OPS(i32, int32_t)
-DEFINE_INT_VEC_OPS(i64, int64_t)
-DEFINE_INT_VEC_OPS(u8, uint8_t)
-DEFINE_INT_VEC_OPS(u16, uint16_t)
-DEFINE_INT_VEC_OPS(u32, uint32_t)
-DEFINE_INT_VEC_OPS(u64, uint64_t)
-
-/* Boolean logical operations */
-static void vec_and_bool(const bool* a, const bool* b, bool* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = a[i] && b[i];
-}
-
-static void vec_or_bool(const bool* a, const bool* b, bool* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = a[i] || b[i];
-}
-
-static void vec_xor_bool(const bool* a, const bool* b, bool* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = a[i] != b[i];
-}
-
-static void vec_not_bool(const bool* a, bool* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = !a[i];
-}
-
-/* Comparison operations - generate for all numeric types */
-/* Note: These return bool arrays, but we'll store them as the same type for simplicity */
-#define DEFINE_COMPARE_OPS(SUFFIX, TYPE) \
-static void vec_cmp_eq_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    IVDEP \
-    for (i = 0; i < n; i++) out[i] = (a[i] == b[i]) ? 1 : 0; \
-} \
-static void vec_cmp_ne_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    IVDEP \
-    for (i = 0; i < n; i++) out[i] = (a[i] != b[i]) ? 1 : 0; \
-} \
-static void vec_cmp_lt_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    IVDEP \
-    for (i = 0; i < n; i++) out[i] = (a[i] < b[i]) ? 1 : 0; \
-} \
-static void vec_cmp_le_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    IVDEP \
-    for (i = 0; i < n; i++) out[i] = (a[i] <= b[i]) ? 1 : 0; \
-} \
-static void vec_cmp_gt_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    IVDEP \
-    for (i = 0; i < n; i++) out[i] = (a[i] > b[i]) ? 1 : 0; \
-} \
-static void vec_cmp_ge_##SUFFIX(const TYPE *a, const TYPE *b, TYPE *out, int n) { \
-    int i; \
-    IVDEP \
-    for (i = 0; i < n; i++) out[i] = (a[i] >= b[i]) ? 1 : 0; \
-}
-
-/* Generate comparison ops for all types */
-DEFINE_COMPARE_OPS(i8, int8_t)
-DEFINE_COMPARE_OPS(i16, int16_t)
-DEFINE_COMPARE_OPS(i32, int32_t)
-DEFINE_COMPARE_OPS(i64, int64_t)
-DEFINE_COMPARE_OPS(u8, uint8_t)
-DEFINE_COMPARE_OPS(u16, uint16_t)
-DEFINE_COMPARE_OPS(u32, uint32_t)
-DEFINE_COMPARE_OPS(u64, uint64_t)
-DEFINE_COMPARE_OPS(f32, float)
-DEFINE_COMPARE_OPS(f64, double)
-
-/* Complex operations */
-static void vec_add_c64(const float _Complex* a, const float _Complex* b, float _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = add_c64(a[i], b[i]);
-}
-
-static void vec_sub_c64(const float _Complex* a, const float _Complex* b, float _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = sub_c64(a[i], b[i]);
-}
-
-static void vec_mul_c64(const float _Complex* a, const float _Complex* b, float _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = mul_c64(a[i], b[i]);
-}
-
-static void vec_div_c64(const float _Complex* a, const float _Complex* b, float _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = div_c64(a[i], b[i]);
-}
-
-static void vec_add_scalar_c64(const float _Complex* a, float _Complex b, float _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = add_c64(a[i], b);
-}
-
-static void vec_mul_scalar_c64(const float _Complex* a, float _Complex b, float _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = mul_c64(a[i], b);
-}
-
-static void vec_pow_c64(const float _Complex* a, const float _Complex* b, float _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = me_cpowf(a[i], b[i]);
-}
-
-static void vec_pow_scalar_c64(const float _Complex* a, float _Complex b, float _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = me_cpowf(a[i], b);
-}
-
-static void vec_sqrt_c64(const float _Complex* a, float _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = me_csqrtf(a[i]);
-}
-
-static void vec_negame_c64(const float _Complex* a, float _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = neg_c64(a[i]);
-}
-
-static void vec_conj_c64(const float _Complex* a, float _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = me_conjf(a[i]);
-}
-
-static void vec_imag_c64(const float _Complex* a, float* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = me_cimagf(a[i]);
-}
-
-static void vec_add_c128(const double _Complex* a, const double _Complex* b, double _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = add_c128(a[i], b[i]);
-}
-
-static void vec_sub_c128(const double _Complex* a, const double _Complex* b, double _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = sub_c128(a[i], b[i]);
-}
-
-static void vec_mul_c128(const double _Complex* a, const double _Complex* b, double _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = mul_c128(a[i], b[i]);
-}
-
-static void vec_div_c128(const double _Complex* a, const double _Complex* b, double _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = div_c128(a[i], b[i]);
-}
-
-static void vec_add_scalar_c128(const double _Complex* a, double _Complex b, double _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = add_c128(a[i], b);
-}
-
-static void vec_mul_scalar_c128(const double _Complex* a, double _Complex b, double _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = mul_c128(a[i], b);
-}
-
-static void vec_pow_c128(const double _Complex* a, const double _Complex* b, double _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = me_cpow(a[i], b[i]);
-}
-
-static void vec_pow_scalar_c128(const double _Complex* a, double _Complex b, double _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = me_cpow(a[i], b);
-}
-
-static void vec_sqrt_c128(const double _Complex* a, double _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = me_csqrt(a[i]);
-}
-
-static void vec_negame_c128(const double _Complex* a, double _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = neg_c128(a[i]);
-}
-
-static void vec_conj_c128(const double _Complex* a, double _Complex* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = me_conj(a[i]);
-}
-
-static void vec_imag_c128(const double _Complex* a, double* out, int n) {
-    int i;
-    IVDEP
-    for (i = 0; i < n; i++) out[i] = me_cimag(a[i]);
-}
-
-/* ============================================================================
- * TYPE CONVERSION FUNCTIONS
- * ============================================================================
- * These functions convert between different data types for mixed-type expressions.
- */
-
-#define DEFINE_VEC_CONVERT(FROM_SUFFIX, TO_SUFFIX, FROM_TYPE, TO_TYPE) \
-static void vec_convert_##FROM_SUFFIX##_to_##TO_SUFFIX(const FROM_TYPE *in, TO_TYPE *out, int n) { \
-    int i; \
-    IVDEP \
-    for (i = 0; i < n; i++) out[i] = TO_TYPE_##TO_SUFFIX(in[i]); \
-}
-
-
-/* Generate all conversion functions */
-/* Conversions FROM bool TO other types */
-DEFINE_VEC_CONVERT(bool, i8, bool, int8_t)
-DEFINE_VEC_CONVERT(bool, i16, bool, int16_t)
-DEFINE_VEC_CONVERT(bool, i32, bool, int32_t)
-DEFINE_VEC_CONVERT(bool, i64, bool, int64_t)
-DEFINE_VEC_CONVERT(bool, u8, bool, uint8_t)
-DEFINE_VEC_CONVERT(bool, u16, bool, uint16_t)
-DEFINE_VEC_CONVERT(bool, u32, bool, uint32_t)
-DEFINE_VEC_CONVERT(bool, u64, bool, uint64_t)
-DEFINE_VEC_CONVERT(bool, f32, bool, float)
-DEFINE_VEC_CONVERT(bool, f64, bool, double)
-
-/* Conversions FROM other types TO bool */
-DEFINE_VEC_CONVERT(i8, bool, int8_t, bool)
-DEFINE_VEC_CONVERT(i16, bool, int16_t, bool)
-DEFINE_VEC_CONVERT(i32, bool, int32_t, bool)
-DEFINE_VEC_CONVERT(i64, bool, int64_t, bool)
-DEFINE_VEC_CONVERT(u8, bool, uint8_t, bool)
-DEFINE_VEC_CONVERT(u16, bool, uint16_t, bool)
-DEFINE_VEC_CONVERT(u32, bool, uint32_t, bool)
-DEFINE_VEC_CONVERT(u64, bool, uint64_t, bool)
-DEFINE_VEC_CONVERT(f32, bool, float, bool)
-DEFINE_VEC_CONVERT(f64, bool, double, bool)
-DEFINE_VEC_CONVERT(f64, f32, double, float)
-
-DEFINE_VEC_CONVERT(i8, i16, int8_t, int16_t)
-DEFINE_VEC_CONVERT(i8, i32, int8_t, int32_t)
-DEFINE_VEC_CONVERT(i8, i64, int8_t, int64_t)
-DEFINE_VEC_CONVERT(i8, f32, int8_t, float)
-DEFINE_VEC_CONVERT(i8, f64, int8_t, double)
-
-DEFINE_VEC_CONVERT(i16, i32, int16_t, int32_t)
-DEFINE_VEC_CONVERT(i16, i64, int16_t, int64_t)
-DEFINE_VEC_CONVERT(i16, f32, int16_t, float)
-DEFINE_VEC_CONVERT(i16, f64, int16_t, double)
-
-DEFINE_VEC_CONVERT(i32, i64, int32_t, int64_t)
-DEFINE_VEC_CONVERT(i32, f32, int32_t, float)
-DEFINE_VEC_CONVERT(i32, f64, int32_t, double)
-
-DEFINE_VEC_CONVERT(i64, f64, int64_t, double)
-
-DEFINE_VEC_CONVERT(u8, u16, uint8_t, uint16_t)
-DEFINE_VEC_CONVERT(u8, u32, uint8_t, uint32_t)
-DEFINE_VEC_CONVERT(u8, u64, uint8_t, uint64_t)
-DEFINE_VEC_CONVERT(u8, i16, uint8_t, int16_t)
-DEFINE_VEC_CONVERT(u8, i32, uint8_t, int32_t)
-DEFINE_VEC_CONVERT(u8, i64, uint8_t, int64_t)
-DEFINE_VEC_CONVERT(u8, f32, uint8_t, float)
-DEFINE_VEC_CONVERT(u8, f64, uint8_t, double)
-
-DEFINE_VEC_CONVERT(u16, u32, uint16_t, uint32_t)
-DEFINE_VEC_CONVERT(u16, u64, uint16_t, uint64_t)
-DEFINE_VEC_CONVERT(u16, i32, uint16_t, int32_t)
-DEFINE_VEC_CONVERT(u16, i64, uint16_t, int64_t)
-DEFINE_VEC_CONVERT(u16, f32, uint16_t, float)
-DEFINE_VEC_CONVERT(u16, f64, uint16_t, double)
-
-DEFINE_VEC_CONVERT(u32, u64, uint32_t, uint64_t)
-DEFINE_VEC_CONVERT(u32, i64, uint32_t, int64_t)
-DEFINE_VEC_CONVERT(u32, f64, uint32_t, double)
-
-DEFINE_VEC_CONVERT(u64, f64, uint64_t, double)
-
-DEFINE_VEC_CONVERT(f32, f64, float, double)
-DEFINE_VEC_CONVERT(f32, c64, float, float _Complex)
-DEFINE_VEC_CONVERT(f32, c128, float, double _Complex)
-
-DEFINE_VEC_CONVERT(f64, c128, double, double _Complex)
-
-DEFINE_VEC_CONVERT(c64, c128, float _Complex, double _Complex)
-
-/* Function to get conversion function pointer */
-typedef void (*convert_func_t)(const void*, void*, int);
-
-static convert_func_t get_convert_func(me_dtype from, me_dtype to) {
-    /* Return conversion function for a specific type pair */
-    if (from == to) return NULL; // No conversion needed
-
-#define CONV_CASE(FROM, TO, FROM_S, TO_S) \
-        if (from == FROM && to == TO) return (convert_func_t)vec_convert_##FROM_S##_to_##TO_S;
-
-    CONV_CASE(ME_BOOL, ME_INT8, bool, i8)
-    CONV_CASE(ME_BOOL, ME_INT16, bool, i16)
-    CONV_CASE(ME_BOOL, ME_INT32, bool, i32)
-    CONV_CASE(ME_BOOL, ME_INT64, bool, i64)
-    CONV_CASE(ME_BOOL, ME_UINT8, bool, u8)
-    CONV_CASE(ME_BOOL, ME_UINT16, bool, u16)
-    CONV_CASE(ME_BOOL, ME_UINT32, bool, u32)
-    CONV_CASE(ME_BOOL, ME_UINT64, bool, u64)
-    CONV_CASE(ME_BOOL, ME_FLOAT32, bool, f32)
-    CONV_CASE(ME_BOOL, ME_FLOAT64, bool, f64)
-
-    CONV_CASE(ME_INT8, ME_BOOL, i8, bool)
-    CONV_CASE(ME_INT16, ME_BOOL, i16, bool)
-    CONV_CASE(ME_INT32, ME_BOOL, i32, bool)
-    CONV_CASE(ME_INT64, ME_BOOL, i64, bool)
-    CONV_CASE(ME_UINT8, ME_BOOL, u8, bool)
-    CONV_CASE(ME_UINT16, ME_BOOL, u16, bool)
-    CONV_CASE(ME_UINT32, ME_BOOL, u32, bool)
-    CONV_CASE(ME_UINT64, ME_BOOL, u64, bool)
-    CONV_CASE(ME_FLOAT32, ME_BOOL, f32, bool)
-    CONV_CASE(ME_FLOAT64, ME_BOOL, f64, bool)
-
-    CONV_CASE(ME_INT8, ME_INT16, i8, i16)
-    CONV_CASE(ME_INT8, ME_INT32, i8, i32)
-    CONV_CASE(ME_INT8, ME_INT64, i8, i64)
-    CONV_CASE(ME_INT8, ME_FLOAT32, i8, f32)
-    CONV_CASE(ME_INT8, ME_FLOAT64, i8, f64)
-
-    CONV_CASE(ME_INT16, ME_INT32, i16, i32)
-    CONV_CASE(ME_INT16, ME_INT64, i16, i64)
-    CONV_CASE(ME_INT16, ME_FLOAT32, i16, f32)
-    CONV_CASE(ME_INT16, ME_FLOAT64, i16, f64)
-
-    CONV_CASE(ME_INT32, ME_INT64, i32, i64)
-    CONV_CASE(ME_INT32, ME_FLOAT32, i32, f32)
-    CONV_CASE(ME_INT32, ME_FLOAT64, i32, f64)
-
-    CONV_CASE(ME_INT64, ME_FLOAT64, i64, f64)
-
-    CONV_CASE(ME_UINT8, ME_UINT16, u8, u16)
-    CONV_CASE(ME_UINT8, ME_UINT32, u8, u32)
-    CONV_CASE(ME_UINT8, ME_UINT64, u8, u64)
-    CONV_CASE(ME_UINT8, ME_INT16, u8, i16)
-    CONV_CASE(ME_UINT8, ME_INT32, u8, i32)
-    CONV_CASE(ME_UINT8, ME_INT64, u8, i64)
-    CONV_CASE(ME_UINT8, ME_FLOAT32, u8, f32)
-    CONV_CASE(ME_UINT8, ME_FLOAT64, u8, f64)
-
-    CONV_CASE(ME_UINT16, ME_UINT32, u16, u32)
-    CONV_CASE(ME_UINT16, ME_UINT64, u16, u64)
-    CONV_CASE(ME_UINT16, ME_INT32, u16, i32)
-    CONV_CASE(ME_UINT16, ME_INT64, u16, i64)
-    CONV_CASE(ME_UINT16, ME_FLOAT32, u16, f32)
-    CONV_CASE(ME_UINT16, ME_FLOAT64, u16, f64)
-
-    CONV_CASE(ME_UINT32, ME_UINT64, u32, u64)
-    CONV_CASE(ME_UINT32, ME_INT64, u32, i64)
-    CONV_CASE(ME_UINT32, ME_FLOAT64, u32, f64)
-
-    CONV_CASE(ME_UINT64, ME_FLOAT64, u64, f64)
-
-    CONV_CASE(ME_FLOAT32, ME_FLOAT64, f32, f64)
-    CONV_CASE(ME_FLOAT32, ME_COMPLEX64, f32, c64)
-    CONV_CASE(ME_FLOAT32, ME_COMPLEX128, f32, c128)
-
-    CONV_CASE(ME_FLOAT64, ME_FLOAT32, f64, f32)
-    CONV_CASE(ME_FLOAT64, ME_COMPLEX128, f64, c128)
-
-    CONV_CASE(ME_COMPLEX64, ME_COMPLEX128, c64, c128)
-
-#undef CONV_CASE
-
-    return NULL; // Unsupported conversion
-}
-
-
-typedef double (*me_fun1)(double);
-
-typedef float (*me_fun1_f32)(float);
-
-/* Template for type-specific evaluator */
-#define DEFINE_ME_EVAL(SUFFIX, TYPE, VEC_ADD, VEC_SUB, VEC_MUL, VEC_DIV, VEC_POW, \
-    VEC_ADD_SCALAR, VEC_MUL_SCALAR, VEC_POW_SCALAR, \
-    VEC_SQRT, VEC_SIN, VEC_COS, VEC_NEGATE, \
-    SQRT_FUNC, SIN_FUNC, COS_FUNC, EXP_FUNC, LOG_FUNC, FABS_FUNC, POW_FUNC, \
-    VEC_CONJ) \
-static void me_eval_##SUFFIX(const me_expr *n) { \
-    if (!n || !n->output) return; \
-    if (is_reduction_node(n)) { \
-        eval_reduction(n, n->nitems); \
-        return; \
-    } \
-    if (n->nitems <= 0) return; \
-    \
-    int i, j; \
-    const int arity = ARITY(n->type); \
-    TYPE *output = (TYPE*)n->output; \
-    \
-    switch(TYPE_MASK(n->type)) { \
-        case ME_CONSTANT: \
-            { \
-                TYPE val = TO_TYPE_##SUFFIX(n->value); \
-                for (i = 0; i < n->nitems; i++) { \
-                    output[i] = val; \
-                } \
-            } \
-            break; \
-            \
-        case ME_VARIABLE: \
-            { \
-                const TYPE *src = (const TYPE*)n->bound; \
-                for (i = 0; i < n->nitems; i++) { \
-                    output[i] = src[i]; \
-                } \
-            } \
-            break; \
-        \
-        case ME_FUNCTION0: case ME_FUNCTION1: case ME_FUNCTION2: case ME_FUNCTION3: \
-        case ME_FUNCTION4: case ME_FUNCTION5: case ME_FUNCTION6: case ME_FUNCTION7: \
-        case ME_CLOSURE0: case ME_CLOSURE1: case ME_CLOSURE2: case ME_CLOSURE3: \
-        case ME_CLOSURE4: case ME_CLOSURE5: case ME_CLOSURE6: case ME_CLOSURE7: \
-            for (j = 0; j < arity; j++) { \
-                me_expr *param = (me_expr*)n->parameters[j]; \
-                if (param->type != ME_CONSTANT && param->type != ME_VARIABLE) { \
-                    if (!param->output) { \
-                        param->output = malloc(n->nitems * sizeof(TYPE)); \
-                        param->nitems = n->nitems; \
-                        param->dtype = n->dtype; \
-                    } \
-                    me_eval_##SUFFIX(param); \
-                } \
-            } \
-            \
-            if (arity == 2 && IS_FUNCTION(n->type)) { \
-                me_expr *left = (me_expr*)n->parameters[0]; \
-                me_expr *right = (me_expr*)n->parameters[1]; \
-                \
-                const TYPE *ldata = (left->type == ME_CONSTANT) ? NULL : \
-                                   (left->type == ME_VARIABLE) ? (const TYPE*)left->bound : (const TYPE*)left->output; \
-                const TYPE *rdata = (right->type == ME_CONSTANT) ? NULL : \
-                                    (right->type == ME_VARIABLE) ? (const TYPE*)right->bound : (const TYPE*)right->output; \
-                \
-                me_fun2 func = (me_fun2)n->function; \
-                \
-                if (func == add) { \
-                    if (ldata && rdata) { \
-                        VEC_ADD(ldata, rdata, output, n->nitems); \
-                    } else if (ldata && right->type == ME_CONSTANT) { \
-                        VEC_ADD_SCALAR(ldata, TO_TYPE_##SUFFIX(right->value), output, n->nitems); \
-                    } else if (left->type == ME_CONSTANT && rdata) { \
-                        VEC_ADD_SCALAR(rdata, TO_TYPE_##SUFFIX(left->value), output, n->nitems); \
-                    } else { \
-                        goto general_case_binary_##SUFFIX; \
-                    } \
-                } else if (func == sub) { \
-                    if (ldata && rdata) { \
-                        VEC_SUB(ldata, rdata, output, n->nitems); \
-                    } else { \
-                        goto general_case_binary_##SUFFIX; \
-                    } \
-                } else if (func == mul) { \
-                    if (ldata && rdata) { \
-                        VEC_MUL(ldata, rdata, output, n->nitems); \
-                    } else if (ldata && right->type == ME_CONSTANT) { \
-                        VEC_MUL_SCALAR(ldata, TO_TYPE_##SUFFIX(right->value), output, n->nitems); \
-                    } else if (left->type == ME_CONSTANT && rdata) { \
-                        VEC_MUL_SCALAR(rdata, TO_TYPE_##SUFFIX(left->value), output, n->nitems); \
-                    } else { \
-                        goto general_case_binary_##SUFFIX; \
-                    } \
-                } else if (func == divide) { \
-                    if (ldata && rdata) { \
-                        VEC_DIV(ldata, rdata, output, n->nitems); \
-                    } else { \
-                        goto general_case_binary_##SUFFIX; \
-                    } \
-                } else if (func == (me_fun2)pow) { \
-                    if (ldata && rdata) { \
-                        VEC_POW(ldata, rdata, output, n->nitems); \
-                    } else if (ldata && right->type == ME_CONSTANT) { \
-                        VEC_POW_SCALAR(ldata, TO_TYPE_##SUFFIX(right->value), output, n->nitems); \
-                    } else { \
-                        goto general_case_binary_##SUFFIX; \
-                    } \
-                } else { \
-                    general_case_binary_##SUFFIX: \
-                    for (i = 0; i < n->nitems; i++) { \
-                        double a = (left->type == ME_CONSTANT) ? left->value : \
-                                  FROM_TYPE_##SUFFIX(ldata[i]); \
-                        double b = (right->type == ME_CONSTANT) ? right->value : \
-                                  FROM_TYPE_##SUFFIX(rdata[i]); \
-                        output[i] = TO_TYPE_##SUFFIX(func(a, b)); \
-                    } \
-                } \
-            } else if (arity == 3 && IS_FUNCTION(n->type) && n->function == (void*)where_scalar) { \
-                /* where(cond, x, y) – NumPy-like semantics: cond != 0 selects x else y */ \
-                me_expr *cond = (me_expr*)n->parameters[0]; \
-                me_expr *xexpr = (me_expr*)n->parameters[1]; \
-                me_expr *yexpr = (me_expr*)n->parameters[2]; \
-                \
-                const TYPE *cdata = (const TYPE*)((cond->type == ME_VARIABLE) ? cond->bound : cond->output); \
-                const TYPE *xdata = (const TYPE*)((xexpr->type == ME_VARIABLE) ? xexpr->bound : xexpr->output); \
-                const TYPE *ydata = (const TYPE*)((yexpr->type == ME_VARIABLE) ? yexpr->bound : yexpr->output); \
-                \
-                for (i = 0; i < n->nitems; i++) { \
-                    output[i] = (IS_NONZERO_##SUFFIX(cdata[i])) ? xdata[i] : ydata[i]; \
-                } \
-            } \
-            else if (arity == 1 && IS_FUNCTION(n->type)) { \
-                me_expr *arg = (me_expr*)n->parameters[0]; \
-                \
-                const TYPE *adata = (arg->type == ME_CONSTANT) ? NULL : \
-                                   (arg->type == ME_VARIABLE) ? (const TYPE*)arg->bound : (const TYPE*)arg->output; \
-                \
-                const void *func_ptr = n->function; \
-                \
-                if (func_ptr == (void*)sqrt) { \
-                    if (adata) VEC_SQRT(adata, output, n->nitems); \
-                } else if (func_ptr == (void*)sin) { \
-                    if (adata) VEC_SIN(adata, output, n->nitems); \
-                } else if (func_ptr == (void*)cos) { \
-                    if (adata) VEC_COS(adata, output, n->nitems); \
-                } else if (func_ptr == (void*)negate) { \
-                    if (adata) VEC_NEGATE(adata, output, n->nitems); \
-                } else if (func_ptr == (void*)imag_wrapper) { \
-                    /* NumPy semantics: imag(real) == 0 with same dtype */ \
-                    if (adata) { \
-                        for (i = 0; i < n->nitems; i++) { \
-                            output[i] = TO_TYPE_##SUFFIX(0); \
-                        } \
-                    } \
-                } else if (func_ptr == (void*)real_wrapper) { \
-                    /* NumPy semantics: real(real) == real with same dtype */ \
-                    if (adata) { \
-                        for (i = 0; i < n->nitems; i++) { \
-                            output[i] = adata[i]; \
-                        } \
-                    } \
-                } else if (func_ptr == (void*)conj_wrapper) { \
-                    if (adata) VEC_CONJ(adata, output, n->nitems); \
-                } else { \
-                    me_fun1 func = (me_fun1)func_ptr; \
-                    if (arg->type == ME_CONSTANT) { \
-                        TYPE val = TO_TYPE_##SUFFIX(func(arg->value)); \
-                        for (i = 0; i < n->nitems; i++) { \
-                            output[i] = val; \
-                        } \
-                    } else { \
-                        for (i = 0; i < n->nitems; i++) { \
-                            output[i] = TO_TYPE_##SUFFIX(func(FROM_TYPE_##SUFFIX(adata[i]))); \
-                        } \
-                    } \
-                } \
-            } \
-            else { \
-                for (i = 0; i < n->nitems; i++) { \
-                    double args[7]; \
-                    \
-                    for (j = 0; j < arity; j++) { \
-                        me_expr *param = (me_expr*)n->parameters[j]; \
-                        const TYPE *pdata = (const TYPE*)((param->type == ME_VARIABLE) ? param->bound : param->output); \
-                        if (param->type == ME_CONSTANT) { \
-                            args[j] = param->value; \
-                        } else { \
-                            args[j] = FROM_TYPE_##SUFFIX(pdata[i]); \
-                        } \
-                    } \
-                    \
-                    if (IS_FUNCTION(n->type)) { \
-                        switch(arity) { \
-                            case 0: output[i] = TO_TYPE_##SUFFIX(((double(*)(void))n->function)()); break; \
-                            case 3: output[i] = TO_TYPE_##SUFFIX(((double(*)(double,double,double))n->function)(args[0], args[1], args[2])); break; \
-                            case 4: output[i] = TO_TYPE_##SUFFIX(((double(*)(double,double,double,double))n->function)(args[0], args[1], args[2], args[3])); break; \
-                            case 5: output[i] = TO_TYPE_##SUFFIX(((double(*)(double,double,double,double,double))n->function)(args[0], args[1], args[2], args[3], args[4])); break; \
-                            case 6: output[i] = TO_TYPE_##SUFFIX(((double(*)(double,double,double,double,double,double))n->function)(args[0], args[1], args[2], args[3], args[4], args[5])); break; \
-                            case 7: output[i] = TO_TYPE_##SUFFIX(((double(*)(double,double,double,double,double,double,double))n->function)(args[0], args[1], args[2], args[3], args[4], args[5], args[6])); break; \
-                        } \
-                    } else if (IS_CLOSURE(n->type)) { \
-                        void *context = n->parameters[arity]; \
-                        switch(arity) { \
-                            case 0: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*))n->function)(context)); break; \
-                            case 1: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double))n->function)(context, args[0])); break; \
-                            case 2: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double,double))n->function)(context, args[0], args[1])); break; \
-                            case 3: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double,double,double))n->function)(context, args[0], args[1], args[2])); break; \
-                            case 4: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3])); break; \
-                            case 5: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3], args[4])); break; \
-                            case 6: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double,double,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3], args[4], args[5])); break; \
-                            case 7: output[i] = TO_TYPE_##SUFFIX(((double(*)(void*,double,double,double,double,double,double,double))n->function)(context, args[0], args[1], args[2], args[3], args[4], args[5], args[6])); break; \
-                        } \
-                    } \
-                } \
-            } \
-            break; \
-        \
-        default: \
-            for (i = 0; i < n->nitems; i++) { \
-                output[i] = TO_TYPE_##SUFFIX(NAN); \
-            } \
-            break; \
-    } \
-}
-
-/* Vector operation macros - expand to inline loops */
-#define vec_add(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
-#define vec_sub(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
-#define vec_mul(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
-#define vec_div(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = pow((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
-#define vec_mul_scalar(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = pow((a)[_i], (b)); } while(0)
-#define vec_sqrt(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sqrt((a)[_i]); } while(0)
-#define vec_sin(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sin((a)[_i]); } while(0)
-#define vec_cos(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cos((a)[_i]); } while(0)
-#define vec_negate(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-#define vec_copy(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i]; } while(0)
-
-#define vec_add_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
-#define vec_sub_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
-#define vec_mul_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
-#define vec_div_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = powf((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
-#define vec_mul_scalar_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_f32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = powf((a)[_i], (b)); } while(0)
-#define vec_sqrt_f32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sqrtf((a)[_i]); } while(0)
-#define vec_sin_f32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sinf((a)[_i]); } while(0)
-#define vec_cos_f32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cosf((a)[_i]); } while(0)
-#define vec_negame_f32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-
-#define vec_add_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
-#define vec_sub_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
-#define vec_mul_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
-#define vec_div_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int8_t)pow((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
-#define vec_mul_scalar_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_i8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int8_t)pow((a)[_i], (b)); } while(0)
-#define vec_sqrt_i8(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int8_t)sqrt((a)[_i]); } while(0)
-#define vec_negame_i8(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-
-#define vec_add_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
-#define vec_sub_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
-#define vec_mul_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
-#define vec_div_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int16_t)pow((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
-#define vec_mul_scalar_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_i16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int16_t)pow((a)[_i], (b)); } while(0)
-#define vec_sqrt_i16(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int16_t)sqrt((a)[_i]); } while(0)
-#define vec_negame_i16(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-
-#define vec_add_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
-#define vec_sub_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
-#define vec_mul_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
-#define vec_div_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int32_t)pow((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
-#define vec_mul_scalar_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_i32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int32_t)pow((a)[_i], (b)); } while(0)
-#define vec_sqrt_i32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int32_t)sqrt((a)[_i]); } while(0)
-#define vec_negame_i32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-
-#define vec_add_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
-#define vec_sub_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
-#define vec_mul_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
-#define vec_div_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int64_t)pow((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
-#define vec_mul_scalar_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_i64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int64_t)pow((a)[_i], (b)); } while(0)
-#define vec_sqrt_i64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (int64_t)sqrt((a)[_i]); } while(0)
-#define vec_negame_i64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-
-#define vec_add_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
-#define vec_sub_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
-#define vec_mul_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
-#define vec_div_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint8_t)pow((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
-#define vec_mul_scalar_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_u8(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint8_t)pow((a)[_i], (b)); } while(0)
-#define vec_sqrt_u8(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint8_t)sqrt((a)[_i]); } while(0)
-#define vec_negame_u8(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-
-#define vec_add_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
-#define vec_sub_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
-#define vec_mul_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
-#define vec_div_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint16_t)pow((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
-#define vec_mul_scalar_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_u16(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint16_t)pow((a)[_i], (b)); } while(0)
-#define vec_sqrt_u16(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint16_t)sqrt((a)[_i]); } while(0)
-#define vec_negame_u16(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-
-#define vec_add_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
-#define vec_sub_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
-#define vec_mul_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
-#define vec_div_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint32_t)pow((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
-#define vec_mul_scalar_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_u32(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint32_t)pow((a)[_i], (b)); } while(0)
-#define vec_sqrt_u32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint32_t)sqrt((a)[_i]); } while(0)
-#define vec_negame_u32(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-
-#define vec_add_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
-#define vec_sub_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
-#define vec_mul_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
-#define vec_div_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint64_t)pow((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
-#define vec_mul_scalar_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_u64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint64_t)pow((a)[_i], (b)); } while(0)
-#define vec_sqrt_u64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (uint64_t)sqrt((a)[_i]); } while(0)
-#define vec_negame_u64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-
-#if defined(_MSC_VER) && !defined(__clang__)
-#define vec_add_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = add_c64((a)[_i], (b)[_i]); } while(0)
-#define vec_sub_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sub_c64((a)[_i], (b)[_i]); } while(0)
-#define vec_mul_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = mul_c64((a)[_i], (b)[_i]); } while(0)
-#define vec_div_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = div_c64((a)[_i], (b)[_i]); } while(0)
-#define vec_pow_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpowf((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = add_c64((a)[_i], (b)); } while(0)
-#define vec_mul_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = mul_c64((a)[_i], (b)); } while(0)
-#define vec_pow_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpowf((a)[_i], (b)); } while(0)
-#define vec_sqrt_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = csqrtf((a)[_i]); } while(0)
-#define vec_negame_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = neg_c64((a)[_i]); } while(0)
-#define vec_conj_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = conjf((a)[_i]); } while(0)
-#define vec_imag_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cimagf((a)[_i]); } while(0)
-#define vec_real_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_crealf((a)[_i]); } while(0)
-#define vec_conj_noop(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i]; } while(0)
-
-#define vec_add_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = add_c128((a)[_i], (b)[_i]); } while(0)
-#define vec_sub_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = sub_c128((a)[_i], (b)[_i]); } while(0)
-#define vec_mul_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = mul_c128((a)[_i], (b)[_i]); } while(0)
-#define vec_div_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = div_c128((a)[_i], (b)[_i]); } while(0)
-#define vec_pow_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpow((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = add_c128((a)[_i], (b)); } while(0)
-#define vec_mul_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = mul_c128((a)[_i], (b)); } while(0)
-#define vec_pow_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = cpow((a)[_i], (b)); } while(0)
-#define vec_sqrt_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = csqrt((a)[_i]); } while(0)
-#define vec_negame_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = neg_c128((a)[_i]); } while(0)
-#define vec_conj_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = conj((a)[_i]); } while(0)
-#define vec_imag_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cimag((a)[_i]); } while(0)
-#define vec_real_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_creal((a)[_i]); } while(0)
-#else
-#define vec_add_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
-#define vec_sub_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
-#define vec_mul_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
-#define vec_div_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cpowf((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
-#define vec_mul_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_c64(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cpowf((a)[_i], (b)); } while(0)
-#define vec_sqrt_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_csqrtf((a)[_i]); } while(0)
-#define vec_negame_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-#define vec_conj_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_conjf((a)[_i]); } while(0)
-#define vec_imag_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cimagf((a)[_i]); } while(0)
-#define vec_real_c64(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_crealf((a)[_i]); } while(0)
-#define vec_conj_noop(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i]; } while(0)
-
-#define vec_add_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b)[_i]; } while(0)
-#define vec_sub_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] - (b)[_i]; } while(0)
-#define vec_mul_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b)[_i]; } while(0)
-#define vec_div_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] / (b)[_i]; } while(0)
-#define vec_pow_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cpow((a)[_i], (b)[_i]); } while(0)
-#define vec_add_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] + (b); } while(0)
-#define vec_mul_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = (a)[_i] * (b); } while(0)
-#define vec_pow_scalar_c128(a, b, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cpow((a)[_i], (b)); } while(0)
-#define vec_sqrt_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_csqrt((a)[_i]); } while(0)
-#define vec_negame_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = -(a)[_i]; } while(0)
-#define vec_conj_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_conj((a)[_i]); } while(0)
-#define vec_imag_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_cimag((a)[_i]); } while(0)
-#define vec_real_c128(a, out, n) do { for (int _i = 0; _i < (n); _i++) (out)[_i] = me_creal((a)[_i]); } while(0)
-#endif
-
-/* Generate float32 evaluator */
-DEFINE_ME_EVAL(f32, float,
-               vec_add_f32, vec_sub_f32, vec_mul_f32, vec_div_f32, vec_pow_f32,
-               vec_add_scalar_f32, vec_mul_scalar_f32, vec_pow_scalar_f32,
-               vec_sqrt_f32, vec_sin_f32, vec_cos_f32, vec_negame_f32,
-               sqrtf, sinf, cosf, expf, logf, fabsf, powf,
-               vec_copy)
-
-/* Generate float64 (double) evaluator */
-DEFINE_ME_EVAL(f64, double,
-               vec_add, vec_sub, vec_mul, vec_div, vec_pow,
-               vec_add_scalar, vec_mul_scalar, vec_pow_scalar,
-               vec_sqrt, vec_sin, vec_cos, vec_negate,
-               sqrt, sin, cos, exp, log, fabs, pow,
-               vec_copy)
-
-/* Generate integer evaluators - sin/cos cast to double and back */
-DEFINE_ME_EVAL(i8, int8_t,
-               vec_add_i8, vec_sub_i8, vec_mul_i8, vec_div_i8, vec_pow_i8,
-               vec_add_scalar_i8, vec_mul_scalar_i8, vec_pow_scalar_i8,
-               vec_sqrt_i8, vec_sqrt_i8, vec_sqrt_i8, vec_negame_i8,
-               sqrt, sin, cos, exp, log, fabs, pow,
-               vec_conj_noop)
-
-DEFINE_ME_EVAL(i16, int16_t,
-               vec_add_i16, vec_sub_i16, vec_mul_i16, vec_div_i16, vec_pow_i16,
-               vec_add_scalar_i16, vec_mul_scalar_i16, vec_pow_scalar_i16,
-               vec_sqrt_i16, vec_sqrt_i16, vec_sqrt_i16, vec_negame_i16,
-               sqrt, sin, cos, exp, log, fabs, pow,
-               vec_conj_noop)
-
-DEFINE_ME_EVAL(i32, int32_t,
-               vec_add_i32, vec_sub_i32, vec_mul_i32, vec_div_i32, vec_pow_i32,
-               vec_add_scalar_i32, vec_mul_scalar_i32, vec_pow_scalar_i32,
-               vec_sqrt_i32, vec_sqrt_i32, vec_sqrt_i32, vec_negame_i32,
-               sqrt, sin, cos, exp, log, fabs, pow,
-               vec_conj_noop)
-
-DEFINE_ME_EVAL(i64, int64_t,
-               vec_add_i64, vec_sub_i64, vec_mul_i64, vec_div_i64, vec_pow_i64,
-               vec_add_scalar_i64, vec_mul_scalar_i64, vec_pow_scalar_i64,
-               vec_sqrt_i64, vec_sqrt_i64, vec_sqrt_i64, vec_negame_i64,
-               sqrt, sin, cos, exp, log, fabs, pow,
-               vec_conj_noop)
-
-DEFINE_ME_EVAL(u8, uint8_t,
-               vec_add_u8, vec_sub_u8, vec_mul_u8, vec_div_u8, vec_pow_u8,
-               vec_add_scalar_u8, vec_mul_scalar_u8, vec_pow_scalar_u8,
-               vec_sqrt_u8, vec_sqrt_u8, vec_sqrt_u8, vec_negame_u8,
-               sqrt, sin, cos, exp, log, fabs, pow,
-               vec_conj_noop)
-
-DEFINE_ME_EVAL(u16, uint16_t,
-               vec_add_u16, vec_sub_u16, vec_mul_u16, vec_div_u16, vec_pow_u16,
-               vec_add_scalar_u16, vec_mul_scalar_u16, vec_pow_scalar_u16,
-               vec_sqrt_u16, vec_sqrt_u16, vec_sqrt_u16, vec_negame_u16,
-               sqrt, sin, cos, exp, log, fabs, pow,
-               vec_conj_noop)
-
-DEFINE_ME_EVAL(u32, uint32_t,
-               vec_add_u32, vec_sub_u32, vec_mul_u32, vec_div_u32, vec_pow_u32,
-               vec_add_scalar_u32, vec_mul_scalar_u32, vec_pow_scalar_u32,
-               vec_sqrt_u32, vec_sqrt_u32, vec_sqrt_u32, vec_negame_u32,
-               sqrt, sin, cos, exp, log, fabs, pow,
-               vec_conj_noop)
-
-DEFINE_ME_EVAL(u64, uint64_t,
-               vec_add_u64, vec_sub_u64, vec_mul_u64, vec_div_u64, vec_pow_u64,
-               vec_add_scalar_u64, vec_mul_scalar_u64, vec_pow_scalar_u64,
-               vec_sqrt_u64, vec_sqrt_u64, vec_sqrt_u64, vec_negame_u64,
-               sqrt, sin, cos, exp, log, fabs, pow,
-               vec_conj_noop)
-
-/* Generate complex evaluators */
-DEFINE_ME_EVAL(c64, float _Complex,
-               vec_add_c64, vec_sub_c64, vec_mul_c64, vec_div_c64, vec_pow_c64,
-               vec_add_scalar_c64, vec_mul_scalar_c64, vec_pow_scalar_c64,
-               vec_sqrt_c64, vec_sqrt_c64, vec_sqrt_c64, vec_negame_c64,
-               me_csqrtf, me_csqrtf, me_csqrtf, me_cexpf, me_clogf, me_cabsf, me_cpowf,
-               vec_conj_c64)
-
-DEFINE_ME_EVAL(c128, double _Complex,
-               vec_add_c128, vec_sub_c128, vec_mul_c128, vec_div_c128, vec_pow_c128,
-               vec_add_scalar_c128, vec_mul_scalar_c128, vec_pow_scalar_c128,
-               vec_sqrt_c128, vec_sqrt_c128, vec_sqrt_c128, vec_negame_c128,
-               me_csqrt, me_csqrt, me_csqrt, me_cexp, me_clog, me_cabs, me_cpow,
-               vec_conj_c128)
-
-/* Public API - dispatches to correct type-specific evaluator */
-/* Structure to track promoted variables */
-typedef struct {
-    void* promoted_data; // Temporary buffer for promoted data
-    me_dtype original_type;
-    bool needs_free;
-} promoted_var_t;
-
-/* Helper to save original variable bindings */
-static void save_variable_bindings(const me_expr* node,
-                                   const void** original_bounds,
-                                   me_dtype* original_types,
-                                   int* save_idx) {
-    if (!node) return;
-    switch (TYPE_MASK(node->type)) {
-    case ME_VARIABLE:
-        original_bounds[*save_idx] = node->bound;
-        original_types[*save_idx] = node->dtype;
-        (*save_idx)++;
-        break;
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(node->type);
-            for (int i = 0; i < arity; i++) {
-                save_variable_bindings((const me_expr*)node->parameters[i],
-                                       original_bounds, original_types, save_idx);
-            }
-            break;
-        }
-    }
-}
-
-/* Recursively promote variables in expression tree */
-static void promote_variables_in_tree(me_expr* n, me_dtype target_type,
-                                      promoted_var_t* promotions, int* promo_count,
-                                      int nitems) {
-    if (!n) return;
-
-    switch (TYPE_MASK(n->type)) {
-    case ME_CONSTANT:
-        // Constants are promoted on-the-fly during evaluation
-        break;
-
-    case ME_VARIABLE:
-        if (n->dtype != target_type) {
-            // Need to promote this variable
-            void* promoted = malloc(nitems * dtype_size(target_type));
-            if (promoted) {
-                convert_func_t conv = get_convert_func(n->dtype, target_type);
-                if (conv) {
-                    conv(n->bound, promoted, nitems);
-
-                    // Track this promotion for later cleanup
-                    promotions[*promo_count].promoted_data = promoted;
-                    promotions[*promo_count].original_type = n->dtype;
-                    promotions[*promo_count].needs_free = true;
-                    (*promo_count)++;
-
-                    // Temporarily replace bound pointer
-                    n->bound = promoted;
-                    n->dtype = target_type;
-                }
-                else {
-                    free(promoted);
-                }
-            }
-        }
-        break;
-
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(n->type);
-            for (int i = 0; i < arity; i++) {
-                promote_variables_in_tree((me_expr*)n->parameters[i], target_type,
-                                          promotions, promo_count, nitems);
-            }
-            break;
-        }
-    }
-}
-
-/* Restore original variable bindings after promotion */
-static void restore_variables_in_tree(me_expr* n, const void** original_bounds,
-                                      const me_dtype* original_types, int* restore_idx) {
-    if (!n) return;
-
-    switch (TYPE_MASK(n->type)) {
-    case ME_VARIABLE:
-        if (original_bounds[*restore_idx] != NULL) {
-            n->bound = original_bounds[*restore_idx];
-            n->dtype = original_types[*restore_idx];
-            (*restore_idx)++;
-        }
-        break;
-
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(n->type);
-            for (int i = 0; i < arity; i++) {
-                restore_variables_in_tree((me_expr*)n->parameters[i], original_bounds, original_types, restore_idx);
-            }
-            break;
-        }
-    }
-}
-
-/* Check if all variables in tree match target type */
-static bool all_variables_match_type(const me_expr* n, me_dtype target_type) {
-    if (!n) return true;
-
-    switch (TYPE_MASK(n->type)) {
-    case ME_CONSTANT:
-        return true; // Constants are always OK
-
-    case ME_VARIABLE:
-        return n->dtype == target_type;
-
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(n->type);
-            for (int i = 0; i < arity; i++) {
-                if (!all_variables_match_type((const me_expr*)n->parameters[i], target_type)) {
-                    return false;
-                }
-            }
-            return true;
-        }
-    }
-
-    return true;
-}
-
-static void broadcast_reduction_output(void* output, me_dtype dtype, int output_nitems) {
-    if (!output || output_nitems <= 1) return;
-    switch (dtype) {
-    case ME_BOOL:
-        {
-            bool val = ((bool*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((bool*)output)[i] = val;
-            }
-            break;
-        }
-    case ME_INT8:
-        {
-            int8_t val = ((int8_t*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((int8_t*)output)[i] = val;
-            }
-            break;
-        }
-    case ME_INT16:
-        {
-            int16_t val = ((int16_t*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((int16_t*)output)[i] = val;
-            }
-            break;
-        }
-    case ME_INT32:
-        {
-            int32_t val = ((int32_t*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((int32_t*)output)[i] = val;
-            }
-            break;
-        }
-    case ME_INT64:
-        {
-            int64_t val = ((int64_t*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((int64_t*)output)[i] = val;
-            }
-            break;
-        }
-    case ME_UINT8:
-        {
-            uint8_t val = ((uint8_t*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((uint8_t*)output)[i] = val;
-            }
-            break;
-        }
-    case ME_UINT16:
-        {
-            uint16_t val = ((uint16_t*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((uint16_t*)output)[i] = val;
-            }
-            break;
-        }
-    case ME_UINT32:
-        {
-            uint32_t val = ((uint32_t*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((uint32_t*)output)[i] = val;
-            }
-            break;
-        }
-    case ME_UINT64:
-        {
-            uint64_t val = ((uint64_t*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((uint64_t*)output)[i] = val;
-            }
-            break;
-        }
-    case ME_FLOAT32:
-        {
-            float val = ((float*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((float*)output)[i] = val;
-            }
-            break;
-        }
-    case ME_FLOAT64:
-        {
-            double val = ((double*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((double*)output)[i] = val;
-            }
-            break;
-        }
-    case ME_COMPLEX64:
-        {
-            float _Complex val = ((float _Complex*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((float _Complex*)output)[i] = val;
-            }
-            break;
-        }
-    case ME_COMPLEX128:
-        {
-            double _Complex val = ((double _Complex*)output)[0];
-            for (int i = 1; i < output_nitems; i++) {
-                ((double _Complex*)output)[i] = val;
-            }
-            break;
-        }
-    default:
-        break;
-    }
-}
-
-static void eval_reduction(const me_expr* n, int output_nitems) {
-    if (!n || !n->output || !is_reduction_node(n)) return;
-    if (output_nitems <= 0) return;
-
-    me_expr* arg = (me_expr*)n->parameters[0];
-    if (!arg) return;
-
-    const int nitems = n->nitems;
-    me_dtype arg_type = arg->dtype;
-    if (arg->type != ME_CONSTANT && arg->type != ME_VARIABLE) {
-        arg_type = infer_output_type(arg);
-        if (nitems > 0) {
-            if (!arg->output) {
-                arg->output = malloc((size_t)nitems * dtype_size(arg_type));
-                if (!arg->output) return;
-            }
-            arg->nitems = nitems;
-            arg->dtype = arg_type;
-            private_eval(arg);
-        }
-    }
-    me_dtype result_type = reduction_output_dtype(arg_type, n->function);
-    me_dtype output_type = n->dtype;
-    bool is_prod = n->function == (void*)prod_reduce;
-    bool is_min = n->function == (void*)min_reduce;
-    bool is_max = n->function == (void*)max_reduce;
-    bool is_any = n->function == (void*)any_reduce;
-    bool is_all = n->function == (void*)all_reduce;
-
-    void* write_ptr = n->output;
-    void* temp_output = NULL;
-    if (output_type != result_type) {
-        temp_output = malloc((size_t)output_nitems * dtype_size(result_type));
-        if (!temp_output) return;
-        write_ptr = temp_output;
-    }
-
-    if (arg->type == ME_CONSTANT) {
-        double val = arg->value;
-        if (is_any || is_all) {
-            bool acc = is_all;
-            if (nitems == 0) {
-                acc = is_all;
-            }
-            else {
-                switch (arg_type) {
-                case ME_BOOL:
-                    acc = val != 0.0;
-                    break;
-                case ME_INT8:
-                case ME_INT16:
-                case ME_INT32:
-                case ME_INT64:
-                case ME_UINT8:
-                case ME_UINT16:
-                case ME_UINT32:
-                case ME_UINT64:
-                case ME_FLOAT32:
-                case ME_FLOAT64:
-                    acc = val != 0.0;
-                    break;
-                case ME_COMPLEX64:
-                case ME_COMPLEX128:
-                    acc = val != 0.0;
-                    break;
-                default:
-                    acc = false;
-                    break;
-                }
-            }
-            ((bool*)write_ptr)[0] = acc;
-        }
-        else if (is_min || is_max) {
-            switch (arg_type) {
-            case ME_BOOL:
-                {
-                    bool acc = is_min;
-                    if (nitems > 0) {
-                        acc = (bool)val;
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_INT8:
-                {
-                    int8_t acc = (int8_t)(is_min ? INT8_MAX : INT8_MIN);
-                    if (nitems > 0) acc = (int8_t)val;
-                    ((int8_t*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_INT16:
-                {
-                    int16_t acc = (int16_t)(is_min ? INT16_MAX : INT16_MIN);
-                    if (nitems > 0) acc = (int16_t)val;
-                    ((int16_t*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_INT32:
-                {
-                    int32_t acc = (int32_t)(is_min ? INT32_MAX : INT32_MIN);
-                    if (nitems > 0) acc = (int32_t)val;
-                    ((int32_t*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_INT64:
-                {
-                    int64_t acc = is_min ? INT64_MAX : INT64_MIN;
-                    if (nitems > 0) acc = (int64_t)val;
-                    ((int64_t*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_UINT8:
-                {
-                    uint8_t acc = is_min ? UINT8_MAX : 0;
-                    if (nitems > 0) acc = (uint8_t)val;
-                    ((uint8_t*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_UINT16:
-                {
-                    uint16_t acc = is_min ? UINT16_MAX : 0;
-                    if (nitems > 0) acc = (uint16_t)val;
-                    ((uint16_t*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_UINT32:
-                {
-                    uint32_t acc = is_min ? UINT32_MAX : 0;
-                    if (nitems > 0) acc = (uint32_t)val;
-                    ((uint32_t*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_UINT64:
-                {
-                    uint64_t acc = is_min ? UINT64_MAX : 0;
-                    if (nitems > 0) acc = (uint64_t)val;
-                    ((uint64_t*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_FLOAT32:
-                {
-                    float acc = is_min ? INFINITY : -INFINITY;
-                    if (nitems > 0) acc = (float)val;
-                    ((float*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_FLOAT64:
-                {
-                    double acc = is_min ? INFINITY : -INFINITY;
-                    if (nitems > 0) acc = val;
-                    ((double*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_COMPLEX64:
-                {
-                    ((float _Complex*)write_ptr)[0] = (float _Complex)0.0f;
-                    break;
-                }
-            case ME_COMPLEX128:
-                {
-                    ((double _Complex*)write_ptr)[0] = (double _Complex)0.0;
-                    break;
-                }
-            default:
-                break;
-            }
-        }
-        else {
-            switch (arg_type) {
-            case ME_BOOL:
-            case ME_INT8:
-            case ME_INT16:
-            case ME_INT32:
-            case ME_INT64:
-                {
-                    int64_t acc = is_prod ? 1 : 0;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1 : 0;
-                    }
-                    else if (is_prod) {
-                        int64_t v = (int64_t)val;
-                        for (int i = 0; i < nitems; i++) acc *= v;
-                    }
-                    else {
-                        acc = (int64_t)val * (int64_t)nitems;
-                    }
-                    ((int64_t*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_UINT8:
-            case ME_UINT16:
-            case ME_UINT32:
-            case ME_UINT64:
-                {
-                    uint64_t acc = is_prod ? 1 : 0;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1 : 0;
-                    }
-                    else if (is_prod) {
-                        uint64_t v = (uint64_t)val;
-                        for (int i = 0; i < nitems; i++) acc *= v;
-                    }
-                    else {
-                        acc = (uint64_t)val * (uint64_t)nitems;
-                    }
-                    ((uint64_t*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_FLOAT32:
-                {
-                    float acc = is_prod ? 1.0f : 0.0f;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1.0f : 0.0f;
-                    }
-                    else if (is_prod) {
-                        float v = (float)val;
-                        for (int i = 0; i < nitems; i++) acc *= v;
-                    }
-                    else {
-                        acc = (float)val * (float)nitems;
-                    }
-                    ((float*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_FLOAT64:
-                {
-                    double acc = is_prod ? 1.0 : 0.0;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1.0 : 0.0;
-                    }
-                    else if (is_prod) {
-                        for (int i = 0; i < nitems; i++) acc *= val;
-                    }
-                    else {
-                        acc = val * (double)nitems;
-                    }
-                    ((double*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_COMPLEX64:
-                {
-                    float _Complex acc = is_prod ? (float _Complex)1.0f : (float _Complex)0.0f;
-                    float _Complex v = (float _Complex)val;
-                    if (nitems == 0) {
-                        acc = is_prod ? (float _Complex)1.0f : (float _Complex)0.0f;
-                    }
-                    else if (is_prod) {
-                        for (int i = 0; i < nitems; i++) acc *= v;
-                    }
-                    else {
-                        acc = v * (float)nitems;
-                    }
-                    ((float _Complex*)write_ptr)[0] = acc;
-                    break;
-                }
-            case ME_COMPLEX128:
-                {
-                    double _Complex acc = is_prod ? (double _Complex)1.0 : (double _Complex)0.0;
-                    double _Complex v = (double _Complex)val;
-                    if (nitems == 0) {
-                        acc = is_prod ? (double _Complex)1.0 : (double _Complex)0.0;
-                    }
-                    else if (is_prod) {
-                        for (int i = 0; i < nitems; i++) acc *= v;
-                    }
-                    else {
-                        acc = v * (double)nitems;
-                    }
-                    ((double _Complex*)write_ptr)[0] = acc;
-                    break;
-                }
-            default:
-                break;
-            }
-        }
-    }
-    else {
-        const void* saved_bound = arg->bound;
-        int saved_type = arg->type;
-        if (arg->type != ME_VARIABLE) {
-            ((me_expr*)arg)->bound = arg->output;
-            ((me_expr*)arg)->type = ME_VARIABLE;
-        }
-        switch (arg_type) {
-        case ME_BOOL:
-            {
-                const bool* data = (const bool*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            if (is_any) {
-                                if (data[i]) { acc = true; break; }
-                            }
-                            else {
-                                if (!data[i]) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                }
-                else if (is_min || is_max) {
-                    bool acc = is_min;
-                    if (nitems > 0) {
-                        acc = data[0];
-                        for (int i = 1; i < nitems; i++) {
-                            acc = is_min ? (acc && data[i]) : (acc || data[i]);
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                }
-                else {
-                    int64_t acc = is_prod ? 1 : 0;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1 : 0;
-                    }
-                    else if (is_prod) {
-                        for (int i = 0; i < nitems; i++) acc *= data[i] ? 1 : 0;
-                    }
-                    else {
-                        for (int i = 0; i < nitems; i++) acc += data[i] ? 1 : 0;
-                    }
-                    ((int64_t*)write_ptr)[0] = acc;
-                }
-                break;
-            }
-        case ME_INT8:
-            {
-                const int8_t* data = (const int8_t*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            if (is_any) {
-                                if (data[i] != 0) { acc = true; break; }
-                            }
-                            else {
-                                if (data[i] == 0) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                }
-                else if (is_min || is_max) {
-                    int8_t acc = is_min ? reduce_min_int8(data, nitems) :
-                        reduce_max_int8(data, nitems);
-                    ((int8_t*)write_ptr)[0] = acc;
-                }
-                else {
-                    int64_t acc = is_prod ? 1 : 0;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1 : 0;
-                    }
-                    else if (is_prod) {
-                        for (int i = 0; i < nitems; i++) acc *= data[i];
-                    }
-                    else {
-                        for (int i = 0; i < nitems; i++) acc += data[i];
-                    }
-                    ((int64_t*)write_ptr)[0] = acc;
-                }
-                break;
-            }
-        case ME_INT16:
-            {
-                const int16_t* data = (const int16_t*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            if (is_any) {
-                                if (data[i] != 0) { acc = true; break; }
-                            }
-                            else {
-                                if (data[i] == 0) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                }
-                else if (is_min || is_max) {
-                    int16_t acc = is_min ? reduce_min_int16(data, nitems) :
-                        reduce_max_int16(data, nitems);
-                    ((int16_t*)write_ptr)[0] = acc;
-                }
-                else {
-                    int64_t acc = is_prod ? 1 : 0;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1 : 0;
-                    }
-                    else if (is_prod) {
-                        for (int i = 0; i < nitems; i++) acc *= data[i];
-                    }
-                    else {
-                        for (int i = 0; i < nitems; i++) acc += data[i];
-                    }
-                    ((int64_t*)write_ptr)[0] = acc;
-                }
-                break;
-            }
-        case ME_INT32:
-            {
-                const int32_t* data = (const int32_t*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            if (is_any) {
-                                if (data[i] != 0) { acc = true; break; }
-                            }
-                            else {
-                                if (data[i] == 0) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                }
-                else if (is_min || is_max) {
-                    int32_t acc = is_min ? reduce_min_int32(data, nitems) :
-                        reduce_max_int32(data, nitems);
-                    ((int32_t*)write_ptr)[0] = acc;
-                }
-                else {
-                    int64_t acc = is_prod ? 1 : 0;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1 : 0;
-                    }
-                    else if (is_prod) {
-                        for (int i = 0; i < nitems; i++) acc *= data[i];
-                    }
-                    else {
-                        acc = reduce_sum_int32(data, nitems);
-                    }
-                    ((int64_t*)write_ptr)[0] = acc;
-                }
-                break;
-            }
-        case ME_INT64:
-            {
-                const int64_t* data = (const int64_t*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            if (is_any) {
-                                if (data[i] != 0) { acc = true; break; }
-                            }
-                            else {
-                                if (data[i] == 0) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                }
-                else if (is_min || is_max) {
-                    int64_t acc = is_min ? reduce_min_int64(data, nitems) :
-                        reduce_max_int64(data, nitems);
-                    ((int64_t*)write_ptr)[0] = acc;
-                }
-                else {
-                    int64_t acc = is_prod ? 1 : 0;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1 : 0;
-                    }
-                    else if (is_prod) {
-                        for (int i = 0; i < nitems; i++) acc *= data[i];
-                    }
-                    else {
-                        for (int i = 0; i < nitems; i++) acc += data[i];
-                    }
-                    ((int64_t*)write_ptr)[0] = acc;
-                }
-                break;
-            }
-        case ME_UINT8:
-            {
-                const uint8_t* data = (const uint8_t*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            if (is_any) {
-                                if (data[i] != 0) { acc = true; break; }
-                            }
-                            else {
-                                if (data[i] == 0) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                }
-                else if (is_min || is_max) {
-                    uint8_t acc = is_min ? reduce_min_uint8(data, nitems) :
-                        reduce_max_uint8(data, nitems);
-                    ((uint8_t*)write_ptr)[0] = acc;
-                }
-                else {
-                    uint64_t acc = is_prod ? 1 : 0;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1 : 0;
-                    }
-                    else if (is_prod) {
-                        for (int i = 0; i < nitems; i++) acc *= data[i];
-                    }
-                    else {
-                        for (int i = 0; i < nitems; i++) acc += data[i];
-                    }
-                    ((uint64_t*)write_ptr)[0] = acc;
-                }
-                break;
-            }
-        case ME_UINT16:
-            {
-                const uint16_t* data = (const uint16_t*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            if (is_any) {
-                                if (data[i] != 0) { acc = true; break; }
-                            }
-                            else {
-                                if (data[i] == 0) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                }
-                else if (is_min || is_max) {
-                    uint16_t acc = is_min ? reduce_min_uint16(data, nitems) :
-                        reduce_max_uint16(data, nitems);
-                    ((uint16_t*)write_ptr)[0] = acc;
-                }
-                else {
-                    uint64_t acc = is_prod ? 1 : 0;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1 : 0;
-                    }
-                    else if (is_prod) {
-                        for (int i = 0; i < nitems; i++) acc *= data[i];
-                    }
-                    else {
-                        for (int i = 0; i < nitems; i++) acc += data[i];
-                    }
-                    ((uint64_t*)write_ptr)[0] = acc;
-                }
-                break;
-            }
-        case ME_UINT32:
-            {
-                const uint32_t* data = (const uint32_t*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            if (is_any) {
-                                if (data[i] != 0) { acc = true; break; }
-                            }
-                            else {
-                                if (data[i] == 0) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                }
-                else if (is_min || is_max) {
-                    uint32_t acc = is_min ? reduce_min_uint32(data, nitems) :
-                        reduce_max_uint32(data, nitems);
-                    ((uint32_t*)write_ptr)[0] = acc;
-                }
-                else {
-                    uint64_t acc = is_prod ? 1 : 0;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1 : 0;
-                    }
-                    else if (is_prod) {
-                        for (int i = 0; i < nitems; i++) acc *= data[i];
-                    }
-                    else {
-                        acc = reduce_sum_uint32(data, nitems);
-                    }
-                    ((uint64_t*)write_ptr)[0] = acc;
-                }
-                break;
-            }
-        case ME_UINT64:
-            {
-                const uint64_t* data = (const uint64_t*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            if (is_any) {
-                                if (data[i] != 0) { acc = true; break; }
-                            }
-                            else {
-                                if (data[i] == 0) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                }
-                else if (is_min || is_max) {
-                    uint64_t acc = is_min ? reduce_min_uint64(data, nitems) :
-                        reduce_max_uint64(data, nitems);
-                    ((uint64_t*)write_ptr)[0] = acc;
-                }
-                else {
-                    uint64_t acc = is_prod ? 1 : 0;
-                    if (nitems == 0) {
-                        acc = is_prod ? 1 : 0;
-                    }
-                    else if (is_prod) {
-                        for (int i = 0; i < nitems; i++) acc *= data[i];
-                    }
-                    else {
-                        for (int i = 0; i < nitems; i++) acc += data[i];
-                    }
-                    ((uint64_t*)write_ptr)[0] = acc;
-                }
-                break;
-            }
-        case ME_FLOAT32:
-            {
-                const float* data = (const float*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            if (is_any) {
-                                if (data[i] != 0.0f) { acc = true; break; }
-                            }
-                            else {
-                                if (data[i] == 0.0f) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                }
-                else {
-                    if (nitems == 0) {
-                        float acc = 0.0f;
-                        if (is_min) acc = INFINITY;
-                        else if (is_max) acc = -INFINITY;
-                        else acc = is_prod ? 1.0f : 0.0f;
-                        ((float*)write_ptr)[0] = acc;
-                    }
-                    else if (is_min) {
-                        float acc = reduce_min_float32_nan_safe(data, nitems);
-                        ((float*)write_ptr)[0] = acc;
-                    }
-                    else if (is_max) {
-                        float acc = reduce_max_float32_nan_safe(data, nitems);
-                        ((float*)write_ptr)[0] = acc;
-                    }
-                    else if (is_prod) {
-                        /* Accumulate float32 sum/prod in float64 for better precision. */
-                        double acc = reduce_prod_float32_nan_safe(data, nitems);
-                        ((float*)write_ptr)[0] = (float)acc;
-                    }
-                    else {
-                        double acc = reduce_sum_float32_nan_safe(data, nitems);
-                        ((float*)write_ptr)[0] = (float)acc;
-                    }
-                }
-                break;
-            }
-        case ME_FLOAT64:
-            {
-                const double* data = (const double*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            if (is_any) {
-                                if (data[i] != 0.0) { acc = true; break; }
-                            }
-                            else {
-                                if (data[i] == 0.0) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                }
-                else {
-                    double acc = 0.0;
-                    if (nitems == 0) {
-                        if (is_min) acc = INFINITY;
-                        else if (is_max) acc = -INFINITY;
-                        else acc = is_prod ? 1.0 : 0.0;
-                    }
-                    else if (is_min) {
-                        acc = reduce_min_float64_nan_safe(data, nitems);
-                    }
-                    else if (is_max) {
-                        acc = reduce_max_float64_nan_safe(data, nitems);
-                    }
-                    else if (is_prod) {
-                        acc = reduce_prod_float64_nan_safe(data, nitems);
-                    }
-                    else {
-                        acc = reduce_sum_float64_nan_safe(data, nitems);
-                    }
-                    ((double*)write_ptr)[0] = acc;
-                }
-                break;
-            }
-        case ME_COMPLEX64:
-            {
-                const float _Complex* data = (const float _Complex*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            bool nonzero = IS_NONZERO_c64(data[i]);
-                            if (is_any) {
-                                if (nonzero) { acc = true; break; }
-                            }
-                            else {
-                                if (!nonzero) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                    break;
-                }
-                if (is_min || is_max) {
-                    ((float _Complex*)write_ptr)[0] = (float _Complex)0.0f;
-                    break;
-                }
-                float _Complex acc = is_prod ? (float _Complex)1.0f : (float _Complex)0.0f;
-                if (nitems == 0) {
-                    acc = is_prod ? (float _Complex)1.0f : (float _Complex)0.0f;
-                }
-                else if (is_prod) {
-                    for (int i = 0; i < nitems; i++) acc *= data[i];
-                }
-                else {
-                    for (int i = 0; i < nitems; i++) acc += data[i];
-                }
-                ((float _Complex*)write_ptr)[0] = acc;
-                break;
-            }
-        case ME_COMPLEX128:
-            {
-                const double _Complex* data = (const double _Complex*)arg->bound;
-                if (is_any || is_all) {
-                    bool acc = is_all;
-                    if (nitems > 0) {
-                        acc = is_all;
-                        for (int i = 0; i < nitems; i++) {
-                            bool nonzero = IS_NONZERO_c128(data[i]);
-                            if (is_any) {
-                                if (nonzero) { acc = true; break; }
-                            }
-                            else {
-                                if (!nonzero) { acc = false; break; }
-                            }
-                        }
-                    }
-                    ((bool*)write_ptr)[0] = acc;
-                    break;
-                }
-                if (is_min || is_max) {
-                    ((double _Complex*)write_ptr)[0] = (double _Complex)0.0;
-                    break;
-                }
-                double _Complex acc = is_prod ? (double _Complex)1.0 : (double _Complex)0.0;
-                if (nitems == 0) {
-                    acc = is_prod ? (double _Complex)1.0 : (double _Complex)0.0;
-                }
-                else if (is_prod) {
-                    for (int i = 0; i < nitems; i++) acc *= data[i];
-                }
-                else {
-                    for (int i = 0; i < nitems; i++) acc += data[i];
-                }
-                ((double _Complex*)write_ptr)[0] = acc;
-                break;
-            }
-        default:
-            break;
-        }
-        if (saved_type != ME_VARIABLE) {
-            ((me_expr*)arg)->bound = saved_bound;
-            ((me_expr*)arg)->type = saved_type;
-        }
-    }
-
-    {
-        me_dtype write_type = temp_output ? result_type : output_type;
-        broadcast_reduction_output(write_ptr, write_type, output_nitems);
-    }
-
-    if (temp_output) {
-        convert_func_t conv = get_convert_func(result_type, output_type);
-        if (conv) {
-            conv(temp_output, n->output, output_nitems);
-        }
-        free(temp_output);
-    }
-}
-
-static void private_eval(const me_expr* n) {
-    if (!n) return;
-
-    if (is_reduction_node(n)) {
-        eval_reduction(n, 1);
-        return;
-    }
-
-    // Special case: imag() and real() functions return real from complex input
-    if (IS_FUNCTION(n->type) && ARITY(n->type) == 1) {
-        if (n->function == (void*)imag_wrapper || n->function == (void*)real_wrapper) {
-            me_expr* arg = (me_expr*)n->parameters[0];
-            me_dtype arg_type = infer_result_type(arg);
-
-            if (arg_type == ME_COMPLEX64) {
-                // Evaluate argument as complex64
-                if (!arg->output) {
-                    arg->output = malloc(n->nitems * sizeof(float _Complex));
-                    arg->nitems = n->nitems;
-                    ((me_expr*)arg)->dtype = ME_COMPLEX64;
-                }
-                me_eval_c64(arg);
-
-                // Extract real/imaginary part to float32 output
-                const float _Complex* cdata = (const float _Complex*)arg->output;
-                float* output = (float*)n->output;
-                if (n->function == (void*)imag_wrapper) {
-                    for (int i = 0; i < n->nitems; i++) {
-#if defined(_MSC_VER) && defined(__clang__)
-                        output[i] = __builtin_cimagf(cdata[i]);
-#else
-                        output[i] = cimagf(cdata[i]);
-#endif
-                    }
-                }
-                else { // real_wrapper
-                    for (int i = 0; i < n->nitems; i++) {
-#if defined(_MSC_VER) && defined(__clang__)
-                        output[i] = __builtin_crealf(cdata[i]);
-#else
-                        output[i] = crealf(cdata[i]);
-#endif
-                    }
-                }
-                return;
-            }
-            else if (arg_type == ME_COMPLEX128) {
-                // Evaluate argument as complex128
-                if (!arg->output) {
-                    arg->output = malloc(n->nitems * sizeof(double _Complex));
-                    arg->nitems = n->nitems;
-                    ((me_expr*)arg)->dtype = ME_COMPLEX128;
-                }
-                me_eval_c128(arg);
-
-                // Extract real/imaginary part to float64 output
-                const double _Complex* cdata = (const double _Complex*)arg->output;
-                double* output = (double*)n->output;
-                if (n->function == (void*)imag_wrapper) {
-                    for (int i = 0; i < n->nitems; i++) {
-#if defined(_MSC_VER) && defined(__clang__)
-                        output[i] = __builtin_cimag(cdata[i]);
-#else
-                        output[i] = cimag(cdata[i]);
-#endif
-                    }
-                }
-                else { // real_wrapper
-                    for (int i = 0; i < n->nitems; i++) {
-#if defined(_MSC_VER) && defined(__clang__)
-                        output[i] = __builtin_creal(cdata[i]);
-#else
-                        output[i] = creal(cdata[i]);
-#endif
-                    }
-                }
-                return;
-            }
-            // If not complex, fall through to normal evaluation
-        }
-    }
-
-    // Infer the result type from the expression tree
-    me_dtype result_type = infer_result_type(n);
-
-    // If all variables already match result type, use fast path
-    bool all_match = all_variables_match_type(n, result_type);
-    if (result_type == n->dtype && all_match) {
-        // Fast path: no promotion needed
-        if (n->dtype == ME_AUTO) {
-            fprintf(stderr, "FATAL: ME_AUTO dtype in evaluation. This is a bug.\n");
-#ifdef NDEBUG
-            abort(); // Release build: terminate immediately
-#else
-            assert(0 && "ME_AUTO should be resolved during compilation"); // Debug: trigger debugger
-#endif
-        }
-        switch (n->dtype) {
-        case ME_BOOL: me_eval_i8(n);
-            break;
-        case ME_INT8: me_eval_i8(n);
-            break;
-        case ME_INT16: me_eval_i16(n);
-            break;
-        case ME_INT32: me_eval_i32(n);
-            break;
-        case ME_INT64: me_eval_i64(n);
-            break;
-        case ME_UINT8: me_eval_u8(n);
-            break;
-        case ME_UINT16: me_eval_u16(n);
-            break;
-        case ME_UINT32: me_eval_u32(n);
-            break;
-        case ME_UINT64: me_eval_u64(n);
-            break;
-        case ME_FLOAT32: me_eval_f32(n);
-            break;
-        case ME_FLOAT64: me_eval_f64(n);
-            break;
-        case ME_COMPLEX64: me_eval_c64(n);
-            break;
-        case ME_COMPLEX128: me_eval_c128(n);
-            break;
-        default:
-            fprintf(stderr, "FATAL: Invalid dtype %d in evaluation.\n", n->dtype);
-#ifdef NDEBUG
-            abort(); // Release build: terminate immediately
-#else
-            assert(0 && "Invalid dtype"); // Debug: trigger debugger
-#endif
-        }
-        return;
-    }
-
-    // Slow path: need to promote variables
-    // Allocate tracking structures (max ME_MAX_VARS variables)
-    promoted_var_t promotions[ME_MAX_VARS];
-    int promo_count = 0;
-
-    // Save original variable bindings
-    const void* original_bounds[ME_MAX_VARS];
-    me_dtype original_types[ME_MAX_VARS];
-    int save_idx = 0;
-
-    save_variable_bindings(n, original_bounds, original_types, &save_idx);
-
-    // Promote variables
-    promote_variables_in_tree((me_expr*)n, result_type, promotions, &promo_count, n->nitems);
-
-    // Check if we need output type conversion (e.g., computation in float64, output in bool)
-    me_dtype saved_dtype = n->dtype;
-    void* original_output = n->output;
-    void* temp_output = NULL;
-
-    if (saved_dtype != result_type) {
-        // Allocate temp buffer for computation
-        temp_output = malloc(n->nitems * dtype_size(result_type));
-        if (temp_output) {
-            ((me_expr*)n)->output = temp_output;
-        }
-    }
-
-    // Update expression type for evaluation
-    ((me_expr*)n)->dtype = result_type;
-
-    // Evaluate with promoted types
-    if (result_type == ME_AUTO) {
-        fprintf(stderr, "FATAL: ME_AUTO result type in evaluation. This is a bug.\n");
-#ifdef NDEBUG
-        abort(); // Release build: terminate immediately
-#else
-        assert(0 && "ME_AUTO should be resolved during compilation"); // Debug: trigger debugger
-#endif
-    }
-    switch (result_type) {
-    case ME_BOOL: me_eval_i8(n);
-        break;
-    case ME_INT8: me_eval_i8(n);
-        break;
-    case ME_INT16: me_eval_i16(n);
-        break;
-    case ME_INT32: me_eval_i32(n);
-        break;
-    case ME_INT64: me_eval_i64(n);
-        break;
-    case ME_UINT8: me_eval_u8(n);
-        break;
-    case ME_UINT16: me_eval_u16(n);
-        break;
-    case ME_UINT32: me_eval_u32(n);
-        break;
-    case ME_UINT64: me_eval_u64(n);
-        break;
-    case ME_FLOAT32: me_eval_f32(n);
-        break;
-    case ME_FLOAT64: me_eval_f64(n);
-        break;
-    case ME_COMPLEX64: me_eval_c64(n);
-        break;
-    case ME_COMPLEX128: me_eval_c128(n);
-        break;
-    default:
-        fprintf(stderr, "FATAL: Invalid result type %d in evaluation.\n", result_type);
-#ifdef NDEBUG
-        abort(); // Release build: terminate immediately
-#else
-        assert(0 && "Invalid dtype"); // Debug: trigger debugger
-#endif
-    }
-
-    // If we used a temp buffer, convert to final output type
-    if (temp_output) {
-        convert_func_t conv = get_convert_func(result_type, saved_dtype);
-        if (conv) {
-            conv(temp_output, original_output, n->nitems);
-        }
-        // Restore original output pointer
-        ((me_expr*)n)->output = original_output;
-        free(temp_output);
-    }
-
-    // Restore original variable bindings
-    int restore_idx = 0;
-    restore_variables_in_tree((me_expr*)n, original_bounds, original_types, &restore_idx);
-
-    // Restore expression type
-    ((me_expr*)n)->dtype = saved_dtype;
-
-    // Free promoted buffers
-    for (int i = 0; i < promo_count; i++) {
-        if (promotions[i].needs_free) {
-            free(promotions[i].promoted_data);
-        }
-    }
-}
-
-/* Helper to update variable bindings and nitems in tree */
-static void save_nitems_in_tree(const me_expr* node, int* nitems_array, int* idx) {
-    if (!node) return;
-    nitems_array[(*idx)++] = node->nitems;
-
-    switch (TYPE_MASK(node->type)) {
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(node->type);
-            for (int i = 0; i < arity; i++) {
-                save_nitems_in_tree((const me_expr*)node->parameters[i], nitems_array, idx);
-            }
-            break;
-        }
-    default:
-        break;
-    }
-}
-
-static void restore_nitems_in_tree(me_expr* node, const int* nitems_array, int* idx) {
-    if (!node) return;
-    node->nitems = nitems_array[(*idx)++];
-
-    switch (TYPE_MASK(node->type)) {
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(node->type);
-            for (int i = 0; i < arity; i++) {
-                restore_nitems_in_tree((me_expr*)node->parameters[i], nitems_array, idx);
-            }
-            break;
-        }
-    default:
-        break;
-    }
-}
-
-/* Helper to free intermediate output buffers */
-static void free_intermediate_buffers(me_expr* node) {
-    if (!node) return;
-
-    switch (TYPE_MASK(node->type)) {
-    case ME_CONSTANT:
-    case ME_VARIABLE:
-        // These don't have intermediate buffers
-        break;
-
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(node->type);
-            for (int i = 0; i < arity; i++) {
-                me_expr* param = (me_expr*)node->parameters[i];
-                free_intermediate_buffers(param);
-
-                // Free intermediate buffer (but not for root or variables/constants)
-                if (param->type != ME_CONSTANT && param->type != ME_VARIABLE && param->output) {
-                    free(param->output);
-                    param->output = NULL;
-                }
-            }
-            break;
-        }
-    }
-}
-
-/* Helper to save original variable bindings with their pointers */
-static void save_variable_metadata(const me_expr* node, const void** var_pointers, size_t* var_sizes, int* var_count) {
-    if (!node) return;
-    switch (TYPE_MASK(node->type)) {
-    case ME_VARIABLE:
-        // Check if this pointer is already in the list
-        for (int i = 0; i < *var_count; i++) {
-            if (var_pointers[i] == node->bound) return; // Already saved
-        }
-        var_pointers[*var_count] = node->bound;
-        var_sizes[*var_count] = dtype_size(node->input_dtype);
-        (*var_count)++;
-        break;
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(node->type);
-            for (int i = 0; i < arity; i++) {
-                save_variable_metadata((const me_expr*)node->parameters[i], var_pointers, var_sizes, var_count);
-            }
-            break;
-        }
-    }
-}
-
-static int count_variable_nodes(const me_expr* node) {
-    if (!node) return 0;
-    switch (TYPE_MASK(node->type)) {
-    case ME_VARIABLE:
-        return 1;
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            int count = 0;
-            const int arity = ARITY(node->type);
-            for (int i = 0; i < arity; i++) {
-                count += count_variable_nodes((const me_expr*)node->parameters[i]);
-            }
-            return count;
-        }
-    }
-    return 0;
-}
-
-static void collect_variable_nodes(me_expr* node, const void** var_pointers, int n_vars,
-                                   me_expr** var_nodes, int* var_indices, int* node_count) {
-    if (!node) return;
-    switch (TYPE_MASK(node->type)) {
-    case ME_VARIABLE:
-        {
-            int idx = -1;
-            for (int i = 0; i < n_vars; i++) {
-                if (node->bound == var_pointers[i]) {
-                    idx = i;
-                    break;
-                }
-            }
-            if (idx >= 0) {
-                var_nodes[*node_count] = node;
-                var_indices[*node_count] = idx;
-                (*node_count)++;
-            }
-            break;
-        }
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(node->type);
-            for (int i = 0; i < arity; i++) {
-                collect_variable_nodes((me_expr*)node->parameters[i], var_pointers, n_vars,
-                                       var_nodes, var_indices, node_count);
-            }
-            break;
-        }
-    }
-}
-
-/* Helper to update variable bindings by matching original pointers */
-static void update_vars_by_pointer(me_expr* node, const void** old_pointers, const void** new_pointers, int n_vars) {
-    if (!node) return;
-    switch (TYPE_MASK(node->type)) {
-    case ME_VARIABLE:
-        // Find which variable this is and update to new pointer
-        for (int i = 0; i < n_vars; i++) {
-            if (node->bound == old_pointers[i]) {
-                node->bound = new_pointers[i];
-                break;
-            }
-        }
-        break;
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(node->type);
-            for (int i = 0; i < arity; i++) {
-                update_vars_by_pointer((me_expr*)node->parameters[i], old_pointers, new_pointers, n_vars);
-            }
-            break;
-        }
-    }
-}
-
-/* Helper to update variable bindings and nitems in tree */
-static void update_variable_bindings(me_expr* node, const void** new_bounds, int* var_idx, int new_nitems) {
-    if (!node) return;
-
-    // Update nitems for all nodes to handle intermediate buffers
-    if (new_nitems > 0) {
-        node->nitems = new_nitems;
-    }
-
-    switch (TYPE_MASK(node->type)) {
-    case ME_VARIABLE:
-        if (new_bounds && *var_idx >= 0) {
-            node->bound = new_bounds[*var_idx];
-            (*var_idx)++;
-        }
-        break;
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        {
-            const int arity = ARITY(node->type);
-            for (int i = 0; i < arity; i++) {
-                update_variable_bindings((me_expr*)node->parameters[i], new_bounds, var_idx, new_nitems);
-            }
-            break;
-        }
-    }
-}
-
-/* Evaluate compiled expression with new variable and output pointers */
-static me_expr* clone_expr(const me_expr* src) {
-    if (!src) return NULL;
-
-    const int arity = ARITY(src->type);
-    const int psize = sizeof(void*) * arity;
-    const int size = (sizeof(me_expr) - sizeof(void*)) + psize + (IS_CLOSURE(src->type) ? sizeof(void*) : 0);
-    me_expr* clone = malloc(size);
-    if (!clone) return NULL;
-
-    // Copy the entire structure
-    memcpy(clone, src, size);
-
-    // Clone children recursively
-    if (arity > 0) {
-        for (int i = 0; i < arity; i++) {
-            clone->parameters[i] = clone_expr((const me_expr*)src->parameters[i]);
-            if (src->parameters[i] && !clone->parameters[i]) {
-                // Clone failed, clean up
-                for (int j = 0; j < i; j++) {
-                    me_free((me_expr*)clone->parameters[j]);
-                }
-                free(clone);
-                return NULL;
-            }
-        }
-    }
-
-    // Don't clone output buffer - it will be set by caller
-    // Don't clone bytecode - not needed for clones
-    clone->output = NULL;
-    clone->bytecode = NULL;
-    clone->ncode = 0;
-
-    return clone;
-}
-
-/* Thread-safe chunked evaluation using expression cloning.
- * This function is safe to call from multiple threads simultaneously,
- * even on the same expression object. Each call creates a temporary
- * clone of the expression tree to avoid race conditions. */
-int me_eval(const me_expr* expr, const void** vars_chunk,
-            int n_vars, void* output_chunk, int chunk_nitems) {
-    if (!expr) return ME_EVAL_ERR_NULL_EXPR;
-
-    // Verify variable count matches
-    const void* original_var_pointers[ME_MAX_VARS];
-    size_t var_sizes[ME_MAX_VARS];
-    int actual_var_count = 0;
-    save_variable_metadata(expr, original_var_pointers, var_sizes, &actual_var_count);
-    if (actual_var_count > ME_MAX_VARS) {
-        fprintf(stderr, "Error: Expression uses %d variables, exceeds ME_MAX_VARS=%d\n",
-                actual_var_count, ME_MAX_VARS);
-        return ME_EVAL_ERR_TOO_MANY_VARS;
-    }
-
-    if (actual_var_count != n_vars) {
-        return ME_EVAL_ERR_VAR_MISMATCH;
-    }
-
-    // Clone the expression tree
-    me_expr* clone = clone_expr(expr);
-    if (!clone) return ME_EVAL_ERR_OOM;
-
-    const int block_nitems = ME_EVAL_BLOCK_NITEMS;
-    int status = ME_EVAL_SUCCESS;
-
-    if (!ME_EVAL_ENABLE_BLOCKING || chunk_nitems <= block_nitems) {
-        // Update clone's variable bindings
-        update_vars_by_pointer(clone, original_var_pointers, vars_chunk, n_vars);
-
-        // Update clone's nitems throughout the tree
-        int update_idx = 0;
-        update_variable_bindings(clone, NULL, &update_idx, chunk_nitems);
-
-        // Set output pointer
-        clone->output = output_chunk;
-
-        // Evaluate the clone
-        private_eval(clone);
-    }
-    else if (is_reduction_node(clone)) {
-        // Reductions operate on the full chunk; avoid block processing.
-        update_vars_by_pointer(clone, original_var_pointers, vars_chunk, n_vars);
-
-        int update_idx = 0;
-        update_variable_bindings(clone, NULL, &update_idx, chunk_nitems);
-
-        clone->output = output_chunk;
-        private_eval(clone);
-    }
-    else {
-        const size_t output_item_size = dtype_size(clone->dtype);
-        const int max_var_nodes = count_variable_nodes(clone);
-        me_expr** var_nodes = NULL;
-        int* var_indices = NULL;
-        int var_node_count = 0;
-
-        if (max_var_nodes > 0) {
-            var_nodes = malloc((size_t)max_var_nodes * sizeof(*var_nodes));
-            var_indices = malloc((size_t)max_var_nodes * sizeof(*var_indices));
-            if (!var_nodes || !var_indices) {
-                free(var_nodes);
-                free(var_indices);
-                status = ME_EVAL_ERR_OOM;
-                goto cleanup;
-            }
-            collect_variable_nodes(clone, original_var_pointers, n_vars,
-                                   var_nodes, var_indices, &var_node_count);
-        }
-
-#if defined(__clang__)
-#pragma clang loop unroll_count(4)
-#elif defined(__GNUC__) && !defined(__clang__)
-#pragma GCC unroll 4
-#endif
-        for (int offset = 0; offset < chunk_nitems; offset += block_nitems) {
-            int current = block_nitems;
-            if (offset + current > chunk_nitems) {
-                current = chunk_nitems - offset;
-            }
-
-            const void* block_vars[ME_MAX_VARS];
-            for (int i = 0; i < n_vars; i++) {
-                const unsigned char* base = (const unsigned char*)vars_chunk[i];
-                block_vars[i] = base + (size_t)offset * var_sizes[i];
-            }
-
-            for (int i = 0; i < var_node_count; i++) {
-                var_nodes[i]->bound = block_vars[var_indices[i]];
-            }
-
-            int update_idx = 0;
-            update_variable_bindings(clone, NULL, &update_idx, current);
-
-            clone->output = (unsigned char*)output_chunk + (size_t)offset * output_item_size;
-            private_eval(clone);
-        }
-
-        free(var_nodes);
-        free(var_indices);
-    }
-
-cleanup:
-    // Free the clone (including any intermediate buffers it allocated)
-    me_free(clone);
-    return status;
-}
-
-
-static void optimize(me_expr* n) {
-    /* Evaluates as much as possible. */
-    if (!n) return;
-    if (n->type == ME_CONSTANT) return;
-    if (n->type == ME_VARIABLE) return;
-
-    /* Only optimize out functions flagged as pure. */
-    if (IS_PURE(n->type)) {
-        const int arity = ARITY(n->type);
-        int known = 1;
-        int i;
-        for (i = 0; i < arity; ++i) {
-            optimize(n->parameters[i]);
-            if (((me_expr*)(n->parameters[i]))->type != ME_CONSTANT) {
-                known = 0;
-            }
-        }
-        if (known) {
-            const double value = me_eval_scalar(n);
-            me_free_parameters(n);
-            n->type = ME_CONSTANT;
-            n->value = value;
-        }
-    }
-}
-
-#if defined(_WIN32) || defined(_WIN64)
-static bool has_complex_node(const me_expr* n) {
-    if (!n) return false;
-    if (n->dtype == ME_COMPLEX64 || n->dtype == ME_COMPLEX128) return true;
-    const int arity = ARITY(n->type);
-    for (int i = 0; i < arity; i++) {
-        if (has_complex_node((const me_expr*)n->parameters[i])) return true;
-    }
-    return false;
-}
-
-static bool has_complex_input(const me_expr* n) {
-    if (!n) return false;
-    if (n->input_dtype == ME_COMPLEX64 || n->input_dtype == ME_COMPLEX128) return true;
-    const int arity = ARITY(n->type);
-    for (int i = 0; i < arity; i++) {
-        if (has_complex_input((const me_expr*)n->parameters[i])) return true;
-    }
-    return false;
-}
-#endif
-
-
-static int private_compile(const char* expression, const me_variable* variables, int var_count,
-                           void* output, int nitems, me_dtype dtype, int* error, me_expr** out) {
-    if (out) *out = NULL;
-    if (!expression || !out || var_count < 0) {
-        if (error) *error = -1;
-        return ME_COMPILE_ERR_INVALID_ARG;
-    }
-
-    // Validate dtype usage: either all vars are ME_AUTO (use dtype), or dtype is ME_AUTO (use var dtypes)
-    if (variables && var_count > 0) {
-        int auto_count = 0;
-        int specified_count = 0;
-
-        for (int i = 0; i < var_count; i++) {
-            if (variables[i].dtype == ME_AUTO) {
-                auto_count++;
-            }
-            else {
-                specified_count++;
-            }
-        }
-
-        // Check the two valid modes
-        if (dtype == ME_AUTO) {
-            // Mode 1: Output dtype is ME_AUTO, all variables must have explicit dtypes
-            if (auto_count > 0) {
-                fprintf(
-                    stderr,
-                    "Error: When output dtype is ME_AUTO, all variable dtypes must be specified (not ME_AUTO)\n");
-                if (error) *error = -1;
-                return ME_COMPILE_ERR_VAR_UNSPECIFIED;
-            }
-        }
-        else {
-            // Mode 2: Output dtype is specified
-            // Two sub-modes: all ME_AUTO (homogeneous), or all explicit (heterogeneous with conversion)
-            if (auto_count > 0 && specified_count > 0) {
-                // Mixed mode not allowed
-                fprintf(stderr, "Error: Variable dtypes must be all ME_AUTO or all explicitly specified\n");
-                if (error) *error = -1;
-                return ME_COMPILE_ERR_VAR_MIXED;
-            }
-        }
-    }
-
-    // Create a copy of variables with dtype filled in (if not already set)
-    me_variable* vars_copy = NULL;
-    if (variables && var_count > 0) {
-        vars_copy = malloc(var_count * sizeof(me_variable));
-        if (!vars_copy) {
-            if (error) *error = -1;
-            return ME_COMPILE_ERR_OOM;
-        }
-        for (int i = 0; i < var_count; i++) {
-            vars_copy[i] = variables[i];
-            // If dtype not set (ME_AUTO), use the provided dtype
-            if (vars_copy[i].dtype == ME_AUTO && vars_copy[i].type == 0) {
-                vars_copy[i].dtype = dtype;
-                vars_copy[i].type = ME_VARIABLE;
-            }
-        }
-    }
-
-    state s;
-    s.start = s.next = expression;
-    s.lookup = vars_copy ? vars_copy : variables;
-    s.lookup_len = var_count;
-    // When dtype is ME_AUTO, infer target dtype from variables to avoid type mismatch
-    if (dtype != ME_AUTO) {
-        s.target_dtype = dtype;
-    }
-    else if (variables && var_count > 0) {
-        // Use the first variable's dtype as the target for constants
-        // This prevents type promotion issues when mixing float32 vars with float64 constants
-        s.target_dtype = variables[0].dtype;
-    }
-    else {
-        s.target_dtype = ME_AUTO;
-    }
-
-    next_token(&s);
-    me_expr* root = list(&s);
-
-    if (root == NULL) {
-        if (error) *error = -1;
-        if (vars_copy) free(vars_copy);
-        return ME_COMPILE_ERR_OOM;
-    }
-
-    if (contains_reduction(root) && !reduction_usage_is_valid(root)) {
-        me_free(root);
-        if (error) *error = -1;
-        if (vars_copy) free(vars_copy);
-        return ME_COMPILE_ERR_REDUCTION_INVALID;
-    }
-
-#if defined(_WIN32) || defined(_WIN64)
-    {
-        const me_variable* vars_check = vars_copy ? vars_copy : variables;
-        bool complex_vars = false;
-        if (vars_check) {
-            for (int i = 0; i < var_count; i++) {
-                if (vars_check[i].dtype == ME_COMPLEX64 || vars_check[i].dtype == ME_COMPLEX128) {
-                    complex_vars = true;
-                    break;
-                }
-            }
-        }
-        if (complex_vars ||
-            dtype == ME_COMPLEX64 || dtype == ME_COMPLEX128 ||
-            has_complex_node(root) || has_complex_input(root)) {
-            fprintf(stderr, "Error: Complex expressions are not supported on Windows (no C99 complex ABI)\n");
-            me_free(root);
-            if (error) *error = -1;
-            if (vars_copy) free(vars_copy);
-            return ME_COMPILE_ERR_COMPLEX_UNSUPPORTED;
-        }
-    }
-#endif
-
-    if (s.type != TOK_END) {
-        me_free(root);
-        if (error) {
-            *error = (s.next - s.start);
-            if (*error == 0) *error = 1;
-        }
-        if (vars_copy) free(vars_copy);
-        return ME_COMPILE_ERR_PARSE;
-    }
-    else {
-        optimize(root);
-        root->output = output;
-        root->nitems = nitems;
-
-        // If dtype is ME_AUTO, infer from expression; otherwise use provided dtype
-        if (dtype == ME_AUTO) {
-            root->dtype = infer_output_type(root);
-        }
-        else {
-            // User explicitly requested a dtype - use it (will cast if needed)
-            root->dtype = dtype;
-        }
-
-        if (error) *error = 0;
-        if (vars_copy) free(vars_copy);
-        *out = root;
-        return ME_COMPILE_SUCCESS;
-    }
-}
-
-// Synthetic addresses for ordinal matching (when user provides NULL addresses)
-static char synthetic_var_addresses[ME_MAX_VARS];
-
-int me_compile(const char* expression, const me_variable* variables,
-               int var_count, me_dtype dtype, int* error, me_expr** out) {
-    if (out) *out = NULL;
-    if (!out) {
-        if (error) *error = -1;
-        return ME_COMPILE_ERR_INVALID_ARG;
-    }
-
-    // For chunked evaluation, we compile without specific output/nitems
-    // If variables have NULL addresses, assign synthetic unique addresses for ordinal matching
-    me_variable* vars_copy = NULL;
-    int needs_synthetic = 0;
-
-    if (variables && var_count > 0) {
-        // Check if any variables have NULL addresses
-        for (int i = 0; i < var_count; i++) {
-            if (variables[i].address == NULL) {
-                needs_synthetic = 1;
-                break;
-            }
-        }
-
-        if (needs_synthetic) {
-            // Create copy with synthetic addresses
-            vars_copy = malloc(var_count * sizeof(me_variable));
-            if (!vars_copy) {
-                if (error) *error = -1;
-                return ME_COMPILE_ERR_OOM;
-            }
-
-            for (int i = 0; i < var_count; i++) {
-                vars_copy[i] = variables[i];
-                if (vars_copy[i].address == NULL) {
-                    // Use address in synthetic array (each index is unique)
-                    vars_copy[i].address = &synthetic_var_addresses[i];
-                }
-            }
-
-            int status = private_compile(expression, vars_copy, var_count, NULL, 0, dtype, error, out);
-            free(vars_copy);
-            return status;
-        }
-    }
-
-    // No NULL addresses, use variables as-is
-    return private_compile(expression, variables, var_count, NULL, 0, dtype, error, out);
-}
-
-static void pn(const me_expr* n, int depth) {
-    int i, arity;
-    printf("%*s", depth, "");
-
-    if (!n) {
-        printf("NULL\n");
-        return;
-    }
-
-    switch (TYPE_MASK(n->type)) {
-    case ME_CONSTANT: printf("%f\n", n->value);
-        break;
-    case ME_VARIABLE: printf("bound %p\n", n->bound);
-        break;
-
-    case ME_FUNCTION0:
-    case ME_FUNCTION1:
-    case ME_FUNCTION2:
-    case ME_FUNCTION3:
-    case ME_FUNCTION4:
-    case ME_FUNCTION5:
-    case ME_FUNCTION6:
-    case ME_FUNCTION7:
-    case ME_CLOSURE0:
-    case ME_CLOSURE1:
-    case ME_CLOSURE2:
-    case ME_CLOSURE3:
-    case ME_CLOSURE4:
-    case ME_CLOSURE5:
-    case ME_CLOSURE6:
-    case ME_CLOSURE7:
-        arity = ARITY(n->type);
-        printf("f%d", arity);
-        for (i = 0; i < arity; i++) {
-            printf(" %p", n->parameters[i]);
-        }
-        printf("\n");
-        for (i = 0; i < arity; i++) {
-            pn(n->parameters[i], depth + 1);
-        }
-        break;
-    }
-}
-
-void me_print(const me_expr* n) {
-    pn(n, 0);
-}
-
-me_dtype me_get_dtype(const me_expr* expr) {
-    return expr ? expr->dtype : ME_AUTO;
-}
diff --git a/src/blosc2/miniexpr.h b/src/blosc2/miniexpr.h
deleted file mode 100644
index e778e579..00000000
--- a/src/blosc2/miniexpr.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/*********************************************************************
-  Blosc - Blocked Shuffling and Compression Library
-
-  Copyright (c) 2025  Blosc Development Team <blosc@blosc.org>
-  https://blosc.org
-  License: BSD 3-Clause (see LICENSE.txt)
-
-  See LICENSE.txt for details about copyright and rights to use.
-**********************************************************************/
-
-// Loosely based on https://github.com/CodePlea/tinyexpr. License follows:
-// SPDX-License-Identifier: Zlib
-/*
- * TINYEXPR - Tiny recursive descent parser and evaluation engine in C
- *
- * Copyright (c) 2015-2020 Lewis Van Winkle
- *
- * http://CodePlea.com
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgement in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#ifndef MINIEXPR_H
-#define MINIEXPR_H
-
-
-#ifdef __cplusplus
-extern "C" {
-
-
-#endif
-
-/* Internal eval block size (elements). Compile-time fixed. */
-#ifndef ME_EVAL_BLOCK_NITEMS
-#define ME_EVAL_BLOCK_NITEMS 4096
-#endif
-
-/* Maximum number of variables supported in a single expression. */
-#ifndef ME_MAX_VARS
-#define ME_MAX_VARS 128
-#endif
-
-/* Enable internal eval blocking for large chunks (1 = on, 0 = off). */
-#ifndef ME_EVAL_ENABLE_BLOCKING
-#define ME_EVAL_ENABLE_BLOCKING 1
-#endif
-
-
-/* Data type enumeration - Full C99 support */
-typedef enum {
-    /* Automatic type inference */
-    ME_AUTO,
-
-    /* Boolean */
-    ME_BOOL,
-
-    /* Signed integers */
-    ME_INT8,
-    ME_INT16,
-    ME_INT32,
-    ME_INT64,
-
-    /* Unsigned integers */
-    ME_UINT8,
-    ME_UINT16,
-    ME_UINT32,
-    ME_UINT64,
-
-    /* Floating point */
-    ME_FLOAT32,
-    ME_FLOAT64,
-
-    /* Complex (C99) */
-    ME_COMPLEX64, /* float complex */
-    ME_COMPLEX128 /* double complex */
-} me_dtype;
-
-/* Opaque type for compiled expressions */
-typedef struct me_expr me_expr;
-
-
-enum {
-    ME_VARIABLE = 0,
-
-    ME_FUNCTION0 = 8, ME_FUNCTION1, ME_FUNCTION2, ME_FUNCTION3,
-    ME_FUNCTION4, ME_FUNCTION5, ME_FUNCTION6, ME_FUNCTION7,
-
-    ME_CLOSURE0 = 16, ME_CLOSURE1, ME_CLOSURE2, ME_CLOSURE3,
-    ME_CLOSURE4, ME_CLOSURE5, ME_CLOSURE6, ME_CLOSURE7,
-
-    ME_FLAG_PURE = 32
-};
-
-typedef struct me_variable {
-    const char *name;
-    me_dtype dtype; // Data type of this variable (ME_AUTO = use output dtype)
-    const void *address; // Pointer to data (NULL for me_compile)
-    int type; // ME_VARIABLE for user variables (0 = auto-set to ME_VARIABLE)
-    void *context; // For closures/functions (NULL for normal variables)
-} me_variable;
-
-/* Note: When initializing variables, only name/dtype/address are typically needed.
- * Unspecified fields default to 0/NULL, which is correct for normal use:
- *   {"varname"}                          → defaults all fields
- *   {"varname", ME_FLOAT64}              → for me_compile with mixed types
- *   {"varname", ME_FLOAT64, var_array}   → for me_compile with address
- * Advanced users can specify type for closures/functions if needed.
- */
-
-
-/* Compile expression for chunked evaluation.
- * This function is optimized for use with me_eval(),
- * where variable and output pointers are provided later during evaluation.
- *
- * Parameters:
- *   expression: The expression string to compile
- *   variables: Array of variable definitions. Only the 'name' field is required.
- *              Variables will be matched by position (ordinal order) during me_eval().
- *   var_count: Number of variables
- *   dtype: Data type handling:
- *          - ME_AUTO: All variables must specify their dtypes, output is inferred
- *          - Specific type: Either all variables are ME_AUTO (homogeneous, all use this type),
- *            OR all variables have explicit dtypes (heterogeneous, result cast to this type)
- *   error: Optional pointer to receive error position (0 on success, >0 on parse error)
- *   out: Output pointer to receive the compiled expression
- *
- * Returns: ME_COMPILE_SUCCESS (0) on success, or a negative ME_COMPILE_ERR_* code on failure
- *
- * Example 1 (simple - all same type):
- *   me_variable vars[] = {{"x"}, {"y"}};  // Both ME_AUTO
- *   me_expr *expr = NULL;
- *   if (me_compile("x + y", vars, 2, ME_FLOAT64, &err, &expr) != ME_COMPILE_SUCCESS) { return; }
- *
- * Example 2 (mixed types with ME_AUTO):
- *   me_variable vars[] = {{"x", ME_INT32}, {"y", ME_FLOAT64}};
- *   me_expr *expr = NULL;
- *   if (me_compile("x + y", vars, 2, ME_AUTO, &err, &expr) != ME_COMPILE_SUCCESS) { return; }
- *
- * Example 3 (mixed types with explicit output):
- *   me_variable vars[] = {{"x", ME_INT32}, {"y", ME_FLOAT64}};
- *   me_expr *expr = NULL;
- *   if (me_compile("x + y", vars, 2, ME_FLOAT32, &err, &expr) != ME_COMPILE_SUCCESS) { return; }
- *   // Variables keep their types, result is cast to FLOAT32
- *
- *   // Later, provide data in same order as variable definitions
- *   const void *data[] = {x_array, y_array};  // x first, y second
- *   if (me_eval(expr, data, 2, output, nitems) != ME_EVAL_SUCCESS) { return; }
- */
-int me_compile(const char *expression, const me_variable *variables,
-               int var_count, me_dtype dtype, int *error, me_expr **out);
-
-/* Status codes for me_compile(). */
-typedef enum {
-    ME_COMPILE_SUCCESS = 0,
-    ME_COMPILE_ERR_OOM = -1,
-    ME_COMPILE_ERR_PARSE = -2,
-    ME_COMPILE_ERR_INVALID_ARG = -3,
-    ME_COMPILE_ERR_COMPLEX_UNSUPPORTED = -4,
-    ME_COMPILE_ERR_REDUCTION_INVALID = -5,
-    ME_COMPILE_ERR_VAR_MIXED = -6,
-    ME_COMPILE_ERR_VAR_UNSPECIFIED = -7
-} me_compile_status;
-
-/* Status codes for me_eval(). */
-typedef enum {
-    ME_EVAL_SUCCESS = 0,
-    ME_EVAL_ERR_OOM = -1,
-    ME_EVAL_ERR_NULL_EXPR = -2,
-    ME_EVAL_ERR_TOO_MANY_VARS = -3,
-    ME_EVAL_ERR_VAR_MISMATCH = -4
-} me_eval_status;
-
-/* Evaluates compiled expression with variable and output pointers.
- * This function can be safely called from multiple threads simultaneously on the
- * same compiled expression. It creates a temporary clone of the expression tree
- * for each call, eliminating race conditions at the cost of some memory allocation.
- *
- * Parameters:
- *   expr: Compiled expression (from me_compile)
- *   vars_chunk: Array of pointers to variable data chunks (same order as in me_compile)
- *   n_vars: Number of variables (must match the number used in me_compile)
- *   output_chunk: Pointer to output buffer for this chunk
- *   chunk_nitems: Number of elements in this chunk
- *
- * Returns:
- *   ME_EVAL_SUCCESS (0) on success, or a negative ME_EVAL_ERR_* code on failure.
- *
- * Use this function for both serial and parallel evaluation. It is thread-safe
- * and can be used from multiple threads to process different chunks simultaneously.
- */
-int me_eval(const me_expr *expr, const void **vars_chunk,
-            int n_vars, void *output_chunk, int chunk_nitems);
-
-/* Prints the expression tree for debugging purposes. */
-void me_print(const me_expr *n);
-
-/* Frees the expression. */
-/* This is safe to call on NULL pointers. */
-void me_free(me_expr *n);
-
-/* Get the result data type of a compiled expression.
- * Returns the dtype that will be used for the output of me_eval().
- */
-me_dtype me_get_dtype(const me_expr *expr);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /*MINIEXPR_H*/
diff --git a/src/blosc2/miniexpr_numpy.h b/src/blosc2/miniexpr_numpy.h
deleted file mode 100644
index 8250c812..00000000
--- a/src/blosc2/miniexpr_numpy.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*********************************************************************
-  Blosc - Blocked Shuffling and Compression Library
-
-  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
-  https://blosc.org
-  License: BSD 3-Clause (see LICENSE.txt)
-
-  NumPy Integration Utilities for MiniExpr
-
-  This file provides conversion functions between miniexpr dtypes
-  and NumPy type numbers for Python bindings.
-**********************************************************************/
-
-#ifndef MINIEXPR_NUMPY_H
-#define MINIEXPR_NUMPY_H
-
-#include <stdio.h>
-#include "miniexpr.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Convert miniexpr dtype to NumPy type number
- *
- * Returns the NumPy dtype.num value corresponding to a miniexpr dtype.
- * Returns -1 for ME_AUTO (which has no NumPy equivalent).
- *
- * Example:
- *   int numpy_num = me_dtype_to_numpy(ME_INT64);  // Returns 7
- */
-static inline int me_dtype_to_numpy(me_dtype dtype) {
-    static const int numpy_type_nums[] = {
-        -1,  // ME_AUTO (0) -> No NumPy equivalent
-        0,   // ME_BOOL (1) -> NPY_BOOL
-        1,   // ME_INT8 (2) -> NPY_BYTE
-        3,   // ME_INT16 (3) -> NPY_SHORT
-        5,   // ME_INT32 (4) -> NPY_INT
-        7,   // ME_INT64 (5) -> NPY_LONGLONG
-        2,   // ME_UINT8 (6) -> NPY_UBYTE
-        4,   // ME_UINT16 (7) -> NPY_USHORT
-        6,   // ME_UINT32 (8) -> NPY_UINT
-        8,   // ME_UINT64 (9) -> NPY_ULONGLONG
-        11,  // ME_FLOAT32 (10) -> NPY_FLOAT
-        12,  // ME_FLOAT64 (11) -> NPY_DOUBLE
-        14,  // ME_COMPLEX64 (12) -> NPY_CFLOAT
-        15   // ME_COMPLEX128 (13) -> NPY_CDOUBLE
-    };
-
-    if (dtype >= 0 && dtype <= ME_COMPLEX128) {
-        return numpy_type_nums[dtype];
-    }
-    return -1;  // Invalid dtype
-}
-
-/* Get a string name for a NumPy type number (for error messages)
- *
- * Returns a human-readable name for common NumPy types.
- * Returns "unknown" for unsupported types.
- */
-static inline const char* me_numpy_type_name(int numpy_type_num) {
-    switch (numpy_type_num) {
-        case 0:  return "bool";
-        case 1:  return "int8";
-        case 2:  return "uint8";
-        case 3:  return "int16";
-        case 4:  return "uint16";
-        case 5:  return "int32";
-        case 6:  return "uint32";
-        case 7:  return "int64";
-        case 8:  return "uint64";
-        case 9:  return "float16";      // Not supported
-        case 10: return "longdouble";   // Not supported
-        case 11: return "float32";
-        case 12: return "float64";
-        case 13: return "clongdouble";  // Not supported
-        case 14: return "complex64";
-        case 15: return "complex128";
-        default: return "unknown";
-    }
-}
-
-/* Convert NumPy type number to miniexpr dtype
- *
- * Returns the miniexpr dtype corresponding to a NumPy dtype.num value.
- * Returns -1 and prints an error message for unsupported NumPy types.
- *
- * Example:
- *   me_dtype dtype = me_dtype_from_numpy(7);  // Returns ME_INT64
- *   if (dtype < 0) {
- *       // Unsupported type, error already printed
- *       return NULL;
- *   }
- *
- * Note: This function only supports the subset of NumPy types that
- * miniexpr implements. Other types (float16, longdouble, etc.) will
- * return -1 and print an error message to stderr.
- */
-static inline me_dtype me_dtype_from_numpy(int numpy_type_num) {
-    switch (numpy_type_num) {
-        case 0:  return ME_BOOL;
-        case 1:  return ME_INT8;
-        case 2:  return ME_UINT8;
-        case 3:  return ME_INT16;
-        case 4:  return ME_UINT16;
-        case 5:  return ME_INT32;
-        case 6:  return ME_UINT32;
-        case 7:  return ME_INT64;
-        case 8:  return ME_UINT64;
-        case 11: return ME_FLOAT32;
-        case 12: return ME_FLOAT64;
-        case 14: return ME_COMPLEX64;
-        case 15: return ME_COMPLEX128;
-        default:
-            fprintf(stderr, "Error: Unsupported NumPy dtype.num = %d (%s)\n",
-                    numpy_type_num, me_numpy_type_name(numpy_type_num));
-            return -1;  // Return -1 to indicate error
-    }
-}
-
-/* Check if a NumPy type is supported by miniexpr
- *
- * Returns 1 if the NumPy type number is supported, 0 otherwise.
- * This function does not print error messages.
- *
- * Example:
- *   if (me_numpy_type_supported(numpy_dtype_num)) {
- *       // Can use this type with miniexpr
- *   }
- */
-static inline int me_numpy_type_supported(int numpy_type_num) {
-    // Check directly without calling me_dtype_from_numpy to avoid error messages
-    switch (numpy_type_num) {
-        case 0:   // bool
-        case 1:   // int8
-        case 2:   // uint8
-        case 3:   // int16
-        case 4:   // uint16
-        case 5:   // int32
-        case 6:   // uint32
-        case 7:   // int64
-        case 8:   // uint64
-        case 11:  // float32
-        case 12:  // float64
-        case 14:  // complex64
-        case 15:  // complex128
-            return 1;
-        default:
-            return 0;
-    }
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* MINIEXPR_NUMPY_H */

From a179ea148450dae83ed97b361c494bf513654787 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 13:30:33 +0100
Subject: [PATCH 075/123] Relax precision checking for tan in miniexpr

---
 tests/ndarray/test_lazyexpr.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index 1425f2c4..3fa54f9f 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -338,7 +338,11 @@ def test_functions(function, dtype_fixture, shape_fixture):
     expr_string = f"na1 + {function}(na2)"
     res_numexpr = ne_evaluate(expr_string)
     # Compare the results
-    np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=1e-5)
+    if function == "tan":
+        # tan in miniexpr has not a lot of precision for values that are close to 0
+        np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=5e-4)
+    else:
+        np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=1e-5)
 
     # Functions of the form np.function(a1 + a2)
     expr = eval(f"np.{function}(a1 + a2)", {"a1": a1, "a2": a2, "np": np})

From 4e65f5922307df8e60ebbeb93370b506a50125ac Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 13:38:46 +0100
Subject: [PATCH 076/123] Only attempt to compile the static version of
 miniexpr

---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 54d6f537..54f799c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,6 +50,11 @@ target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 # Fetch and build miniexpr library
 include(FetchContent)
 
+set(MINIEXPR_BUILD_SHARED OFF CACHE BOOL "Build miniexpr shared library" FORCE)
+set(MINIEXPR_BUILD_TESTS OFF CACHE BOOL "Build miniexpr tests" FORCE)
+set(MINIEXPR_BUILD_EXAMPLES OFF CACHE BOOL "Build miniexpr examples" FORCE)
+set(MINIEXPR_BUILD_BENCH OFF CACHE BOOL "Build miniexpr benchmarks" FORCE)
+
 FetchContent_Declare(miniexpr
     GIT_REPOSITORY https://github.com/Blosc/miniexpr.git
     GIT_TAG 3e0ad9f2800cfb46729da88553a9228845eaa731  # latest SIMD additions

From cda72e25616a2ea1c62a69997bb4416e08a2dfd1 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 13:45:06 +0100
Subject: [PATCH 077/123] Set Position Independent Code when compiling miniexpr

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 54f799c2..accbd439 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,6 +50,7 @@ target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 # Fetch and build miniexpr library
 include(FetchContent)
 
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(MINIEXPR_BUILD_SHARED OFF CACHE BOOL "Build miniexpr shared library" FORCE)
 set(MINIEXPR_BUILD_TESTS OFF CACHE BOOL "Build miniexpr tests" FORCE)
 set(MINIEXPR_BUILD_EXAMPLES OFF CACHE BOOL "Build miniexpr examples" FORCE)

From a604af21ff998690f5fecbf6496a1bb8dbec9661 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 13:49:58 +0100
Subject: [PATCH 078/123] Remove torch as a test requirement (too heavy
 dependency)

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 93073710..86cf1999 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,7 +65,8 @@ dev = [
 test = [
     "pytest",
     "psutil; platform_machine != 'wasm32'",
-    "torch; platform_machine != 'wasm32'",
+    # torch is optional because it is quite large (but will still be used if found)
+    # "torch; platform_machine != 'wasm32'",
 ]
 doc = [
     "sphinx>=8",

From ac019cb5dbbc2698c005879542d0c8da2a9ae2ff Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 14:00:09 +0100
Subject: [PATCH 079/123] Deactivate openzl on wasm platforms (requires too
 advanced C11)

---
 .github/workflows/wasm.yml | 1 +
 CMakeLists.txt             | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/.github/workflows/wasm.yml b/.github/workflows/wasm.yml
index ab008eea..8f98cbe3 100644
--- a/.github/workflows/wasm.yml
+++ b/.github/workflows/wasm.yml
@@ -21,6 +21,7 @@ jobs:
     env:
       CIBW_BUILD: ${{ matrix.cibw_build }}
       CMAKE_ARGS: "-DWITH_OPTIM=OFF"
+      DEACTIVATE_OPENZL: "1"
       CIBW_TEST_COMMAND: "pytest {project}/tests/ndarray/test_reductions.py"
     strategy:
       matrix:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index accbd439..d9ad560b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,6 +103,9 @@ else()
     set(BUILD_EXAMPLES OFF CACHE BOOL "Build C-Blosc2 examples")
     set(BUILD_BENCHMARKS OFF CACHE BOOL "Build C-Blosc2 benchmarks")
     set(BUILD_FUZZERS OFF CACHE BOOL "Build C-Blosc2 fuzzers")
+    if(DEFINED ENV{DEACTIVATE_OPENZL})
+        set(DEACTIVATE_OPENZL ON CACHE BOOL "Do not include support for the OpenZL library.")
+    endif()
     set(CMAKE_POSITION_INDEPENDENT_CODE ON)
     # we want the binaries of the C-Blosc2 library to go into the wheels
     set(BLOSC_INSTALL ON)

From d521d4b6fbc8387d1a034d22b969ad09c5c3cbee Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 14:02:54 +0100
Subject: [PATCH 080/123] More fixing issues on windows

---
 .github/workflows/cibuildwheels.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index 168ec33f..80385592 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -18,7 +18,11 @@ env:
   # musllinux takes too long to build, and it's not worth it for now
   CIBW_SKIP: "pp* *musllinux* *-win32"
   # Use explicit generator/compiler env vars; CMAKE_ARGS with spaces is not split on Windows.
-  CIBW_ENVIRONMENT_WINDOWS: "PATH=C:\\Program Files\\LLVM\\bin;%PATH% CMAKE_GENERATOR=Ninja CC=clang-cl CXX=clang-cl"
+  CIBW_ENVIRONMENT_WINDOWS: >-
+    PATH=C:\\Program Files\\LLVM\\bin;%PATH%
+    CMAKE_GENERATOR=Ninja
+    CC=clang-cl
+    CXX=clang-cl
 
 jobs:
 

From 677cfe626fe8c8f083a2481a30016d85927e879b Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 14:10:23 +0100
Subject: [PATCH 081/123] Protect tests that depend on torch

---
 tests/ndarray/test_elementwise_funcs.py | 3 ++-
 tests/ndarray/test_lazyexpr.py          | 3 ++-
 tests/ndarray/test_linalg.py            | 3 ++-
 tests/ndarray/test_setitem.py           | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/ndarray/test_elementwise_funcs.py b/tests/ndarray/test_elementwise_funcs.py
index 0a03a4a2..4c13572d 100644
--- a/tests/ndarray/test_elementwise_funcs.py
+++ b/tests/ndarray/test_elementwise_funcs.py
@@ -3,10 +3,11 @@
 
 import numpy as np
 import pytest
-import torch
 
 import blosc2
 
+torch = pytest.importorskip("torch", reason="torch not available")
+
 warnings.simplefilter("always")
 
 # Functions to test (add more as needed)
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index 3fa54f9f..a06c7d63 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -10,12 +10,13 @@
 
 import numpy as np
 import pytest
-import torch
 
 import blosc2
 from blosc2.lazyexpr import ne_evaluate
 from blosc2.utils import get_chunks_idx, npvecdot
 
+torch = pytest.importorskip("torch", reason="torch not available")
+
 NITEMS_SMALL = 100
 NITEMS = 1000
 
diff --git a/tests/ndarray/test_linalg.py b/tests/ndarray/test_linalg.py
index 9c1a110d..fb310049 100644
--- a/tests/ndarray/test_linalg.py
+++ b/tests/ndarray/test_linalg.py
@@ -3,12 +3,13 @@
 
 import numpy as np
 import pytest
-import torch
 
 import blosc2
 from blosc2.lazyexpr import linalg_funcs
 from blosc2.utils import npvecdot
 
+torch = pytest.importorskip("torch", reason="torch not available")
+
 
 @pytest.mark.parametrize(
     ("ashape", "achunks", "ablocks"),
diff --git a/tests/ndarray/test_setitem.py b/tests/ndarray/test_setitem.py
index a2145b90..9e9751f9 100644
--- a/tests/ndarray/test_setitem.py
+++ b/tests/ndarray/test_setitem.py
@@ -8,10 +8,11 @@
 
 import numpy as np
 import pytest
-import torch
 
 import blosc2
 
+torch = pytest.importorskip("torch", reason="torch not available")
+
 argnames = "shape, chunks, blocks, slices, dtype"
 argvalues = [
     ([456], [258], [73], slice(0, 1), np.int32),

From e55bf994277d2e02cddd56d69b73553c9afe8156 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 14:23:23 +0100
Subject: [PATCH 082/123] More granularity in protecting tests requiring torch

---
 tests/ndarray/test_elementwise_funcs.py | 16 +++++++--------
 tests/ndarray/test_lazyexpr.py          | 11 +++++++++--
 tests/ndarray/test_linalg.py            | 11 +++++++++--
 tests/ndarray/test_setitem.py           | 26 +++++++++++++++----------
 4 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/tests/ndarray/test_elementwise_funcs.py b/tests/ndarray/test_elementwise_funcs.py
index 4c13572d..fdce9239 100644
--- a/tests/ndarray/test_elementwise_funcs.py
+++ b/tests/ndarray/test_elementwise_funcs.py
@@ -6,8 +6,6 @@
 
 import blosc2
 
-torch = pytest.importorskip("torch", reason="torch not available")
-
 warnings.simplefilter("always")
 
 # Functions to test (add more as needed)
@@ -312,9 +310,10 @@ def test_unary_funcs(np_func, blosc_func, dtype, shape, chunkshape):
 @pytest.mark.parametrize(("np_func", "blosc_func"), UNARY_FUNC_PAIRS)
 @pytest.mark.parametrize("dtype", STR_DTYPES)
 @pytest.mark.parametrize("shape", [(10,), (20, 20)])
-@pytest.mark.parametrize("xp", [torch])
-def test_unfuncs_proxy(np_func, blosc_func, dtype, shape, xp):
-    _test_unary_func_proxy(np_func, blosc_func, dtype, shape, xp)
+def test_unary_funcs_torch_proxy(np_func, blosc_func, dtype, shape):
+    """Test unary functions with torch tensors as input (via proxy)."""
+    torch = pytest.importorskip("torch")
+    _test_unary_func_proxy(np_func, blosc_func, dtype, shape, torch)
 
 
 @pytest.mark.heavy
@@ -335,9 +334,10 @@ def test_binary_funcs(np_func, blosc_func, dtype, shape, chunkshape):
 @pytest.mark.parametrize(("np_func", "blosc_func"), BINARY_FUNC_PAIRS)
 @pytest.mark.parametrize("dtype", STR_DTYPES)
 @pytest.mark.parametrize(("shape", "chunkshape"), SHAPES_CHUNKS)
-@pytest.mark.parametrize("xp", [torch])
-def test_binfuncs_proxy(np_func, blosc_func, dtype, shape, chunkshape, xp):
-    _test_binary_func_proxy(np_func, blosc_func, dtype, shape, chunkshape, xp)
+def test_binary_funcs_torch_proxy(np_func, blosc_func, dtype, shape, chunkshape):
+    """Test binary functions with torch tensors as input (via proxy)."""
+    torch = pytest.importorskip("torch")
+    _test_binary_func_proxy(np_func, blosc_func, dtype, shape, chunkshape, torch)
 
 
 @pytest.mark.heavy
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index a06c7d63..f1a9e01c 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -15,7 +15,14 @@
 from blosc2.lazyexpr import ne_evaluate
 from blosc2.utils import get_chunks_idx, npvecdot
 
-torch = pytest.importorskip("torch", reason="torch not available")
+# Conditionally import torch for proxy tests
+try:
+    import torch
+
+    PROXY_TEST_XP = [torch, np]
+except ImportError:
+    torch = None
+    PROXY_TEST_XP = [np]
 
 NITEMS_SMALL = 100
 NITEMS = 1000
@@ -1848,7 +1855,7 @@ def test_lazyexpr_2args():
 
 @pytest.mark.parametrize(
     "xp",
-    [torch, np],
+    PROXY_TEST_XP,
 )
 @pytest.mark.parametrize(
     "dtype",
diff --git a/tests/ndarray/test_linalg.py b/tests/ndarray/test_linalg.py
index fb310049..33ff45d6 100644
--- a/tests/ndarray/test_linalg.py
+++ b/tests/ndarray/test_linalg.py
@@ -8,7 +8,14 @@
 from blosc2.lazyexpr import linalg_funcs
 from blosc2.utils import npvecdot
 
-torch = pytest.importorskip("torch", reason="torch not available")
+# Conditionally import torch for proxy tests
+try:
+    import torch
+
+    PROXY_TEST_XP = [torch, np]
+except ImportError:
+    torch = None
+    PROXY_TEST_XP = [np]
 
 
 @pytest.mark.parametrize(
@@ -827,7 +834,7 @@ def test_diagonal(shape, chunkshape, offset):
 
 @pytest.mark.parametrize(
     "xp",
-    [torch, np],
+    PROXY_TEST_XP,
 )
 @pytest.mark.parametrize(
     "dtype",
diff --git a/tests/ndarray/test_setitem.py b/tests/ndarray/test_setitem.py
index 9e9751f9..71137779 100644
--- a/tests/ndarray/test_setitem.py
+++ b/tests/ndarray/test_setitem.py
@@ -11,8 +11,6 @@
 
 import blosc2
 
-torch = pytest.importorskip("torch", reason="torch not available")
-
 argnames = "shape, chunks, blocks, slices, dtype"
 argvalues = [
     ([456], [258], [73], slice(0, 1), np.int32),
@@ -46,14 +44,6 @@ def test_setitem(shape, chunks, blocks, slices, dtype):
     nparray[slices] = val
     np.testing.assert_almost_equal(a[...], nparray)
 
-    # Object called via SimpleProxy
-    slice_shape = a[slices].shape
-    dtype_ = {np.float32: torch.float32, np.int32: torch.int32, np.float64: torch.float64}[dtype]
-    val = torch.ones(slice_shape, dtype=dtype_)
-    a[slices] = val
-    nparray[slices] = val
-    np.testing.assert_almost_equal(a[...], nparray)
-
     # blosc2.NDArray
     if np.prod(slice_shape) == 1 or len(slice_shape) != len(blocks):
         chunks = None
@@ -65,6 +55,22 @@ def test_setitem(shape, chunks, blocks, slices, dtype):
     np.testing.assert_almost_equal(a[...], nparray)
 
 
+@pytest.mark.parametrize(argnames, argvalues)
+def test_setitem_torch_proxy(shape, chunks, blocks, slices, dtype):
+    torch = pytest.importorskip("torch")
+    size = int(np.prod(shape))
+    nparray = np.arange(size, dtype=dtype).reshape(shape)
+    a = blosc2.frombuffer(bytes(nparray), nparray.shape, dtype=dtype, chunks=chunks, blocks=blocks)
+
+    # Object called via SimpleProxy (torch tensor)
+    slice_shape = a[slices].shape
+    dtype_ = {np.float32: torch.float32, np.int32: torch.int32, np.float64: torch.float64}[dtype]
+    val = torch.ones(slice_shape, dtype=dtype_)
+    a[slices] = val
+    nparray[slices] = val
+    np.testing.assert_almost_equal(a[...], nparray)
+
+
 @pytest.mark.parametrize(
     ("shape", "slices"),
     [

From 65d273811785f9e7cfe765da06f5fcaff8f9ca1f Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 17:06:35 +0100
Subject: [PATCH 083/123] Fix result checks

---
 bench/ndarray/miniexpr-reduct-sum-multi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bench/ndarray/miniexpr-reduct-sum-multi.py b/bench/ndarray/miniexpr-reduct-sum-multi.py
index 3a734001..badd9647 100644
--- a/bench/ndarray/miniexpr-reduct-sum-multi.py
+++ b/bench/ndarray/miniexpr-reduct-sum-multi.py
@@ -36,7 +36,7 @@
 nt = time() - t0
 print(f"Time to evaluate with NumPy: {nt * 1000 :.4f} ms", end=" ")
 print(f"Speed (GB/s): {(na.nbytes * 3 / 1e9) / nt:.2f}")
-print("Result:", res, "Mean:", res / (N * N))
+print("Result:", nres, "Mean:", nres / (N * N))
 print(f"Speedup Blosc2 vs NumPy: {nt / t:.2f}x")
 assert np.allclose(res, nres)
 
@@ -45,5 +45,5 @@
 net = time() - t0
 print(f"Time to evaluate with NumExpr: {net * 1000 :.4f} ms", end=" ")
 print(f"Speed (GB/s): {(na.nbytes * 3 / 1e9) / net:.2f}")
-print("Result:", res, "Mean:", res / (N * N))
+print("Result:", neres, "Mean:", neres / (N * N))
 print(f"Speedup Blosc2 vs NumExpr: {net / t:.2f}x")

From 891c9ae1844d087360287f592c3f5dca632ffdcc Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 17:06:59 +0100
Subject: [PATCH 084/123] Fix result checks

---
 bench/ndarray/miniexpr-reduct-sum.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bench/ndarray/miniexpr-reduct-sum.py b/bench/ndarray/miniexpr-reduct-sum.py
index 8714dc66..e8a76119 100644
--- a/bench/ndarray/miniexpr-reduct-sum.py
+++ b/bench/ndarray/miniexpr-reduct-sum.py
@@ -29,7 +29,7 @@
 nt = time() - t0
 print(f"Time to evaluate with NumPy: {nt * 1000 :.4f} ms", end=" ")
 print(f"Speed (GB/s): {(na.nbytes / 1e9) / nt:.2f}")
-print("Result:", res, "Mean:", res / (N * N))
+print("Result:", nres, "Mean:", nres / (N * N))
 print(f"Speedup Blosc2 vs NumPy: {nt / t:.2f}x")
 assert np.allclose(res, nres)
 
@@ -38,5 +38,5 @@
 net = time() - t0
 print(f"Time to evaluate with NumExpr: {net * 1000 :.4f} ms", end=" ")
 print(f"Speed (GB/s): {(na.nbytes / 1e9) / net:.2f}")
-print("Result:", res, "Mean:", res / (N * N))
+print("Result:", neres, "Mean:", neres / (N * N))
 print(f"Speedup Blosc2 vs NumExpr: {net / t:.2f}x")

From a05ed3c9fef3486276fe834ba3142910a465c870 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 17:07:27 +0100
Subject: [PATCH 085/123] Fetch the latest commit only

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d9ad560b..80d08da2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -113,6 +113,7 @@ else()
     FetchContent_Declare(blosc2
         GIT_REPOSITORY https://github.com/Blosc/c-blosc2
         GIT_TAG 9d250c2201f6e385c56a372b08037f7debc6fa1b  # openzl
+        GIT_SHALLOW TRUE  # fetch only the latest commit
         # in case you want to use a local copy of c-blosc2 for development, uncomment the line below
         # SOURCE_DIR "/Users/faltet/blosc/c-blosc2"
     )

From 55d5e8f21d63bf9743bf82b35c7de9a2afded165 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 17:17:23 +0100
Subject: [PATCH 086/123] Fix PATH parsing issues on win

---
 .github/workflows/cibuildwheels.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index 80385592..3444946a 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -19,7 +19,6 @@ env:
   CIBW_SKIP: "pp* *musllinux* *-win32"
   # Use explicit generator/compiler env vars; CMAKE_ARGS with spaces is not split on Windows.
   CIBW_ENVIRONMENT_WINDOWS: >-
-    PATH=C:\\Program Files\\LLVM\\bin;%PATH%
     CMAKE_GENERATOR=Ninja
     CC=clang-cl
     CXX=clang-cl

From e099898f705a5eed02c80bfd9394b87fa2433d94 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 17:42:10 +0100
Subject: [PATCH 087/123] miniexpr is not designed for WASM (it does not
 support multithreading)

---
 src/blosc2/lazyexpr.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index eafa793a..a8ae17f8 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -93,6 +93,8 @@
 
 # Set this to False if miniexpr should not be tried out
 try_miniexpr = True
+if blosc2.IS_WASM:
+    try_miniexpr = False
 
 
 def ne_evaluate(expression, local_dict=None, **kwargs):

From c4367ef2e116e86274d2de659835525c338ab2d9 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 17:53:34 +0100
Subject: [PATCH 088/123] Disabling miniexpr for windows, as it has some issues
 yet

---
 src/blosc2/lazyexpr.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index a8ae17f8..5cfbc6e1 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -95,6 +95,8 @@
 try_miniexpr = True
 if blosc2.IS_WASM:
     try_miniexpr = False
+if sys.platform == "win32":
+    try_miniexpr = False
 
 
 def ne_evaluate(expression, local_dict=None, **kwargs):
@@ -1298,10 +1300,6 @@ def fast_eval(  # noqa: C901
                 use_miniexpr = False
                 break
 
-    if sys.platform == "win32":
-        # Miniexpr has issues on Windows, but only with complex types; still investigating
-        use_miniexpr = False
-
     if use_miniexpr:
         cparams = kwargs.pop("cparams", blosc2.CParams())
         # All values will be overwritten, so we can use an uninitialized array

From 57de90703f57e1b8f252cc036039e3a10e22ee39 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 8 Jan 2026 18:14:39 +0100
Subject: [PATCH 089/123] Hook for ignoring transient request failures for
 tests requiring network

---
 tests/conftest.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 35768f59..e4a505dd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -9,6 +9,7 @@
 import sys
 
 import pytest
+import requests
 
 import blosc2
 
@@ -36,15 +37,11 @@ def cat2_context():
         yield c2params
 
 
-# This is to avoid sporadic failures in the CI when reaching network,
-# but this makes the tests to stuck in local.  Perhaps move this to
-# every test module that needs it?
-# def pytest_runtest_call(item):
-#     try:
-#         item.runtest()
-#     except requests.ConnectTimeout:
-#         pytest.skip("Skipping test due to sporadic requests.ConnectTimeout")
-#     except requests.ReadTimeout:
-#         pytest.skip("Skipping test due to sporadic requests.ReadTimeout")
-#     except requests.Timeout:
-#         pytest.skip("Skipping test due to sporadic requests.Timeout")
+def pytest_runtest_call(item):
+    # Skip network-marked tests on transient request failures to keep CI stable.
+    if item.get_closest_marker("network") is None:
+        return
+    try:
+        item.runtest()
+    except requests.exceptions.RequestException as exc:
+        pytest.skip(f"Skipping network test due to request failure: {exc}")

From 651e6aa2ce084952c12d35ab0fdf61a6b8265d51 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 9 Jan 2026 06:36:21 +0100
Subject: [PATCH 090/123] Switch to openzl branch in c-blosc2

---
 CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 80d08da2..d4a0c8dc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,8 +112,9 @@ else()
     include(FetchContent)
     FetchContent_Declare(blosc2
         GIT_REPOSITORY https://github.com/Blosc/c-blosc2
-        GIT_TAG 9d250c2201f6e385c56a372b08037f7debc6fa1b  # openzl
-        GIT_SHALLOW TRUE  # fetch only the latest commit
+        # GIT_TAG 9d250c2201f6e385c56a372b08037f7debc6fa1b  # openzl (disposable output)
+        GIT_TAG openzl
+        GIT_SHALLOW TRUE  # fetch only the latest commit (only works with a branch in GIT_TAG)
         # in case you want to use a local copy of c-blosc2 for development, uncomment the line below
         # SOURCE_DIR "/Users/faltet/blosc/c-blosc2"
     )

From 5b7afed4f3e1c320d5785c41740b5cd95fcdc793 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 9 Jan 2026 06:39:05 +0100
Subject: [PATCH 091/123] Switch to *add_openzl* branch in c-blosc2

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d4a0c8dc..ce4018b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -113,7 +113,7 @@ else()
     FetchContent_Declare(blosc2
         GIT_REPOSITORY https://github.com/Blosc/c-blosc2
         # GIT_TAG 9d250c2201f6e385c56a372b08037f7debc6fa1b  # openzl (disposable output)
-        GIT_TAG openzl
+        GIT_TAG add_openzl
         GIT_SHALLOW TRUE  # fetch only the latest commit (only works with a branch in GIT_TAG)
         # in case you want to use a local copy of c-blosc2 for development, uncomment the line below
         # SOURCE_DIR "/Users/faltet/blosc/c-blosc2"

From eb9d40a43b37d26c3ad15ba5fc10d9442a74affa Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 9 Jan 2026 19:52:21 +0100
Subject: [PATCH 092/123] Improvements in miniexpr prefilter for more broader
 use cases

---
 src/blosc2/blosc2_ext.pyx | 23 ++++++++++++++++++-----
 src/blosc2/lazyexpr.py    | 12 +++++++++---
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index bbca201d..47f23283 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1868,8 +1868,10 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
     cdef float *buf
     cdef void* src
     cdef int32_t chunk_nbytes, chunk_cbytes, block_nbytes
-    cdef int start
+    cdef int start, blocknitems, expected_blocknitems
+    cdef int32_t input_typesize
     cdef blosc2_context* dctx
+    expected_blocknitems = -1
     for i in range(udata.ninputs):
         ndarr = udata.inputs[i]
         input_buffers[i] = malloc(ndarr.sc.blocksize)
@@ -1883,7 +1885,13 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
             rc = blosc2_cbuffer_sizes(src, &chunk_nbytes, &chunk_cbytes, &block_nbytes)
             if rc < 0:
                 raise ValueError("miniexpr: error getting cbuffer sizes")
-            start = nblock * ndarr.blocknitems
+            input_typesize = ndarr.sc.typesize
+            blocknitems = block_nbytes // input_typesize
+            if expected_blocknitems == -1:
+                expected_blocknitems = blocknitems
+            elif blocknitems != expected_blocknitems:
+                raise ValueError("miniexpr: inconsistent block element counts across inputs")
+            start = nblock * blocknitems
             # A way to check for top speed
             if False:
                 # Unsafe, but it works for special arrays (e.g. blosc2.ones), and can be fast
@@ -1892,7 +1900,12 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
                 # This can add a significant overhead, but it is needed for thread safety.
                 # Perhaps one can create a specific (serial) context just for blosc2_getitem_ctx?
                 dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
-            rc = blosc2_getitem_ctx(dctx, src, chunk_cbytes, start, ndarr.blocknitems,
+            if nchunk * ndarr.chunknitems + start + blocknitems > ndarr.nitems:
+                blocknitems = ndarr.nitems - (nchunk * ndarr.chunknitems + start)
+                if blocknitems <= 0:
+                    # Should never happen, but anyway
+                    continue
+            rc = blosc2_getitem_ctx(dctx, src, chunk_cbytes, start, blocknitems,
                                     input_buffers[i], block_nbytes)
             blosc2_free_ctx(dctx)
             if rc < 0:
@@ -1907,12 +1920,12 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
     # Call thread-safe miniexpr C API
     if udata.aux_reduc_ptr == NULL:
         rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
-                     <void*>params_output, ndarr.blocknitems)
+                     <void*>params_output, blocknitems)
     else:
         # Reduction operation
         offset_bytes = <uintptr_t> typesize * (nchunk * nblocks_per_chunk + nblock)
         aux_reduc_ptr = <void *> (<uintptr_t> udata.aux_reduc_ptr + offset_bytes)
-        rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs, aux_reduc_ptr, ndarr.blocknitems)
+        rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs, aux_reduc_ptr, blocknitems)
         # The output buffer is cleared in the prefilter function
         # memset(<void *>params_output, 0, udata.array.sc.blocksize)  # clear output buffer
     if rc != 0:
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 5cfbc6e1..6e9b920f 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1299,6 +1299,11 @@ def fast_eval(  # noqa: C901
             if not blosc2.are_partitions_behaved(op.shape, op.chunks, op.blocks):
                 use_miniexpr = False
                 break
+            # Ensure blocks fit exactly in chunks
+            blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks, op.blocks, strict=True))
+            if not blocks_fit:
+                use_miniexpr = False
+                break
 
     if use_miniexpr:
         cparams = kwargs.pop("cparams", blosc2.CParams())
@@ -1997,8 +2002,9 @@ def reduce_slices(  # noqa: C901
     # Only behaved partitions are supported in miniexpr reductions
     if use_miniexpr:
         for op in operands.values():
-            # Check that partitions are well-behaved (no padding)
-            if not blosc2.are_partitions_behaved(op.shape, op.chunks, op.blocks):
+            # Check that chunksize is multiple of blocksize and blocks fit exactly in chunks
+            blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks, op.blocks, strict=True))
+            if not blocks_fit:
                 use_miniexpr = False
                 break
 
@@ -2011,7 +2017,7 @@ def reduce_slices(  # noqa: C901
         nblocks = res_eval.nbytes // res_eval.blocksize
         aux_reduc = np.empty(nblocks, dtype=dtype)
         try:
-            # print("expr->miniexpr:", expression, reduce_op)
+            print("expr->miniexpr:", expression, reduce_op)
             expression = f"{reduce_op_str}({expression})"
             res_eval._set_pref_expr(expression, operands, aux_reduc)
             # Data won't even try to be compressed, so buffers can be unitialized and reused

From 3be7455872b37ae29330be55f298ef48b1d7f708 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 10 Jan 2026 10:23:18 +0100
Subject: [PATCH 093/123] Allow to run miniexpr is more cases (specially 1-dim)

---
 CMakeLists.txt                 |  4 +++-
 src/blosc2/blosc2_ext.pyx      | 17 +++++++++++++++--
 src/blosc2/lazyexpr.py         | 25 ++++++++++++++++++++-----
 tests/ndarray/test_lazyexpr.py | 22 +++++++++++++++++-----
 4 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ce4018b4..74b8b067 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,7 +58,9 @@ set(MINIEXPR_BUILD_BENCH OFF CACHE BOOL "Build miniexpr benchmarks" FORCE)
 
 FetchContent_Declare(miniexpr
     GIT_REPOSITORY https://github.com/Blosc/miniexpr.git
-    GIT_TAG 3e0ad9f2800cfb46729da88553a9228845eaa731  # latest SIMD additions
+    GIT_TAG sleef  # latest SIMD additions
+    # In case you want to use a local copy of miniexpr for development, uncomment the line below
+    # SOURCE_DIR "/Users/faltet/blosc/miniexpr"
 )
 FetchContent_MakeAvailable(miniexpr)
 
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 47f23283..b8f1fa18 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -573,6 +573,17 @@ cdef extern from "miniexpr.h":
     int me_compile(const char *expression, const me_variable *variables,
                    int var_count, me_dtype dtype, int *error, me_expr **out)
 
+    cdef enum me_compile_status:
+        ME_COMPILE_SUCCESS
+        ME_COMPILE_ERR_OOM
+        ME_COMPILE_ERR_PARSE
+        ME_COMPILE_ERR_INVALID_ARG
+        ME_COMPILE_ERR_COMPLEX_UNSUPPORTED
+        ME_COMPILE_ERR_REDUCTION_INVALID
+        ME_COMPILE_ERR_VAR_MIXED
+        ME_COMPILE_ERR_VAR_UNSPECIFIED
+        ME_COMPILE_ERR_INVALID_ARG_TYPE
+
     int me_eval(const me_expr *expr, const void ** vars_chunk,
                 int n_vars, void *output_chunk, int chunk_nitems) nogil
 
@@ -2878,8 +2889,10 @@ cdef class NDArray:
         expression = expression.encode("utf-8") if isinstance(expression, str) else expression
         cdef me_dtype = me_dtype_from_numpy(self.dtype.num)
         cdef me_expr *out_expr
-        error = me_compile(expression, variables, n, me_dtype, &error, &out_expr)
-        if error != 0:
+        cdef int rc = me_compile(expression, variables, n, me_dtype, &error, &out_expr)
+        if rc == ME_COMPILE_ERR_INVALID_ARG_TYPE:
+            raise TypeError(f"miniexpr does not support operand or output dtype: {expression}")
+        if rc != ME_COMPILE_SUCCESS:
             raise NotImplementedError(f"Cannot compile expression: {expression}")
         udata.miniexpr_handle = out_expr
 
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 6e9b920f..88b16b06 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1238,6 +1238,9 @@ def fast_eval(  # noqa: C901
         ne_args = {}
     dtype = kwargs.pop("dtype", None)
     where: dict | None = kwargs.pop("_where_args", None)
+    if where is not None:
+        # miniexpr does not support where(); use the regular path.
+        use_miniexpr = False
     if isinstance(out, blosc2.NDArray):
         # If 'out' has been passed, and is a NDArray, use it as the base array
         basearr = out
@@ -1290,15 +1293,17 @@ def fast_eval(  # noqa: C901
         use_miniexpr = False
 
     if use_miniexpr:
+        op_dtypes = {op.dtype for op in operands.values() if isinstance(op, blosc2.NDArray)}
+        if len(op_dtypes) > 1:
+            use_miniexpr = False
+        # Avoid padding issues except for 1D arrays (contiguous along the only axis).
+        if len(shape) != 1 and builtins.any(s % c != 0 for s, c in zip(shape, chunks, strict=True)):
+            use_miniexpr = False
         for op in operands.values():
             # Only NDArray in-memory operands
             if not (isinstance(op, blosc2.NDArray) and op.urlpath is None and out is None):
                 use_miniexpr = False
                 break
-            # Check that partitions are well-behaved (no padding)
-            if not blosc2.are_partitions_behaved(op.shape, op.chunks, op.blocks):
-                use_miniexpr = False
-                break
             # Ensure blocks fit exactly in chunks
             blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks, op.blocks, strict=True))
             if not blocks_fit:
@@ -1310,7 +1315,7 @@ def fast_eval(  # noqa: C901
         # All values will be overwritten, so we can use an uninitialized array
         res_eval = blosc2.uninit(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
         try:
-            # print("expr->miniexpr:", expression)
+            print("expr->miniexpr:", expression)
             res_eval._set_pref_expr(expression, operands)
             # Data to compress is fetched from operands, so it can be uninitialized here
             data = np.empty(res_eval.schunk.chunksize, dtype=np.uint8)
@@ -2001,6 +2006,16 @@ def reduce_slices(  # noqa: C901
 
     # Only behaved partitions are supported in miniexpr reductions
     if use_miniexpr:
+        # Avoid padding issues except for 1D arrays (contiguous along the only axis).
+        if len(shape) != 1 and builtins.any(s % c != 0 for s, c in zip(shape, chunks, strict=True)):
+            use_miniexpr = False
+        if use_miniexpr and isinstance(expression, str):
+            has_complex = any(
+                isinstance(op, blosc2.NDArray) and blosc2.isdtype(op.dtype, "complex floating")
+                for op in operands.values()
+            )
+            if has_complex and any(tok in expression for tok in ("!=", "==", "<=", ">=", "<", ">")):
+                use_miniexpr = False
         for op in operands.values():
             # Check that chunksize is multiple of blocksize and blocks fit exactly in chunks
             blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks, op.blocks, strict=True))
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index f1a9e01c..f0f6a29a 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -180,7 +180,10 @@ def test_simple_expression(array_fixture):
     expr = a1 + a2 - a3 * a4
     nres = ne_evaluate("na1 + na2 - na3 * na4")
     res = expr.compute(cparams=blosc2.CParams())
-    np.testing.assert_allclose(res[:], nres)
+    if na1.dtype == np.float32:
+        np.testing.assert_allclose(res[:], nres, rtol=1e-6, atol=1e-6)
+    else:
+        np.testing.assert_allclose(res[:], nres)
 
 
 # Mix Proxy and NDArray operands
@@ -205,10 +208,16 @@ def test_iXXX(array_fixture):
         expr **= 2.3  # __ipow__
     res = expr.compute()
     if not blosc2.IS_WASM:
-        nres = ne_evaluate("(((((na1 ** 3 + na2 ** 2 + na3 ** 3 - na4 + 3) + 5) - 15) * 2) / 7) ** 2.3")
+        expr_str = "(((((na1 ** 3 + na2 ** 2 + na3 ** 3 - na4 + 3) + 5) - 15) * 2) / 7) ** 2.3"
     else:
-        nres = ne_evaluate("(((((na1 ** 3 + na2 ** 2 + na3 ** 3 - na4 + 3) + 5) - 15) * 2) / 7)")
-    np.testing.assert_allclose(res[:], nres)
+        expr_str = "(((((na1 ** 3 + na2 ** 2 + na3 ** 3 - na4 + 3) + 5) - 15) * 2) / 7)"
+    if na1.dtype == np.float32:
+        with np.errstate(invalid="ignore"):
+            nres = eval(expr_str, {"np": np}, {"na1": na1, "na2": na2, "na3": na3, "na4": na4})
+        np.testing.assert_allclose(res[:], nres, rtol=1e-5, atol=1e-6)
+    else:
+        nres = ne_evaluate(expr_str)
+        np.testing.assert_allclose(res[:], nres)
 
 
 def test_complex_evaluate(array_fixture):
@@ -253,7 +262,10 @@ def test_expression_with_constants(array_fixture):
     # Test with operands with same chunks and blocks
     expr = a1 + 2 - a3 * 3.14
     nres = ne_evaluate("na1 + 2 - na3 * 3.14")
-    np.testing.assert_allclose(expr[:], nres)
+    if na1.dtype == np.float32:
+        np.testing.assert_allclose(expr[:], nres, rtol=1e-6)
+    else:
+        np.testing.assert_allclose(expr[:], nres)
 
 
 @pytest.mark.parametrize("compare_expressions", [True, False])

From 4f5a96989554248d03020395bbfadac2678e747e Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 10 Jan 2026 11:42:01 +0100
Subject: [PATCH 094/123] Added a note about same dtypes in expressions

---
 src/blosc2/lazyexpr.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 88b16b06..60b80976 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1294,6 +1294,9 @@ def fast_eval(  # noqa: C901
 
     if use_miniexpr:
         op_dtypes = {op.dtype for op in operands.values() if isinstance(op, blosc2.NDArray)}
+        # This is for avoiding type casting issues in miniexpr, like in:
+        # tests/ndarray/test_lazyexpr_fields.py::test_where_fusion6[dtype_fixture0-shape_fixture0-chunks_blocks_fixture0]
+        # TODO: remove this restriction when miniexpr supports type casting better
         if len(op_dtypes) > 1:
             use_miniexpr = False
         # Avoid padding issues except for 1D arrays (contiguous along the only axis).

From ff5384966b9f07e82aae8cd9799c0ebffab926f3 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 12 Jan 2026 08:00:45 +0100
Subject: [PATCH 095/123] Miniexpr now correctly detects that it does not
 support nested mixed-types

---
 src/blosc2/lazyexpr.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 60b80976..98d07143 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1293,12 +1293,6 @@ def fast_eval(  # noqa: C901
         use_miniexpr = False
 
     if use_miniexpr:
-        op_dtypes = {op.dtype for op in operands.values() if isinstance(op, blosc2.NDArray)}
-        # This is for avoiding type casting issues in miniexpr, like in:
-        # tests/ndarray/test_lazyexpr_fields.py::test_where_fusion6[dtype_fixture0-shape_fixture0-chunks_blocks_fixture0]
-        # TODO: remove this restriction when miniexpr supports type casting better
-        if len(op_dtypes) > 1:
-            use_miniexpr = False
         # Avoid padding issues except for 1D arrays (contiguous along the only axis).
         if len(shape) != 1 and builtins.any(s % c != 0 for s, c in zip(shape, chunks, strict=True)):
             use_miniexpr = False

From 3c639ed407e6381a44d91514c968bf3d221777de Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 13 Jan 2026 08:11:45 +0100
Subject: [PATCH 096/123] Return int64 for reductions of bools

---
 src/blosc2/lazyexpr.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 98d07143..c1ec31c6 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1819,6 +1819,8 @@ def infer_reduction_dtype(dtype, operation):
         dtype, np.float32 if dtype in (np.float32, np.complex64) else blosc2.DEFAULT_FLOAT
     )
     if operation in {ReduceOp.SUM, ReduceOp.PROD}:
+        if np.issubdtype(dtype, np.bool_):
+            return np.int64
         if np.issubdtype(dtype, np.unsignedinteger):
             return np.result_type(dtype, np.uint64)
         return np.result_type(dtype, np.int64 if np.issubdtype(dtype, np.integer) else my_float)

From f9c03246fe08887048ebf79bf37e931970c29980 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 13 Jan 2026 14:09:55 +0100
Subject: [PATCH 097/123] Broaden the use cases for miniexpr to all 1-dim cases

---
 src/blosc2/lazyexpr.py         |  8 ++++----
 tests/ndarray/test_lazyexpr.py | 22 ++++++++++++++++------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index c1ec31c6..dab8b39b 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1301,9 +1301,9 @@ def fast_eval(  # noqa: C901
             if not (isinstance(op, blosc2.NDArray) and op.urlpath is None and out is None):
                 use_miniexpr = False
                 break
-            # Ensure blocks fit exactly in chunks
+            # Ensure blocks fit exactly in chunks for the n-dim case
             blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks, op.blocks, strict=True))
-            if not blocks_fit:
+            if len(op.shape) != 1 and not blocks_fit:
                 use_miniexpr = False
                 break
 
@@ -2016,9 +2016,9 @@ def reduce_slices(  # noqa: C901
             if has_complex and any(tok in expression for tok in ("!=", "==", "<=", ">=", "<", ">")):
                 use_miniexpr = False
         for op in operands.values():
-            # Check that chunksize is multiple of blocksize and blocks fit exactly in chunks
+            # Ensure blocks fit exactly in chunks for the n-dim case
             blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks, op.blocks, strict=True))
-            if not blocks_fit:
+            if len(op.shape) != 1 and not blocks_fit:
                 use_miniexpr = False
                 break
 
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index f0f6a29a..2f441497 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -226,7 +226,10 @@ def test_complex_evaluate(array_fixture):
     expr += 2
     nres = ne_evaluate("tan(na1) * (sin(na2) * sin(na2) + cos(na3)) + (sqrt(na4) * 2) + 2")
     res = expr.compute()
-    np.testing.assert_allclose(res[:], nres)
+    if na1.dtype == np.float32:
+        np.testing.assert_allclose(res[:], nres, rtol=1e-5)
+    else:
+        np.testing.assert_allclose(res[:], nres)
 
 
 def test_complex_getitem(array_fixture):
@@ -235,7 +238,10 @@ def test_complex_getitem(array_fixture):
     expr += 2
     nres = ne_evaluate("tan(na1) * (sin(na2) * sin(na2) + cos(na3)) + (sqrt(na4) * 2) + 2")
     res = expr[:]
-    np.testing.assert_allclose(res, nres)
+    if na1.dtype == np.float32:
+        np.testing.assert_allclose(res[:], nres, rtol=1e-5)
+    else:
+        np.testing.assert_allclose(res[:], nres)
 
 
 def test_complex_getitem_slice(array_fixture):
@@ -253,8 +259,11 @@ def test_func_expression(array_fixture):
     expr = (a1 + a2) * a3 - a4
     expr = blosc2.sin(expr) + blosc2.cos(expr)
     nres = ne_evaluate("sin((na1 + na2) * na3 - na4) + cos((na1 + na2) * na3 - na4)")
-    res = expr.compute(storage={})
-    np.testing.assert_allclose(res[:], nres)
+    res = expr.compute()
+    if na1.dtype == np.float32:
+        np.testing.assert_allclose(res[:], nres, rtol=1e-5)
+    else:
+        np.testing.assert_allclose(res[:], nres)
 
 
 def test_expression_with_constants(array_fixture):
@@ -262,10 +271,11 @@ def test_expression_with_constants(array_fixture):
     # Test with operands with same chunks and blocks
     expr = a1 + 2 - a3 * 3.14
     nres = ne_evaluate("na1 + 2 - na3 * 3.14")
+    res = expr.compute()
     if na1.dtype == np.float32:
-        np.testing.assert_allclose(expr[:], nres, rtol=1e-6)
+        np.testing.assert_allclose(res[:], nres, rtol=1e-5)
     else:
-        np.testing.assert_allclose(expr[:], nres)
+        np.testing.assert_allclose(res[:], nres)
 
 
 @pytest.mark.parametrize("compare_expressions", [True, False])

From f07b60b59bfd37f3e1d09b41437bd653573766da Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 13 Jan 2026 18:38:32 +0100
Subject: [PATCH 098/123] Better offset calculation for incomplete chunks at
 the end

---
 src/blosc2/blosc2_ext.pyx | 21 ++++++++++++++++-----
 src/blosc2/lazyexpr.py    |  3 ++-
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index b8f1fa18..c36181e6 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1924,21 +1924,32 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
 
     cdef me_expr* miniexpr_handle = udata.miniexpr_handle
     cdef void* aux_reduc_ptr
-    cdef uintptr_t offset_bytes
-    cdef int nblocks_per_chunk = udata.array.chunknitems // udata.array.blocknitems
+    # Calculate blocks per chunk using CEILING division (chunks are padded to fit whole blocks)
+    cdef int nblocks_per_chunk = (udata.array.chunknitems + udata.array.blocknitems - 1) // udata.array.blocknitems
+    # Calculate the global linear block index: nchunk * blocks_per_chunk + nblock
+    # This works because blocks never span chunks (chunks are padded to block boundaries)
+    cdef int64_t linear_block_index = nchunk * nblocks_per_chunk + nblock
+    cdef uintptr_t offset_bytes = typesize * linear_block_index
+
     if miniexpr_handle == NULL:
         raise ValueError("miniexpr: handle not assigned")
+
+    # Skip evaluation if blocknitems is invalid (can happen for padding blocks beyond data)
+    if blocknitems <= 0:
+        # Free resources
+        for i in range(udata.ninputs):
+            free(input_buffers[i])
+        free(input_buffers)
+        return 0
+
     # Call thread-safe miniexpr C API
     if udata.aux_reduc_ptr == NULL:
         rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
                      <void*>params_output, blocknitems)
     else:
         # Reduction operation
-        offset_bytes = <uintptr_t> typesize * (nchunk * nblocks_per_chunk + nblock)
         aux_reduc_ptr = <void *> (<uintptr_t> udata.aux_reduc_ptr + offset_bytes)
         rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs, aux_reduc_ptr, blocknitems)
-        # The output buffer is cleared in the prefilter function
-        # memset(<void *>params_output, 0, udata.array.sc.blocksize)  # clear output buffer
     if rc != 0:
         raise RuntimeError(f"miniexpr: issues during evaluation; error code: {rc}")
 
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index dab8b39b..2247995a 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -2029,7 +2029,8 @@ def reduce_slices(  # noqa: C901
         res_eval = blosc2.uninit(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
         # Compute the number of blocks in the result
         nblocks = res_eval.nbytes // res_eval.blocksize
-        aux_reduc = np.empty(nblocks, dtype=dtype)
+        # Initialize to zeros since some blocks may be padding and won't be written
+        aux_reduc = np.zeros(nblocks, dtype=dtype)
         try:
             print("expr->miniexpr:", expression, reduce_op)
             expression = f"{reduce_op_str}({expression})"

From 36e743ca4a75a3acdf31493bd41e06aa9e137b4a Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 14 Jan 2026 11:16:32 +0100
Subject: [PATCH 099/123] Did benchmarks for blosc2_getitem_ctx 'supposed'
 overhead

---
 src/blosc2/blosc2_ext.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index c36181e6..621e2867 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1908,8 +1908,9 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
                 # Unsafe, but it works for special arrays (e.g. blosc2.ones), and can be fast
                 dctx = ndarr.sc.dctx
             else:
-                # This can add a significant overhead, but it is needed for thread safety.
-                # Perhaps one can create a specific (serial) context just for blosc2_getitem_ctx?
+                # This is needed for thread safety, but adds a pretty low overhead (< 400ns on a modern CPU)
+                # In the future, perhaps one can create a specific (serial) context just for
+                # blosc2_getitem_ctx, but this is probably never going to be necessary.
                 dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
             if nchunk * ndarr.chunknitems + start + blocknitems > ndarr.nitems:
                 blocknitems = ndarr.nitems - (nchunk * ndarr.chunknitems + start)

From 2b1327d722b98ceb82408c1e05ccb38a1df9415c Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 14 Jan 2026 11:18:27 +0100
Subject: [PATCH 100/123] Reductions can handle padding in the first dimension
 now

---
 src/blosc2/lazyexpr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 2247995a..6c3c73d0 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -2006,7 +2006,7 @@ def reduce_slices(  # noqa: C901
     # Only behaved partitions are supported in miniexpr reductions
     if use_miniexpr:
         # Avoid padding issues except for 1D arrays (contiguous along the only axis).
-        if len(shape) != 1 and builtins.any(s % c != 0 for s, c in zip(shape, chunks, strict=True)):
+        if len(shape) != 1 and builtins.any(s % c != 0 for s, c in zip(shape[1:], chunks[1:], strict=True)):
             use_miniexpr = False
         if use_miniexpr and isinstance(expression, str):
             has_complex = any(

From f285872116d3ba0106c5593fc32402443caf3d8f Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 14 Jan 2026 11:26:23 +0100
Subject: [PATCH 101/123] Attemp to use miniexpr on windows too

---
 src/blosc2/lazyexpr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 6c3c73d0..26be0a75 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -95,8 +95,8 @@
 try_miniexpr = True
 if blosc2.IS_WASM:
     try_miniexpr = False
-if sys.platform == "win32":
-    try_miniexpr = False
+# if sys.platform == "win32":
+#     try_miniexpr = False
 
 
 def ne_evaluate(expression, local_dict=None, **kwargs):

From b86994e8494bf949a6cdff1fdca614503e9380cf Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 14 Jan 2026 11:36:25 +0100
Subject: [PATCH 102/123] Miniexpr can handle padding evals in the first
 dimension now

---
 src/blosc2/lazyexpr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 26be0a75..390ab5ab 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1294,7 +1294,7 @@ def fast_eval(  # noqa: C901
 
     if use_miniexpr:
         # Avoid padding issues except for 1D arrays (contiguous along the only axis).
-        if len(shape) != 1 and builtins.any(s % c != 0 for s, c in zip(shape, chunks, strict=True)):
+        if len(shape) != 1 and builtins.any(s % c != 0 for s, c in zip(shape[1:], chunks[1:], strict=True)):
             use_miniexpr = False
         for op in operands.values():
             # Only NDArray in-memory operands

From fb0c1ba7980f6ec33870e6cc9527c0ae6dad130f Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 14 Jan 2026 12:35:56 +0100
Subject: [PATCH 103/123] Do not use miniexpr on windows yet

---
 src/blosc2/lazyexpr.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 390ab5ab..ffaeb096 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -95,8 +95,10 @@
 try_miniexpr = True
 if blosc2.IS_WASM:
     try_miniexpr = False
-# if sys.platform == "win32":
-#     try_miniexpr = False
+if sys.platform == "win32":
+    # Although miniexpr has support for windows, the integration with Blosc2
+    # still has some rough edges.
+    try_miniexpr = False
 
 
 def ne_evaluate(expression, local_dict=None, **kwargs):

From e9b1c8013734a8b435dc2cddba699c4b191b3dee Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 14 Jan 2026 14:25:36 +0100
Subject: [PATCH 104/123] Relax the condition in which blocks should 'fit' in
 chunks

---
 src/blosc2/lazyexpr.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index ffaeb096..7650ea5e 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1303,8 +1303,8 @@ def fast_eval(  # noqa: C901
             if not (isinstance(op, blosc2.NDArray) and op.urlpath is None and out is None):
                 use_miniexpr = False
                 break
-            # Ensure blocks fit exactly in chunks for the n-dim case
-            blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks, op.blocks, strict=True))
+            # Ensure blocks fit exactly in chunks for the n-dim case, except for the first dimension
+            blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks[1:], op.blocks[1:], strict=True))
             if len(op.shape) != 1 and not blocks_fit:
                 use_miniexpr = False
                 break
@@ -2018,8 +2018,8 @@ def reduce_slices(  # noqa: C901
             if has_complex and any(tok in expression for tok in ("!=", "==", "<=", ">=", "<", ">")):
                 use_miniexpr = False
         for op in operands.values():
-            # Ensure blocks fit exactly in chunks for the n-dim case
-            blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks, op.blocks, strict=True))
+            # Ensure blocks fit exactly in chunks for the n-dim case, except for the first dimension
+            blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks[1:], op.blocks[1:], strict=True))
             if len(op.shape) != 1 and not blocks_fit:
                 use_miniexpr = False
                 break

From 2fd4c7165ff9ce144892ae0d0c9d168809e5f6f8 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 14 Jan 2026 17:20:01 +0100
Subject: [PATCH 105/123] Fix for reductions in miniexpr (depends on reduc
 type)

---
 src/blosc2/blosc2_ext.pyx |  7 +++++--
 src/blosc2/lazyexpr.py    | 21 +++++++++++++++++++--
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 621e2867..6d5ea3ed 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1925,8 +1925,11 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
 
     cdef me_expr* miniexpr_handle = udata.miniexpr_handle
     cdef void* aux_reduc_ptr
-    # Calculate blocks per chunk using CEILING division (chunks are padded to fit whole blocks)
-    cdef int nblocks_per_chunk = (udata.array.chunknitems + udata.array.blocknitems - 1) // udata.array.blocknitems
+    # For reduction operations, we need to track which block we're processing
+    # The linear_block_index should be based on the INPUT array structure, not the output array
+    # Get the first input array's chunk and block structure
+    cdef b2nd_array_t* first_input = udata.inputs[0]
+    cdef int nblocks_per_chunk = (first_input.chunknitems + first_input.blocknitems - 1) // first_input.blocknitems
     # Calculate the global linear block index: nchunk * blocks_per_chunk + nblock
     # This works because blocks never span chunks (chunks are padded to block boundaries)
     cdef int64_t linear_block_index = nchunk * nblocks_per_chunk + nblock
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 7650ea5e..fdd6c973 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -2031,8 +2031,25 @@ def reduce_slices(  # noqa: C901
         res_eval = blosc2.uninit(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
         # Compute the number of blocks in the result
         nblocks = res_eval.nbytes // res_eval.blocksize
-        # Initialize to zeros since some blocks may be padding and won't be written
-        aux_reduc = np.zeros(nblocks, dtype=dtype)
+        # Initialize aux_reduc based on the reduction operation
+        # Padding blocks won't be written, so initial values matter for the final reduction
+        if reduce_op == ReduceOp.SUM or reduce_op == ReduceOp.ANY:
+            aux_reduc = np.zeros(nblocks, dtype=dtype)
+        elif reduce_op == ReduceOp.PROD or reduce_op == ReduceOp.ALL:
+            aux_reduc = np.ones(nblocks, dtype=dtype)
+        elif reduce_op == ReduceOp.MIN:
+            if np.issubdtype(dtype, np.integer):
+                aux_reduc = np.full(nblocks, np.iinfo(dtype).max, dtype=dtype)
+            else:
+                aux_reduc = np.full(nblocks, np.inf, dtype=dtype)
+        elif reduce_op == ReduceOp.MAX:
+            if np.issubdtype(dtype, np.integer):
+                aux_reduc = np.full(nblocks, np.iinfo(dtype).min, dtype=dtype)
+            else:
+                aux_reduc = np.full(nblocks, -np.inf, dtype=dtype)
+        else:
+            # For other operations, zeros should be safe
+            aux_reduc = np.zeros(nblocks, dtype=dtype)
         try:
             print("expr->miniexpr:", expression, reduce_op)
             expression = f"{reduce_op_str}({expression})"

From 7d68e55a4c3e9ce9711a1cba27e34fb842e86801 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 14 Jan 2026 17:41:00 +0100
Subject: [PATCH 106/123] Allow broader miniexpr use by getting rid of
 unnecessary guards

---
 src/blosc2/lazyexpr.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index fdd6c973..7f85aeb8 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1303,11 +1303,6 @@ def fast_eval(  # noqa: C901
             if not (isinstance(op, blosc2.NDArray) and op.urlpath is None and out is None):
                 use_miniexpr = False
                 break
-            # Ensure blocks fit exactly in chunks for the n-dim case, except for the first dimension
-            blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks[1:], op.blocks[1:], strict=True))
-            if len(op.shape) != 1 and not blocks_fit:
-                use_miniexpr = False
-                break
 
     if use_miniexpr:
         cparams = kwargs.pop("cparams", blosc2.CParams())

From ac757251df1764844f2c96a9e998c5856b597214 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 14 Jan 2026 17:41:20 +0100
Subject: [PATCH 107/123] Allow broader miniexpr use by getting rid of
 unnecessary guards (II)

---
 src/blosc2/lazyexpr.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 7f85aeb8..b2056c62 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1283,17 +1283,6 @@ def fast_eval(  # noqa: C901
         iter_disk = False
 
     # Check whether we can use miniexpr
-    # Miniexpr only supports a subset of functions - disable for unsupported ones
-    unsupported_funcs = [
-        "clip",
-        "maximum",
-        "minimum",
-        "contains",
-    ] + reducers  # miniexpr doesn't support reduction functions
-
-    if isinstance(expression, str) and any(func in expression for func in unsupported_funcs):
-        use_miniexpr = False
-
     if use_miniexpr:
         # Avoid padding issues except for 1D arrays (contiguous along the only axis).
         if len(shape) != 1 and builtins.any(s % c != 0 for s, c in zip(shape[1:], chunks[1:], strict=True)):

From 9f4a1d56f43bd0fdb24987be7267744b738b4a6d Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Sat, 17 Jan 2026 10:07:25 +0100
Subject: [PATCH 108/123] Use new me_compile_nd/me_eval_nd for broadening
 scenarios for miniexpr

---
 CMakeLists.txt                 |  2 +-
 README_DEVELOPERS.md           |  8 +--
 src/blosc2/blosc2_ext.pyx      | 94 +++++++++++++++++++++++-----------
 src/blosc2/lazyexpr.py         | 28 +++++-----
 tests/ndarray/test_lazyexpr.py |  2 +-
 5 files changed, 78 insertions(+), 56 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74b8b067..87123238 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,7 +58,7 @@ set(MINIEXPR_BUILD_BENCH OFF CACHE BOOL "Build miniexpr benchmarks" FORCE)
 
 FetchContent_Declare(miniexpr
     GIT_REPOSITORY https://github.com/Blosc/miniexpr.git
-    GIT_TAG sleef  # latest SIMD additions
+    GIT_TAG ndim  # latest me_compile_nd()/me_eval_nd() APIs
     # In case you want to use a local copy of miniexpr for development, uncomment the line below
     # SOURCE_DIR "/Users/faltet/blosc/miniexpr"
 )
diff --git a/README_DEVELOPERS.md b/README_DEVELOPERS.md
index acf8493a..f28a8eca 100644
--- a/README_DEVELOPERS.md
+++ b/README_DEVELOPERS.md
@@ -59,15 +59,9 @@ brew install sccache ninja
 Then run:
 
 ```bash
-CMAKE_GENERATOR=Ninja \
-CMAKE_C_COMPILER=clang \
-CMAKE_CXX_COMPILER=clang++ \
 CMAKE_C_COMPILER_LAUNCHER=sccache \
-CMAKE_CXX_COMPILER_LAUNCHER=sccache \
-CMAKE_BUILD_PARALLEL_LEVEL=8 \
-SKBUILD_PARALLEL_LEVEL=8 \
 SKBUILD_BUILD_DIR=build \
-pip install -e .
+pip install -e . --no-build-isolation
 ```
 
 Using `SKBUILD_BUILD_DIR` keeps a stable build directory between runs, which
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 6d5ea3ed..85296a2a 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -573,6 +573,11 @@ cdef extern from "miniexpr.h":
     int me_compile(const char *expression, const me_variable *variables,
                    int var_count, me_dtype dtype, int *error, me_expr **out)
 
+    int me_compile_nd(const char *expression, const me_variable *variables,
+                      int var_count, me_dtype dtype, int ndims,
+                      const int64_t *shape, const int32_t *chunkshape,
+                      const int32_t *blockshape, int *error, me_expr **out)
+
     cdef enum me_compile_status:
         ME_COMPILE_SUCCESS
         ME_COMPILE_ERR_OOM
@@ -583,9 +588,26 @@ cdef extern from "miniexpr.h":
         ME_COMPILE_ERR_VAR_MIXED
         ME_COMPILE_ERR_VAR_UNSPECIFIED
         ME_COMPILE_ERR_INVALID_ARG_TYPE
+        ME_COMPILE_ERR_MIXED_TYPE_NESTED
+
+    cdef enum me_simd_ulp_mode:
+        ME_SIMD_ULP_DEFAULT
+        ME_SIMD_ULP_1
+        ME_SIMD_ULP_3_5
+
+    ctypedef struct me_eval_params:
+        c_bool disable_simd
+        me_simd_ulp_mode simd_ulp_mode
+
+    int me_eval(const me_expr *expr, const void **vars_block,
+                int n_vars, void *output_block, int chunk_nitems,
+                const me_eval_params *params) nogil
 
-    int me_eval(const me_expr *expr, const void ** vars_chunk,
-                int n_vars, void *output_chunk, int chunk_nitems) nogil
+    int me_eval_nd(const me_expr *expr, const void **vars_block,
+                   int n_vars, void *output_block, int block_nitems,
+                   int64_t nchunk, int64_t nblock, const me_eval_params *params) nogil
+
+    int me_nd_valid_nitems(const me_expr *expr, int64_t nchunk, int64_t nblock, int64_t *valid_nitems) nogil
 
     void me_print(const me_expr *n) nogil
     void me_free(me_expr *n) nogil
@@ -1860,10 +1882,8 @@ cdef int general_filler(blosc2_prefilter_params *params):
     return 0
 
 
-# Auxiliary function for just miniexpr as a prefilter
-# Only meant for (input and output) arrays that:
-# 1) Are blosc2.NDArray objects
-# 2) Do not have padding
+# Auxiliary function for miniexpr as a prefilter
+# Only meant for (input and output) arrays that are blosc2.NDArray objects.
 cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
                       c_bool is_postfilter, uint8_t *params_output, int32_t typesize) nogil:
     # Declare all C variables at the beginning
@@ -1880,9 +1900,29 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
     cdef void* src
     cdef int32_t chunk_nbytes, chunk_cbytes, block_nbytes
     cdef int start, blocknitems, expected_blocknitems
+    cdef int64_t valid_nitems
     cdef int32_t input_typesize
     cdef blosc2_context* dctx
     expected_blocknitems = -1
+    valid_nitems = 0
+
+    cdef me_expr* miniexpr_handle = udata.miniexpr_handle
+    cdef void* aux_reduc_ptr
+
+    if miniexpr_handle == NULL:
+        raise ValueError("miniexpr: handle not assigned")
+
+    # Query valid (unpadded) items for this block
+    rc = me_nd_valid_nitems(miniexpr_handle, nchunk, nblock, &valid_nitems)
+    if rc != 0:
+        raise RuntimeError(f"miniexpr: invalid block; error code: {rc}")
+    if valid_nitems <= 0:
+        # Nothing to compute for this block.
+        # For reductions, keep aux_reduc neutral values untouched.
+        if udata.aux_reduc_ptr == NULL:
+            memset(params_output, 0, udata.array.blocknitems * typesize)
+        free(input_buffers)
+        return 0
     for i in range(udata.ninputs):
         ndarr = udata.inputs[i]
         input_buffers[i] = malloc(ndarr.sc.blocksize)
@@ -1912,48 +1952,35 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
                 # In the future, perhaps one can create a specific (serial) context just for
                 # blosc2_getitem_ctx, but this is probably never going to be necessary.
                 dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
-            if nchunk * ndarr.chunknitems + start + blocknitems > ndarr.nitems:
-                blocknitems = ndarr.nitems - (nchunk * ndarr.chunknitems + start)
-                if blocknitems <= 0:
-                    # Should never happen, but anyway
-                    continue
+            if valid_nitems > blocknitems:
+                raise ValueError("miniexpr: valid items exceed padded block size")
             rc = blosc2_getitem_ctx(dctx, src, chunk_cbytes, start, blocknitems,
                                     input_buffers[i], block_nbytes)
             blosc2_free_ctx(dctx)
             if rc < 0:
                 raise ValueError("miniexpr: error decompressing the chunk")
-
-    cdef me_expr* miniexpr_handle = udata.miniexpr_handle
-    cdef void* aux_reduc_ptr
     # For reduction operations, we need to track which block we're processing
     # The linear_block_index should be based on the INPUT array structure, not the output array
     # Get the first input array's chunk and block structure
     cdef b2nd_array_t* first_input = udata.inputs[0]
-    cdef int nblocks_per_chunk = (first_input.chunknitems + first_input.blocknitems - 1) // first_input.blocknitems
+    cdef int nblocks_per_chunk = 1
+    for i in range(first_input.ndim):
+        nblocks_per_chunk *= <int>udata.blocks_in_chunk[i]
     # Calculate the global linear block index: nchunk * blocks_per_chunk + nblock
     # This works because blocks never span chunks (chunks are padded to block boundaries)
     cdef int64_t linear_block_index = nchunk * nblocks_per_chunk + nblock
     cdef uintptr_t offset_bytes = typesize * linear_block_index
 
-    if miniexpr_handle == NULL:
-        raise ValueError("miniexpr: handle not assigned")
-
-    # Skip evaluation if blocknitems is invalid (can happen for padding blocks beyond data)
-    if blocknitems <= 0:
-        # Free resources
-        for i in range(udata.ninputs):
-            free(input_buffers[i])
-        free(input_buffers)
-        return 0
-
     # Call thread-safe miniexpr C API
     if udata.aux_reduc_ptr == NULL:
-        rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
-                     <void*>params_output, blocknitems)
+        rc = me_eval_nd(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
+                        <void*>params_output, blocknitems, nchunk, nblock, NULL)
     else:
-        # Reduction operation
+        # Reduction operation: evaluate only valid items into a single output element.
+        # NOTE: miniexpr handles scalar outputs in me_eval_nd without touching tail bytes.
         aux_reduc_ptr = <void *> (<uintptr_t> udata.aux_reduc_ptr + offset_bytes)
-        rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs, aux_reduc_ptr, blocknitems)
+        rc = me_eval_nd(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
+                        aux_reduc_ptr, blocknitems, nchunk, nblock, NULL)
     if rc != 0:
         raise RuntimeError(f"miniexpr: issues during evaluation; error code: {rc}")
 
@@ -2904,7 +2931,12 @@ cdef class NDArray:
         expression = expression.encode("utf-8") if isinstance(expression, str) else expression
         cdef me_dtype = me_dtype_from_numpy(self.dtype.num)
         cdef me_expr *out_expr
-        cdef int rc = me_compile(expression, variables, n, me_dtype, &error, &out_expr)
+        cdef int ndims = self.array.ndim
+        cdef int64_t* shape = &self.array.shape[0]
+        cdef int32_t* chunkshape = &self.array.chunkshape[0]
+        cdef int32_t* blockshape = &self.array.blockshape[0]
+        cdef int rc = me_compile_nd(expression, variables, n, me_dtype, ndims,
+                                    shape, chunkshape, blockshape, &error, &out_expr)
         if rc == ME_COMPILE_ERR_INVALID_ARG_TYPE:
             raise TypeError(f"miniexpr does not support operand or output dtype: {expression}")
         if rc != ME_COMPILE_SUCCESS:
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index b2056c62..5fb32f96 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1284,14 +1284,14 @@ def fast_eval(  # noqa: C901
 
     # Check whether we can use miniexpr
     if use_miniexpr:
-        # Avoid padding issues except for 1D arrays (contiguous along the only axis).
-        if len(shape) != 1 and builtins.any(s % c != 0 for s, c in zip(shape[1:], chunks[1:], strict=True)):
+        # Require aligned NDArray operands with identical chunk/block grid.
+        same_shape = all(hasattr(op, "shape") and op.shape == shape for op in operands.values())
+        same_chunks = all(hasattr(op, "chunks") and op.chunks == chunks for op in operands.values())
+        same_blocks = all(hasattr(op, "blocks") and op.blocks == blocks for op in operands.values())
+        if not (same_shape and same_chunks and same_blocks):
+            use_miniexpr = False
+        if not (all_ndarray and not any_persisted and out is None):
             use_miniexpr = False
-        for op in operands.values():
-            # Only NDArray in-memory operands
-            if not (isinstance(op, blosc2.NDArray) and op.urlpath is None and out is None):
-                use_miniexpr = False
-                break
 
     if use_miniexpr:
         cparams = kwargs.pop("cparams", blosc2.CParams())
@@ -1989,10 +1989,12 @@ def reduce_slices(  # noqa: C901
     if reduce_op in (ReduceOp.ARGMAX, ReduceOp.ARGMIN):
         use_miniexpr = False
 
-    # Only behaved partitions are supported in miniexpr reductions
+    # Check whether we can use miniexpr
     if use_miniexpr:
-        # Avoid padding issues except for 1D arrays (contiguous along the only axis).
-        if len(shape) != 1 and builtins.any(s % c != 0 for s, c in zip(shape[1:], chunks[1:], strict=True)):
+        same_shape = all(hasattr(op, "shape") and op.shape == shape for op in operands.values())
+        same_chunks = all(hasattr(op, "chunks") and op.chunks == chunks for op in operands.values())
+        same_blocks = all(hasattr(op, "blocks") and op.blocks == blocks for op in operands.values())
+        if not (same_shape and same_chunks and same_blocks):
             use_miniexpr = False
         if use_miniexpr and isinstance(expression, str):
             has_complex = any(
@@ -2001,12 +2003,6 @@ def reduce_slices(  # noqa: C901
             )
             if has_complex and any(tok in expression for tok in ("!=", "==", "<=", ">=", "<", ">")):
                 use_miniexpr = False
-        for op in operands.values():
-            # Ensure blocks fit exactly in chunks for the n-dim case, except for the first dimension
-            blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks[1:], op.blocks[1:], strict=True))
-            if len(op.shape) != 1 and not blocks_fit:
-                use_miniexpr = False
-                break
 
     if use_miniexpr:
         # Experiments say that not splitting is best (at least on Apple Silicon M4 Pro)
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index 2f441497..01cc8dbb 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -270,7 +270,7 @@ def test_expression_with_constants(array_fixture):
     a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
     # Test with operands with same chunks and blocks
     expr = a1 + 2 - a3 * 3.14
-    nres = ne_evaluate("na1 + 2 - na3 * 3.14")
+    nres = na1 + 2 - na3 * 3.14
     res = expr.compute()
     if na1.dtype == np.float32:
         np.testing.assert_allclose(res[:], nres, rtol=1e-5)

From 69579f494e99008aa79298d5658c90056d7b1df1 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 19 Jan 2026 07:40:59 +0100
Subject: [PATCH 109/123] Enabled miniexpr for on-disk operands too

---
 src/blosc2/blosc2_ext.pyx      | 54 +++++++++++++++++++++++++++++++---
 src/blosc2/lazyexpr.py         |  4 +--
 tests/ndarray/test_lazyexpr.py |  7 +++--
 3 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 85296a2a..ed3ac0cd 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -378,9 +378,9 @@ cdef extern from "blosc2.h":
     int blosc2_schunk_decompress_chunk(blosc2_schunk *schunk, int64_t nchunk, void *dest, int32_t nbytes)
 
     int blosc2_schunk_get_chunk(blosc2_schunk *schunk, int64_t nchunk, uint8_t ** chunk,
-                                c_bool *needs_free)
+                                c_bool *needs_free) nogil
     int blosc2_schunk_get_lazychunk(blosc2_schunk *schunk, int64_t nchunk, uint8_t ** chunk,
-                                    c_bool *needs_free)
+                                    c_bool *needs_free) nogil
     int blosc2_schunk_get_slice_buffer(blosc2_schunk *schunk, int64_t start, int64_t stop, void *buffer)
     int blosc2_schunk_set_slice_buffer(blosc2_schunk *schunk, int64_t start, int64_t stop, void *buffer)
     int blosc2_schunk_get_cparams(blosc2_schunk *schunk, blosc2_cparams** cparams)
@@ -616,6 +616,13 @@ cdef extern from "miniexpr.h":
 cdef extern from "miniexpr_numpy.h":
     me_dtype me_dtype_from_numpy(int numpy_type_num)
 
+cdef extern from "pythread.h":
+    ctypedef void* PyThread_type_lock
+    PyThread_type_lock PyThread_allocate_lock() nogil
+    int PyThread_acquire_lock(PyThread_type_lock lock, int waitflag) nogil
+    void PyThread_release_lock(PyThread_type_lock lock) nogil
+    void PyThread_free_lock(PyThread_type_lock lock) nogil
+
 
 ctypedef struct user_filters_udata:
     char* py_func
@@ -666,9 +673,14 @@ cdef _check_comp_length(comp_name, comp_len):
 
 
 blosc2_init()
+cdef PyThread_type_lock chunk_cache_lock = PyThread_allocate_lock()
+if chunk_cache_lock == NULL:
+    raise MemoryError("Could not allocate chunk cache lock")
 
 @atexit.register
 def destroy():
+    if chunk_cache_lock != NULL:
+        PyThread_free_lock(chunk_cache_lock)
     blosc2_destroy()
 
 
@@ -1799,6 +1811,11 @@ cdef class SChunk:
                 me_data = <me_udata*>self.schunk.storage.cparams.preparams.user_data
                 if me_data != NULL:
                     if me_data.inputs != NULL:
+                        for i in range(me_data.ninputs):
+                            if me_data.inputs[i].chunk_cache.data != NULL:
+                                free(me_data.inputs[i].chunk_cache.data)
+                                me_data.inputs[i].chunk_cache.data = NULL
+                                me_data.inputs[i].chunk_cache.nchunk = -1
                         free(me_data.inputs)
                     if me_data.miniexpr_handle != NULL:  # XXX do we really need the conditional?
                         me_free(me_data.miniexpr_handle)
@@ -1897,7 +1914,9 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
     cdef int rc
     cdef void** input_buffers = <void**> malloc(udata.ninputs * sizeof(uint8_t*))
     cdef float *buf
-    cdef void* src
+    cdef uint8_t* src
+    cdef uint8_t* chunk
+    cdef c_bool needs_free
     cdef int32_t chunk_nbytes, chunk_cbytes, block_nbytes
     cdef int start, blocknitems, expected_blocknitems
     cdef int64_t valid_nitems
@@ -1932,7 +1951,32 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
             for j in range(ndarr.blocknitems):
                 buf[j] = 1.
         else:
-            src = ndarr.sc.data[nchunk]
+            if ndarr.sc.storage.urlpath == NULL:
+                src = ndarr.sc.data[nchunk]
+            else:
+                # We need to get the chunk from disk/network
+                if ndarr.chunk_cache.nchunk != nchunk:
+                    PyThread_acquire_lock(chunk_cache_lock, 1)
+                    if ndarr.chunk_cache.nchunk != nchunk:
+                        if ndarr.chunk_cache.data != NULL:
+                            free(ndarr.chunk_cache.data)
+                            ndarr.chunk_cache.data = NULL
+                        rc = blosc2_schunk_get_chunk(ndarr.sc, nchunk, &chunk, &needs_free)
+                        if rc < 0:
+                            PyThread_release_lock(chunk_cache_lock)
+                            raise ValueError("miniexpr: error getting chunk")
+                        if not needs_free:
+                            src = <uint8_t*> malloc(rc)
+                            if src == NULL:
+                                PyThread_release_lock(chunk_cache_lock)
+                                raise MemoryError("miniexpr: cannot allocate chunk copy")
+                            memcpy(src, chunk, rc)
+                        else:
+                            src = chunk
+                        ndarr.chunk_cache.data = src
+                        ndarr.chunk_cache.nchunk = nchunk
+                    PyThread_release_lock(chunk_cache_lock)
+                src = ndarr.chunk_cache.data
             rc = blosc2_cbuffer_sizes(src, &chunk_nbytes, &chunk_cbytes, &block_nbytes)
             if rc < 0:
                 raise ValueError("miniexpr: error getting cbuffer sizes")
@@ -2888,6 +2932,8 @@ cdef class NDArray:
         cdef b2nd_array_t** inputs_ = <b2nd_array_t**> malloc(ninputs * sizeof(b2nd_array_t*))
         for i, operand in enumerate(operands):
             inputs_[i] = <b2nd_array_t*><uintptr_t>operand.c_array
+            inputs_[i].chunk_cache.nchunk = -1
+            inputs_[i].chunk_cache.data = NULL
         udata.inputs = inputs_
         udata.ninputs = ninputs
         udata.array = self.array
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 5fb32f96..a89f7f1c 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1290,7 +1290,7 @@ def fast_eval(  # noqa: C901
         same_blocks = all(hasattr(op, "blocks") and op.blocks == blocks for op in operands.values())
         if not (same_shape and same_chunks and same_blocks):
             use_miniexpr = False
-        if not (all_ndarray and not any_persisted and out is None):
+        if not (all_ndarray and out is None):
             use_miniexpr = False
 
     if use_miniexpr:
@@ -1982,7 +1982,7 @@ def reduce_slices(  # noqa: C901
         del temp
 
     # miniexpr reduction path only supported for some cases so far
-    if not (where is None and fast_path and all_ndarray and not any_persisted and reduced_shape == ()):
+    if not (where is None and fast_path and all_ndarray and reduced_shape == ()):
         use_miniexpr = False
 
     # Some reductions are not supported yet in miniexpr
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index 01cc8dbb..d85856b3 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -703,17 +703,18 @@ def test_save_functions(function, dtype_fixture, shape_fixture):
     expr_string = f"{function}(na1)"
     res_numexpr = ne_evaluate(expr_string)
     # Compare the results
-    np.testing.assert_allclose(res_lazyexpr[:], res_numexpr)
+    rtol = 1e-6 if dtype_fixture == np.float32 else 1e-15
+    np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=rtol)
 
     expr_string = f"blosc2.{function}(a1)"
     expr = eval(expr_string, {"a1": a1, "blosc2": blosc2})
     expr.save(urlpath=urlpath_save)
     res_lazyexpr = expr.compute()
-    np.testing.assert_allclose(res_lazyexpr[:], res_numexpr)
+    np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=rtol)
 
     expr = blosc2.open(urlpath_save)
     res_lazyexpr = expr.compute()
-    np.testing.assert_allclose(res_lazyexpr[:], res_numexpr)
+    np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=rtol)
 
     for urlpath in [urlpath_op, urlpath_save]:
         blosc2.remove_urlpath(urlpath)

From a03c8d9e4f612702bb0aebf1f50f98325e138a30 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 19 Jan 2026 08:39:16 +0100
Subject: [PATCH 110/123] Trying to fix windows issues

---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 87123238..6dd94242 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,7 +58,8 @@ set(MINIEXPR_BUILD_BENCH OFF CACHE BOOL "Build miniexpr benchmarks" FORCE)
 
 FetchContent_Declare(miniexpr
     GIT_REPOSITORY https://github.com/Blosc/miniexpr.git
-    GIT_TAG ndim  # latest me_compile_nd()/me_eval_nd() APIs
+    #GIT_TAG ndim  # latest me_compile_nd()/me_eval_nd() APIs
+    GIT_TAG 8c50850094e156ce568186edef667fabecbd00ff  # latest commit in ndim
     # In case you want to use a local copy of miniexpr for development, uncomment the line below
     # SOURCE_DIR "/Users/faltet/blosc/miniexpr"
 )

From 140b3912e8e5a396b07eeaa4ced74ab6b97489fa Mon Sep 17 00:00:00 2001
From: lshaw8317 <lshaw8317@gmail.com>
Date: Tue, 20 Jan 2026 16:33:06 +0100
Subject: [PATCH 111/123] Change from numpy to numexpr calculation

---
 tests/ndarray/test_lazyexpr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index d85856b3..a275b07c 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -270,7 +270,7 @@ def test_expression_with_constants(array_fixture):
     a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
     # Test with operands with same chunks and blocks
     expr = a1 + 2 - a3 * 3.14
-    nres = na1 + 2 - na3 * 3.14
+    nres = ne_evaluate("na1 + 2 - na3 * 3.14")
     res = expr.compute()
     if na1.dtype == np.float32:
         np.testing.assert_allclose(res[:], nres, rtol=1e-5)

From 5912f04d096c3c4b235cbaea89498fc49233b91b Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 20 Jan 2026 17:29:11 +0100
Subject: [PATCH 112/123] Fix issues with precision

---
 tests/ndarray/test_lazyexpr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index a275b07c..762a77cc 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -273,7 +273,7 @@ def test_expression_with_constants(array_fixture):
     nres = ne_evaluate("na1 + 2 - na3 * 3.14")
     res = expr.compute()
     if na1.dtype == np.float32:
-        np.testing.assert_allclose(res[:], nres, rtol=1e-5)
+        np.testing.assert_allclose(res[:], nres, rtol=1e-5, atol=1e-6)
     else:
         np.testing.assert_allclose(res[:], nres)
 

From 423a448648acb3409c3751048a538dd257239a84 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 20 Jan 2026 17:29:57 +0100
Subject: [PATCH 113/123] Use latest c-blosc2 with miniexpr and openzl_plugin
 improvements

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6dd94242..83629c1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,8 +115,8 @@ else()
     include(FetchContent)
     FetchContent_Declare(blosc2
         GIT_REPOSITORY https://github.com/Blosc/c-blosc2
-        # GIT_TAG 9d250c2201f6e385c56a372b08037f7debc6fa1b  # openzl (disposable output)
-        GIT_TAG add_openzl
+        GIT_TAG 011c9e537f28299c536294d842e1a3d0e41db24f  # openzl_plugin + miniexpr
+        # GIT_TAG main
         GIT_SHALLOW TRUE  # fetch only the latest commit (only works with a branch in GIT_TAG)
         # in case you want to use a local copy of c-blosc2 for development, uncomment the line below
         # SOURCE_DIR "/Users/faltet/blosc/c-blosc2"

From 61ef8f4d3b9c319e7340c41eb455a2e73749f164 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Tue, 20 Jan 2026 18:47:15 +0100
Subject: [PATCH 114/123] Add a test for reductions with where

---
 tests/ndarray/test_reductions.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/ndarray/test_reductions.py b/tests/ndarray/test_reductions.py
index 054184ee..e146bfb7 100644
--- a/tests/ndarray/test_reductions.py
+++ b/tests/ndarray/test_reductions.py
@@ -65,6 +65,19 @@ def test_reduce_bool(array_fixture, reduce_op):
     np.testing.assert_allclose(res, nres, atol=tol, rtol=tol)
 
 
+def test_reduce_where(array_fixture):
+    a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
+    # The next works
+    # res = blosc2.where(a1 < a2, a2, 0).sum()
+    # nres = ne_evaluate("sum(where(na1 < na2, na2, 0))")
+    # This does not work yet (it currently hangs)
+    res = blosc2.where(a1 < a2, a2, a1).sum()
+    nres = ne_evaluate("sum(where(na1 < na2, na2, na1))")
+    print("res:", res, nres)
+    tol = 1e-15 if a1.dtype == "float64" else 1e-6
+    np.testing.assert_allclose(res, nres, atol=tol, rtol=tol)
+
+
 @pytest.mark.parametrize(
     "reduce_op", ["sum", "prod", "mean", "std", "var", "min", "max", "any", "all", "argmax", "argmin"]
 )

From 9f992593bb6ae42bc3b9c07030ad67d1413f15c0 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 21 Jan 2026 11:02:38 +0100
Subject: [PATCH 115/123] Remove check for top speed (simplify code)

---
 src/blosc2/blosc2_ext.pyx | 104 ++++++++++++++++++--------------------
 1 file changed, 49 insertions(+), 55 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index ed3ac0cd..38dd31df 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1945,64 +1945,58 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
     for i in range(udata.ninputs):
         ndarr = udata.inputs[i]
         input_buffers[i] = malloc(ndarr.sc.blocksize)
-        # A way to check for top speed
-        if False:
-            buf = <float *>input_buffers[i]
-            for j in range(ndarr.blocknitems):
-                buf[j] = 1.
+        if ndarr.sc.storage.urlpath == NULL:
+            src = ndarr.sc.data[nchunk]
         else:
-            if ndarr.sc.storage.urlpath == NULL:
-                src = ndarr.sc.data[nchunk]
-            else:
-                # We need to get the chunk from disk/network
+            # We need to get the chunk from disk/network
+            if ndarr.chunk_cache.nchunk != nchunk:
+                PyThread_acquire_lock(chunk_cache_lock, 1)
                 if ndarr.chunk_cache.nchunk != nchunk:
-                    PyThread_acquire_lock(chunk_cache_lock, 1)
-                    if ndarr.chunk_cache.nchunk != nchunk:
-                        if ndarr.chunk_cache.data != NULL:
-                            free(ndarr.chunk_cache.data)
-                            ndarr.chunk_cache.data = NULL
-                        rc = blosc2_schunk_get_chunk(ndarr.sc, nchunk, &chunk, &needs_free)
-                        if rc < 0:
+                    if ndarr.chunk_cache.data != NULL:
+                        free(ndarr.chunk_cache.data)
+                        ndarr.chunk_cache.data = NULL
+                    rc = blosc2_schunk_get_chunk(ndarr.sc, nchunk, &chunk, &needs_free)
+                    if rc < 0:
+                        PyThread_release_lock(chunk_cache_lock)
+                        raise ValueError("miniexpr: error getting chunk")
+                    if not needs_free:
+                        src = <uint8_t*> malloc(rc)
+                        if src == NULL:
                             PyThread_release_lock(chunk_cache_lock)
-                            raise ValueError("miniexpr: error getting chunk")
-                        if not needs_free:
-                            src = <uint8_t*> malloc(rc)
-                            if src == NULL:
-                                PyThread_release_lock(chunk_cache_lock)
-                                raise MemoryError("miniexpr: cannot allocate chunk copy")
-                            memcpy(src, chunk, rc)
-                        else:
-                            src = chunk
-                        ndarr.chunk_cache.data = src
-                        ndarr.chunk_cache.nchunk = nchunk
-                    PyThread_release_lock(chunk_cache_lock)
-                src = ndarr.chunk_cache.data
-            rc = blosc2_cbuffer_sizes(src, &chunk_nbytes, &chunk_cbytes, &block_nbytes)
-            if rc < 0:
-                raise ValueError("miniexpr: error getting cbuffer sizes")
-            input_typesize = ndarr.sc.typesize
-            blocknitems = block_nbytes // input_typesize
-            if expected_blocknitems == -1:
-                expected_blocknitems = blocknitems
-            elif blocknitems != expected_blocknitems:
-                raise ValueError("miniexpr: inconsistent block element counts across inputs")
-            start = nblock * blocknitems
-            # A way to check for top speed
-            if False:
-                # Unsafe, but it works for special arrays (e.g. blosc2.ones), and can be fast
-                dctx = ndarr.sc.dctx
-            else:
-                # This is needed for thread safety, but adds a pretty low overhead (< 400ns on a modern CPU)
-                # In the future, perhaps one can create a specific (serial) context just for
-                # blosc2_getitem_ctx, but this is probably never going to be necessary.
-                dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
-            if valid_nitems > blocknitems:
-                raise ValueError("miniexpr: valid items exceed padded block size")
-            rc = blosc2_getitem_ctx(dctx, src, chunk_cbytes, start, blocknitems,
-                                    input_buffers[i], block_nbytes)
-            blosc2_free_ctx(dctx)
-            if rc < 0:
-                raise ValueError("miniexpr: error decompressing the chunk")
+                            raise MemoryError("miniexpr: cannot allocate chunk copy")
+                        memcpy(src, chunk, rc)
+                    else:
+                        src = chunk
+                    ndarr.chunk_cache.data = src
+                    ndarr.chunk_cache.nchunk = nchunk
+                PyThread_release_lock(chunk_cache_lock)
+            src = ndarr.chunk_cache.data
+        rc = blosc2_cbuffer_sizes(src, &chunk_nbytes, &chunk_cbytes, &block_nbytes)
+        if rc < 0:
+            raise ValueError("miniexpr: error getting cbuffer sizes")
+        input_typesize = ndarr.sc.typesize
+        blocknitems = block_nbytes // input_typesize
+        if expected_blocknitems == -1:
+            expected_blocknitems = blocknitems
+        elif blocknitems != expected_blocknitems:
+            raise ValueError("miniexpr: inconsistent block element counts across inputs")
+        start = nblock * blocknitems
+        # A way to check for top speed
+        if False:
+            # Unsafe, but it works for special arrays (e.g. blosc2.ones), and can be fast
+            dctx = ndarr.sc.dctx
+        else:
+            # This is needed for thread safety, but adds a pretty low overhead (< 400ns on a modern CPU)
+            # In the future, perhaps one can create a specific (serial) context just for
+            # blosc2_getitem_ctx, but this is probably never going to be necessary.
+            dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
+        if valid_nitems > blocknitems:
+            raise ValueError("miniexpr: valid items exceed padded block size")
+        rc = blosc2_getitem_ctx(dctx, src, chunk_cbytes, start, blocknitems,
+                                input_buffers[i], block_nbytes)
+        blosc2_free_ctx(dctx)
+        if rc < 0:
+            raise ValueError("miniexpr: error decompressing the chunk")
     # For reduction operations, we need to track which block we're processing
     # The linear_block_index should be based on the INPUT array structure, not the output array
     # Get the first input array's chunk and block structure

From e30575a50f83a5ae4a615788ecca3ddba02b719b Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 21 Jan 2026 11:20:36 +0100
Subject: [PATCH 116/123] Clarify why we need to check the same condition twice

---
 src/blosc2/blosc2_ext.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 38dd31df..9fa133cc 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1951,6 +1951,7 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
             # We need to get the chunk from disk/network
             if ndarr.chunk_cache.nchunk != nchunk:
                 PyThread_acquire_lock(chunk_cache_lock, 1)
+                # We need to check again, as another thread may have updated the cache already
                 if ndarr.chunk_cache.nchunk != nchunk:
                     if ndarr.chunk_cache.data != NULL:
                         free(ndarr.chunk_cache.data)

From e27b135cf69d741d3388c1ce3418ffa008bd5168 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 21 Jan 2026 11:27:29 +0100
Subject: [PATCH 117/123] Convert unused branch into a comment

---
 src/blosc2/blosc2_ext.pyx | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 9fa133cc..fa9cd8a1 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1982,15 +1982,12 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
         elif blocknitems != expected_blocknitems:
             raise ValueError("miniexpr: inconsistent block element counts across inputs")
         start = nblock * blocknitems
-        # A way to check for top speed
-        if False:
-            # Unsafe, but it works for special arrays (e.g. blosc2.ones), and can be fast
-            dctx = ndarr.sc.dctx
-        else:
-            # This is needed for thread safety, but adds a pretty low overhead (< 400ns on a modern CPU)
-            # In the future, perhaps one can create a specific (serial) context just for
-            # blosc2_getitem_ctx, but this is probably never going to be necessary.
-            dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
+        # This is needed for thread safety, but adds a pretty low overhead (< 400ns on a modern CPU)
+        # In the future, perhaps one can create a specific (serial) context just for
+        # blosc2_getitem_ctx, but this is probably never going to be necessary.
+        dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
+        # Unsafe, but it works for special arrays (e.g. blosc2.ones), and can be used for profiling
+        # dctx = ndarr.sc.dctx
         if valid_nitems > blocknitems:
             raise ValueError("miniexpr: valid items exceed padded block size")
         rc = blosc2_getitem_ctx(dctx, src, chunk_cbytes, start, blocknitems,

From bd6fb839ae38b7a0f106635283093836f5c12f20 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 21 Jan 2026 12:29:39 +0100
Subject: [PATCH 118/123] Remove already tested guard for unaligned operands

---
 src/blosc2/lazyexpr.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index a89f7f1c..89b58afe 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1990,19 +1990,13 @@ def reduce_slices(  # noqa: C901
         use_miniexpr = False
 
     # Check whether we can use miniexpr
-    if use_miniexpr:
-        same_shape = all(hasattr(op, "shape") and op.shape == shape for op in operands.values())
-        same_chunks = all(hasattr(op, "chunks") and op.chunks == chunks for op in operands.values())
-        same_blocks = all(hasattr(op, "blocks") and op.blocks == blocks for op in operands.values())
-        if not (same_shape and same_chunks and same_blocks):
+    if use_miniexpr and isinstance(expression, str):
+        has_complex = any(
+            isinstance(op, blosc2.NDArray) and blosc2.isdtype(op.dtype, "complex floating")
+            for op in operands.values()
+        )
+        if has_complex and any(tok in expression for tok in ("!=", "==", "<=", ">=", "<", ">")):
             use_miniexpr = False
-        if use_miniexpr and isinstance(expression, str):
-            has_complex = any(
-                isinstance(op, blosc2.NDArray) and blosc2.isdtype(op.dtype, "complex floating")
-                for op in operands.values()
-            )
-            if has_complex and any(tok in expression for tok in ("!=", "==", "<=", ">=", "<", ">")):
-                use_miniexpr = False
 
     if use_miniexpr:
         # Experiments say that not splitting is best (at least on Apple Silicon M4 Pro)

From d1fd8f6dd901b5dc8849b823d2f6920aac5e399f Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 21 Jan 2026 17:02:31 +0100
Subject: [PATCH 119/123] Code simplification

---
 src/blosc2/blosc2_ext.pyx | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index fa9cd8a1..666a479a 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -2009,14 +2009,13 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
 
     # Call thread-safe miniexpr C API
     if udata.aux_reduc_ptr == NULL:
-        rc = me_eval_nd(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
-                        <void*>params_output, blocknitems, nchunk, nblock, NULL)
+        aux_reduc_ptr = <void *> params_output
     else:
         # Reduction operation: evaluate only valid items into a single output element.
         # NOTE: miniexpr handles scalar outputs in me_eval_nd without touching tail bytes.
         aux_reduc_ptr = <void *> (<uintptr_t> udata.aux_reduc_ptr + offset_bytes)
-        rc = me_eval_nd(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
-                        aux_reduc_ptr, blocknitems, nchunk, nblock, NULL)
+    rc = me_eval_nd(miniexpr_handle, <const void**> input_buffers, udata.ninputs,
+                    aux_reduc_ptr, blocknitems, nchunk, nblock, NULL)
     if rc != 0:
         raise RuntimeError(f"miniexpr: issues during evaluation; error code: {rc}")
 

From 90565a87c9b9768b17c308907cc47129e5271a9e Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Wed, 21 Jan 2026 19:03:01 +0100
Subject: [PATCH 120/123] Add new fp_accuracy param for LazyArray.compute()

---
 doc/reference/classes.rst      |  1 +
 doc/reference/misc.rst         |  1 +
 src/blosc2/__init__.py         | 15 +++++++++++++++
 src/blosc2/blosc2_ext.pyx      | 21 ++++++++++++++-------
 src/blosc2/lazyexpr.py         | 24 +++++++++++++++++++-----
 tests/ndarray/test_lazyexpr.py | 14 ++++++++++++++
 6 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/doc/reference/classes.rst b/doc/reference/classes.rst
index 1d9de69f..cca8c7e0 100644
--- a/doc/reference/classes.rst
+++ b/doc/reference/classes.rst
@@ -54,3 +54,4 @@ Other Classes
     Storage
     Tuner
     URLPath
+    FPAccuracy
diff --git a/doc/reference/misc.rst b/doc/reference/misc.rst
index 8e1d34f4..279ed79a 100644
--- a/doc/reference/misc.rst
+++ b/doc/reference/misc.rst
@@ -57,6 +57,7 @@ This page documents the miscellaneous members of the ``blosc2`` module that do n
         SpecialValue,
         SplitMode,
         Tuner,
+        FPAccuracy,
         compute_chunks_blocks,
         get_slice_nchunks,
         remove_urlpath,
diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py
index 12a2a908..95cf3fc0 100644
--- a/src/blosc2/__init__.py
+++ b/src/blosc2/__init__.py
@@ -113,6 +113,21 @@ class Tuner(Enum):
     BTUNE = 32
 
 
+class FPAccuracy(Enum):
+    """
+    Floating point accuracy modes for Blosc2 computing with lazy expressions.
+
+    This is only relevant when using floating point dtypes with miniexpr.
+    """
+
+    #: Use 1.0 ULPs (Units in the Last Place) for floating point functions
+    HIGH = 1
+    #: Use 3.5 ULPs (Units in the Last Place) for floating point functions
+    LOW = 2
+    #: Use default accuracy. This is LOW, which is enough for most applications.
+    DEFAULT = LOW
+
+
 from .blosc2_ext import (
     DEFINED_CODECS_STOP,
     EXTENDED_HEADER_LENGTH,
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 666a479a..4c1512ff 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -578,7 +578,7 @@ cdef extern from "miniexpr.h":
                       const int64_t *shape, const int32_t *chunkshape,
                       const int32_t *blockshape, int *error, me_expr **out)
 
-    cdef enum me_compile_status:
+    ctypedef enum me_compile_status:
         ME_COMPILE_SUCCESS
         ME_COMPILE_ERR_OOM
         ME_COMPILE_ERR_PARSE
@@ -590,7 +590,7 @@ cdef extern from "miniexpr.h":
         ME_COMPILE_ERR_INVALID_ARG_TYPE
         ME_COMPILE_ERR_MIXED_TYPE_NESTED
 
-    cdef enum me_simd_ulp_mode:
+    ctypedef enum me_simd_ulp_mode:
         ME_SIMD_ULP_DEFAULT
         ME_SIMD_ULP_1
         ME_SIMD_ULP_3_5
@@ -647,7 +647,8 @@ ctypedef struct udf_udata:
 ctypedef struct me_udata:
     b2nd_array_t** inputs
     int ninputs
-    b2nd_array_t *array
+    me_eval_params* eval_params
+    b2nd_array_t* array
     void* aux_reduc_ptr
     int64_t chunks_in_array[B2ND_MAX_DIM]
     int64_t blocks_in_chunk[B2ND_MAX_DIM]
@@ -1819,6 +1820,8 @@ cdef class SChunk:
                         free(me_data.inputs)
                     if me_data.miniexpr_handle != NULL:  # XXX do we really need the conditional?
                         me_free(me_data.miniexpr_handle)
+                    if me_data.eval_params != NULL:
+                        free(me_data.eval_params)
                     free(me_data)
         elif self.schunk.storage.cparams.prefilter != NULL:
             # From Python the preparams->udata with always have the field py_func
@@ -2015,7 +2018,7 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
         # NOTE: miniexpr handles scalar outputs in me_eval_nd without touching tail bytes.
         aux_reduc_ptr = <void *> (<uintptr_t> udata.aux_reduc_ptr + offset_bytes)
     rc = me_eval_nd(miniexpr_handle, <const void**> input_buffers, udata.ninputs,
-                    aux_reduc_ptr, blocknitems, nchunk, nblock, NULL)
+                    aux_reduc_ptr, blocknitems, nchunk, nblock, udata.eval_params)
     if rc != 0:
         raise RuntimeError(f"miniexpr: issues during evaluation; error code: {rc}")
 
@@ -2916,7 +2919,7 @@ cdef class NDArray:
 
         return udata
 
-    cdef me_udata *_fill_me_udata(self, inputs, aux_reduc):
+    cdef me_udata *_fill_me_udata(self, inputs, fp_accuracy, aux_reduc):
         cdef me_udata *udata = <me_udata *> malloc(sizeof(me_udata))
         operands = list(inputs.values())
         ninputs = len(operands)
@@ -2927,6 +2930,10 @@ cdef class NDArray:
             inputs_[i].chunk_cache.data = NULL
         udata.inputs = inputs_
         udata.ninputs = ninputs
+        cdef me_eval_params* eval_params = <me_eval_params*> malloc(sizeof(me_eval_params))
+        eval_params.disable_simd = False
+        eval_params.simd_ulp_mode = ME_SIMD_ULP_3_5 if fp_accuracy == blosc2.FPAccuracy.LOW else ME_SIMD_ULP_1
+        udata.eval_params = eval_params
         udata.array = self.array
         cdef void* aux_reduc_ptr = NULL
         if aux_reduc is not None:
@@ -2941,12 +2948,12 @@ cdef class NDArray:
 
         return udata
 
-    def _set_pref_expr(self, expression, inputs, aux_reduc=None):
+    def _set_pref_expr(self, expression, inputs, fp_accuracy, aux_reduc=None):
         # Set prefilter for miniexpr
         cdef blosc2_cparams* cparams = self.array.sc.storage.cparams
         cparams.prefilter = <blosc2_prefilter_fn> miniexpr_prefilter
 
-        cdef me_udata* udata = self._fill_me_udata(inputs, aux_reduc)
+        cdef me_udata* udata = self._fill_me_udata(inputs, fp_accuracy, aux_reduc)
 
         # Get the compiled expression handle for multi-threading
         cdef Py_ssize_t n = len(inputs)
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 89b58afe..4c39e426 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -302,7 +302,12 @@ def sort(self, order: str | list[str] | None = None) -> blosc2.LazyArray:
         pass
 
     @abstractmethod
-    def compute(self, item: slice | list[slice] | None = None, **kwargs: Any) -> blosc2.NDArray:
+    def compute(
+        self,
+        item: slice | list[slice] | None = None,
+        fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT,
+        **kwargs: Any,
+    ) -> blosc2.NDArray:
         """
         Return a :ref:`NDArray` containing the evaluation of the :ref:`LazyArray`.
 
@@ -313,9 +318,14 @@ def compute(self, item: slice | list[slice] | None = None, **kwargs: Any) -> blo
             the evaluated result. This difference between slicing operands and slicing the final expression
             is important when reductions or a where clause are used in the expression.
 
+        fp_accuracy: :ref:`blosc2.FPAccuracy`, optional
+            Specifies the floating-point accuracy to be used during computation.
+            By default, :ref:`blosc2.FPAccuracy.DEFAULT` is used.
+
         kwargs: Any, optional
             Keyword arguments that are supported by the :func:`empty` constructor.
             These arguments will be set in the resulting :ref:`NDArray`.
+            Additionally, the following special kwargs are supported:
 
         Returns
         -------
@@ -1296,10 +1306,11 @@ def fast_eval(  # noqa: C901
     if use_miniexpr:
         cparams = kwargs.pop("cparams", blosc2.CParams())
         # All values will be overwritten, so we can use an uninitialized array
+        fp_accuracy = kwargs.pop("fp_accuracy", blosc2.FPAccuracy.DEFAULT)
         res_eval = blosc2.uninit(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
         try:
             print("expr->miniexpr:", expression)
-            res_eval._set_pref_expr(expression, operands)
+            res_eval._set_pref_expr(expression, operands, fp_accuracy=fp_accuracy)
             # Data to compress is fetched from operands, so it can be uninitialized here
             data = np.empty(res_eval.schunk.chunksize, dtype=np.uint8)
             # Exercise prefilter for each chunk
@@ -2001,6 +2012,7 @@ def reduce_slices(  # noqa: C901
     if use_miniexpr:
         # Experiments say that not splitting is best (at least on Apple Silicon M4 Pro)
         cparams = kwargs.pop("cparams", blosc2.CParams(splitmode=blosc2.SplitMode.NEVER_SPLIT))
+        fp_accuracy = kwargs.pop("fp_accuracy", blosc2.FPAccuracy.DEFAULT)
         # Create a fake NDArray just to drive the miniexpr evaluation (values won't be used)
         res_eval = blosc2.uninit(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
         # Compute the number of blocks in the result
@@ -2027,7 +2039,7 @@ def reduce_slices(  # noqa: C901
         try:
             print("expr->miniexpr:", expression, reduce_op)
             expression = f"{reduce_op_str}({expression})"
-            res_eval._set_pref_expr(expression, operands, aux_reduc)
+            res_eval._set_pref_expr(expression, operands, fp_accuracy, aux_reduc)
             # Data won't even try to be compressed, so buffers can be unitialized and reused
             data = np.empty(res_eval.schunk.chunksize, dtype=np.uint8)
             chunk_data = np.empty(res_eval.schunk.chunksize + blosc2.MAX_OVERHEAD, dtype=np.uint8)
@@ -3142,7 +3154,9 @@ def sort(self, order: str | list[str] | None = None) -> blosc2.LazyArray:
             lazy_expr._order = order
         return lazy_expr
 
-    def compute(self, item=(), **kwargs) -> blosc2.NDArray:
+    def compute(
+        self, item=(), fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT, **kwargs
+    ) -> blosc2.NDArray:
         # When NumPy ufuncs are called, the user may add an `out` parameter to kwargs
         if "out" in kwargs:  # use provided out preferentially
             kwargs["_output"] = kwargs.pop("out")
@@ -3452,7 +3466,7 @@ def sort(self, order: str | list[str] | None = None) -> blosc2.LazyArray:
             lazy_expr._order = order
         return lazy_expr
 
-    def compute(self, item=(), **kwargs):
+    def compute(self, item=(), fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT, **kwargs):
         # Get kwargs
         if kwargs is None:
             kwargs = {}
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index 762a77cc..f2d0c663 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -278,6 +278,20 @@ def test_expression_with_constants(array_fixture):
         np.testing.assert_allclose(res[:], nres)
 
 
+@pytest.mark.parametrize("accuracy", [blosc2.FPAccuracy.LOW, blosc2.FPAccuracy.HIGH])
+def test_fp_precision(array_fixture, accuracy):
+    a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
+    # Test with operands with same chunks and blocks
+    expr = blosc2.sin(a1) ** 2 - blosc2.cos(a2) ** 2 + blosc2.sqrt(a3)
+    # All precisions in miniexpr should be quite good for this expression
+    res = expr.compute(fp_accuracy=accuracy)
+    nres = ne_evaluate("sin(na1) ** 2 - cos(na2) ** 2 + sqrt(na3)")
+    if na1.dtype == np.float32:
+        np.testing.assert_allclose(res[:], nres, rtol=1e-6, atol=1e-6)
+    else:
+        np.testing.assert_allclose(res[:], nres)
+
+
 @pytest.mark.parametrize("compare_expressions", [True, False])
 @pytest.mark.parametrize("comparison_operator", ["==", "!=", ">=", ">", "<=", "<"])
 def test_comparison_operators(dtype_fixture, compare_expressions, comparison_operator):

From a8bc280e495add60329732b40bbe3695e663352d Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 22 Jan 2026 06:07:41 +0100
Subject: [PATCH 121/123] Support miniexpr for where inside reductions, e.g
 'sum(where(a < b, b, a))'

---
 CMakeLists.txt                   |  3 +--
 src/blosc2/lazyexpr.py           | 13 +++++++++----
 tests/ndarray/test_reductions.py | 25 +++++++++++++++++--------
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83629c1f..b57452f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,8 +58,7 @@ set(MINIEXPR_BUILD_BENCH OFF CACHE BOOL "Build miniexpr benchmarks" FORCE)
 
 FetchContent_Declare(miniexpr
     GIT_REPOSITORY https://github.com/Blosc/miniexpr.git
-    #GIT_TAG ndim  # latest me_compile_nd()/me_eval_nd() APIs
-    GIT_TAG 8c50850094e156ce568186edef667fabecbd00ff  # latest commit in ndim
+    GIT_TAG ndim  # latest me_compile_nd()/me_eval_nd() APIs
     # In case you want to use a local copy of miniexpr for development, uncomment the line below
     # SOURCE_DIR "/Users/faltet/blosc/miniexpr"
 )
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 4c39e426..1f601a08 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1309,8 +1309,8 @@ def fast_eval(  # noqa: C901
         fp_accuracy = kwargs.pop("fp_accuracy", blosc2.FPAccuracy.DEFAULT)
         res_eval = blosc2.uninit(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
         try:
-            print("expr->miniexpr:", expression)
             res_eval._set_pref_expr(expression, operands, fp_accuracy=fp_accuracy)
+            print("expr->miniexpr:", expression)
             # Data to compress is fetched from operands, so it can be uninitialized here
             data = np.empty(res_eval.schunk.chunksize, dtype=np.uint8)
             # Exercise prefilter for each chunk
@@ -1993,7 +1993,7 @@ def reduce_slices(  # noqa: C901
         del temp
 
     # miniexpr reduction path only supported for some cases so far
-    if not (where is None and fast_path and all_ndarray and reduced_shape == ()):
+    if not (fast_path and all_ndarray and reduced_shape == ()):
         use_miniexpr = False
 
     # Some reductions are not supported yet in miniexpr
@@ -2008,6 +2008,8 @@ def reduce_slices(  # noqa: C901
         )
         if has_complex and any(tok in expression for tok in ("!=", "==", "<=", ">=", "<", ">")):
             use_miniexpr = False
+        if where is not None and len(where) != 2:
+            use_miniexpr = False
 
     if use_miniexpr:
         # Experiments say that not splitting is best (at least on Apple Silicon M4 Pro)
@@ -2037,9 +2039,12 @@ def reduce_slices(  # noqa: C901
             # For other operations, zeros should be safe
             aux_reduc = np.zeros(nblocks, dtype=dtype)
         try:
+            if where is not None:
+                expression_miniexpr = f"{reduce_op_str}(where({expression}, _where_x, _where_y))"
+            else:
+                expression_miniexpr = f"{reduce_op_str}({expression})"
+            res_eval._set_pref_expr(expression_miniexpr, operands, fp_accuracy, aux_reduc)
             print("expr->miniexpr:", expression, reduce_op)
-            expression = f"{reduce_op_str}({expression})"
-            res_eval._set_pref_expr(expression, operands, fp_accuracy, aux_reduc)
             # Data won't even try to be compressed, so buffers can be unitialized and reused
             data = np.empty(res_eval.schunk.chunksize, dtype=np.uint8)
             chunk_data = np.empty(res_eval.schunk.chunksize + blosc2.MAX_OVERHEAD, dtype=np.uint8)
diff --git a/tests/ndarray/test_reductions.py b/tests/ndarray/test_reductions.py
index e146bfb7..9c4f30cf 100644
--- a/tests/ndarray/test_reductions.py
+++ b/tests/ndarray/test_reductions.py
@@ -65,15 +65,24 @@ def test_reduce_bool(array_fixture, reduce_op):
     np.testing.assert_allclose(res, nres, atol=tol, rtol=tol)
 
 
-def test_reduce_where(array_fixture):
+# @pytest.mark.parametrize("reduce_op", ["sum"])
+@pytest.mark.parametrize("reduce_op", ["sum", "prod", "min", "max", "any", "all", "argmax", "argmin"])
+def test_reduce_where(array_fixture, reduce_op):
     a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
-    # The next works
-    # res = blosc2.where(a1 < a2, a2, 0).sum()
-    # nres = ne_evaluate("sum(where(na1 < na2, na2, 0))")
-    # This does not work yet (it currently hangs)
-    res = blosc2.where(a1 < a2, a2, a1).sum()
-    nres = ne_evaluate("sum(where(na1 < na2, na2, na1))")
-    print("res:", res, nres)
+    if reduce_op == "prod":
+        # To avoid overflow, create a1 and a2 with small values
+        na1 = np.linspace(0, 0.1, np.prod(a1.shape), dtype=np.float32).reshape(a1.shape)
+        a1 = blosc2.asarray(na1)
+        na2 = np.linspace(0, 0.5, np.prod(a1.shape), dtype=np.float32).reshape(a1.shape)
+        a2 = blosc2.asarray(na2)
+        expr = a1 + a2 - 0.2
+        nres = eval("na1 + na2 - .2")
+    else:
+        expr = blosc2.where(a1 < a2, a2, a1)
+        nres = eval("np.where(na1 < na2, na2, na1)")
+    res = getattr(expr, reduce_op)()
+    nres = getattr(nres, reduce_op)()
+    # print("res:", res, nres, type(res), type(nres))
     tol = 1e-15 if a1.dtype == "float64" else 1e-6
     np.testing.assert_allclose(res, nres, atol=tol, rtol=tol)
 

From df449a256690b1a8bcacc137a22131d1174e2541 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 22 Jan 2026 07:11:07 +0100
Subject: [PATCH 122/123] Add new fp_accuracy param for LazyArray reductions
 (sum, prod et al)

---
 doc/reference/reduction_functions.rst |   2 +-
 src/blosc2/lazyexpr.py                | 156 +++++++++++++++++++++-----
 src/blosc2/ndarray.py                 |  15 +++
 tests/ndarray/test_lazyexpr.py        |  21 ++--
 tests/ndarray/test_reductions.py      |  17 +++
 5 files changed, 170 insertions(+), 41 deletions(-)

diff --git a/doc/reference/reduction_functions.rst b/doc/reference/reduction_functions.rst
index d9b61834..45f27abe 100644
--- a/doc/reference/reduction_functions.rst
+++ b/doc/reference/reduction_functions.rst
@@ -3,7 +3,7 @@ Reduction Functions
 
 Contrarily to lazy functions, reduction functions are evaluated eagerly, and the result is always a NumPy array (although this can be converted internally into an :ref:`NDArray <NDArray>` if you pass any :func:`blosc2.empty` arguments in ``kwargs``).
 
-Reduction operations can be used with any of :ref:`NDArray <NDArray>`, :ref:`C2Array <C2Array>`, :ref:`NDField <NDField>` and :ref:`LazyExpr <LazyExpr>`. Again, although these can be part of a :ref:`LazyExpr <LazyExpr>`, you must be aware that they are not lazy, but will be evaluated eagerly during the construction of a LazyExpr instance (this might change in the future).
+Reduction operations can be used with any of :ref:`NDArray <NDArray>`, :ref:`C2Array <C2Array>`, :ref:`NDField <NDField>` and :ref:`LazyExpr <LazyExpr>`. Again, although these can be part of a :ref:`LazyExpr <LazyExpr>`, you must be aware that they are not lazy, but will be evaluated eagerly during the construction of a LazyExpr instance (this might change in the future). When the input is a :ref:`LazyExpr`, reductions accept ``fp_accuracy`` to control floating-point accuracy, and it is forwarded to :func:`LazyExpr.compute`.
 
 .. currentmodule:: blosc2
 
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 1f601a08..c68ad3b6 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1248,6 +1248,7 @@ def fast_eval(  # noqa: C901
     ne_args: dict = kwargs.pop("_ne_args", {})
     if ne_args is None:
         ne_args = {}
+    fp_accuracy = kwargs.pop("fp_accuracy", blosc2.FPAccuracy.DEFAULT)
     dtype = kwargs.pop("dtype", None)
     where: dict | None = kwargs.pop("_where_args", None)
     if where is not None:
@@ -1306,11 +1307,10 @@ def fast_eval(  # noqa: C901
     if use_miniexpr:
         cparams = kwargs.pop("cparams", blosc2.CParams())
         # All values will be overwritten, so we can use an uninitialized array
-        fp_accuracy = kwargs.pop("fp_accuracy", blosc2.FPAccuracy.DEFAULT)
         res_eval = blosc2.uninit(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
         try:
             res_eval._set_pref_expr(expression, operands, fp_accuracy=fp_accuracy)
-            print("expr->miniexpr:", expression)
+            print("expr->miniexpr:", expression, fp_accuracy)
             # Data to compress is fetched from operands, so it can be uninitialized here
             data = np.empty(res_eval.schunk.chunksize, dtype=np.uint8)
             # Exercise prefilter for each chunk
@@ -1522,7 +1522,10 @@ def slices_eval(  # noqa: C901
             # Typically, we enter here when using UDFs, and out is a NumPy array.
             # Use operands to get the shape and chunks
             # operand will be a 'fake' NDArray just to get the necessary chunking information
+            fp_accuracy = kwargs.pop("fp_accuracy", None)
             temp = blosc2.empty(shape, dtype=dtype)
+            if fp_accuracy is not None:
+                kwargs["fp_accuracy"] = fp_accuracy
             chunks = temp.chunks
             del temp
 
@@ -1607,7 +1610,10 @@ def slices_eval(  # noqa: C901
                 if "chunks" in kwargs and (where is not None and len(where) < 2 and len(shape_) > 1):
                     # Remove the chunks argument if the where condition is not a tuple with two elements
                     kwargs.pop("chunks")
+                fp_accuracy = kwargs.pop("fp_accuracy", None)
                 out = blosc2.empty(shape_, dtype=dtype_, **kwargs)
+                if fp_accuracy is not None:
+                    kwargs["fp_accuracy"] = fp_accuracy
                 # Check if the in out partitions are well-behaved (i.e. no padding)
                 behaved = blosc2.are_partitions_behaved(out.shape, out.chunks, out.blocks)
         # Evaluate the expression using chunks of operands
@@ -1892,6 +1898,7 @@ def reduce_slices(  # noqa: C901
     ne_args: dict = kwargs.pop("_ne_args", {})
     if ne_args is None:
         ne_args = {}
+    fp_accuracy = kwargs.pop("fp_accuracy", blosc2.FPAccuracy.DEFAULT)
     where: dict | None = kwargs.pop("_where_args", None)
     reduce_op = reduce_args.pop("op")
     reduce_op_str = reduce_args.pop("op_str", None)
@@ -2014,7 +2021,6 @@ def reduce_slices(  # noqa: C901
     if use_miniexpr:
         # Experiments say that not splitting is best (at least on Apple Silicon M4 Pro)
         cparams = kwargs.pop("cparams", blosc2.CParams(splitmode=blosc2.SplitMode.NEVER_SPLIT))
-        fp_accuracy = kwargs.pop("fp_accuracy", blosc2.FPAccuracy.DEFAULT)
         # Create a fake NDArray just to drive the miniexpr evaluation (values won't be used)
         res_eval = blosc2.uninit(shape, dtype, chunks=chunks, blocks=blocks, cparams=cparams, **kwargs)
         # Compute the number of blocks in the result
@@ -2044,7 +2050,7 @@ def reduce_slices(  # noqa: C901
             else:
                 expression_miniexpr = f"{reduce_op_str}({expression})"
             res_eval._set_pref_expr(expression_miniexpr, operands, fp_accuracy, aux_reduc)
-            print("expr->miniexpr:", expression, reduce_op)
+            print("expr->miniexpr:", expression, reduce_op, fp_accuracy)
             # Data won't even try to be compressed, so buffers can be unitialized and reused
             data = np.empty(res_eval.schunk.chunksize, dtype=np.uint8)
             chunk_data = np.empty(res_eval.schunk.chunksize + blosc2.MAX_OVERHEAD, dtype=np.uint8)
@@ -2849,7 +2855,14 @@ def where(self, value1=None, value2=None):
         new_expr._dtype = dtype
         return new_expr
 
-    def sum(self, axis=None, dtype=None, keepdims=False, **kwargs):
+    def sum(
+        self,
+        axis=None,
+        dtype=None,
+        keepdims=False,
+        fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT,
+        **kwargs,
+    ):
         reduce_args = {
             "op": ReduceOp.SUM,
             "op_str": "sum",
@@ -2857,9 +2870,16 @@ def sum(self, axis=None, dtype=None, keepdims=False, **kwargs):
             "dtype": dtype,
             "keepdims": keepdims,
         }
-        return self.compute(_reduce_args=reduce_args, **kwargs)
+        return self.compute(_reduce_args=reduce_args, fp_accuracy=fp_accuracy, **kwargs)
 
-    def prod(self, axis=None, dtype=None, keepdims=False, **kwargs):
+    def prod(
+        self,
+        axis=None,
+        dtype=None,
+        keepdims=False,
+        fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT,
+        **kwargs,
+    ):
         reduce_args = {
             "op": ReduceOp.PROD,
             "op_str": "prod",
@@ -2867,7 +2887,7 @@ def prod(self, axis=None, dtype=None, keepdims=False, **kwargs):
             "dtype": dtype,
             "keepdims": keepdims,
         }
-        return self.compute(_reduce_args=reduce_args, **kwargs)
+        return self.compute(_reduce_args=reduce_args, fp_accuracy=fp_accuracy, **kwargs)
 
     def get_num_elements(self, axis, item):
         if hasattr(self, "_where_args") and len(self._where_args) == 1:
@@ -2889,9 +2909,22 @@ def get_num_elements(self, axis, item):
         axis = tuple(a if a >= 0 else a + len(shape) for a in axis)  # handle negative indexing
         return math.prod([shape[i] for i in axis])
 
-    def mean(self, axis=None, dtype=None, keepdims=False, **kwargs):
+    def mean(
+        self,
+        axis=None,
+        dtype=None,
+        keepdims=False,
+        fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT,
+        **kwargs,
+    ):
         item = kwargs.pop("item", ())
-        total_sum = self.sum(axis=axis, dtype=dtype, keepdims=keepdims, item=item)
+        total_sum = self.sum(
+            axis=axis,
+            dtype=dtype,
+            keepdims=keepdims,
+            item=item,
+            fp_accuracy=fp_accuracy,
+        )
         num_elements = self.get_num_elements(axis, item)
         if num_elements == 0:
             raise ValueError("mean of an empty array is not defined")
@@ -2904,17 +2937,25 @@ def mean(self, axis=None, dtype=None, keepdims=False, **kwargs):
             out = blosc2.asarray(out, **kwargs)
         return out
 
-    def std(self, axis=None, dtype=None, keepdims=False, ddof=0, **kwargs):
+    def std(
+        self,
+        axis=None,
+        dtype=None,
+        keepdims=False,
+        ddof=0,
+        fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT,
+        **kwargs,
+    ):
         item = kwargs.pop("item", ())
         if item == ():  # fast path
-            mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True)
+            mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True, fp_accuracy=fp_accuracy)
             expr = (self - mean_value) ** 2
         else:
-            mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True, item=item)
+            mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True, item=item, fp_accuracy=fp_accuracy)
             # TODO: Not optimal because we load the whole slice in memory. Would have to write
             #  a bespoke std function that executed within slice_eval to avoid this probably.
             expr = (self.slice(item) - mean_value) ** 2
-        out = expr.mean(axis=axis, dtype=dtype, keepdims=keepdims)
+        out = expr.mean(axis=axis, dtype=dtype, keepdims=keepdims, fp_accuracy=fp_accuracy)
         if ddof != 0:
             num_elements = self.get_num_elements(axis, item)
             out = np.sqrt(out * num_elements / (num_elements - ddof))
@@ -2928,17 +2969,25 @@ def std(self, axis=None, dtype=None, keepdims=False, ddof=0, **kwargs):
             out = blosc2.asarray(out, **kwargs)
         return out
 
-    def var(self, axis=None, dtype=None, keepdims=False, ddof=0, **kwargs):
+    def var(
+        self,
+        axis=None,
+        dtype=None,
+        keepdims=False,
+        ddof=0,
+        fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT,
+        **kwargs,
+    ):
         item = kwargs.pop("item", ())
         if item == ():  # fast path
-            mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True)
+            mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True, fp_accuracy=fp_accuracy)
             expr = (self - mean_value) ** 2
         else:
-            mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True, item=item)
+            mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True, item=item, fp_accuracy=fp_accuracy)
             # TODO: Not optimal because we load the whole slice in memory. Would have to write
             #  a bespoke var function that executed within slice_eval to avoid this probably.
             expr = (self.slice(item) - mean_value) ** 2
-        out = expr.mean(axis=axis, dtype=dtype, keepdims=keepdims)
+        out = expr.mean(axis=axis, dtype=dtype, keepdims=keepdims, fp_accuracy=fp_accuracy)
         if ddof != 0:
             num_elements = self.get_num_elements(axis, item)
             out = out * num_elements / (num_elements - ddof)
@@ -2950,57 +2999,93 @@ def var(self, axis=None, dtype=None, keepdims=False, ddof=0, **kwargs):
             out = blosc2.asarray(out, **kwargs)
         return out
 
-    def min(self, axis=None, keepdims=False, **kwargs):
+    def min(
+        self,
+        axis=None,
+        keepdims=False,
+        fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT,
+        **kwargs,
+    ):
         reduce_args = {
             "op": ReduceOp.MIN,
             "op_str": "min",
             "axis": axis,
             "keepdims": keepdims,
         }
-        return self.compute(_reduce_args=reduce_args, **kwargs)
+        return self.compute(_reduce_args=reduce_args, fp_accuracy=fp_accuracy, **kwargs)
 
-    def max(self, axis=None, keepdims=False, **kwargs):
+    def max(
+        self,
+        axis=None,
+        keepdims=False,
+        fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT,
+        **kwargs,
+    ):
         reduce_args = {
             "op": ReduceOp.MAX,
             "op_str": "max",
             "axis": axis,
             "keepdims": keepdims,
         }
-        return self.compute(_reduce_args=reduce_args, **kwargs)
+        return self.compute(_reduce_args=reduce_args, fp_accuracy=fp_accuracy, **kwargs)
 
-    def any(self, axis=None, keepdims=False, **kwargs):
+    def any(
+        self,
+        axis=None,
+        keepdims=False,
+        fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT,
+        **kwargs,
+    ):
         reduce_args = {
             "op": ReduceOp.ANY,
             "op_str": "any",
             "axis": axis,
             "keepdims": keepdims,
         }
-        return self.compute(_reduce_args=reduce_args, **kwargs)
+        return self.compute(_reduce_args=reduce_args, fp_accuracy=fp_accuracy, **kwargs)
 
-    def all(self, axis=None, keepdims=False, **kwargs):
+    def all(
+        self,
+        axis=None,
+        keepdims=False,
+        fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT,
+        **kwargs,
+    ):
         reduce_args = {
             "op": ReduceOp.ALL,
             "op_str": "all",
             "axis": axis,
             "keepdims": keepdims,
         }
-        return self.compute(_reduce_args=reduce_args, **kwargs)
+        return self.compute(_reduce_args=reduce_args, fp_accuracy=fp_accuracy, **kwargs)
 
-    def argmax(self, axis=None, keepdims=False, **kwargs):
+    def argmax(
+        self,
+        axis=None,
+        keepdims=False,
+        fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT,
+        **kwargs,
+    ):
         reduce_args = {
             "op": ReduceOp.ARGMAX,
             "axis": axis,
             "keepdims": keepdims,
         }
-        return self.compute(_reduce_args=reduce_args, **kwargs)
+        return self.compute(_reduce_args=reduce_args, fp_accuracy=fp_accuracy, **kwargs)
 
-    def argmin(self, axis=None, keepdims=False, **kwargs):
+    def argmin(
+        self,
+        axis=None,
+        keepdims=False,
+        fp_accuracy: blosc2.FPAccuracy = blosc2.FPAccuracy.DEFAULT,
+        **kwargs,
+    ):
         reduce_args = {
             "op": ReduceOp.ARGMIN,
             "axis": axis,
             "keepdims": keepdims,
         }
-        return self.compute(_reduce_args=reduce_args, **kwargs)
+        return self.compute(_reduce_args=reduce_args, fp_accuracy=fp_accuracy, **kwargs)
 
     def _eval_constructor(self, expression, constructor, operands):
         """Evaluate a constructor function inside a string expression."""
@@ -3174,6 +3259,7 @@ def compute(
             kwargs["_ne_args"] = self._ne_args
         if hasattr(self, "_where_args"):
             kwargs["_where_args"] = self._where_args
+        kwargs.setdefault("fp_accuracy", fp_accuracy)
         kwargs["dtype"] = self.dtype
         kwargs["shape"] = self.shape
         if hasattr(self, "_indices"):
@@ -3192,7 +3278,15 @@ def compute(
             and not isinstance(result, blosc2.NDArray)
         ):
             # Get rid of all the extra kwargs that are not accepted by blosc2.asarray
-            kwargs_not_accepted = {"_where_args", "_indices", "_order", "_ne_args", "dtype", "shape"}
+            kwargs_not_accepted = {
+                "_where_args",
+                "_indices",
+                "_order",
+                "_ne_args",
+                "dtype",
+                "shape",
+                "fp_accuracy",
+            }
             kwargs = {key: value for key, value in kwargs.items() if key not in kwargs_not_accepted}
             result = blosc2.asarray(result, **kwargs)
         return result
diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py
index 85d4dcb2..c6c0c10f 100644
--- a/src/blosc2/ndarray.py
+++ b/src/blosc2/ndarray.py
@@ -505,6 +505,9 @@ def sum(
         If set to True, the reduced axes are left in the result
         as dimensions with size one. With this option, the result will broadcast
         correctly against the input array.
+    fp_accuracy: :ref:`blosc2.FPAccuracy`, optional
+        Specifies the floating-point accuracy for reductions on :ref:`LazyExpr`.
+        Passed to :func:`LazyExpr.compute` when :paramref:`ndarr` is a LazyExpr.
     kwargs: dict, optional
         Additional keyword arguments supported by the :func:`empty` constructor.
 
@@ -600,6 +603,9 @@ def std(
         If set to True, the reduced axes are left in the result as
         dimensions with size one. This ensures that the result will broadcast correctly
         against the input array.
+    fp_accuracy: :ref:`blosc2.FPAccuracy`, optional
+        Specifies the floating-point accuracy for reductions on :ref:`LazyExpr`.
+        Passed to :func:`LazyExpr.compute` when :paramref:`ndarr` is a LazyExpr.
     kwargs: dict, optional
         Additional keyword arguments that are supported by the :func:`empty` constructor.
 
@@ -732,6 +738,9 @@ def min(
         If set to True, the axes which are reduced are left in the result as
         dimensions with size one. With this option, the result will broadcast correctly
         against the input array.
+    fp_accuracy: :ref:`blosc2.FPAccuracy`, optional
+        Specifies the floating-point accuracy for reductions on :ref:`LazyExpr`.
+        Passed to :func:`LazyExpr.compute` when :paramref:`ndarr` is a LazyExpr.
     kwargs: dict, optional
         Keyword arguments that are supported by the :func:`empty` constructor.
 
@@ -863,6 +872,9 @@ def argmin(
 
     keepdims: bool
         If True, reduced axis included in the result as singleton dimension. Otherwise, axis not included in the result. Default: False.
+    fp_accuracy: :ref:`blosc2.FPAccuracy`, optional
+        Specifies the floating-point accuracy for reductions on :ref:`LazyExpr`.
+        Passed to :func:`LazyExpr.compute` when :paramref:`ndarr` is a LazyExpr.
 
     Returns
     -------
@@ -890,6 +902,9 @@ def argmax(
 
     keepdims: bool
         If True, reduced axis included in the result as singleton dimension. Otherwise, axis not included in the result. Default: False.
+    fp_accuracy: :ref:`blosc2.FPAccuracy`, optional
+        Specifies the floating-point accuracy for reductions on :ref:`LazyExpr`.
+        Passed to :func:`LazyExpr.compute` when :paramref:`ndarr` is a LazyExpr.
 
     Returns
     -------
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index f2d0c663..223d981c 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -278,18 +278,21 @@ def test_expression_with_constants(array_fixture):
         np.testing.assert_allclose(res[:], nres)
 
 
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("accuracy", [blosc2.FPAccuracy.LOW, blosc2.FPAccuracy.HIGH])
-def test_fp_precision(array_fixture, accuracy):
-    a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
-    # Test with operands with same chunks and blocks
+def test_fp_accuracy(accuracy, dtype):
+    a1 = blosc2.linspace(0, 10, NITEMS, dtype=dtype, chunks=(1000,), blocks=(500,))
+    a2 = blosc2.linspace(0, 10, NITEMS, dtype=dtype, chunks=(1000,), blocks=(500,))
+    a3 = blosc2.linspace(0, 10, NITEMS, dtype=dtype, chunks=(1000,), blocks=(500,))
     expr = blosc2.sin(a1) ** 2 - blosc2.cos(a2) ** 2 + blosc2.sqrt(a3)
-    # All precisions in miniexpr should be quite good for this expression
     res = expr.compute(fp_accuracy=accuracy)
-    nres = ne_evaluate("sin(na1) ** 2 - cos(na2) ** 2 + sqrt(na3)")
-    if na1.dtype == np.float32:
-        np.testing.assert_allclose(res[:], nres, rtol=1e-6, atol=1e-6)
-    else:
-        np.testing.assert_allclose(res[:], nres)
+    na1 = a1[:]
+    na2 = a2[:]
+    na3 = a3[:]
+    nres = eval("np.sin(na1) ** 2 - np.cos(na2) ** 2 + np.sqrt(na3)")
+    # print("res dtypes:", res.dtype, nres.dtype)
+    tol = 1e-6 if a1.dtype == "float32" else 1e-15
+    np.testing.assert_allclose(res, nres, atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("compare_expressions", [True, False])
diff --git a/tests/ndarray/test_reductions.py b/tests/ndarray/test_reductions.py
index 9c4f30cf..f2443b95 100644
--- a/tests/ndarray/test_reductions.py
+++ b/tests/ndarray/test_reductions.py
@@ -87,6 +87,23 @@ def test_reduce_where(array_fixture, reduce_op):
     np.testing.assert_allclose(res, nres, atol=tol, rtol=tol)
 
 
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("accuracy", [blosc2.FPAccuracy.LOW, blosc2.FPAccuracy.HIGH])
+def test_fp_accuracy(accuracy, dtype):
+    a1 = blosc2.linspace(0, 10, NITEMS, dtype=dtype, chunks=(1000,), blocks=(500,))
+    a2 = blosc2.linspace(0, 10, NITEMS, dtype=dtype, chunks=(1000,), blocks=(500,))
+    a3 = blosc2.linspace(0, 10, NITEMS, dtype=dtype, chunks=(1000,), blocks=(500,))
+    expr = blosc2.sin(a1) ** 2 - blosc2.cos(a2) ** 2 + blosc2.sqrt(a3)
+    res = expr.sum(fp_accuracy=accuracy)
+    na1 = a1[:]
+    na2 = a2[:]
+    na3 = a3[:]
+    nres = eval("np.sin(na1) ** 2 - np.cos(na2) ** 2 + np.sqrt(na3)").sum()
+    # print("res:", res, nres, type(res), type(nres))
+    tol = 1e-6 if a1.dtype == "float32" else 1e-15
+    np.testing.assert_allclose(res, nres, atol=tol, rtol=tol)
+
+
 @pytest.mark.parametrize(
     "reduce_op", ["sum", "prod", "mean", "std", "var", "min", "max", "any", "all", "argmax", "argmin"]
 )

From 302124a7876d2742d79905d3c1a06a5c3b79cef0 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 22 Jan 2026 07:25:00 +0100
Subject: [PATCH 123/123] FPAccuracy.LOW -> FPAccuracy.MEDIUM

---
 src/blosc2/__init__.py           | 6 +++---
 src/blosc2/blosc2_ext.pyx        | 2 +-
 tests/ndarray/test_lazyexpr.py   | 2 +-
 tests/ndarray/test_reductions.py | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py
index 95cf3fc0..9a5488de 100644
--- a/src/blosc2/__init__.py
+++ b/src/blosc2/__init__.py
@@ -123,9 +123,9 @@ class FPAccuracy(Enum):
     #: Use 1.0 ULPs (Units in the Last Place) for floating point functions
     HIGH = 1
     #: Use 3.5 ULPs (Units in the Last Place) for floating point functions
-    LOW = 2
-    #: Use default accuracy. This is LOW, which is enough for most applications.
-    DEFAULT = LOW
+    MEDIUM = 2
+    #: Use default accuracy. This is MEDIUM, which should be enough for most applications.
+    DEFAULT = MEDIUM
 
 
 from .blosc2_ext import (
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index 4c1512ff..cd4e5697 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -2932,7 +2932,7 @@ cdef class NDArray:
         udata.ninputs = ninputs
         cdef me_eval_params* eval_params = <me_eval_params*> malloc(sizeof(me_eval_params))
         eval_params.disable_simd = False
-        eval_params.simd_ulp_mode = ME_SIMD_ULP_3_5 if fp_accuracy == blosc2.FPAccuracy.LOW else ME_SIMD_ULP_1
+        eval_params.simd_ulp_mode = ME_SIMD_ULP_3_5 if fp_accuracy == blosc2.FPAccuracy.MEDIUM else ME_SIMD_ULP_1
         udata.eval_params = eval_params
         udata.array = self.array
         cdef void* aux_reduc_ptr = NULL
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index 223d981c..c92d53be 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -279,7 +279,7 @@ def test_expression_with_constants(array_fixture):
 
 
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-@pytest.mark.parametrize("accuracy", [blosc2.FPAccuracy.LOW, blosc2.FPAccuracy.HIGH])
+@pytest.mark.parametrize("accuracy", [blosc2.FPAccuracy.MEDIUM, blosc2.FPAccuracy.HIGH])
 def test_fp_accuracy(accuracy, dtype):
     a1 = blosc2.linspace(0, 10, NITEMS, dtype=dtype, chunks=(1000,), blocks=(500,))
     a2 = blosc2.linspace(0, 10, NITEMS, dtype=dtype, chunks=(1000,), blocks=(500,))
diff --git a/tests/ndarray/test_reductions.py b/tests/ndarray/test_reductions.py
index f2443b95..d1031093 100644
--- a/tests/ndarray/test_reductions.py
+++ b/tests/ndarray/test_reductions.py
@@ -88,7 +88,7 @@ def test_reduce_where(array_fixture, reduce_op):
 
 
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-@pytest.mark.parametrize("accuracy", [blosc2.FPAccuracy.LOW, blosc2.FPAccuracy.HIGH])
+@pytest.mark.parametrize("accuracy", [blosc2.FPAccuracy.MEDIUM, blosc2.FPAccuracy.HIGH])
 def test_fp_accuracy(accuracy, dtype):
     a1 = blosc2.linspace(0, 10, NITEMS, dtype=dtype, chunks=(1000,), blocks=(500,))
     a2 = blosc2.linspace(0, 10, NITEMS, dtype=dtype, chunks=(1000,), blocks=(500,))