From 1ced1f241439315115528266d18b5f22349b8040 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 21 Jun 2026 00:14:33 +0000 Subject: [PATCH] Vectorize small-offset overlapping copies on AArch64 with NEON tbl For small match offsets (overlap period < 16 bytes), `ZSTD_wildcopy` falls back to an 8-byte-per-iteration `COPY8` loop. On AArch64 this dominates decompression of integer and low-cardinality columns, whose matches frequently have offsets of 4 or 8 bytes (e.g. repeated fixed-width values). Build the repeating pattern once in a NEON register and store 16 bytes per iteration, advancing the pattern with a single `vqtbl1q_u8` table lookup and no load inside the loop. The scalar `COPY8` path is kept for other targets. On Graviton 4 (Neoverse-V2), ZSTD level 1 decompression of ClickBench `hits` columns speeds up: UserID 2.30x, AdvEngineID 1.46x, ClientIP 1.42x, RegionID 1.27x; string and incompressible columns are unchanged. Verified byte-exact and clean under ASan and UBSan. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/common/zstd_internal.h | 52 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index c1647689b94..8df2df23c2c 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -222,9 +222,61 @@ void ZSTD_wildcopy(void* dst, const void* src, size_t length, ZSTD_overlap_e con if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) { /* Handle short offset copies. */ +#if defined(ZSTD_ARCH_ARM_NEON) + /* The source and destination overlap with period `diff` (1 <= diff < 16). + * Build one 16-byte register holding the repeating pattern, then store 16 + * bytes per iteration (vs 8 with COPY8), advancing the pattern with a single + * table lookup. There is no load inside the loop. + * init[diff][j] = j % diff : build the pattern from a 16-byte load + * adv[diff][j] = (16 + j) % diff : shift the pattern forward by 16 bytes */ + static const uint8_t init[16][16] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}, + { 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0}, + { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}, + { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0}, + { 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3}, + { 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1}, + { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0}, + }; + static const uint8_t adv[16][16] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}, + { 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1}, + { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}, + { 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1}, + { 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}, + { 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3}, + { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}, + { 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4}, + { 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1}, + { 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + { 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7}, + { 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5}, + { 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3}, + { 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1}, + }; + uint8x16_t pattern = vqtbl1q_u8(vld1q_u8((const uint8_t*)ip), vld1q_u8(init[diff])); + uint8x16_t const advance = vld1q_u8(adv[diff]); + do { + vst1q_u8((uint8_t*)op, pattern); + pattern = vqtbl1q_u8(pattern, advance); + op += 16; + } while (op < oend); +#else do { COPY8(op, ip); } while (op < oend); +#endif } else { assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); /* Separate out the first COPY16() call because the copy length is