diff --git a/.github/workflows/ios-packaging.yml b/.github/workflows/ios-packaging.yml index afe8051c82..92005bda20 100644 --- a/.github/workflows/ios-packaging.yml +++ b/.github/workflows/ios-packaging.yml @@ -6,6 +6,7 @@ on: - '.github/workflows/ios-packaging.yml' - 'maven/codenameone-maven-plugin/**' - 'vm/ByteCodeTranslator/**' + - 'Ports/iOSPort/**' - 'scripts/build-ios-app.sh' - 'scripts/run-ios-ui-tests.sh' - 'scripts/run-ios-native-tests.sh' @@ -18,6 +19,7 @@ on: - '.github/workflows/ios-packaging.yml' - 'maven/codenameone-maven-plugin/**' - 'vm/ByteCodeTranslator/**' + - 'Ports/iOSPort/**' - 'scripts/build-ios-app.sh' - 'scripts/run-ios-ui-tests.sh' - 'scripts/run-ios-native-tests.sh' @@ -68,7 +70,9 @@ jobs: id: setup_hash run: | set -euo pipefail - echo "hash=$(shasum -a 256 scripts/setup-workspace.sh | awk '{print $1}')" >> "$GITHUB_OUTPUT" + SETUP_HASH=$(shasum -a 256 scripts/setup-workspace.sh | awk '{print $1}') + IOS_PORT_HASH=$(find Ports/iOSPort/src -type f -name '*.java' | sort | xargs shasum -a 256 | shasum -a 256 | awk '{print $1}') + echo "hash=${SETUP_HASH}-${IOS_PORT_HASH}" >> "$GITHUB_OUTPUT" - name: Set TMPDIR run: echo "TMPDIR=${{ runner.temp }}" >> $GITHUB_ENV diff --git a/CodenameOne/src/com/codename1/impl/CodenameOneImplementation.java b/CodenameOne/src/com/codename1/impl/CodenameOneImplementation.java index 4777cde002..1190843222 100644 --- a/CodenameOne/src/com/codename1/impl/CodenameOneImplementation.java +++ b/CodenameOne/src/com/codename1/impl/CodenameOneImplementation.java @@ -85,6 +85,7 @@ import com.codename1.ui.util.ImageIO; import com.codename1.util.AsyncResource; import com.codename1.util.FailureCallback; +import com.codename1.util.Simd; import com.codename1.util.StringUtil; import com.codename1.util.SuccessCallback; @@ -8397,6 +8398,12 @@ public ImageIO getImageIO() { return null; } + /// Creates the SIMD implementation for this platform. + /// Ports may override this to provide accelerated SIMD behavior. + public Simd createSimd() { + return new Simd(); + } + /// Workaround for XMLVM bug public boolean instanceofObjArray(Object o) { return o instanceof Object[]; diff --git a/CodenameOne/src/com/codename1/ui/CN.java b/CodenameOne/src/com/codename1/ui/CN.java index 9bcce255ea..62470ab8da 100644 --- a/CodenameOne/src/com/codename1/ui/CN.java +++ b/CodenameOne/src/com/codename1/ui/CN.java @@ -36,6 +36,7 @@ import com.codename1.ui.events.WindowEvent; import com.codename1.ui.geom.Dimension; import com.codename1.ui.geom.Rectangle; +import com.codename1.util.Simd; import com.codename1.util.RunnableWithResultSync; import java.io.IOException; @@ -1032,6 +1033,11 @@ public static String getPlatformName() { return Display.impl.getPlatformName(); } + /// Returns the SIMD API for the current platform. + public static Simd getSimd() { + return Display.getInstance().getSimd(); + } + /// Opens the device Dialer application with the given phone number /// diff --git a/CodenameOne/src/com/codename1/ui/Display.java b/CodenameOne/src/com/codename1/ui/Display.java index c56aaf780d..34bf4c4034 100644 --- a/CodenameOne/src/com/codename1/ui/Display.java +++ b/CodenameOne/src/com/codename1/ui/Display.java @@ -60,6 +60,7 @@ import com.codename1.ui.util.EventDispatcher; import com.codename1.ui.util.ImageIO; import com.codename1.util.AsyncResource; +import com.codename1.util.Simd; import com.codename1.util.RunnableWithResultSync; import com.codename1.util.SuccessCallback; @@ -216,6 +217,7 @@ public final class Display extends CN1Constants { long time; private int transitionDelay = -1; private String selectedVirtualKeyboard = null; + private Simd simd; private CrashReport crashReporter; private EventDispatcher errorHandler; private boolean inNativeUI; @@ -343,6 +345,7 @@ public static void init(Object m) { commandBehaviour = impl.getCommandBehavior(); } impl = (CodenameOneImplementation) ImplementationFactory.getInstance().createImplementation(); + INSTANCE.simd = null; impl.setDisplayLock(lock); impl.initImpl(m); @@ -493,6 +496,18 @@ CodenameOneImplementation getImplementation() { return impl; } + /// Returns the SIMD API instance bound to the current implementation. + public Simd getSimd() { + if (simd == null) { + Simd created = impl.createSimd(); + if (created == null) { + created = new Simd(); + } + simd = created; + } + return simd; + } + /// Indicates the maximum frames the API will try to draw every second /// by default this is set to 10. The advantage of limiting /// framerate is to allow the CPU to perform other tasks besides drawing. diff --git a/CodenameOne/src/com/codename1/util/Base64.java b/CodenameOne/src/com/codename1/util/Base64.java index e4e6b6b740..5945ebe424 100644 --- a/CodenameOne/src/com/codename1/util/Base64.java +++ b/CodenameOne/src/com/codename1/util/Base64.java @@ -39,6 +39,7 @@ public abstract class Base64 { private static final byte[] decodeMap = new byte[256]; private static final int[] decodeMapInt = new int[256]; + private static final int SIMD_SCRATCH_INTS = 192; static { for (int i = 0; i < decodeMap.length; i++) { @@ -79,7 +80,7 @@ public static byte[] decode(byte[] in, int len) { return new byte[0]; } int maxOutputLength = (len / 4) * 3 + 3; - byte[] out = new byte[maxOutputLength]; + byte[] out = allocByteMaybeSimd(maxOutputLength); int outputLength = decode(in, len, out); if (outputLength < 0) { return null; @@ -87,7 +88,7 @@ public static byte[] decode(byte[] in, int len) { if (outputLength == out.length) { return out; } - byte[] trimmed = new byte[outputLength]; + byte[] trimmed = allocByteMaybeSimd(outputLength); System.arraycopy(out, 0, trimmed, 0, outputLength); return trimmed; } @@ -229,8 +230,9 @@ private static int decodeNoWhitespace(byte[] in, int len, byte[] out) { int outIndex = 0; int fullLen = len - (pad > 0 ? 4 : 0); int[] decodeMapLocal = decodeMapInt; + int simdFullLen = 0; - for (int i = 0; i < fullLen; i += 4) { + for (int i = simdFullLen; i < fullLen; i += 4) { int c0 = in[i] & 0xff; int c1 = in[i + 1] & 0xff; int c2 = in[i + 2] & 0xff; @@ -342,7 +344,7 @@ public static String encodeNoNewline(byte[] in) { return ""; } int outputLength = ((inputLength + 2) / 3) * 4; - byte[] out = new byte[outputLength]; + byte[] out = allocByteMaybeSimd(outputLength); encodeNoNewline(in, out); return com.codename1.util.StringUtil.newString(out, 0, outputLength); } @@ -442,4 +444,359 @@ public static int encodeNoNewline(byte[] in, byte[] out) { } return outIndex; } + + // ---- SIMD constant tables (lazily initialized) ---- + private static int[] simdEncConst; + + // Encode constant offsets (each sub-array is 64 ints) + private static final int ENC_K26 = 0; // threshold 26 + private static final int ENC_K52 = 64; // threshold 52 + private static final int ENC_K62 = 128; // threshold 62 + private static final int ENC_OFF_AZ = 192; // +65 for A-Z + private static final int ENC_OFF_az = 256; // +71 for a-z + private static final int ENC_OFF_09 = 320; // -4 for 0-9 + private static final int ENC_OFF_PLUS = 384; // -19 for + + private static final int ENC_OFF_SLASH = 448; // -16 for / + // masks (16 ints each at offset 512) + private static final int ENC_M03 = 512; + private static final int ENC_M0F = 528; + private static final int ENC_M3F = 544; + private static final int ENC_CONST_SIZE = 560; + + private static byte[] simdMask; + + private static int[] getSimdEncConst(Simd simd) { + int[] c = simdEncConst; + if (c != null) { + return c; + } + c = simd.allocInt(ENC_CONST_SIZE); + fillRange(c, ENC_K26, 64, 26); + fillRange(c, ENC_K52, 64, 52); + fillRange(c, ENC_K62, 64, 62); + fillRange(c, ENC_OFF_AZ, 64, 65); + fillRange(c, ENC_OFF_az, 64, 71); + fillRange(c, ENC_OFF_09, 64, -4); + fillRange(c, ENC_OFF_PLUS, 64, -19); + fillRange(c, ENC_OFF_SLASH, 64, -16); + fillRange(c, ENC_M03, 16, 0x03); + fillRange(c, ENC_M0F, 16, 0x0F); + fillRange(c, ENC_M3F, 16, 0x3F); + simdEncConst = c; + return c; + } + + private static byte[] getSimdMask(Simd simd) { + byte[] m = simdMask; + if (m != null) { + return m; + } + m = simd.allocByte(64); + simdMask = m; + return m; + } + + private static void fillRange(int[] arr, int offset, int len, int val) { + for (int i = offset, end = offset + len; i < end; i++) { + arr[i] = val; + } + } + + /// SIMD-optimized Base64 encoding with explicit offsets and caller scratch. + /// Uses generic Simd int-domain operations to extract 6-bit indices and + /// map them to ASCII via branchless compare/select. + /// + /// Scratch layout: a single SIMD-allocated `int[]` buffer of at least 192 ints. + /// Working regions within scratch: + /// - [0..47] : input bytes unpacked to ints (3 stripes of 16) + /// - [48..111] : output indices / ASCII values (4 stripes of 16) + /// - [112..175] : temporaries + @DisableDebugInfo + @DisableNullChecksAndArrayBoundsChecks + public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { + int outputLength = ((inLength + 2) / 3) * 4; + if (inLength == 0) { + return 0; + } + requireScratch(scratch); + Simd simd = Simd.get(); + int[] ec = getSimdEncConst(simd); + byte[] mask = getSimdMask(simd); + + int end3 = inOffset + inLength - (inLength % 3); + int si = inOffset; + int di = outOffset; + + // Process 16 triplets (48 input bytes -> 64 output bytes) per iteration + int simdEnd = end3 - 48 + 1; + while (si < simdEnd) { + // 1. Scatter input bytes into 3 int stripes (b0, b1, b2) + for (int j = 0; j < 16; j++) { + scratch[j] = in[si + j * 3] & 0xff; + scratch[16 + j] = in[si + j * 3 + 1] & 0xff; + scratch[32 + j] = in[si + j * 3 + 2] & 0xff; + } + + // 2. Extract 4 six-bit index stripes using SIMD int ops + // idx0 = b0 >> 2 + simd.shrLogical(scratch, 0, 2, scratch, 48, 16); + + // idx1 = ((b0 & 0x03) << 4) | (b1 >> 4) + simd.and(scratch, 0, ec, ENC_M03, scratch, 112, 16); + simd.shl(scratch, 112, 4, scratch, 112, 16); + simd.shrLogical(scratch, 16, 4, scratch, 128, 16); + simd.or(scratch, 112, scratch, 128, scratch, 64, 16); + + // idx2 = ((b1 & 0x0f) << 2) | (b2 >> 6) + simd.and(scratch, 16, ec, ENC_M0F, scratch, 112, 16); + simd.shl(scratch, 112, 2, scratch, 112, 16); + simd.shrLogical(scratch, 32, 6, scratch, 128, 16); + simd.or(scratch, 112, scratch, 128, scratch, 80, 16); + + // idx3 = b2 & 0x3f + simd.and(scratch, 32, ec, ENC_M3F, scratch, 96, 16); + + // 3. Map all 64 indices to ASCII in batch + // Initialize offset accumulator [112..175] with '/' offset (-16) + System.arraycopy(ec, ENC_OFF_SLASH, scratch, 112, 64); + + // eq62 -> use '+' offset + simd.cmpEq(scratch, 48, ec, ENC_K62, mask, 0, 64); + simd.select(mask, 0, ec, ENC_OFF_PLUS, scratch, 112, scratch, 112, 64); + + // lt62 -> use '0'-'9' offset + simd.cmpLt(scratch, 48, ec, ENC_K62, mask, 0, 64); + simd.select(mask, 0, ec, ENC_OFF_09, scratch, 112, scratch, 112, 64); + + // lt52 -> use 'a'-'z' offset + simd.cmpLt(scratch, 48, ec, ENC_K52, mask, 0, 64); + simd.select(mask, 0, ec, ENC_OFF_az, scratch, 112, scratch, 112, 64); + + // lt26 -> use 'A'-'Z' offset + simd.cmpLt(scratch, 48, ec, ENC_K26, mask, 0, 64); + simd.select(mask, 0, ec, ENC_OFF_AZ, scratch, 112, scratch, 112, 64); + + // ascii = indices + offset + simd.add(scratch, 48, scratch, 112, scratch, 48, 64); + + // 4. Interleave 4 output stripes into output bytes + for (int j = 0; j < 16; j++) { + out[di + j * 4] = (byte) scratch[48 + j]; + out[di + j * 4 + 1] = (byte) scratch[64 + j]; + out[di + j * 4 + 2] = (byte) scratch[80 + j]; + out[di + j * 4 + 3] = (byte) scratch[96 + j]; + } + + si += 48; + di += 64; + } + + // Scalar tail for remaining complete triplets + byte[] mapLocal = map; + while (si < end3) { + int b0 = in[si] & 0xff; + int b1 = in[si + 1] & 0xff; + int b2 = in[si + 2] & 0xff; + out[di] = mapLocal[b0 >> 2]; + out[di + 1] = mapLocal[((b0 & 0x03) << 4) | (b1 >> 4)]; + out[di + 2] = mapLocal[((b1 & 0x0f) << 2) | (b2 >> 6)]; + out[di + 3] = mapLocal[b2 & 0x3f]; + si += 3; + di += 4; + } + + // Handle 1- or 2-byte remainder with padding + switch (inOffset + inLength - end3) { + case 1: { + int b0 = in[si] & 0xff; + out[di] = mapLocal[b0 >> 2]; + out[di + 1] = mapLocal[(b0 & 0x03) << 4]; + out[di + 2] = '='; + out[di + 3] = '='; + break; + } + case 2: { + int b0 = in[si] & 0xff; + int b1 = in[si + 1] & 0xff; + out[di] = mapLocal[b0 >> 2]; + out[di + 1] = mapLocal[((b0 & 0x03) << 4) | (b1 >> 4)]; + out[di + 2] = mapLocal[(b1 & 0x0f) << 2]; + out[di + 3] = '='; + break; + } + default: + break; + } + return outputLength; + } + + /// SIMD-optimized Base64 decoding for no-whitespace input. + /// Uses generic Simd int-domain operations to map ASCII chars back to + /// 6-bit values via branchless compare/select, then combines into bytes. + /// + /// Returns decoded bytes written, or `-1` for invalid input. + /// + /// Scratch layout: a single SIMD-allocated `int[]` buffer of at least 192 ints. + /// Working regions: + /// - [0..63] : input chars unpacked to ints / decoded 6-bit values + /// - [64..111] : output byte values (3 stripes of 16) + /// - [112..175] : temporaries + @DisableDebugInfo + @DisableNullChecksAndArrayBoundsChecks + public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { + if (inLength == 0) { + return 0; + } + if ((inLength & 0x3) != 0) { + return -1; + } + requireScratch(scratch); + + int pad = 0; + if (in[inOffset + inLength - 1] == '=') { + pad++; + if (inLength > 1 && in[inOffset + inLength - 2] == '=') { + pad++; + } + } + if (pad > 2) { + return -1; + } + int outLength = (inLength / 4) * 3 - pad; + if (outLength <= 0) { + return 0; + } + + Simd simd = Simd.get(); + int[] decodeMapLocal = decodeMapInt; + + int fullLen = inLength - (pad > 0 ? 4 : 0); + int fullEnd = inOffset + fullLen; + int si = inOffset; + int di = outOffset; + + // Process 16 quads (64 input bytes -> 48 output bytes) per iteration + int simdEnd = fullEnd - 64 + 1; + while (si < simdEnd) { + // 1. De-interleave and decode: scatter 64 input bytes into 4 stripes, + // converting ASCII to 6-bit values using the scalar decode table + boolean invalid = false; + for (int j = 0; j < 16; j++) { + int v0 = decodeMapLocal[in[si + j * 4] & 0xff]; + int v1 = decodeMapLocal[in[si + j * 4 + 1] & 0xff]; + int v2 = decodeMapLocal[in[si + j * 4 + 2] & 0xff]; + int v3 = decodeMapLocal[in[si + j * 4 + 3] & 0xff]; + scratch[j] = v0; + scratch[16 + j] = v1; + scratch[32 + j] = v2; + scratch[48 + j] = v3; + if ((v0 | v1 | v2 | v3) < 0) { + invalid = true; + } + } + if (invalid) { + return -1; + } + + // 2. Combine 4 six-bit values into 3 bytes using SIMD int ops + // o0 = (d0 << 2) | (d1 >> 4) + simd.shl(scratch, 0, 2, scratch, 64, 16); + simd.shrLogical(scratch, 16, 4, scratch, 112, 16); + simd.or(scratch, 64, scratch, 112, scratch, 64, 16); + + // o1 = (d1 << 4) | (d2 >> 2) + simd.shl(scratch, 16, 4, scratch, 80, 16); + simd.shrLogical(scratch, 32, 2, scratch, 112, 16); + simd.or(scratch, 80, scratch, 112, scratch, 80, 16); + + // o2 = (d2 << 6) | d3 + simd.shl(scratch, 32, 6, scratch, 96, 16); + simd.or(scratch, 96, scratch, 48, scratch, 96, 16); + + // 3. Interleave 3 output stripes into output bytes + for (int j = 0; j < 16; j++) { + out[di + j * 3] = (byte) scratch[64 + j]; + out[di + j * 3 + 1] = (byte) scratch[80 + j]; + out[di + j * 3 + 2] = (byte) scratch[96 + j]; + } + + si += 64; + di += 48; + } + + // Scalar tail for remaining complete quads + while (si < fullEnd) { + int c0 = in[si] & 0xff; + int c1 = in[si + 1] & 0xff; + int c2 = in[si + 2] & 0xff; + int c3 = in[si + 3] & 0xff; + int b0 = decodeMapLocal[c0]; + int b1 = decodeMapLocal[c1]; + int b2 = decodeMapLocal[c2]; + int b3 = decodeMapLocal[c3]; + if ((b0 | b1 | b2 | b3) < 0) { + return -1; + } + int quantum = (b0 << 18) | (b1 << 12) | (b2 << 6) | b3; + out[di++] = (byte) ((quantum >> 16) & 0xff); + out[di++] = (byte) ((quantum >> 8) & 0xff); + out[di++] = (byte) (quantum & 0xff); + si += 4; + } + + // Handle last quad with padding + if (pad > 0) { + int i = inOffset + inLength - 4; + int c0 = in[i] & 0xff; + int c1 = in[i + 1] & 0xff; + int b0 = decodeMapLocal[c0]; + int b1 = decodeMapLocal[c1]; + if ((b0 | b1) < 0) { + return -1; + } + out[di++] = (byte) ((b0 << 2) | (b1 >> 4)); + if (pad == 2) { + return (in[i + 2] == '=' && in[i + 3] == '=') ? outLength : -1; + } + if (in[i + 3] != '=') { + return -1; + } + int b2 = decodeMapLocal[in[i + 2] & 0xff]; + if (b2 < 0) { + return -1; + } + out[di] = (byte) ((b1 << 4) | (b2 >> 2)); + } + + return outLength; + } + + /// Convenience overload for `encodeNoNewlineSimd(byte[], int, int, byte[], int, int[])` + /// using zero offsets. + public static int encodeNoNewlineSimd(byte[] in, byte[] out, int[] scratch) { + return encodeNoNewlineSimd(in, 0, in.length, out, 0, scratch); + } + + /// Convenience overload for `decodeNoWhitespaceSimd(byte[], int, int, byte[], int, int[])` + /// using zero offsets. + public static int decodeNoWhitespaceSimd(byte[] in, int len, byte[] out, int[] scratch) { + return decodeNoWhitespaceSimd(in, 0, len, out, 0, scratch); + } + + private static void requireScratch(int[] scratch) { + if (scratch == null || scratch.length < SIMD_SCRATCH_INTS) { + throw new IllegalArgumentException("scratch must be an int[] allocated with Simd.allocInt(192) or larger"); + } + } + + private static byte[] allocByteMaybeSimd(int size) { + if (size <= 0) { + return new byte[0]; + } + Simd simd = Simd.get(); + if (simd.isSupported() && size >= 16) { + return simd.allocByte(size); + } + return new byte[size]; + } } diff --git a/CodenameOne/src/com/codename1/util/Simd.java b/CodenameOne/src/com/codename1/util/Simd.java new file mode 100644 index 0000000000..e3d88af3d8 --- /dev/null +++ b/CodenameOne/src/com/codename1/util/Simd.java @@ -0,0 +1,752 @@ +/* + * Copyright (c) 2026, Codename One and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Codename One designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + */ +package com.codename1.util; + +import com.codename1.annotations.Concrete; +import com.codename1.ui.CN; + +/// Portable SIMD API with Java fallback implementations. +@Concrete(name = "com.codename1.impl.ios.IOSSimd") +public class Simd { + + /// Returns the singleton instance of the Simd class. Equivalent to `CN.getSimd();` + public static Simd get() { + return CN.getSimd(); + } + + /// Returns true if SIMD instructions are natively supported + /// if this returns false the APIs in this class would still work + /// using fallback loop code + public boolean isSupported() { + return false; + } + + /// Allocates an aligned memory block for efficient SIMD + /// operations. All operations MUST be performed on aligned + /// arrays and shouldn't use arrays created with `new`. Operations + /// on unaligned arrays might produce undefined results. + public byte[] allocByte(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return new byte[size]; + } + + /// Allocates an aligned memory block for efficient SIMD + /// operations. All operations MUST be performed on aligned + /// arrays and shouldn't use arrays created with `new`. Operations + /// on unaligned arrays might produce undefined results. + public int[] allocInt(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return new int[size]; + } + + /// Allocates an aligned memory block for efficient SIMD + /// operations. All operations MUST be performed on aligned + /// arrays and shouldn't use arrays created with `new`. Operations + /// on unaligned arrays might produce undefined results. + public float[] allocFloat(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return new float[size]; + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void add(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = clampByte(srcA[i] + srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void sub(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = clampByte(srcA[i] - srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void mul(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = clampByte(srcA[i] * srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void min(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] < srcB[i] ? srcA[i] : srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void max(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] > srcB[i] ? srcA[i] : srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void abs(byte[] src, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + if (v == Byte.MIN_VALUE) { + dst[i] = Byte.MAX_VALUE; + } else { + dst[i] = (byte)Math.abs(v); + } + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + if (v < minValue) { + dst[i] = minValue; + } else if (v > maxValue) { + dst[i] = maxValue; + } else { + dst[i] = (byte)v; + } + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void and(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] & srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void or(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] | srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void xor(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] ^ srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void not(byte[] src, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(~src[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpEq(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] == srcB[i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpLt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] < srcB[i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpGt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] > srcB[i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpRange(byte[] src, byte minValue, byte maxValue, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + dstMask[i] = v >= minValue && v <= maxValue ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void select(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = mask[i] != 0 ? trueValues[i] : falseValues[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shl(byte[] src, int bits, byte[] dst, int offset, int length) { + int shift = bits & 7; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)((src[i] & 0xff) << shift); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shrLogical(byte[] src, int bits, byte[] dst, int offset, int length) { + int shift = bits & 7; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)((src[i] & 0xff) >>> shift); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void addWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] + srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void subWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] - srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void unpackUnsignedByteToInt(byte[] src, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = src[i] & 0xff; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void unpackUnsignedByteToInt(byte[] src, int srcOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = src[srcOffset + i] & 0xff; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void packIntToByteSaturating(int[] src, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = clampByte(src[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void packIntToByteTruncate(int[] src, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)src[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void packIntToByteTruncate(int[] src, int srcOffset, byte[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = (byte)src[srcOffset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void permuteBytes(byte[] src, byte[] indices, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int idx = indices[i]; + dst[i] = idx >= 0 && idx < src.length ? src[idx] : 0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void add(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] + srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void add(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = srcA[srcAOffset + i] + srcB[srcBOffset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void sub(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] - srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void mul(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] * srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void min(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] < srcB[i] ? srcA[i] : srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void max(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] > srcB[i] ? srcA[i] : srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void abs(int[] src, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + dst[i] = v == Integer.MIN_VALUE ? Integer.MAX_VALUE : Math.abs(v); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + if (v < minValue) { + dst[i] = minValue; + } else if (v > maxValue) { + dst[i] = maxValue; + } else { + dst[i] = v; + } + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void and(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] & srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void and(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = srcA[srcAOffset + i] & srcB[srcBOffset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void or(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] | srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void or(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = srcA[srcAOffset + i] | srcB[srcBOffset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void xor(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] ^ srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void not(int[] src, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = ~src[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shl(int[] src, int bits, int[] dst, int offset, int length) { + int shift = bits & 31; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = src[i] << shift; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shl(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { + int shift = bits & 31; + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = src[srcOffset + i] << shift; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shrLogical(int[] src, int bits, int[] dst, int offset, int length) { + int shift = bits & 31; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = src[i] >>> shift; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shrLogical(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { + int shift = bits & 31; + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = src[srcOffset + i] >>> shift; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shrArithmetic(int[] src, int bits, int[] dst, int offset, int length) { + int shift = bits & 31; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = src[i] >> shift; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpEq(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] == srcB[i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpEq(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dstMask[dstOffset + i] = srcA[srcAOffset + i] == srcB[srcBOffset + i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpLt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] < srcB[i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpLt(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dstMask[dstOffset + i] = srcA[srcAOffset + i] < srcB[srcBOffset + i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpGt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] > srcB[i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void select(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = mask[i] != 0 ? trueValues[i] : falseValues[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void select(byte[] mask, int maskOffset, int[] trueValues, int trueOffset, int[] falseValues, int falseOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = mask[maskOffset + i] != 0 ? trueValues[trueOffset + i] : falseValues[falseOffset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public int sum(int[] src, int offset, int length) { + int out = 0; + for (int i = offset, end = offset + length; i < end; i++) { + out += src[i]; + } + return out; + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public int dot(int[] srcA, int[] srcB, int offset, int length) { + int out = 0; + for (int i = offset, end = offset + length; i < end; i++) { + out += srcA[i] * srcB[i]; + } + return out; + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void add(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] + srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void sub(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] - srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void mul(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] * srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void min(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = Math.min(srcA[i], srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void max(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = Math.max(srcA[i], srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void abs(float[] src, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = Math.abs(src[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void clamp(float[] src, float[] dst, float minValue, float maxValue, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + float v = src[i]; + if (v < minValue) { + dst[i] = minValue; + } else if (v > maxValue) { + dst[i] = maxValue; + } else { + dst[i] = v; + } + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public float sum(float[] src, int offset, int length) { + float out = 0f; + for (int i = offset, end = offset + length; i < end; i++) { + out += src[i]; + } + return out; + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public float dot(float[] srcA, float[] srcB, int offset, int length) { + float out = 0f; + for (int i = offset, end = offset + length; i < end; i++) { + out += srcA[i] * srcB[i]; + } + return out; + } + + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateBinaryByte(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateMaskBinaryByte(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dstMask, "dstMask"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dstMask.length, offset, length, "dstMask"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateRangeMaskByte(byte[] src, byte[] dstMask, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dstMask, "dstMask"); + validateRange(src.length, offset, length, "src"); + validateRange(dstMask.length, offset, length, "dstMask"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateSelectByte(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length) { + validateNotNull(mask, "mask"); + validateNotNull(trueValues, "trueValues"); + validateNotNull(falseValues, "falseValues"); + validateNotNull(dst, "dst"); + validateRange(mask.length, offset, length, "mask"); + validateRange(trueValues.length, offset, length, "trueValues"); + validateRange(falseValues.length, offset, length, "falseValues"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateByteToInt(byte[] src, int[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateIntToByte(int[] src, byte[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validatePermuteByte(byte[] src, byte[] indices, byte[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(indices, "indices"); + validateNotNull(dst, "dst"); + validateRange(indices.length, offset, length, "indices"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateUnaryByte(byte[] src, byte[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateBinaryInt(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateUnaryInt(int[] src, int[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateMaskBinaryInt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dstMask, "dstMask"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dstMask.length, offset, length, "dstMask"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateSelectInt(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length) { + validateNotNull(mask, "mask"); + validateNotNull(trueValues, "trueValues"); + validateNotNull(falseValues, "falseValues"); + validateNotNull(dst, "dst"); + validateRange(mask.length, offset, length, "mask"); + validateRange(trueValues.length, offset, length, "trueValues"); + validateRange(falseValues.length, offset, length, "falseValues"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateReductionInt(int[] src, int offset, int length) { + validateNotNull(src, "src"); + validateRange(src.length, offset, length, "src"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateDotInt(int[] srcA, int[] srcB, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateBinaryFloat(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateUnaryFloat(float[] src, float[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateReductionFloat(float[] src, int offset, int length) { + validateNotNull(src, "src"); + validateRange(src.length, offset, length, "src"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateDotFloat(float[] srcA, float[] srcB, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateNotNull(Object o, String name) { + if (o == null) { + throw new NullPointerException(name + " is null"); + } + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateRange(int arrayLength, int offset, int length, String name) { + if (offset < 0 || length < 0 || offset > arrayLength || arrayLength - offset < length) { + throw new ArrayIndexOutOfBoundsException(name + " invalid range offset=" + offset + " length=" + length + " size=" + arrayLength); + } + } + + private byte clampByte(int value) { + if (value > Byte.MAX_VALUE) { + return Byte.MAX_VALUE; + } + if (value < Byte.MIN_VALUE) { + return Byte.MIN_VALUE; + } + return (byte)value; + } +} diff --git a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSEPort.java b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSEPort.java index bcf4c861e1..148699a232 100644 --- a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSEPort.java +++ b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSEPort.java @@ -118,6 +118,7 @@ import com.codename1.ui.util.UITimer; import com.codename1.util.AsyncResource; import com.codename1.util.Callback; +import com.codename1.util.Simd; import com.jhlabs.image.GaussianFilter; import java.awt.*; import java.awt.datatransfer.Clipboard; @@ -10753,6 +10754,11 @@ public String getPlatformName() { return platformName; } + @Override + public Simd createSimd() { + return new JavaSESimd(); + } + /** * @inheritDoc */ diff --git a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java new file mode 100644 index 0000000000..d5cddbdd8b --- /dev/null +++ b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2026, Codename One and/or its affiliates. All rights reserved. + */ +package com.codename1.impl.javase; + +import com.codename1.util.Simd; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +/** + * JavaSE SIMD implementation used for simulator validation and fallback execution. + */ +public class JavaSESimd extends Simd { + private final Set allocatedIds = Collections.synchronizedSet(new HashSet()); + + @Override + public boolean isSupported() { + return true; + } + + @Override + public byte[] allocByte(int size) { + byte[] out = super.allocByte(size); + allocatedIds.add(Integer.valueOf(System.identityHashCode(out))); + return out; + } + + @Override + public int[] allocInt(int size) { + int[] out = super.allocInt(size); + allocatedIds.add(Integer.valueOf(System.identityHashCode(out))); + return out; + } + + @Override + public float[] allocFloat(int size) { + float[] out = super.allocFloat(size); + allocatedIds.add(Integer.valueOf(System.identityHashCode(out))); + return out; + } + + @Override + public void add(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.add(srcA, srcB, dst, offset, length); + } + + @Override + public void sub(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.sub(srcA, srcB, dst, offset, length); + } + + @Override + public void mul(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.mul(srcA, srcB, dst, offset, length); + } + + @Override + public void min(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.min(srcA, srcB, dst, offset, length); + } + + @Override + public void max(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.max(srcA, srcB, dst, offset, length); + } + + @Override + public void abs(byte[] src, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.abs(src, dst, offset, length); + } + + @Override + public void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offset, int length) { + if (minValue > maxValue) { + throw new IllegalArgumentException("minValue > maxValue"); + } + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.clamp(src, dst, minValue, maxValue, offset, length); + } + + @Override + public void and(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.and(srcA, srcB, dst, offset, length); + } + + @Override + public void or(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.or(srcA, srcB, dst, offset, length); + } + + @Override + public void xor(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.xor(srcA, srcB, dst, offset, length); + } + + @Override + public void not(byte[] src, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.not(src, dst, offset, length); + } + + @Override + public void cmpEq(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryByte(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpEq(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpLt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryByte(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpLt(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpGt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryByte(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpGt(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpRange(byte[] src, byte minValue, byte maxValue, byte[] dstMask, int offset, int length) { + if (minValue > maxValue) { + throw new IllegalArgumentException("minValue > maxValue"); + } + validateRangeMaskByte(src, dstMask, offset, length); + validateRegistered(src, dstMask); + super.cmpRange(src, minValue, maxValue, dstMask, offset, length); + } + + @Override + public void select(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length) { + validateSelectByte(mask, trueValues, falseValues, dst, offset, length); + validateRegistered(mask, trueValues, falseValues, dst); + super.select(mask, trueValues, falseValues, dst, offset, length); + } + + @Override + public void unpackUnsignedByteToInt(byte[] src, int[] dst, int offset, int length) { + validateByteToInt(src, dst, offset, length); + validateRegistered(src, dst); + super.unpackUnsignedByteToInt(src, dst, offset, length); + } + + @Override + public void packIntToByteSaturating(int[] src, byte[] dst, int offset, int length) { + validateIntToByte(src, dst, offset, length); + validateRegistered(src, dst); + super.packIntToByteSaturating(src, dst, offset, length); + } + + @Override + public void packIntToByteTruncate(int[] src, byte[] dst, int offset, int length) { + validateIntToByte(src, dst, offset, length); + validateRegistered(src, dst); + super.packIntToByteTruncate(src, dst, offset, length); + } + + @Override + public void packIntToByteTruncate(int[] src, int srcOffset, byte[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.packIntToByteTruncate(src, srcOffset, dst, dstOffset, length); + } + + @Override + public void permuteBytes(byte[] src, byte[] indices, byte[] dst, int offset, int length) { + validatePermuteByte(src, indices, dst, offset, length); + validateRegistered(src, indices, dst); + super.permuteBytes(src, indices, dst, offset, length); + } + + @Override + public void add(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.add(srcA, srcB, dst, offset, length); + } + + @Override + public void sub(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.sub(srcA, srcB, dst, offset, length); + } + + @Override + public void mul(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.mul(srcA, srcB, dst, offset, length); + } + + @Override + public void min(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.min(srcA, srcB, dst, offset, length); + } + + @Override + public void max(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.max(srcA, srcB, dst, offset, length); + } + + @Override + public void abs(int[] src, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.abs(src, dst, offset, length); + } + + @Override + public void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, int length) { + if (minValue > maxValue) { + throw new IllegalArgumentException("minValue > maxValue"); + } + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.clamp(src, dst, minValue, maxValue, offset, length); + } + + @Override + public void and(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.and(srcA, srcB, dst, offset, length); + } + + @Override + public void and(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(srcA, srcB, dst); + super.and(srcA, srcAOffset, srcB, srcBOffset, dst, dstOffset, length); + } + + @Override + public void or(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.or(srcA, srcB, dst, offset, length); + } + + @Override + public void or(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(srcA, srcB, dst); + super.or(srcA, srcAOffset, srcB, srcBOffset, dst, dstOffset, length); + } + + @Override + public void xor(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.xor(srcA, srcB, dst, offset, length); + } + + @Override + public void not(int[] src, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.not(src, dst, offset, length); + } + + @Override + public void shl(int[] src, int bits, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.shl(src, bits, dst, offset, length); + } + + @Override + public void shl(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.shl(src, srcOffset, bits, dst, dstOffset, length); + } + + @Override + public void shrLogical(int[] src, int bits, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.shrLogical(src, bits, dst, offset, length); + } + + @Override + public void shrLogical(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.shrLogical(src, srcOffset, bits, dst, dstOffset, length); + } + + @Override + public void shrArithmetic(int[] src, int bits, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.shrArithmetic(src, bits, dst, offset, length); + } + + @Override + public void cmpEq(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryInt(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpEq(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpLt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryInt(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpLt(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpGt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryInt(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpGt(srcA, srcB, dstMask, offset, length); + } + + @Override + public void select(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length) { + validateSelectInt(mask, trueValues, falseValues, dst, offset, length); + validateRegistered(mask, trueValues, falseValues, dst); + super.select(mask, trueValues, falseValues, dst, offset, length); + } + + @Override + public int sum(int[] src, int offset, int length) { + validateReductionInt(src, offset, length); + validateRegistered(src); + return super.sum(src, offset, length); + } + + @Override + public int dot(int[] srcA, int[] srcB, int offset, int length) { + validateDotInt(srcA, srcB, offset, length); + validateRegistered(srcA, srcB); + return super.dot(srcA, srcB, offset, length); + } + + @Override + public void add(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.add(srcA, srcB, dst, offset, length); + } + + @Override + public void sub(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.sub(srcA, srcB, dst, offset, length); + } + + @Override + public void mul(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.mul(srcA, srcB, dst, offset, length); + } + + @Override + public void min(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.min(srcA, srcB, dst, offset, length); + } + + @Override + public void max(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.max(srcA, srcB, dst, offset, length); + } + + @Override + public void abs(float[] src, float[] dst, int offset, int length) { + validateUnaryFloat(src, dst, offset, length); + validateRegistered(src, dst); + super.abs(src, dst, offset, length); + } + + @Override + public void clamp(float[] src, float[] dst, float minValue, float maxValue, int offset, int length) { + if (minValue > maxValue) { + throw new IllegalArgumentException("minValue > maxValue"); + } + validateUnaryFloat(src, dst, offset, length); + validateRegistered(src, dst); + super.clamp(src, dst, minValue, maxValue, offset, length); + } + + @Override + public float sum(float[] src, int offset, int length) { + validateReductionFloat(src, offset, length); + validateRegistered(src); + return super.sum(src, offset, length); + } + + @Override + public float dot(float[] srcA, float[] srcB, int offset, int length) { + validateDotFloat(srcA, srcB, offset, length); + validateRegistered(srcA, srcB); + return super.dot(srcA, srcB, offset, length); + } + + @Override + public void shl(byte[] src, int bits, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.shl(src, bits, dst, offset, length); + } + + @Override + public void shrLogical(byte[] src, int bits, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.shrLogical(src, bits, dst, offset, length); + } + + @Override + public void addWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.addWrapping(srcA, srcB, dst, offset, length); + } + + @Override + public void subWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.subWrapping(srcA, srcB, dst, offset, length); + } + + @Override + public void unpackUnsignedByteToInt(byte[] src, int srcOffset, int[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.unpackUnsignedByteToInt(src, srcOffset, dst, dstOffset, length); + } + + @Override + public void add(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(srcA, srcB, dst); + super.add(srcA, srcAOffset, srcB, srcBOffset, dst, dstOffset, length); + } + + @Override + public void cmpEq(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dstMask, "dstMask"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dstMask.length, dstOffset, length, "dstMask"); + validateRegistered(srcA, srcB, dstMask); + super.cmpEq(srcA, srcAOffset, srcB, srcBOffset, dstMask, dstOffset, length); + } + + @Override + public void cmpLt(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dstMask, "dstMask"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dstMask.length, dstOffset, length, "dstMask"); + validateRegistered(srcA, srcB, dstMask); + super.cmpLt(srcA, srcAOffset, srcB, srcBOffset, dstMask, dstOffset, length); + } + + @Override + public void select(byte[] mask, int maskOffset, int[] trueValues, int trueOffset, int[] falseValues, int falseOffset, int[] dst, int dstOffset, int length) { + validateNotNull(mask, "mask"); + validateNotNull(trueValues, "trueValues"); + validateNotNull(falseValues, "falseValues"); + validateNotNull(dst, "dst"); + validateRange(mask.length, maskOffset, length, "mask"); + validateRange(trueValues.length, trueOffset, length, "trueValues"); + validateRange(falseValues.length, falseOffset, length, "falseValues"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(mask, trueValues, falseValues, dst); + super.select(mask, maskOffset, trueValues, trueOffset, falseValues, falseOffset, dst, dstOffset, length); + } + + private void validateRegistered(Object... arrays) { + for (int i = 0; i < arrays.length; i++) { + Object arr = arrays[i]; + Integer id = Integer.valueOf(System.identityHashCode(arr)); + if (!allocatedIds.contains(id)) { + throw new IllegalArgumentException( + "SIMD array argument was not allocated using Simd.alloc*(). objectId=" + id.intValue()); + } + } + } +} diff --git a/Ports/iOSPort/nativeSources/IOSSimd.m b/Ports/iOSPort/nativeSources/IOSSimd.m new file mode 100644 index 0000000000..27040050fa --- /dev/null +++ b/Ports/iOSPort/nativeSources/IOSSimd.m @@ -0,0 +1,1049 @@ +#include "xmlvm.h" +#include +#include +#include + +static JAVA_ARRAY_BYTE cn1_saturating_byte(int value) { + if (value > 127) { + return 127; + } + if (value < -128) { + return -128; + } + return (JAVA_ARRAY_BYTE)value; +} + +JAVA_OBJECT com_codename1_impl_ios_IOSSimd_allocByteNative___int_R_byte_1ARRAY(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_INT size) { + return allocArrayAligned(threadStateData, size, &class_array1__JAVA_BYTE, sizeof(JAVA_ARRAY_BYTE), 1, 16); +} + +JAVA_OBJECT com_codename1_impl_ios_IOSSimd_allocIntNative___int_R_int_1ARRAY(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_INT size) { + return allocArrayAligned(threadStateData, size, &class_array1__JAVA_INT, sizeof(JAVA_ARRAY_INT), 1, 16); +} + +JAVA_OBJECT com_codename1_impl_ios_IOSSimd_allocFloatNative___int_R_float_1ARRAY(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_INT size) { + return allocArrayAligned(threadStateData, size, &class_array1__JAVA_FLOAT, sizeof(JAVA_ARRAY_FLOAT), 1, 16); +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_add___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int8x16_t vc = vqaddq_s8(va, vb); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = cn1_saturating_byte((int)a[i] + (int)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_sub___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int8x16_t vc = vqsubq_s8(va, vb); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = cn1_saturating_byte((int)a[i] - (int)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_mul___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int16x8_t low = vmull_s8(vget_low_s8(va), vget_low_s8(vb)); + int16x8_t high = vmull_s8(vget_high_s8(va), vget_high_s8(vb)); + int8x8_t low8 = vqmovn_s16(low); + int8x8_t high8 = vqmovn_s16(high); + int8x16_t out = vcombine_s8(low8, high8); + vst1q_s8((int8_t*)(d + i), out); + } + for (; i < end; i++) { + d[i] = cn1_saturating_byte((int)a[i] * (int)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_min___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int8x16_t vc = vminq_s8(va, vb); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] < b[i] ? a[i] : b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_max___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int8x16_t vc = vmaxq_s8(va, vb); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] > b[i] ? a[i] : b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_abs___byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + int8x16_t vd = vqabsq_s8(vs); + vst1q_s8((int8_t*)(d + i), vd); + } + for (; i < end; i++) { + int v = s[i]; + d[i] = v == -128 ? 127 : (JAVA_ARRAY_BYTE)abs(v); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_clamp___byte_1ARRAY_byte_1ARRAY_byte_byte_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_BYTE minValue, JAVA_BYTE maxValue, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + int8x16_t vminv = vdupq_n_s8((int8_t)minValue); + int8x16_t vmaxv = vdupq_n_s8((int8_t)maxValue); + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + int8x16_t vc = vmaxq_s8(vminv, vminq_s8(vs, vmaxv)); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + int v = s[i]; + if (v < minValue) { + d[i] = minValue; + } else if (v > maxValue) { + d[i] = maxValue; + } else { + d[i] = (JAVA_ARRAY_BYTE)v; + } + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_add___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vaddq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)((int32_t)a[i] + (int32_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_sub___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vsubq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)((int32_t)a[i] - (int32_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_mul___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vmulq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)((int32_t)a[i] * (int32_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_min___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vminq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] < b[i] ? a[i] : b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_max___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vmaxq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] > b[i] ? a[i] : b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_abs___int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + int32x4_t vd = vqabsq_s32(vs); + vst1q_s32((int32_t*)(d + i), vd); + } + for (; i < end; i++) { + int32_t v = (int32_t)s[i]; + d[i] = (JAVA_ARRAY_INT)(v == INT32_MIN ? INT32_MAX : (v < 0 ? -v : v)); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_clamp___int_1ARRAY_int_1ARRAY_int_int_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT minValue, JAVA_INT maxValue, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + int32x4_t vminv = vdupq_n_s32((int32_t)minValue); + int32x4_t vmaxv = vdupq_n_s32((int32_t)maxValue); + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + int32x4_t vc = vmaxq_s32(vminv, vminq_s32(vs, vmaxv)); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + int v = s[i]; + if (v < minValue) { + d[i] = minValue; + } else if (v > maxValue) { + d[i] = maxValue; + } else { + d[i] = (JAVA_ARRAY_INT)v; + } + } +} + +JAVA_INT com_codename1_impl_ios_IOSSimd_sum___int_1ARRAY_int_int_R_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + int i = offset; + int end = offset + length; + int64_t total = 0; + int32x4_t vacc = vdupq_n_s32(0); + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + vacc = vaddq_s32(vacc, vs); + } + int32_t partial[4]; + vst1q_s32(partial, vacc); + total += (int64_t)partial[0] + (int64_t)partial[1] + (int64_t)partial[2] + (int64_t)partial[3]; + for (; i < end; i++) { + total += (int64_t)((int32_t)s[i]); + } + return (JAVA_INT)((int32_t)total); +} + +JAVA_INT com_codename1_impl_ios_IOSSimd_dot___int_1ARRAY_int_1ARRAY_int_int_R_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + int i = offset; + int end = offset + length; + int64_t total = 0; + int32x4_t vacc = vdupq_n_s32(0); + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + vacc = vaddq_s32(vacc, vmulq_s32(va, vb)); + } + int32_t partial[4]; + vst1q_s32(partial, vacc); + total += (int64_t)partial[0] + (int64_t)partial[1] + (int64_t)partial[2] + (int64_t)partial[3]; + for (; i < end; i++) { + total += (int64_t)((int32_t)a[i]) * (int64_t)((int32_t)b[i]); + } + return (JAVA_INT)((int32_t)total); +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_add___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vaddq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] + b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_sub___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vsubq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] - b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_mul___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vmulq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] * b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_min___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vminq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = fminf(a[i], b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_max___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vmaxq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = fmaxf(a[i], b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_abs___float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* s = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t vs = vld1q_f32((float*)(s + i)); + float32x4_t vd = vabsq_f32(vs); + vst1q_f32((float*)(d + i), vd); + } + for (; i < end; i++) { + d[i] = fabsf(s[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_clamp___float_1ARRAY_float_1ARRAY_float_float_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_FLOAT minValue, JAVA_FLOAT maxValue, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* s = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + float32x4_t vminv = vdupq_n_f32((float)minValue); + float32x4_t vmaxv = vdupq_n_f32((float)maxValue); + for (; i <= end - 4; i += 4) { + float32x4_t vs = vld1q_f32((float*)(s + i)); + float32x4_t vc = vmaxq_f32(vminv, vminq_f32(vs, vmaxv)); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + float v = s[i]; + if (v < minValue) { + d[i] = minValue; + } else if (v > maxValue) { + d[i] = maxValue; + } else { + d[i] = v; + } + } +} + +JAVA_FLOAT com_codename1_impl_ios_IOSSimd_sum___float_1ARRAY_int_int_R_float(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* s = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)src)->data; + int i = offset; + int end = offset + length; + float total = 0.f; + float32x4_t vacc = vdupq_n_f32(0.f); + for (; i <= end - 4; i += 4) { + float32x4_t vs = vld1q_f32((float*)(s + i)); + vacc = vaddq_f32(vacc, vs); + } + float partial[4]; + vst1q_f32(partial, vacc); + total += partial[0] + partial[1] + partial[2] + partial[3]; + for (; i < end; i++) { + total += s[i]; + } + return (JAVA_FLOAT)total; +} + +JAVA_FLOAT com_codename1_impl_ios_IOSSimd_dot___float_1ARRAY_float_1ARRAY_int_int_R_float(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + int i = offset; + int end = offset + length; + float total = 0.f; + float32x4_t vacc = vdupq_n_f32(0.f); + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + vacc = vaddq_f32(vacc, vmulq_f32(va, vb)); + } + float partial[4]; + vst1q_f32(partial, vacc); + total += partial[0] + partial[1] + partial[2] + partial[3]; + for (; i < end; i++) { + total += a[i] * b[i]; + } + return (JAVA_FLOAT)total; +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_and___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + vst1q_s8((int8_t*)(d + i), vandq_s8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(a[i] & b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_or___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + vst1q_s8((int8_t*)(d + i), vorrq_s8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(a[i] | b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_xor___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + vst1q_s8((int8_t*)(d + i), veorq_s8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(a[i] ^ b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_not___byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + vst1q_s8((int8_t*)(d + i), vmvnq_s8(vs)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(~s[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpEq___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + uint8x16_t cmp = vceqq_s8(va, vb); + vst1q_u8((uint8_t*)(m + i), cmp); + } + for (; i < end; i++) { + m[i] = a[i] == b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpLt___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + uint8x16_t cmp = vcltq_s8(va, vb); + vst1q_u8((uint8_t*)(m + i), cmp); + } + for (; i < end; i++) { + m[i] = a[i] < b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpGt___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + uint8x16_t cmp = vcgtq_s8(va, vb); + vst1q_u8((uint8_t*)(m + i), cmp); + } + for (; i < end; i++) { + m[i] = a[i] > b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpRange___byte_1ARRAY_byte_byte_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_BYTE minValue, JAVA_BYTE maxValue, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + int8x16_t vminv = vdupq_n_s8((int8_t)minValue); + int8x16_t vmaxv = vdupq_n_s8((int8_t)maxValue); + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + uint8x16_t ge = vcgeq_s8(vs, vminv); + uint8x16_t le = vcleq_s8(vs, vmaxv); + vst1q_u8((uint8_t*)(m + i), vandq_u8(ge, le)); + } + for (; i < end; i++) { + int v = s[i]; + m[i] = v >= minValue && v <= maxValue ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_select___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT mask, JAVA_OBJECT trueValues, JAVA_OBJECT falseValues, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)mask)->data; + JAVA_ARRAY_BYTE* t = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)trueValues)->data; + JAVA_ARRAY_BYTE* f = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)falseValues)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + int8x16_t zero = vdupq_n_s8(0); + for (; i <= end - 16; i += 16) { + int8x16_t vm = vld1q_s8((int8_t*)(m + i)); + int8x16_t vt = vld1q_s8((int8_t*)(t + i)); + int8x16_t vf = vld1q_s8((int8_t*)(f + i)); + uint8x16_t isZero = vceqq_s8(vm, zero); + uint8x16_t out = vbslq_u8(isZero, vreinterpretq_u8_s8(vf), vreinterpretq_u8_s8(vt)); + vst1q_s8((int8_t*)(d + i), vreinterpretq_s8_u8(out)); + } + for (; i < end; i++) { + d[i] = m[i] != 0 ? t[i] : f[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_unpackUnsignedByteToInt___byte_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + uint8x16_t v = vld1q_u8((uint8_t*)(s + i)); + uint16x8_t lo16 = vmovl_u8(vget_low_u8(v)); + uint16x8_t hi16 = vmovl_u8(vget_high_u8(v)); + uint32x4_t x0 = vmovl_u16(vget_low_u16(lo16)); + uint32x4_t x1 = vmovl_u16(vget_high_u16(lo16)); + uint32x4_t x2 = vmovl_u16(vget_low_u16(hi16)); + uint32x4_t x3 = vmovl_u16(vget_high_u16(hi16)); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(x0)); + vst1q_s32((int32_t*)(d + i + 4), vreinterpretq_s32_u32(x1)); + vst1q_s32((int32_t*)(d + i + 8), vreinterpretq_s32_u32(x2)); + vst1q_s32((int32_t*)(d + i + 12), vreinterpretq_s32_u32(x3)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(s[i] & 0xff); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packIntToByteSaturating___int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + int v = s[i]; + if (v > 127) { + d[i] = 127; + } else if (v < -128) { + d[i] = -128; + } else { + d[i] = (JAVA_ARRAY_BYTE)v; + } + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packIntToByteTruncate___int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)s[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packIntToByteTruncate___int_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_BYTE)s[srcOffset + i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_permuteBytes___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT indices, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + int srcLen = ((JAVA_ARRAY)src)->length; + JAVA_ARRAY_BYTE* idx = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)indices)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + int pos = idx[i]; + d[i] = (pos >= 0 && pos < srcLen) ? s[pos] : 0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_and___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vandq_u32(va, vb))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(a[i] & b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_and___int_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + srcAOffset + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + srcBOffset + i))); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(vandq_u32(va, vb))); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(a[srcAOffset + i] & b[srcBOffset + i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_or___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vorrq_u32(va, vb))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(a[i] | b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_or___int_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + srcAOffset + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + srcBOffset + i))); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(vorrq_u32(va, vb))); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(a[srcAOffset + i] | b[srcBOffset + i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_xor___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(veorq_u32(va, vb))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(a[i] ^ b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_not___int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32x4_t vs = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(s + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vmvnq_u32(vs))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(~s[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shl___int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = offset; + int end = offset + length; + int32x4_t vshift = vdupq_n_s32(shift); + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + vst1q_s32((int32_t*)(d + i), vshlq_s32(vs, vshift)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(s[i] << shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shl___int_1ARRAY_int_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = 0; + int32x4_t vshift = vdupq_n_s32(shift); + for (; i <= length - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + srcOffset + i)); + vst1q_s32((int32_t*)(d + dstOffset + i), vshlq_s32(vs, vshift)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(s[srcOffset + i] << shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrLogical___int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = offset; + int end = offset + length; + int32x4_t vshift = vdupq_n_s32(-shift); + for (; i <= end - 4; i += 4) { + uint32x4_t vs = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(s + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vshlq_u32(vs, vshift))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(((uint32_t)s[i]) >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrLogical___int_1ARRAY_int_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = 0; + int32x4_t vshift = vdupq_n_s32(-shift); + for (; i <= length - 4; i += 4) { + uint32x4_t vs = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(s + srcOffset + i))); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(vshlq_u32(vs, vshift))); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(((uint32_t)s[srcOffset + i]) >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrArithmetic___int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = offset; + int end = offset + length; + int32x4_t vshift = vdupq_n_s32(-shift); + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + vst1q_s32((int32_t*)(d + i), vshlq_s32(vs, vshift)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(s[i] >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpEq___int_1ARRAY_int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + m[i] = a[i] == b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpLt___int_1ARRAY_int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + m[i] = a[i] < b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpGt___int_1ARRAY_int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + m[i] = a[i] > b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_select___byte_1ARRAY_int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT mask, JAVA_OBJECT trueValues, JAVA_OBJECT falseValues, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)mask)->data; + JAVA_ARRAY_INT* t = (JAVA_ARRAY_INT*)((JAVA_ARRAY)trueValues)->data; + JAVA_ARRAY_INT* f = (JAVA_ARRAY_INT*)((JAVA_ARRAY)falseValues)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + d[i] = m[i] != 0 ? t[i] : f[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shl___byte_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int shift = bits & 7; + int i = offset; + int end = offset + length; + int8x16_t vshift = vdupq_n_s8((int8_t)shift); + for (; i <= end - 16; i += 16) { + uint8x16_t vs = vld1q_u8((uint8_t*)(s + i)); + vst1q_u8((uint8_t*)(d + i), vshlq_u8(vs, vshift)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(((uint8_t)s[i]) << shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrLogical___byte_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int shift = bits & 7; + int i = offset; + int end = offset + length; + int8x16_t vneg = vdupq_n_s8((int8_t)(-shift)); + for (; i <= end - 16; i += 16) { + uint8x16_t vs = vld1q_u8((uint8_t*)(s + i)); + vst1q_u8((uint8_t*)(d + i), vshlq_u8(vs, vneg)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(((uint8_t)s[i]) >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_addWrapping___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + uint8x16_t va = vld1q_u8((uint8_t*)(a + i)); + uint8x16_t vb = vld1q_u8((uint8_t*)(b + i)); + vst1q_u8((uint8_t*)(d + i), vaddq_u8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)((uint8_t)a[i] + (uint8_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_subWrapping___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + uint8x16_t va = vld1q_u8((uint8_t*)(a + i)); + uint8x16_t vb = vld1q_u8((uint8_t*)(b + i)); + vst1q_u8((uint8_t*)(d + i), vsubq_u8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)((uint8_t)a[i] - (uint8_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_unpackUnsignedByteToInt___byte_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16_t v = vld1q_u8((uint8_t*)(s + srcOffset + i)); + uint16x8_t lo16 = vmovl_u8(vget_low_u8(v)); + uint16x8_t hi16 = vmovl_u8(vget_high_u8(v)); + uint32x4_t x0 = vmovl_u16(vget_low_u16(lo16)); + uint32x4_t x1 = vmovl_u16(vget_high_u16(lo16)); + uint32x4_t x2 = vmovl_u16(vget_low_u16(hi16)); + uint32x4_t x3 = vmovl_u16(vget_high_u16(hi16)); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(x0)); + vst1q_s32((int32_t*)(d + dstOffset + i + 4), vreinterpretq_s32_u32(x1)); + vst1q_s32((int32_t*)(d + dstOffset + i + 8), vreinterpretq_s32_u32(x2)); + vst1q_s32((int32_t*)(d + dstOffset + i + 12), vreinterpretq_s32_u32(x3)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(s[srcOffset + i] & 0xff); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_add___int_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + srcAOffset + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + srcBOffset + i)); + vst1q_s32((int32_t*)(d + dstOffset + i), vaddq_s32(va, vb)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)((int32_t)a[srcAOffset + i] + (int32_t)b[srcBOffset + i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpEq___int_1ARRAY_int_int_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dstMask, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int end = length; + for (int i = 0; i < end; i++) { + m[dstOffset + i] = a[srcAOffset + i] == b[srcBOffset + i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpLt___int_1ARRAY_int_int_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dstMask, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int end = length; + for (int i = 0; i < end; i++) { + m[dstOffset + i] = a[srcAOffset + i] < b[srcBOffset + i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_select___byte_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT mask, JAVA_INT maskOffset, JAVA_OBJECT trueValues, JAVA_INT trueOffset, JAVA_OBJECT falseValues, JAVA_INT falseOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)mask)->data; + JAVA_ARRAY_INT* t = (JAVA_ARRAY_INT*)((JAVA_ARRAY)trueValues)->data; + JAVA_ARRAY_INT* f = (JAVA_ARRAY_INT*)((JAVA_ARRAY)falseValues)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int end = length; + for (int i = 0; i < end; i++) { + d[dstOffset + i] = m[maskOffset + i] != 0 ? t[trueOffset + i] : f[falseOffset + i]; + } +} diff --git a/Ports/iOSPort/src/com/codename1/impl/ios/IOSImplementation.java b/Ports/iOSPort/src/com/codename1/impl/ios/IOSImplementation.java index 3029ccfa61..d94049d5ed 100644 --- a/Ports/iOSPort/src/com/codename1/impl/ios/IOSImplementation.java +++ b/Ports/iOSPort/src/com/codename1/impl/ios/IOSImplementation.java @@ -110,6 +110,7 @@ import com.codename1.util.Callback; import com.codename1.util.StringUtil; import com.codename1.util.SuccessCallback; +import com.codename1.util.Simd; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -8091,6 +8092,11 @@ public String getPlatformName() { return "ios"; } + @Override + public Simd createSimd() { + return new IOSSimd(); + } + /** * @inheritDoc */ diff --git a/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java b/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java new file mode 100644 index 0000000000..3e1cf604dc --- /dev/null +++ b/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2026, Codename One and/or its affiliates. All rights reserved. + */ +package com.codename1.impl.ios; + +import com.codename1.util.Simd; + +/** + * iOS SIMD implementation backed by NEON wrappers. + */ +public class IOSSimd extends Simd { + @Override + public boolean isSupported() { + return true; + } + + @Override + public byte[] allocByte(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return allocByteNative(size); + } + + @Override + public int[] allocInt(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return allocIntNative(size); + } + + @Override + public float[] allocFloat(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return allocFloatNative(size); + } + + @Override + public native void add(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void sub(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void mul(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void min(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void max(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void abs(byte[] src, byte[] dst, int offset, int length); + + @Override + public native void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offset, int length); + + @Override + public native void and(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void or(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void xor(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void not(byte[] src, byte[] dst, int offset, int length); + + @Override + public native void cmpEq(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpLt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpGt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpRange(byte[] src, byte minValue, byte maxValue, byte[] dstMask, int offset, int length); + + @Override + public native void select(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length); + + @Override + public native void unpackUnsignedByteToInt(byte[] src, int[] dst, int offset, int length); + + @Override + public native void packIntToByteSaturating(int[] src, byte[] dst, int offset, int length); + + @Override + public native void packIntToByteTruncate(int[] src, byte[] dst, int offset, int length); + + @Override + public native void packIntToByteTruncate(int[] src, int srcOffset, byte[] dst, int dstOffset, int length); + + @Override + public native void permuteBytes(byte[] src, byte[] indices, byte[] dst, int offset, int length); + + @Override + public native void add(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void sub(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void mul(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void min(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void max(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void abs(int[] src, int[] dst, int offset, int length); + + @Override + public native void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, int length); + + @Override + public native void and(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void and(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length); + + @Override + public native void or(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void or(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length); + + @Override + public native void xor(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void not(int[] src, int[] dst, int offset, int length); + + @Override + public native void shl(int[] src, int bits, int[] dst, int offset, int length); + + @Override + public native void shl(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length); + + @Override + public native void shrLogical(int[] src, int bits, int[] dst, int offset, int length); + + @Override + public native void shrLogical(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length); + + @Override + public native void shrArithmetic(int[] src, int bits, int[] dst, int offset, int length); + + @Override + public native void cmpEq(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpLt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpGt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void select(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length); + + @Override + public native int sum(int[] src, int offset, int length); + + @Override + public native int dot(int[] srcA, int[] srcB, int offset, int length); + + @Override + public native void add(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void sub(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void mul(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void min(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void max(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void abs(float[] src, float[] dst, int offset, int length); + + @Override + public native void clamp(float[] src, float[] dst, float minValue, float maxValue, int offset, int length); + + @Override + public native float sum(float[] src, int offset, int length); + + @Override + public native float dot(float[] srcA, float[] srcB, int offset, int length); + + @Override + public native void shl(byte[] src, int bits, byte[] dst, int offset, int length); + + @Override + public native void shrLogical(byte[] src, int bits, byte[] dst, int offset, int length); + + @Override + public native void addWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void subWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void unpackUnsignedByteToInt(byte[] src, int srcOffset, int[] dst, int dstOffset, int length); + + @Override + public native void add(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length); + + @Override + public native void cmpEq(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length); + + @Override + public native void cmpLt(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length); + + @Override + public native void select(byte[] mask, int maskOffset, int[] trueValues, int trueOffset, int[] falseValues, int falseOffset, int[] dst, int dstOffset, int length); + + private native byte[] allocByteNative(int size); + private native int[] allocIntNative(int size); + private native float[] allocFloatNative(int size); +} diff --git a/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java b/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java new file mode 100644 index 0000000000..d983e9708e --- /dev/null +++ b/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java @@ -0,0 +1,246 @@ +package com.codename1.util; + +import com.codename1.junit.FormTest; +import com.codename1.junit.UITestBase; +import com.codename1.ui.CN; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class SimdTest extends UITestBase { + + @FormTest + void baseFallbackOpsWork() { + Simd simd = new Simd(); + + int[] a = new int[]{1, 2, 3, 4}; + int[] b = new int[]{4, 3, 2, 1}; + int[] out = new int[4]; + simd.add(a, b, out, 0, 4); + assertEquals(5, out[0]); + assertEquals(5, out[3]); + + float[] fa = new float[]{1f, -2f, 3f}; + float[] fb = new float[]{4f, 5f, -6f}; + float[] fo = new float[3]; + simd.mul(fa, fb, fo, 0, 3); + assertEquals(4f, fo[0], 0.0001f); + assertEquals(-18f, fo[2], 0.0001f); + + byte[] ba = new byte[]{120, 100, -128}; + byte[] bb = new byte[]{20, 100, -1}; + byte[] bo = new byte[3]; + simd.add(ba, bb, bo, 0, 3); + assertEquals(127, bo[0]); + assertEquals(127, bo[1]); + assertEquals(-128, bo[2]); + } + + @FormTest + void javaseRegistryGuardInSimulator() { + Simd simd = Simd.get(); + if (!simd.isSupported()) { + return; + } + + int[] regA = simd.allocInt(16); + int[] regB = simd.allocInt(16); + int[] regO = simd.allocInt(16); + simd.add(regA, regB, regO, 0, 16); + + if (CN.isSimulator()) { + int[] plainA = new int[16]; + int[] plainB = new int[16]; + int[] plainO = new int[16]; + Throwable t = assertThrows(IllegalArgumentException.class, () -> simd.add(plainA, plainB, plainO, 0, 16)); + assertTrue(t.getMessage().indexOf("Simd.alloc") >= 0); + } + } + + @FormTest + void genericBitwiseShiftCompareSelectOpsWork() { + Simd simd = new Simd(); + + byte[] a = new byte[]{1, 2, 3, 4}; + byte[] b = new byte[]{3, 2, 1, 4}; + byte[] mask = new byte[4]; + byte[] outB = new byte[4]; + simd.cmpGt(a, b, mask, 0, 4); + simd.select(mask, a, b, outB, 0, 4); + assertEquals(3, outB[0]); + assertEquals(2, outB[1]); + assertEquals(3, outB[2]); + assertEquals(4, outB[3]); + + int[] ia = new int[]{0x0f0f0f0f, 8, -16, 7}; + int[] ib = new int[]{0x00ff00ff, 1, 2, 9}; + int[] io = new int[4]; + simd.and(ia, ib, io, 0, 4); + assertEquals(0x000f000f, io[0]); + simd.shrLogical(ia, 1, io, 0, 4); + assertEquals(4, io[1]); + simd.shrArithmetic(ia, 1, io, 0, 4); + assertEquals(-8, io[2]); + + byte[] intMask = new byte[4]; + simd.cmpLt(ia, ib, intMask, 0, 4); + simd.select(intMask, ia, ib, io, 0, 4); + assertEquals(0x00ff00ff, io[0]); + assertEquals(1, io[1]); + assertEquals(-16, io[2]); + assertEquals(7, io[3]); + + int[] unpack = new int[4]; + simd.unpackUnsignedByteToInt(new byte[]{-1, 0, 1, 127}, unpack, 0, 4); + assertEquals(255, unpack[0]); + assertEquals(127, unpack[3]); + + byte[] packed = new byte[4]; + simd.packIntToByteSaturating(new int[]{-129, -128, 127, 1000}, packed, 0, 4); + assertEquals(-128, packed[0]); + assertEquals(-128, packed[1]); + assertEquals(127, packed[2]); + assertEquals(127, packed[3]); + + byte[] permuted = new byte[4]; + simd.permuteBytes(new byte[]{10, 20, 30, 40}, new byte[]{3, 2, 1, -1}, permuted, 0, 4); + assertEquals(40, permuted[0]); + assertEquals(30, permuted[1]); + assertEquals(20, permuted[2]); + assertEquals(0, permuted[3]); + } + + @FormTest + void base64SimdMethodsMatchScalar() { + Simd simd = Simd.get(); + if (!simd.isSupported()) { + return; + } + + // Test that SIMD encode matches scalar encode + byte[] input = new byte[8192]; + for (int i = 0; i < input.length; i++) { + input[i] = (byte)(i * 31 + 17); + } + + int encodedLen = ((input.length + 2) / 3) * 4; + byte[] scalarEncoded = new byte[encodedLen]; + int scalarWritten = Base64.encodeNoNewline(input, scalarEncoded); + + byte[] simdInput = simd.allocByte(input.length); + System.arraycopy(input, 0, simdInput, 0, input.length); + byte[] simdEncoded = simd.allocByte(encodedLen); + int[] scratch = simd.allocInt(192); + int simdWritten = Base64.encodeNoNewlineSimd(simdInput, 0, simdInput.length, simdEncoded, 0, scratch); + + assertEquals(scalarWritten, simdWritten); + for (int i = 0; i < scalarWritten; i++) { + assertEquals(scalarEncoded[i], simdEncoded[i], "Encode mismatch at index " + i); + } + + // Test that SIMD decode matches scalar decode + byte[] scalarDecoded = new byte[input.length]; + int scalarDecLen = Base64.decode(scalarEncoded, scalarDecoded); + + byte[] simdDecoded = simd.allocByte(input.length); + int simdDecLen = Base64.decodeNoWhitespaceSimd(simdEncoded, 0, simdWritten, simdDecoded, 0, scratch); + + assertEquals(scalarDecLen, simdDecLen); + for (int i = 0; i < scalarDecLen; i++) { + assertEquals(scalarDecoded[i], simdDecoded[i], "Decode mismatch at index " + i); + } + } + + @FormTest + void byteShlAndShrLogicalWork() { + Simd simd = new Simd(); + byte[] src = new byte[]{(byte)0xAB, (byte)0x01, (byte)0xFF, (byte)0x80}; + byte[] dst = new byte[4]; + + simd.shl(src, 4, dst, 0, 4); + assertEquals((byte)0xB0, dst[0]); + assertEquals((byte)0x10, dst[1]); + assertEquals((byte)0xF0, dst[2]); + assertEquals((byte)0x00, dst[3]); + + simd.shrLogical(src, 4, dst, 0, 4); + assertEquals((byte)0x0A, dst[0]); + assertEquals((byte)0x00, dst[1]); + assertEquals((byte)0x0F, dst[2]); + assertEquals((byte)0x08, dst[3]); + } + + @FormTest + void addWrappingAndSubWrappingWork() { + Simd simd = new Simd(); + byte[] a = new byte[]{(byte)200, (byte)100, (byte)0, (byte)255}; + byte[] b = new byte[]{(byte)100, (byte)200, (byte)1, (byte)1}; + byte[] out = new byte[4]; + + simd.addWrapping(a, b, out, 0, 4); + assertEquals((byte)44, out[0]); // 200+100=300 mod 256=44 + assertEquals((byte)44, out[1]); // 100+200=300 mod 256=44 + assertEquals((byte)1, out[2]); // 0+1=1 + assertEquals((byte)0, out[3]); // 255+1=256 mod 256=0 + + simd.subWrapping(a, b, out, 0, 4); + assertEquals((byte)100, out[0]); // 200-100=100 + assertEquals((byte)156, out[1]); // 100-200=-100 mod 256=156 + assertEquals((byte)255, out[2]); // 0-1=-1 mod 256=255 + assertEquals((byte)254, out[3]); // 255-1=254 + } + + @FormTest + void offsetBasedIntOpsWork() { + Simd simd = new Simd(); + + // Test offset-based unpack + byte[] bytes = new byte[]{10, 20, (byte)200, (byte)255}; + int[] ints = new int[8]; + simd.unpackUnsignedByteToInt(bytes, 0, ints, 4, 4); + assertEquals(10, ints[4]); + assertEquals(20, ints[5]); + assertEquals(200, ints[6]); + assertEquals(255, ints[7]); + + // Test offset-based add + int[] a = new int[]{0, 0, 5, 10, 15, 20}; + int[] b = new int[]{1, 2, 3, 4, 5, 6}; + int[] out = new int[6]; + simd.add(a, 2, b, 0, out, 1, 4); + assertEquals(6, out[1]); // a[2]+b[0] = 5+1 + assertEquals(12, out[2]); // a[3]+b[1] = 10+2 + assertEquals(18, out[3]); // a[4]+b[2] = 15+3 + assertEquals(24, out[4]); // a[5]+b[3] = 20+4 + + // Test offset-based cmpLt + int[] vals = new int[]{5, 15, 25, 35}; + int[] thresh = new int[]{10, 10, 10, 10}; + byte[] mask = new byte[4]; + simd.cmpLt(vals, 0, thresh, 0, mask, 0, 4); + assertEquals((byte)-1, mask[0]); + assertEquals((byte)0, mask[1]); + assertEquals((byte)0, mask[2]); + assertEquals((byte)0, mask[3]); + + // Test offset-based cmpEq + int[] vals2 = new int[]{10, 20, 10, 30}; + simd.cmpEq(vals2, 0, thresh, 0, mask, 0, 4); + assertEquals((byte)-1, mask[0]); + assertEquals((byte)0, mask[1]); + assertEquals((byte)-1, mask[2]); + assertEquals((byte)0, mask[3]); + + // Test offset-based select + int[] trueV = new int[]{100, 200, 300, 400}; + int[] falseV = new int[]{-1, -2, -3, -4}; + int[] result = new int[4]; + mask[0] = -1; mask[1] = 0; mask[2] = -1; mask[3] = 0; + simd.select(mask, 0, trueV, 0, falseV, 0, result, 0, 4); + assertEquals(100, result[0]); + assertEquals(-2, result[1]); + assertEquals(300, result[2]); + assertEquals(-4, result[3]); + } +} diff --git a/scripts/build-ios-app.sh b/scripts/build-ios-app.sh index c88bc3e795..0f3f020a52 100755 --- a/scripts/build-ios-app.sh +++ b/scripts/build-ios-app.sh @@ -91,6 +91,90 @@ mkdir -p "$ARTIFACTS_DIR" export CN1_BUILD_STATS_FILE="$ARTIFACTS_DIR/iphone-builder-stats.txt" +copy_tree_contents() { + local src="$1" + local dest="$2" + mkdir -p "$dest" + if command -v rsync >/dev/null 2>&1; then + rsync -a "$src"/ "$dest"/ + else + cp -R "$src"/. "$dest"/ + fi +} + +find_bytecode_translator_sources() { + local root="$1" + local best="" + local best_score=0 + local dir score m_count c_count h_count + + [ -d "$root" ] || return 1 + + while IFS= read -r dir; do + [ -d "$dir" ] || continue + + score=0 + [ -f "$dir/cn1_globals.m" ] && score=$((score + 100)) + [ -f "$dir/xmlvm.h" ] && score=$((score + 100)) + + m_count="$(find "$dir" -maxdepth 1 -type f -name '*.m' 2>/dev/null | wc -l | tr -d ' ')" + c_count="$(find "$dir" -maxdepth 1 -type f -name '*.c' 2>/dev/null | wc -l | tr -d ' ')" + h_count="$(find "$dir" -maxdepth 1 -type f -name '*.h' 2>/dev/null | wc -l | tr -d ' ')" + + score=$((score + m_count + c_count + h_count)) + + if [ "$score" -gt "$best_score" ]; then + best="$dir" + best_score="$score" + fi + done < <( + find "$root" -type d \ + ! -path '*/Pods/*' \ + ! -path '*/build/*' \ + ! -path '*/Build/*' \ + ! -path '*/DerivedData/*' \ + ! -path '*/xcuserdata/*' \ + 2>/dev/null + ) + + [ -n "$best" ] || return 1 + printf '%s\n' "$best" +} + +stage_bytecode_translator_sources() { + local project_dir="$1" + local artifacts_dir="$2" + + local bt_dir="" + local out_dir="$artifacts_dir/bytecode-translator-sources" + local zip_file="$artifacts_dir/bytecode-translator-sources.zip" + local listing_file="$artifacts_dir/bytecode-translator-files.txt" + + bt_dir="$(find_bytecode_translator_sources "$project_dir" || true)" + if [ -z "$bt_dir" ]; then + bia_log "ByteCodeTranslator source directory not found under $project_dir" + return 0 + fi + + bia_log "Detected ByteCodeTranslator sources at $bt_dir" + + rm -rf "$out_dir" "$zip_file" + mkdir -p "$out_dir" + + copy_tree_contents "$bt_dir" "$out_dir" + + find "$out_dir" -maxdepth 2 -type f \( -name '*.m' -o -name '*.c' -o -name '*.h' \) \ + | sort > "$listing_file" || true + + ( + cd "$artifacts_dir" + zip -qry "$(basename "$zip_file")" "$(basename "$out_dir")" + ) + + bia_log "Staged ByteCodeTranslator sources in $out_dir" + bia_log "Created archive $zip_file" +} + bia_log "Running HelloCodenameOne Maven build with JAVA_HOME=$JAVA17_HOME" ( export JAVA_HOME="$JAVA17_HOME" @@ -162,6 +246,8 @@ if [ -z "$PROJECT_DIR" ]; then fi bia_log "Found generated iOS project at $PROJECT_DIR" +stage_bytecode_translator_sources "$PROJECT_DIR" "$ARTIFACTS_DIR" + if [ -f "$PROJECT_DIR/Podfile" ]; then if ! command -v pod >/dev/null 2>&1; then bia_log "Generated project requires CocoaPods but the pod command is not installed." >&2 diff --git a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java index 75b63cfc9d..e5d1b933cc 100644 --- a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java +++ b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java @@ -4,6 +4,7 @@ import com.codename1.ui.Display; import com.codenameone.examples.hellocodenameone.Base64Native; import com.codename1.util.Base64; +import com.codename1.util.Simd; public class Base64NativePerformanceTest extends BaseTest { @@ -60,15 +61,54 @@ public boolean runTest() { return false; } byte[] cn1DecodedBuffer = new byte[payloadBytes.length]; + Simd simd = Simd.get(); + boolean runSimdBenchmark = isIos() && simd.isSupported(); + byte[] simdPayloadBytes = null; + byte[] simdEncodedBytes = null; + byte[] simdDecodedBuffer = null; + int[] simdScratch = null; + if (runSimdBenchmark) { + simdPayloadBytes = simd.allocByte(payloadBytes.length); + System.arraycopy(payloadBytes, 0, simdPayloadBytes, 0, payloadBytes.length); + simdEncodedBytes = simd.allocByte(encodedLen); + simdDecodedBuffer = simd.allocByte(payloadBytes.length); + simdScratch = simd.allocInt(192); + + int simdEncodedWritten = Base64.encodeNoNewlineSimd(simdPayloadBytes, 0, simdPayloadBytes.length, simdEncodedBytes, 0, simdScratch); + if (simdEncodedWritten != encodedLen) { + fail("SIMD Base64 encode returned unexpected length"); + return false; + } + if (!byteArraysEqual(cn1EncodedBytes, simdEncodedBytes, encodedLen)) { + fail("SIMD Base64 encode mismatch"); + return false; + } + int simdDecodedWritten = Base64.decodeNoWhitespaceSimd(simdEncodedBytes, 0, encodedLen, simdDecodedBuffer, 0, simdScratch); + if (simdDecodedWritten != payloadBytes.length) { + fail("SIMD Base64 decode returned unexpected length"); + return false; + } + if (!byteArraysEqual(payloadBytes, simdDecodedBuffer, payloadBytes.length)) { + fail("SIMD Base64 decode mismatch"); + return false; + } + } if (!isIos()) { - warmup(nativeBase64, payload, payloadBytes, nativeEncoded, cn1EncodedBytes, cn1DecodedBuffer); + warmup(nativeBase64, payload, payloadBytes, nativeEncoded, cn1EncodedBytes, cn1DecodedBuffer, + runSimdBenchmark, simdPayloadBytes, simdEncodedBytes, simdDecodedBuffer, simdScratch, encodedLen); + } + if (runSimdBenchmark) { + warmup(nativeBase64, payload, payloadBytes, nativeEncoded, cn1EncodedBytes, cn1DecodedBuffer, + true, simdPayloadBytes, simdEncodedBytes, simdDecodedBuffer, simdScratch, encodedLen); } long nativeEncodeMs = measureNativeEncode(nativeBase64, payload); long cn1EncodeMs = measureCn1Encode(payloadBytes, cn1EncodedBytes); long nativeDecodeMs = measureNativeDecode(nativeBase64, nativeEncoded); long cn1DecodeMs = measureCn1Decode(cn1EncodedBytes, cn1DecodedBuffer); + long simdEncodeMs = runSimdBenchmark ? measureSimdEncode(simdPayloadBytes, simdEncodedBytes, simdScratch) : -1; + long simdDecodeMs = runSimdBenchmark ? measureSimdDecode(simdEncodedBytes, simdDecodedBuffer, simdScratch) : -1; double encodeRatio = cn1EncodeMs / Math.max(1.0, (double) nativeEncodeMs); double decodeRatio = cn1DecodeMs / Math.max(1.0, (double) nativeDecodeMs); @@ -80,17 +120,47 @@ public boolean runTest() { emitStat("Base64 native decode", formatMs(nativeDecodeMs)); emitStat("Base64 CN1 decode", formatMs(cn1DecodeMs)); emitStat("Base64 decode ratio (CN1/native)", formatRatio(decodeRatio)); + if (runSimdBenchmark) { + double simdEncodeRatioVsNative = simdEncodeMs / Math.max(1.0, (double) nativeEncodeMs); + double simdDecodeRatioVsNative = simdDecodeMs / Math.max(1.0, (double) nativeDecodeMs); + double simdEncodeRatioVsCn1 = simdEncodeMs / Math.max(1.0, (double) cn1EncodeMs); + double simdDecodeRatioVsCn1 = simdDecodeMs / Math.max(1.0, (double) cn1DecodeMs); + emitStat("Base64 SIMD encode", formatMs(simdEncodeMs)); + emitStat("Base64 encode ratio (SIMD/native)", formatRatio(simdEncodeRatioVsNative)); + emitStat("Base64 encode ratio (SIMD/CN1)", formatRatio(simdEncodeRatioVsCn1)); + emitStat("Base64 SIMD decode", formatMs(simdDecodeMs)); + emitStat("Base64 decode ratio (SIMD/native)", formatRatio(simdDecodeRatioVsNative)); + emitStat("Base64 decode ratio (SIMD/CN1)", formatRatio(simdDecodeRatioVsCn1)); + } done(); return true; } - private static void warmup(Base64Native nativeBase64, String payload, byte[] payloadBytes, String nativeEncoded, byte[] cn1EncodedBytes, byte[] cn1DecodedBuffer) { + private static void warmup(Base64Native nativeBase64, String payload, byte[] payloadBytes, String nativeEncoded, byte[] cn1EncodedBytes, + byte[] cn1DecodedBuffer, boolean includeSimd, byte[] simdPayloadBytes, byte[] simdEncodedBytes, + byte[] simdDecodedBuffer, int[] simdScratch, int encodedLen) { for (int i = 0; i < 40; i++) { nativeBase64.encodeUtf8(payload); - Base64.encodeNoNewline(payloadBytes, cn1EncodedBytes); + int cn1EncodedWritten = Base64.encodeNoNewline(payloadBytes, cn1EncodedBytes); + if (cn1EncodedWritten != encodedLen) { + throw new IllegalStateException("Warmup CN1 encode length mismatch"); + } nativeBase64.decodeToUtf8(nativeEncoded); - Base64.decode(cn1EncodedBytes, cn1DecodedBuffer); + int cn1DecodedWritten = Base64.decode(cn1EncodedBytes, cn1DecodedBuffer); + if (cn1DecodedWritten != payloadBytes.length || !byteArraysEqual(payloadBytes, cn1DecodedBuffer, payloadBytes.length)) { + throw new IllegalStateException("Warmup CN1 decode mismatch"); + } + if (includeSimd) { + int simdEncodedWritten = Base64.encodeNoNewlineSimd(simdPayloadBytes, 0, simdPayloadBytes.length, simdEncodedBytes, 0, simdScratch); + if (simdEncodedWritten != encodedLen || !byteArraysEqual(cn1EncodedBytes, simdEncodedBytes, encodedLen)) { + throw new IllegalStateException("Warmup SIMD encode mismatch"); + } + int simdDecodedWritten = Base64.decodeNoWhitespaceSimd(simdEncodedBytes, 0, encodedLen, simdDecodedBuffer, 0, simdScratch); + if (simdDecodedWritten != payloadBytes.length || !byteArraysEqual(payloadBytes, simdDecodedBuffer, payloadBytes.length)) { + throw new IllegalStateException("Warmup SIMD decode mismatch"); + } + } } } @@ -126,6 +196,22 @@ private static long measureCn1Decode(byte[] encoded, byte[] outputBuffer) { return System.currentTimeMillis() - start; } + private static long measureSimdEncode(byte[] payloadBytes, byte[] outputBuffer, int[] scratch) { + long start = System.currentTimeMillis(); + for (int i = 0; i < ITERATIONS; i++) { + Base64.encodeNoNewlineSimd(payloadBytes, 0, payloadBytes.length, outputBuffer, 0, scratch); + } + return System.currentTimeMillis() - start; + } + + private static long measureSimdDecode(byte[] encoded, byte[] outputBuffer, int[] scratch) { + long start = System.currentTimeMillis(); + for (int i = 0; i < ITERATIONS; i++) { + Base64.decodeNoWhitespaceSimd(encoded, 0, encoded.length, outputBuffer, 0, scratch); + } + return System.currentTimeMillis() - start; + } + private static String decodeUtf8(String base64) { try { return new String(Base64.decode(base64.getBytes()), "UTF-8"); @@ -147,6 +233,21 @@ private static boolean isIos() { return platformName != null && platformName.toLowerCase().contains("ios"); } + private static boolean byteArraysEqual(byte[] a, byte[] b, int len) { + if (a == b) { + return true; + } + if (a == null || b == null || a.length < len || b.length < len) { + return false; + } + for (int i = 0; i < len; i++) { + if (a[i] != b[i]) { + return false; + } + } + return true; + } + private static String formatMs(double millis) { return formatDecimal(millis, 3) + " ms"; } diff --git a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Cn1ssDeviceRunner.java b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Cn1ssDeviceRunner.java index 7eb91d2869..5cd364a492 100644 --- a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Cn1ssDeviceRunner.java +++ b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Cn1ssDeviceRunner.java @@ -80,6 +80,7 @@ public final class Cn1ssDeviceRunner extends DeviceRunner { new OrientationLockScreenshotTest(), new InPlaceEditViewTest(), new BytecodeTranslatorRegressionTest(), + new SimdApiTest(), new StreamApiTest(), new TimeApiTest(), new Java17Tests(), diff --git a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java new file mode 100644 index 0000000000..36c07f55a6 --- /dev/null +++ b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java @@ -0,0 +1,94 @@ +package com.codenameone.examples.hellocodenameone.tests; + +import com.codename1.ui.CN; +import com.codename1.util.Simd; + +public class SimdApiTest extends BaseTest { + @Override + public boolean runTest() { + try { + Simd simd = Simd.get(); + if (!simd.isSupported()) { + int[] a = new int[]{1, 2, 3, 4}; + int[] b = new int[]{9, 8, 7, 6}; + int[] out = new int[4]; + simd.add(a, b, out, 0, 4); + if (out[0] != 10 || out[1] != 10 || out[2] != 10 || out[3] != 10) { + fail("Fallback SIMD API add failed on unsupported platform"); + return false; + } + done(); + return true; + } + + int[] a = simd.allocInt(16); + int[] b = simd.allocInt(16); + int[] out = simd.allocInt(16); + for (int i = 0; i < 8; i++) { + a[i] = i + 1; + b[i] = 9 - i; + } + simd.add(a, b, out, 0, 8); + for (int i = 0; i < 8; i++) { + if (out[i] != 10) { + fail("Unexpected int add result at " + i + ": " + out[i]); + return false; + } + } + + float[] fa = simd.allocFloat(16); + float[] fb = simd.allocFloat(16); + float[] fo = simd.allocFloat(16); + fa[0] = 1.5f; + fa[1] = -2f; + fa[2] = 3f; + fa[3] = -4f; + fb[0] = 2f; + fb[1] = 3f; + fb[2] = -1f; + fb[3] = 0.5f; + simd.mul(fa, fb, fo, 0, 4); + if (Math.abs(fo[0] - 3f) > 0.0001f || Math.abs(fo[1] + 6f) > 0.0001f + || Math.abs(fo[2] + 3f) > 0.0001f || Math.abs(fo[3] + 2f) > 0.0001f) { + fail("Unexpected float mul results"); + return false; + } + + byte[] ba = simd.allocByte(16); + byte[] bb = simd.allocByte(16); + byte[] bo = simd.allocByte(16); + ba[0] = 120; + ba[1] = 10; + ba[2] = -120; + bb[0] = 20; + bb[1] = -40; + bb[2] = -20; + simd.add(ba, bb, bo, 0, 3); + if (bo[0] != 127 || bo[1] != -30 || bo[2] != -128) { + fail("Unexpected saturating byte add results"); + return false; + } + + if (CN.isSimulator()) { + try { + simd.add(new int[4], new int[4], new int[4], 0, 4); + fail("Expected simulator registry guard to reject non-alloc arrays"); + return false; + } catch (IllegalArgumentException expected) { + // expected + } + } + + done(); + return true; + } catch (Throwable t) { + fail("SimdApiTest failed: " + t); + return false; + } + } + + @Override + public boolean shouldTakeScreenshot() { + return false; + } +} diff --git a/vm/ByteCodeTranslator/src/cn1_globals.h b/vm/ByteCodeTranslator/src/cn1_globals.h index 5b3c8bfebe..dbd1cc1dac 100644 --- a/vm/ByteCodeTranslator/src/cn1_globals.h +++ b/vm/ByteCodeTranslator/src/cn1_globals.h @@ -10,6 +10,7 @@ #include #include #include +#include //#define DEBUG_GC_ALLOCATIONS @@ -1085,6 +1086,7 @@ extern void arrayFinalizerFunction(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT array) extern void gcReleaseObj(JAVA_OBJECT o); extern JAVA_OBJECT allocArray(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type, int primitiveSize, int dim); +extern JAVA_OBJECT allocArrayAligned(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type, int primitiveSize, int dim, int alignment); extern JAVA_OBJECT allocMultiArray(int* lengths, struct clazz* type, int primitiveSize, int dim); extern JAVA_OBJECT alloc2DArray(CODENAME_ONE_THREAD_STATE, int length1, int length2, struct clazz* parentType, struct clazz* childType, int primitiveSize); extern JAVA_OBJECT alloc3DArray(CODENAME_ONE_THREAD_STATE, int length1, int length2, int length3, struct clazz* parentType, struct clazz* childType, struct clazz* grandChildType, int primitiveSize); diff --git a/vm/ByteCodeTranslator/src/cn1_globals.m b/vm/ByteCodeTranslator/src/cn1_globals.m index 3269fa92e1..4b3e883111 100644 --- a/vm/ByteCodeTranslator/src/cn1_globals.m +++ b/vm/ByteCodeTranslator/src/cn1_globals.m @@ -1221,6 +1221,31 @@ JAVA_OBJECT allocArray(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type return (JAVA_OBJECT)array; } +JAVA_OBJECT allocArrayAligned(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type, int primitiveSize, int dim, int alignment) { + int actualSize = length * primitiveSize; + int requestedAlignment = alignment; + if (requestedAlignment < (int)sizeof(void*)) { + requestedAlignment = (int)sizeof(void*); + } + if ((requestedAlignment & (requestedAlignment - 1)) != 0) { + requestedAlignment = 16; + } + int extraPadding = requestedAlignment - 1; + JAVA_ARRAY array = (JAVA_ARRAY)codenameOneGcMalloc(threadStateData, sizeof(struct JavaArrayPrototype) + actualSize + sizeof(void*) + extraPadding, type); + (*array).length = length; + (*array).dimensions = dim; + (*array).primitiveSize = primitiveSize; + if (actualSize > 0) { + char* arr = (char*)(&(array->data)); + arr += sizeof(void*); + uintptr_t aligned = (((uintptr_t)arr) + ((uintptr_t)requestedAlignment - 1)) & ~((uintptr_t)requestedAlignment - 1); + (*array).data = (void*)aligned; + } else { + (*array).data = 0; + } + return (JAVA_OBJECT)array; +} + JAVA_OBJECT alloc2DArray(CODENAME_ONE_THREAD_STATE, int length2, int length1, struct clazz* parentType, struct clazz* childType, int primitiveSize) { JAVA_ARRAY base = (JAVA_ARRAY)allocArray(threadStateData, length1, parentType, sizeof(JAVA_OBJECT), 2); JAVA_ARRAY_OBJECT* objs = base->data;