From 11c999d5cd3e879f7d5c4c88644816c7de25054f Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Sun, 12 Apr 2026 20:57:20 +0300 Subject: [PATCH 01/12] Initial Simd API --- .../impl/CodenameOneImplementation.java | 7 + CodenameOne/src/com/codename1/ui/CN.java | 6 + CodenameOne/src/com/codename1/ui/Display.java | 15 + CodenameOne/src/com/codename1/util/Simd.java | 326 ++++++++++++ .../com/codename1/impl/javase/JavaSEPort.java | 6 + .../com/codename1/impl/javase/JavaSESimd.java | 242 +++++++++ Ports/iOSPort/nativeSources/IOSSimd.m | 484 ++++++++++++++++++ .../codename1/impl/ios/IOSImplementation.java | 6 + .../src/com/codename1/impl/ios/IOSSimd.java | 119 +++++ .../java/com/codename1/util/SimdTest.java | 60 +++ .../tests/Cn1ssDeviceRunner.java | 1 + .../hellocodenameone/tests/SimdApiTest.java | 86 ++++ vm/ByteCodeTranslator/src/cn1_globals.h | 2 + vm/ByteCodeTranslator/src/cn1_globals.m | 25 + 14 files changed, 1385 insertions(+) create mode 100644 CodenameOne/src/com/codename1/util/Simd.java create mode 100644 Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java create mode 100644 Ports/iOSPort/nativeSources/IOSSimd.m create mode 100644 Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java create mode 100644 maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java create mode 100644 scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java diff --git a/CodenameOne/src/com/codename1/impl/CodenameOneImplementation.java b/CodenameOne/src/com/codename1/impl/CodenameOneImplementation.java index 4777cde002..1190843222 100644 --- a/CodenameOne/src/com/codename1/impl/CodenameOneImplementation.java +++ b/CodenameOne/src/com/codename1/impl/CodenameOneImplementation.java @@ -85,6 +85,7 @@ import com.codename1.ui.util.ImageIO; import com.codename1.util.AsyncResource; import com.codename1.util.FailureCallback; +import com.codename1.util.Simd; import com.codename1.util.StringUtil; import com.codename1.util.SuccessCallback; @@ -8397,6 +8398,12 @@ public ImageIO getImageIO() { return null; } + /// Creates the SIMD implementation for this platform. + /// Ports may override this to provide accelerated SIMD behavior. + public Simd createSimd() { + return new Simd(); + } + /// Workaround for XMLVM bug public boolean instanceofObjArray(Object o) { return o instanceof Object[]; diff --git a/CodenameOne/src/com/codename1/ui/CN.java b/CodenameOne/src/com/codename1/ui/CN.java index 9bcce255ea..62470ab8da 100644 --- a/CodenameOne/src/com/codename1/ui/CN.java +++ b/CodenameOne/src/com/codename1/ui/CN.java @@ -36,6 +36,7 @@ import com.codename1.ui.events.WindowEvent; import com.codename1.ui.geom.Dimension; import com.codename1.ui.geom.Rectangle; +import com.codename1.util.Simd; import com.codename1.util.RunnableWithResultSync; import java.io.IOException; @@ -1032,6 +1033,11 @@ public static String getPlatformName() { return Display.impl.getPlatformName(); } + /// Returns the SIMD API for the current platform. + public static Simd getSimd() { + return Display.getInstance().getSimd(); + } + /// Opens the device Dialer application with the given phone number /// diff --git a/CodenameOne/src/com/codename1/ui/Display.java b/CodenameOne/src/com/codename1/ui/Display.java index c56aaf780d..34bf4c4034 100644 --- a/CodenameOne/src/com/codename1/ui/Display.java +++ b/CodenameOne/src/com/codename1/ui/Display.java @@ -60,6 +60,7 @@ import com.codename1.ui.util.EventDispatcher; import com.codename1.ui.util.ImageIO; import com.codename1.util.AsyncResource; +import com.codename1.util.Simd; import com.codename1.util.RunnableWithResultSync; import com.codename1.util.SuccessCallback; @@ -216,6 +217,7 @@ public final class Display extends CN1Constants { long time; private int transitionDelay = -1; private String selectedVirtualKeyboard = null; + private Simd simd; private CrashReport crashReporter; private EventDispatcher errorHandler; private boolean inNativeUI; @@ -343,6 +345,7 @@ public static void init(Object m) { commandBehaviour = impl.getCommandBehavior(); } impl = (CodenameOneImplementation) ImplementationFactory.getInstance().createImplementation(); + INSTANCE.simd = null; impl.setDisplayLock(lock); impl.initImpl(m); @@ -493,6 +496,18 @@ CodenameOneImplementation getImplementation() { return impl; } + /// Returns the SIMD API instance bound to the current implementation. + public Simd getSimd() { + if (simd == null) { + Simd created = impl.createSimd(); + if (created == null) { + created = new Simd(); + } + simd = created; + } + return simd; + } + /// Indicates the maximum frames the API will try to draw every second /// by default this is set to 10. The advantage of limiting /// framerate is to allow the CPU to perform other tasks besides drawing. diff --git a/CodenameOne/src/com/codename1/util/Simd.java b/CodenameOne/src/com/codename1/util/Simd.java new file mode 100644 index 0000000000..53e268c4d7 --- /dev/null +++ b/CodenameOne/src/com/codename1/util/Simd.java @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2026, Codename One and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Codename One designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + */ +package com.codename1.util; + +import com.codename1.ui.CN; + +/** + * Portable SIMD API with Java fallback implementations. + */ +public class Simd { + + public static Simd get() { + return CN.getSimd(); + } + + public boolean isSupported() { + return false; + } + + public byte[] allocByte(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return new byte[size]; + } + + public int[] allocInt(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return new int[size]; + } + + public float[] allocFloat(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return new float[size]; + } + + public void add(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = clampByte(srcA[i] + srcB[i]); + } + } + + public void sub(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = clampByte(srcA[i] - srcB[i]); + } + } + + public void mul(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = clampByte(srcA[i] * srcB[i]); + } + } + + public void min(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] < srcB[i] ? srcA[i] : srcB[i]; + } + } + + public void max(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] > srcB[i] ? srcA[i] : srcB[i]; + } + } + + public void abs(byte[] src, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + if (v == Byte.MIN_VALUE) { + dst[i] = Byte.MAX_VALUE; + } else { + dst[i] = (byte)Math.abs(v); + } + } + } + + public void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + if (v < minValue) { + dst[i] = minValue; + } else if (v > maxValue) { + dst[i] = maxValue; + } else { + dst[i] = (byte)v; + } + } + } + + public void add(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] + srcB[i]; + } + } + + public void sub(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] - srcB[i]; + } + } + + public void mul(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] * srcB[i]; + } + } + + public void min(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] < srcB[i] ? srcA[i] : srcB[i]; + } + } + + public void max(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] > srcB[i] ? srcA[i] : srcB[i]; + } + } + + public void abs(int[] src, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + dst[i] = v == Integer.MIN_VALUE ? Integer.MAX_VALUE : Math.abs(v); + } + } + + public void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + if (v < minValue) { + dst[i] = minValue; + } else if (v > maxValue) { + dst[i] = maxValue; + } else { + dst[i] = v; + } + } + } + + public int sum(int[] src, int offset, int length) { + int out = 0; + for (int i = offset, end = offset + length; i < end; i++) { + out += src[i]; + } + return out; + } + + public int dot(int[] srcA, int[] srcB, int offset, int length) { + int out = 0; + for (int i = offset, end = offset + length; i < end; i++) { + out += srcA[i] * srcB[i]; + } + return out; + } + + public void add(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] + srcB[i]; + } + } + + public void sub(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] - srcB[i]; + } + } + + public void mul(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] * srcB[i]; + } + } + + public void min(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = Math.min(srcA[i], srcB[i]); + } + } + + public void max(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = Math.max(srcA[i], srcB[i]); + } + } + + public void abs(float[] src, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = Math.abs(src[i]); + } + } + + public void clamp(float[] src, float[] dst, float minValue, float maxValue, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + float v = src[i]; + if (v < minValue) { + dst[i] = minValue; + } else if (v > maxValue) { + dst[i] = maxValue; + } else { + dst[i] = v; + } + } + } + + public float sum(float[] src, int offset, int length) { + float out = 0f; + for (int i = offset, end = offset + length; i < end; i++) { + out += src[i]; + } + return out; + } + + public float dot(float[] srcA, float[] srcB, int offset, int length) { + float out = 0f; + for (int i = offset, end = offset + length; i < end; i++) { + out += srcA[i] * srcB[i]; + } + return out; + } + + protected final void validateBinaryByte(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dst.length, offset, length, "dst"); + } + + protected final void validateUnaryByte(byte[] src, byte[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + protected final void validateBinaryInt(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dst.length, offset, length, "dst"); + } + + protected final void validateUnaryInt(int[] src, int[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + protected final void validateReductionInt(int[] src, int offset, int length) { + validateNotNull(src, "src"); + validateRange(src.length, offset, length, "src"); + } + + protected final void validateDotInt(int[] srcA, int[] srcB, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + } + + protected final void validateBinaryFloat(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dst.length, offset, length, "dst"); + } + + protected final void validateUnaryFloat(float[] src, float[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + protected final void validateReductionFloat(float[] src, int offset, int length) { + validateNotNull(src, "src"); + validateRange(src.length, offset, length, "src"); + } + + protected final void validateDotFloat(float[] srcA, float[] srcB, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + } + + protected final void validateNotNull(Object o, String name) { + if (o == null) { + throw new NullPointerException(name + " is null"); + } + } + + protected final void validateRange(int arrayLength, int offset, int length, String name) { + if (offset < 0 || length < 0 || offset > arrayLength || arrayLength - offset < length) { + throw new ArrayIndexOutOfBoundsException(name + " invalid range offset=" + offset + " length=" + length + " size=" + arrayLength); + } + } + + private byte clampByte(int value) { + if (value > Byte.MAX_VALUE) { + return Byte.MAX_VALUE; + } + if (value < Byte.MIN_VALUE) { + return Byte.MIN_VALUE; + } + return (byte)value; + } +} diff --git a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSEPort.java b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSEPort.java index bcf4c861e1..148699a232 100644 --- a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSEPort.java +++ b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSEPort.java @@ -118,6 +118,7 @@ import com.codename1.ui.util.UITimer; import com.codename1.util.AsyncResource; import com.codename1.util.Callback; +import com.codename1.util.Simd; import com.jhlabs.image.GaussianFilter; import java.awt.*; import java.awt.datatransfer.Clipboard; @@ -10753,6 +10754,11 @@ public String getPlatformName() { return platformName; } + @Override + public Simd createSimd() { + return new JavaSESimd(); + } + /** * @inheritDoc */ diff --git a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java new file mode 100644 index 0000000000..b685daeb53 --- /dev/null +++ b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2026, Codename One and/or its affiliates. All rights reserved. + */ +package com.codename1.impl.javase; + +import com.codename1.ui.CN; +import com.codename1.util.Simd; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +/** + * JavaSE SIMD implementation used for simulator validation and fallback execution. + */ +public class JavaSESimd extends Simd { + private final Set allocatedIds = Collections.synchronizedSet(new HashSet()); + + @Override + public boolean isSupported() { + return true; + } + + @Override + public byte[] allocByte(int size) { + byte[] out = super.allocByte(size); + allocatedIds.add(Integer.valueOf(System.identityHashCode(out))); + return out; + } + + @Override + public int[] allocInt(int size) { + int[] out = super.allocInt(size); + allocatedIds.add(Integer.valueOf(System.identityHashCode(out))); + return out; + } + + @Override + public float[] allocFloat(int size) { + float[] out = super.allocFloat(size); + allocatedIds.add(Integer.valueOf(System.identityHashCode(out))); + return out; + } + + @Override + public void add(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.add(srcA, srcB, dst, offset, length); + } + + @Override + public void sub(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.sub(srcA, srcB, dst, offset, length); + } + + @Override + public void mul(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.mul(srcA, srcB, dst, offset, length); + } + + @Override + public void min(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.min(srcA, srcB, dst, offset, length); + } + + @Override + public void max(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.max(srcA, srcB, dst, offset, length); + } + + @Override + public void abs(byte[] src, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.abs(src, dst, offset, length); + } + + @Override + public void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offset, int length) { + if (minValue > maxValue) { + throw new IllegalArgumentException("minValue > maxValue"); + } + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.clamp(src, dst, minValue, maxValue, offset, length); + } + + @Override + public void add(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.add(srcA, srcB, dst, offset, length); + } + + @Override + public void sub(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.sub(srcA, srcB, dst, offset, length); + } + + @Override + public void mul(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.mul(srcA, srcB, dst, offset, length); + } + + @Override + public void min(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.min(srcA, srcB, dst, offset, length); + } + + @Override + public void max(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.max(srcA, srcB, dst, offset, length); + } + + @Override + public void abs(int[] src, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.abs(src, dst, offset, length); + } + + @Override + public void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, int length) { + if (minValue > maxValue) { + throw new IllegalArgumentException("minValue > maxValue"); + } + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.clamp(src, dst, minValue, maxValue, offset, length); + } + + @Override + public int sum(int[] src, int offset, int length) { + validateReductionInt(src, offset, length); + validateRegistered(src); + return super.sum(src, offset, length); + } + + @Override + public int dot(int[] srcA, int[] srcB, int offset, int length) { + validateDotInt(srcA, srcB, offset, length); + validateRegistered(srcA, srcB); + return super.dot(srcA, srcB, offset, length); + } + + @Override + public void add(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.add(srcA, srcB, dst, offset, length); + } + + @Override + public void sub(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.sub(srcA, srcB, dst, offset, length); + } + + @Override + public void mul(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.mul(srcA, srcB, dst, offset, length); + } + + @Override + public void min(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.min(srcA, srcB, dst, offset, length); + } + + @Override + public void max(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.max(srcA, srcB, dst, offset, length); + } + + @Override + public void abs(float[] src, float[] dst, int offset, int length) { + validateUnaryFloat(src, dst, offset, length); + validateRegistered(src, dst); + super.abs(src, dst, offset, length); + } + + @Override + public void clamp(float[] src, float[] dst, float minValue, float maxValue, int offset, int length) { + if (minValue > maxValue) { + throw new IllegalArgumentException("minValue > maxValue"); + } + validateUnaryFloat(src, dst, offset, length); + validateRegistered(src, dst); + super.clamp(src, dst, minValue, maxValue, offset, length); + } + + @Override + public float sum(float[] src, int offset, int length) { + validateReductionFloat(src, offset, length); + validateRegistered(src); + return super.sum(src, offset, length); + } + + @Override + public float dot(float[] srcA, float[] srcB, int offset, int length) { + validateDotFloat(srcA, srcB, offset, length); + validateRegistered(srcA, srcB); + return super.dot(srcA, srcB, offset, length); + } + + private void validateRegistered(Object... arrays) { + if (!CN.isSimulator()) { + return; + } + for (int i = 0; i < arrays.length; i++) { + Object arr = arrays[i]; + Integer id = Integer.valueOf(System.identityHashCode(arr)); + if (!allocatedIds.contains(id)) { + throw new IllegalArgumentException( + "SIMD array argument was not allocated using Simd.alloc*(). objectId=" + id.intValue()); + } + } + } +} diff --git a/Ports/iOSPort/nativeSources/IOSSimd.m b/Ports/iOSPort/nativeSources/IOSSimd.m new file mode 100644 index 0000000000..c80fe9005d --- /dev/null +++ b/Ports/iOSPort/nativeSources/IOSSimd.m @@ -0,0 +1,484 @@ +#include "xmlvm.h" +#include +#include +#include + +static JAVA_ARRAY_BYTE cn1_saturating_byte(int value) { + if (value > 127) { + return 127; + } + if (value < -128) { + return -128; + } + return (JAVA_ARRAY_BYTE)value; +} + +JAVA_OBJECT com_codename1_impl_ios_IOSSimd_allocByteNative___int_R_byte_1ARRAY(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_INT size) { + return allocArrayAligned(threadStateData, size, &class_array1__JAVA_BYTE, sizeof(JAVA_ARRAY_BYTE), 1, 16); +} + +JAVA_OBJECT com_codename1_impl_ios_IOSSimd_allocIntNative___int_R_int_1ARRAY(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_INT size) { + return allocArrayAligned(threadStateData, size, &class_array1__JAVA_INT, sizeof(JAVA_ARRAY_INT), 1, 16); +} + +JAVA_OBJECT com_codename1_impl_ios_IOSSimd_allocFloatNative___int_R_float_1ARRAY(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_INT size) { + return allocArrayAligned(threadStateData, size, &class_array1__JAVA_FLOAT, sizeof(JAVA_ARRAY_FLOAT), 1, 16); +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_add___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int8x16_t vc = vqaddq_s8(va, vb); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = cn1_saturating_byte((int)a[i] + (int)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_sub___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int8x16_t vc = vqsubq_s8(va, vb); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = cn1_saturating_byte((int)a[i] - (int)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_mul___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int16x8_t low = vmull_s8(vget_low_s8(va), vget_low_s8(vb)); + int16x8_t high = vmull_s8(vget_high_s8(va), vget_high_s8(vb)); + int8x8_t low8 = vqmovn_s16(low); + int8x8_t high8 = vqmovn_s16(high); + int8x16_t out = vcombine_s8(low8, high8); + vst1q_s8((int8_t*)(d + i), out); + } + for (; i < end; i++) { + d[i] = cn1_saturating_byte((int)a[i] * (int)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_min___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int8x16_t vc = vminq_s8(va, vb); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] < b[i] ? a[i] : b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_max___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int8x16_t vc = vmaxq_s8(va, vb); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] > b[i] ? a[i] : b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_abs___byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + int8x16_t vd = vqabsq_s8(vs); + vst1q_s8((int8_t*)(d + i), vd); + } + for (; i < end; i++) { + int v = s[i]; + d[i] = v == -128 ? 127 : (JAVA_ARRAY_BYTE)abs(v); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_clamp___byte_1ARRAY_byte_1ARRAY_byte_byte_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_BYTE minValue, JAVA_BYTE maxValue, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + int8x16_t vminv = vdupq_n_s8((int8_t)minValue); + int8x16_t vmaxv = vdupq_n_s8((int8_t)maxValue); + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + int8x16_t vc = vmaxq_s8(vminv, vminq_s8(vs, vmaxv)); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + int v = s[i]; + if (v < minValue) { + d[i] = minValue; + } else if (v > maxValue) { + d[i] = maxValue; + } else { + d[i] = (JAVA_ARRAY_BYTE)v; + } + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_add___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vaddq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)((int32_t)a[i] + (int32_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_sub___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vsubq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)((int32_t)a[i] - (int32_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_mul___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vmulq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)((int32_t)a[i] * (int32_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_min___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vminq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] < b[i] ? a[i] : b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_max___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vmaxq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] > b[i] ? a[i] : b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_abs___int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + int32x4_t vd = vqabsq_s32(vs); + vst1q_s32((int32_t*)(d + i), vd); + } + for (; i < end; i++) { + int32_t v = (int32_t)s[i]; + d[i] = (JAVA_ARRAY_INT)(v == INT32_MIN ? INT32_MAX : (v < 0 ? -v : v)); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_clamp___int_1ARRAY_int_1ARRAY_int_int_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT minValue, JAVA_INT maxValue, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + int32x4_t vminv = vdupq_n_s32((int32_t)minValue); + int32x4_t vmaxv = vdupq_n_s32((int32_t)maxValue); + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + int32x4_t vc = vmaxq_s32(vminv, vminq_s32(vs, vmaxv)); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + int v = s[i]; + if (v < minValue) { + d[i] = minValue; + } else if (v > maxValue) { + d[i] = maxValue; + } else { + d[i] = (JAVA_ARRAY_INT)v; + } + } +} + +JAVA_INT com_codename1_impl_ios_IOSSimd_sum___int_1ARRAY_int_int_R_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + int i = offset; + int end = offset + length; + int64_t total = 0; + int32x4_t vacc = vdupq_n_s32(0); + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + vacc = vaddq_s32(vacc, vs); + } + int32_t partial[4]; + vst1q_s32(partial, vacc); + total += (int64_t)partial[0] + (int64_t)partial[1] + (int64_t)partial[2] + (int64_t)partial[3]; + for (; i < end; i++) { + total += (int64_t)((int32_t)s[i]); + } + return (JAVA_INT)((int32_t)total); +} + +JAVA_INT com_codename1_impl_ios_IOSSimd_dot___int_1ARRAY_int_1ARRAY_int_int_R_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + int i = offset; + int end = offset + length; + int64_t total = 0; + int32x4_t vacc = vdupq_n_s32(0); + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + vacc = vaddq_s32(vacc, vmulq_s32(va, vb)); + } + int32_t partial[4]; + vst1q_s32(partial, vacc); + total += (int64_t)partial[0] + (int64_t)partial[1] + (int64_t)partial[2] + (int64_t)partial[3]; + for (; i < end; i++) { + total += (int64_t)((int32_t)a[i]) * (int64_t)((int32_t)b[i]); + } + return (JAVA_INT)((int32_t)total); +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_add___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vaddq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] + b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_sub___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vsubq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] - b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_mul___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vmulq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] * b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_min___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vminq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = fminf(a[i], b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_max___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vmaxq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = fmaxf(a[i], b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_abs___float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* s = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t vs = vld1q_f32((float*)(s + i)); + float32x4_t vd = vabsq_f32(vs); + vst1q_f32((float*)(d + i), vd); + } + for (; i < end; i++) { + d[i] = fabsf(s[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_clamp___float_1ARRAY_float_1ARRAY_float_float_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_FLOAT minValue, JAVA_FLOAT maxValue, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* s = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + float32x4_t vminv = vdupq_n_f32((float)minValue); + float32x4_t vmaxv = vdupq_n_f32((float)maxValue); + for (; i <= end - 4; i += 4) { + float32x4_t vs = vld1q_f32((float*)(s + i)); + float32x4_t vc = vmaxq_f32(vminv, vminq_f32(vs, vmaxv)); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + float v = s[i]; + if (v < minValue) { + d[i] = minValue; + } else if (v > maxValue) { + d[i] = maxValue; + } else { + d[i] = v; + } + } +} + +JAVA_FLOAT com_codename1_impl_ios_IOSSimd_sum___float_1ARRAY_int_int_R_float(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* s = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)src)->data; + int i = offset; + int end = offset + length; + float total = 0.f; + float32x4_t vacc = vdupq_n_f32(0.f); + for (; i <= end - 4; i += 4) { + float32x4_t vs = vld1q_f32((float*)(s + i)); + vacc = vaddq_f32(vacc, vs); + } + float partial[4]; + vst1q_f32(partial, vacc); + total += partial[0] + partial[1] + partial[2] + partial[3]; + for (; i < end; i++) { + total += s[i]; + } + return (JAVA_FLOAT)total; +} + +JAVA_FLOAT com_codename1_impl_ios_IOSSimd_dot___float_1ARRAY_float_1ARRAY_int_int_R_float(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + int i = offset; + int end = offset + length; + float total = 0.f; + float32x4_t vacc = vdupq_n_f32(0.f); + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + vacc = vaddq_f32(vacc, vmulq_f32(va, vb)); + } + float partial[4]; + vst1q_f32(partial, vacc); + total += partial[0] + partial[1] + partial[2] + partial[3]; + for (; i < end; i++) { + total += a[i] * b[i]; + } + return (JAVA_FLOAT)total; +} diff --git a/Ports/iOSPort/src/com/codename1/impl/ios/IOSImplementation.java b/Ports/iOSPort/src/com/codename1/impl/ios/IOSImplementation.java index 3029ccfa61..d94049d5ed 100644 --- a/Ports/iOSPort/src/com/codename1/impl/ios/IOSImplementation.java +++ b/Ports/iOSPort/src/com/codename1/impl/ios/IOSImplementation.java @@ -110,6 +110,7 @@ import com.codename1.util.Callback; import com.codename1.util.StringUtil; import com.codename1.util.SuccessCallback; +import com.codename1.util.Simd; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -8091,6 +8092,11 @@ public String getPlatformName() { return "ios"; } + @Override + public Simd createSimd() { + return new IOSSimd(); + } + /** * @inheritDoc */ diff --git a/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java b/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java new file mode 100644 index 0000000000..a142b8273d --- /dev/null +++ b/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2026, Codename One and/or its affiliates. All rights reserved. + */ +package com.codename1.impl.ios; + +import com.codename1.util.Simd; + +/** + * iOS SIMD implementation backed by NEON wrappers. + */ +public class IOSSimd extends Simd { + @Override + public boolean isSupported() { + return true; + } + + @Override + public byte[] allocByte(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return allocByteNative(size); + } + + @Override + public int[] allocInt(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return allocIntNative(size); + } + + @Override + public float[] allocFloat(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return allocFloatNative(size); + } + + @Override + public native void add(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void sub(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void mul(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void min(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void max(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void abs(byte[] src, byte[] dst, int offset, int length); + + @Override + public native void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offset, int length); + + @Override + public native void add(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void sub(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void mul(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void min(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void max(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void abs(int[] src, int[] dst, int offset, int length); + + @Override + public native void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, int length); + + @Override + public native int sum(int[] src, int offset, int length); + + @Override + public native int dot(int[] srcA, int[] srcB, int offset, int length); + + @Override + public native void add(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void sub(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void mul(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void min(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void max(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void abs(float[] src, float[] dst, int offset, int length); + + @Override + public native void clamp(float[] src, float[] dst, float minValue, float maxValue, int offset, int length); + + @Override + public native float sum(float[] src, int offset, int length); + + @Override + public native float dot(float[] srcA, float[] srcB, int offset, int length); + + private native byte[] allocByteNative(int size); + private native int[] allocIntNative(int size); + private native float[] allocFloatNative(int size); +} diff --git a/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java b/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java new file mode 100644 index 0000000000..f0ad0526c6 --- /dev/null +++ b/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java @@ -0,0 +1,60 @@ +package com.codename1.util; + +import com.codename1.junit.FormTest; +import com.codename1.junit.UITestBase; +import com.codename1.ui.CN; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class SimdTest extends UITestBase { + + @FormTest + void baseFallbackOpsWork() { + Simd simd = new Simd(); + + int[] a = new int[]{1, 2, 3, 4}; + int[] b = new int[]{4, 3, 2, 1}; + int[] out = new int[4]; + simd.add(a, b, out, 0, 4); + assertEquals(5, out[0]); + assertEquals(5, out[3]); + + float[] fa = new float[]{1f, -2f, 3f}; + float[] fb = new float[]{4f, 5f, -6f}; + float[] fo = new float[3]; + simd.mul(fa, fb, fo, 0, 3); + assertEquals(4f, fo[0], 0.0001f); + assertEquals(-18f, fo[2], 0.0001f); + + byte[] ba = new byte[]{120, 100, -128}; + byte[] bb = new byte[]{20, 100, -1}; + byte[] bo = new byte[3]; + simd.add(ba, bb, bo, 0, 3); + assertEquals(127, bo[0]); + assertEquals(127, bo[1]); + assertEquals(-128, bo[2]); + } + + @FormTest + void javaseRegistryGuardInSimulator() { + Simd simd = Simd.get(); + if (!simd.isSupported()) { + return; + } + + int[] regA = simd.allocInt(4); + int[] regB = simd.allocInt(4); + int[] regO = simd.allocInt(4); + simd.add(regA, regB, regO, 0, 4); + + if (CN.isSimulator()) { + int[] plainA = new int[4]; + int[] plainB = new int[4]; + int[] plainO = new int[4]; + Throwable t = assertThrows(IllegalArgumentException.class, () -> simd.add(plainA, plainB, plainO, 0, 4)); + assertTrue(t.getMessage().indexOf("Simd.alloc") >= 0); + } + } +} diff --git a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Cn1ssDeviceRunner.java b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Cn1ssDeviceRunner.java index 7eb91d2869..5cd364a492 100644 --- a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Cn1ssDeviceRunner.java +++ b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Cn1ssDeviceRunner.java @@ -80,6 +80,7 @@ public final class Cn1ssDeviceRunner extends DeviceRunner { new OrientationLockScreenshotTest(), new InPlaceEditViewTest(), new BytecodeTranslatorRegressionTest(), + new SimdApiTest(), new StreamApiTest(), new TimeApiTest(), new Java17Tests(), diff --git a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java new file mode 100644 index 0000000000..9414c679ba --- /dev/null +++ b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java @@ -0,0 +1,86 @@ +package com.codenameone.examples.hellocodenameone.tests; + +import com.codename1.ui.CN; +import com.codename1.util.Simd; + +public class SimdApiTest extends BaseTest { + @Override + public boolean runTest() { + try { + Simd simd = Simd.get(); + if (!simd.isSupported()) { + fail("Simd.isSupported() returned false"); + return false; + } + + int[] a = simd.allocInt(16); + int[] b = simd.allocInt(16); + int[] out = simd.allocInt(16); + for (int i = 0; i < 8; i++) { + a[i] = i + 1; + b[i] = 9 - i; + } + simd.add(a, b, out, 0, 8); + for (int i = 0; i < 8; i++) { + if (out[i] != 10) { + fail("Unexpected int add result at " + i + ": " + out[i]); + return false; + } + } + + float[] fa = simd.allocFloat(16); + float[] fb = simd.allocFloat(16); + float[] fo = simd.allocFloat(16); + fa[0] = 1.5f; + fa[1] = -2f; + fa[2] = 3f; + fa[3] = -4f; + fb[0] = 2f; + fb[1] = 3f; + fb[2] = -1f; + fb[3] = 0.5f; + simd.mul(fa, fb, fo, 0, 4); + if (Math.abs(fo[0] - 3f) > 0.0001f || Math.abs(fo[1] + 6f) > 0.0001f + || Math.abs(fo[2] + 3f) > 0.0001f || Math.abs(fo[3] + 2f) > 0.0001f) { + fail("Unexpected float mul results"); + return false; + } + + byte[] ba = simd.allocByte(16); + byte[] bb = simd.allocByte(16); + byte[] bo = simd.allocByte(16); + ba[0] = 120; + ba[1] = 10; + ba[2] = -120; + bb[0] = 20; + bb[1] = -40; + bb[2] = -20; + simd.add(ba, bb, bo, 0, 3); + if (bo[0] != 127 || bo[1] != -30 || bo[2] != -128) { + fail("Unexpected saturating byte add results"); + return false; + } + + if (CN.isSimulator()) { + try { + simd.add(new int[4], new int[4], new int[4], 0, 4); + fail("Expected simulator registry guard to reject non-alloc arrays"); + return false; + } catch (IllegalArgumentException expected) { + // expected + } + } + + done(); + return true; + } catch (Throwable t) { + fail("SimdApiTest failed: " + t); + return false; + } + } + + @Override + public boolean shouldTakeScreenshot() { + return false; + } +} diff --git a/vm/ByteCodeTranslator/src/cn1_globals.h b/vm/ByteCodeTranslator/src/cn1_globals.h index 5b3c8bfebe..dbd1cc1dac 100644 --- a/vm/ByteCodeTranslator/src/cn1_globals.h +++ b/vm/ByteCodeTranslator/src/cn1_globals.h @@ -10,6 +10,7 @@ #include #include #include +#include //#define DEBUG_GC_ALLOCATIONS @@ -1085,6 +1086,7 @@ extern void arrayFinalizerFunction(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT array) extern void gcReleaseObj(JAVA_OBJECT o); extern JAVA_OBJECT allocArray(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type, int primitiveSize, int dim); +extern JAVA_OBJECT allocArrayAligned(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type, int primitiveSize, int dim, int alignment); extern JAVA_OBJECT allocMultiArray(int* lengths, struct clazz* type, int primitiveSize, int dim); extern JAVA_OBJECT alloc2DArray(CODENAME_ONE_THREAD_STATE, int length1, int length2, struct clazz* parentType, struct clazz* childType, int primitiveSize); extern JAVA_OBJECT alloc3DArray(CODENAME_ONE_THREAD_STATE, int length1, int length2, int length3, struct clazz* parentType, struct clazz* childType, struct clazz* grandChildType, int primitiveSize); diff --git a/vm/ByteCodeTranslator/src/cn1_globals.m b/vm/ByteCodeTranslator/src/cn1_globals.m index 3269fa92e1..4b3e883111 100644 --- a/vm/ByteCodeTranslator/src/cn1_globals.m +++ b/vm/ByteCodeTranslator/src/cn1_globals.m @@ -1221,6 +1221,31 @@ JAVA_OBJECT allocArray(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type return (JAVA_OBJECT)array; } +JAVA_OBJECT allocArrayAligned(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type, int primitiveSize, int dim, int alignment) { + int actualSize = length * primitiveSize; + int requestedAlignment = alignment; + if (requestedAlignment < (int)sizeof(void*)) { + requestedAlignment = (int)sizeof(void*); + } + if ((requestedAlignment & (requestedAlignment - 1)) != 0) { + requestedAlignment = 16; + } + int extraPadding = requestedAlignment - 1; + JAVA_ARRAY array = (JAVA_ARRAY)codenameOneGcMalloc(threadStateData, sizeof(struct JavaArrayPrototype) + actualSize + sizeof(void*) + extraPadding, type); + (*array).length = length; + (*array).dimensions = dim; + (*array).primitiveSize = primitiveSize; + if (actualSize > 0) { + char* arr = (char*)(&(array->data)); + arr += sizeof(void*); + uintptr_t aligned = (((uintptr_t)arr) + ((uintptr_t)requestedAlignment - 1)) & ~((uintptr_t)requestedAlignment - 1); + (*array).data = (void*)aligned; + } else { + (*array).data = 0; + } + return (JAVA_OBJECT)array; +} + JAVA_OBJECT alloc2DArray(CODENAME_ONE_THREAD_STATE, int length2, int length1, struct clazz* parentType, struct clazz* childType, int primitiveSize) { JAVA_ARRAY base = (JAVA_ARRAY)allocArray(threadStateData, length1, parentType, sizeof(JAVA_OBJECT), 2); JAVA_ARRAY_OBJECT* objs = base->data; From ba51cc6c88ee751848c7ed7695de46cecda76a6c Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Mon, 13 Apr 2026 05:20:08 +0300 Subject: [PATCH 02/12] Added SIMD Support for Base64 algorithm --- .../src/com/codename1/util/Base64.java | 299 +++++++++++- CodenameOne/src/com/codename1/util/Simd.java | 252 ++++++++++ .../com/codename1/impl/javase/JavaSESimd.java | 229 ++++++++- Ports/iOSPort/nativeSources/IOSSimd.m | 433 ++++++++++++++++++ .../src/com/codename1/impl/ios/IOSSimd.java | 87 ++++ .../java/com/codename1/util/SimdTest.java | 69 ++- .../tests/Base64NativePerformanceTest.java | 93 +++- 7 files changed, 1444 insertions(+), 18 deletions(-) diff --git a/CodenameOne/src/com/codename1/util/Base64.java b/CodenameOne/src/com/codename1/util/Base64.java index 4e042537bd..f28762ade5 100644 --- a/CodenameOne/src/com/codename1/util/Base64.java +++ b/CodenameOne/src/com/codename1/util/Base64.java @@ -39,6 +39,8 @@ public abstract class Base64 { private static final byte[] decodeMap = new byte[256]; private static final int[] decodeMapInt = new int[256]; + private static final int SIMD_LANES = 16; + private static final int SIMD_SCRATCH_INTS = 192; static { for (int i = 0; i < decodeMap.length; i++) { @@ -79,7 +81,7 @@ public static byte[] decode(byte[] in, int len) { return new byte[0]; } int maxOutputLength = (len / 4) * 3 + 3; - byte[] out = new byte[maxOutputLength]; + byte[] out = allocByteMaybeSimd(maxOutputLength); int outputLength = decode(in, len, out); if (outputLength < 0) { return null; @@ -87,7 +89,7 @@ public static byte[] decode(byte[] in, int len) { if (outputLength == out.length) { return out; } - byte[] trimmed = new byte[outputLength]; + byte[] trimmed = allocByteMaybeSimd(outputLength); System.arraycopy(out, 0, trimmed, 0, outputLength); return trimmed; } @@ -224,8 +226,9 @@ private static int decodeNoWhitespace(byte[] in, int len, byte[] out) { int outIndex = 0; int fullLen = len - (pad > 0 ? 4 : 0); int[] decodeMapLocal = decodeMapInt; + int simdFullLen = 0; - for (int i = 0; i < fullLen; i += 4) { + for (int i = simdFullLen; i < fullLen; i += 4) { int c0 = in[i] & 0xff; int c1 = in[i + 1] & 0xff; int c2 = in[i + 2] & 0xff; @@ -337,7 +340,7 @@ public static String encodeNoNewline(byte[] in) { return ""; } int outputLength = ((inputLength + 2) / 3) * 4; - byte[] out = new byte[outputLength]; + byte[] out = allocByteMaybeSimd(outputLength); encodeNoNewline(in, out); return com.codename1.util.StringUtil.newString(out, 0, outputLength); } @@ -433,4 +436,292 @@ public static int encodeNoNewline(byte[] in, byte[] out) { } return outIndex; } + + /// SIMD-optimized Base64 encoding with explicit offsets and caller scratch. + /// Scratch layout: a single SIMD-allocated `int[]` buffer of at least 192 ints. + /// + /// Usage example: + /// ```java + /// Simd simd = Simd.get(); + /// byte[] input = simd.allocByte(data.length); + /// System.arraycopy(data, 0, input, 0, data.length); + /// byte[] output = simd.allocByte(((data.length + 2) / 3) * 4); + /// int[] scratch = simd.allocInt(192); + /// int written = Base64.encodeNoNewlineSimd(input, 0, input.length, output, 0, scratch); + /// ``` + @DisableDebugInfo + @DisableNullChecksAndArrayBoundsChecks + public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { + Simd simd = Simd.get(); + int outputLength = ((inLength + 2) / 3) * 4; + if (out.length - outOffset < outputLength) { + throw new IllegalArgumentException("Output buffer too small for encoded data"); + } + if (inLength == 0) { + return 0; + } + requireScratch(scratch); + requireSimdApiArrays(simd, in, out, scratch); + + final int b0 = 0; + final int b1 = b0 + SIMD_LANES; + final int b2 = b1 + SIMD_LANES; + final int s0 = b2 + SIMD_LANES; + final int s1 = s0 + SIMD_LANES; + final int s2 = s1 + SIMD_LANES; + final int s3 = s2 + SIMD_LANES; + final int t0 = s3 + SIMD_LANES; + final int t1 = t0 + SIMD_LANES; + final int c3 = t1 + SIMD_LANES; + final int c15 = c3 + SIMD_LANES; + final int c63 = c15 + SIMD_LANES; + + for (int lane = 0; lane < SIMD_LANES; lane++) { + scratch[c3 + lane] = 3; + scratch[c15 + lane] = 15; + scratch[c63 + lane] = 63; + } + + int end = inOffset + inLength - (inLength % 3); + int simdEnd = end - ((end - inOffset) % 48); + int inIndex = inOffset; + int outIndex = outOffset; + for (; inIndex < simdEnd; inIndex += 48) { + for (int lane = 0; lane < SIMD_LANES; lane++) { + int src = inIndex + lane * 3; + scratch[b0 + lane] = in[src] & 0xff; + scratch[b1 + lane] = in[src + 1] & 0xff; + scratch[b2 + lane] = in[src + 2] & 0xff; + } + + simd.shrLogical(scratch, b0, 2, scratch, s0, SIMD_LANES); + simd.and(scratch, b0, scratch, c3, scratch, t0, SIMD_LANES); + simd.shl(scratch, t0, 4, scratch, t0, SIMD_LANES); + simd.shrLogical(scratch, b1, 4, scratch, t1, SIMD_LANES); + simd.or(scratch, t0, scratch, t1, scratch, s1, SIMD_LANES); + simd.and(scratch, b1, scratch, c15, scratch, t0, SIMD_LANES); + simd.shl(scratch, t0, 2, scratch, t0, SIMD_LANES); + simd.shrLogical(scratch, b2, 6, scratch, t1, SIMD_LANES); + simd.or(scratch, t0, scratch, t1, scratch, s2, SIMD_LANES); + simd.and(scratch, b2, scratch, c63, scratch, s3, SIMD_LANES); + + for (int lane = 0; lane < SIMD_LANES; lane++) { + out[outIndex++] = map[scratch[s0 + lane]]; + out[outIndex++] = map[scratch[s1 + lane]]; + out[outIndex++] = map[scratch[s2 + lane]]; + out[outIndex++] = map[scratch[s3 + lane]]; + } + } + + for (; inIndex < end; inIndex += 3) { + int x0 = in[inIndex] & 0xff; + int x1 = in[inIndex + 1] & 0xff; + int x2 = in[inIndex + 2] & 0xff; + out[outIndex++] = map[x0 >> 2]; + out[outIndex++] = map[((x0 & 0x03) << 4) | (x1 >> 4)]; + out[outIndex++] = map[((x1 & 0x0f) << 2) | (x2 >> 6)]; + out[outIndex++] = map[x2 & 0x3f]; + } + + switch (inOffset + inLength - end) { + case 1: { + int x0 = in[end] & 0xff; + out[outIndex++] = map[x0 >> 2]; + out[outIndex++] = map[(x0 & 0x03) << 4]; + out[outIndex++] = '='; + out[outIndex++] = '='; + break; + } + case 2: { + int x0 = in[end] & 0xff; + int x1 = in[end + 1] & 0xff; + out[outIndex++] = map[x0 >> 2]; + out[outIndex++] = map[((x0 & 0x03) << 4) | (x1 >> 4)]; + out[outIndex++] = map[(x1 & 0x0f) << 2]; + out[outIndex++] = '='; + break; + } + default: + break; + } + return outputLength; + } + + /// SIMD-optimized Base64 decoding for no-whitespace input. + /// Scratch layout: a single SIMD-allocated `int[]` buffer of at least 192 ints. + /// + /// Returns decoded bytes written, or `-1` for invalid input. + /// + /// Usage example: + /// ```java + /// Simd simd = Simd.get(); + /// byte[] encoded = simd.allocByte(base64Bytes.length); + /// System.arraycopy(base64Bytes, 0, encoded, 0, base64Bytes.length); + /// byte[] decoded = simd.allocByte((encoded.length / 4) * 3); + /// int[] scratch = simd.allocInt(192); + /// int written = Base64.decodeNoWhitespaceSimd(encoded, 0, encoded.length, decoded, 0, scratch); + /// ``` + @DisableDebugInfo + @DisableNullChecksAndArrayBoundsChecks + public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { + if ((inLength & 0x3) != 0) { + return -1; + } + int pad = 0; + if (inLength > 0 && in[inOffset + inLength - 1] == '=') { + pad++; + if (inLength > 1 && in[inOffset + inLength - 2] == '=') { + pad++; + } + } + if (pad > 2) { + return -1; + } + int outLength = (inLength / 4) * 3 - pad; + if (outLength <= 0) { + return 0; + } + if (out.length - outOffset < outLength) { + throw new IllegalArgumentException("Output buffer too small for decoded data"); + } + + requireScratch(scratch); + Simd simd = Simd.get(); + requireSimdApiArrays(simd, in, out, scratch); + + final int q0 = 0; + final int q1 = q0 + SIMD_LANES; + final int q2 = q1 + SIMD_LANES; + final int q3 = q2 + SIMD_LANES; + final int o0 = q3 + SIMD_LANES; + final int o1 = o0 + SIMD_LANES; + final int o2 = o1 + SIMD_LANES; + final int t0 = o2 + SIMD_LANES; + final int t1 = t0 + SIMD_LANES; + final int c3 = t1 + SIMD_LANES; + final int c15 = c3 + SIMD_LANES; + + for (int lane = 0; lane < SIMD_LANES; lane++) { + scratch[c3 + lane] = 3; + scratch[c15 + lane] = 15; + } + + int fullLen = inLength - (pad > 0 ? 4 : 0); + int simdFullLen = fullLen - (fullLen % 64); + int inIndex = inOffset; + int outIndex = outOffset; + int endVector = inOffset + simdFullLen; + for (; inIndex < endVector; inIndex += 64) { + for (int lane = 0; lane < SIMD_LANES; lane++) { + int src = inIndex + lane * 4; + int d0 = decodeMapInt[in[src] & 0xff]; + int d1 = decodeMapInt[in[src + 1] & 0xff]; + int d2 = decodeMapInt[in[src + 2] & 0xff]; + int d3 = decodeMapInt[in[src + 3] & 0xff]; + if ((d0 | d1 | d2 | d3) < 0) { + return -1; + } + scratch[q0 + lane] = d0; + scratch[q1 + lane] = d1; + scratch[q2 + lane] = d2; + scratch[q3 + lane] = d3; + } + + simd.shl(scratch, q0, 2, scratch, o0, SIMD_LANES); + simd.shrLogical(scratch, q1, 4, scratch, t0, SIMD_LANES); + simd.or(scratch, o0, scratch, t0, scratch, o0, SIMD_LANES); + simd.and(scratch, q1, scratch, c15, scratch, t0, SIMD_LANES); + simd.shl(scratch, t0, 4, scratch, t0, SIMD_LANES); + simd.shrLogical(scratch, q2, 2, scratch, t1, SIMD_LANES); + simd.or(scratch, t0, scratch, t1, scratch, o1, SIMD_LANES); + simd.and(scratch, q2, scratch, c3, scratch, t0, SIMD_LANES); + simd.shl(scratch, t0, 6, scratch, t0, SIMD_LANES); + simd.or(scratch, t0, scratch, q3, scratch, o2, SIMD_LANES); + + for (int lane = 0; lane < SIMD_LANES; lane++) { + out[outIndex++] = (byte)scratch[o0 + lane]; + out[outIndex++] = (byte)scratch[o1 + lane]; + out[outIndex++] = (byte)scratch[o2 + lane]; + } + } + + int fullEnd = inOffset + fullLen; + for (; inIndex < fullEnd; inIndex += 4) { + int c0 = in[inIndex] & 0xff; + int c1 = in[inIndex + 1] & 0xff; + int c2 = in[inIndex + 2] & 0xff; + int c3v = in[inIndex + 3] & 0xff; + int x0 = decodeMapInt[c0]; + int x1 = decodeMapInt[c1]; + int x2 = decodeMapInt[c2]; + int x3 = decodeMapInt[c3v]; + if ((x0 | x1 | x2 | x3) < 0) { + return -1; + } + int quantum = (x0 << 18) | (x1 << 12) | (x2 << 6) | x3; + out[outIndex++] = (byte)((quantum >> 16) & 0xff); + out[outIndex++] = (byte)((quantum >> 8) & 0xff); + out[outIndex++] = (byte)(quantum & 0xff); + } + + if (pad == 0) { + return outLength; + } + + int i = inOffset + inLength - 4; + int c0 = in[i] & 0xff; + int c1 = in[i + 1] & 0xff; + int x0 = decodeMapInt[c0]; + int x1 = decodeMapInt[c1]; + if ((x0 | x1) < 0) { + return -1; + } + out[outIndex++] = (byte)((x0 << 2) | (x1 >> 4)); + if (pad == 2) { + return (in[i + 2] == '=' && in[i + 3] == '=') ? outLength : -1; + } + if (in[i + 3] != '=') { + return -1; + } + int x2 = decodeMapInt[in[i + 2] & 0xff]; + if (x2 < 0) { + return -1; + } + out[outIndex] = (byte)((x1 << 4) | (x2 >> 2)); + return outLength; + } + + /// Convenience overload for `encodeNoNewlineSimd(byte[], int, int, byte[], int, int[])` + /// using zero offsets. + public static int encodeNoNewlineSimd(byte[] in, byte[] out, int[] scratch) { + return encodeNoNewlineSimd(in, 0, in.length, out, 0, scratch); + } + + /// Convenience overload for `decodeNoWhitespaceSimd(byte[], int, int, byte[], int, int[])` + /// using zero offsets. + public static int decodeNoWhitespaceSimd(byte[] in, int len, byte[] out, int[] scratch) { + return decodeNoWhitespaceSimd(in, 0, len, out, 0, scratch); + } + + private static void requireScratch(int[] scratch) { + if (scratch == null || scratch.length < SIMD_SCRATCH_INTS) { + throw new IllegalArgumentException("scratch must be an int[] allocated with Simd.allocInt(192) or larger"); + } + } + + private static void requireSimdApiArrays(Simd simd, byte[] in, byte[] out, int[] scratch) { + simd.unpackUnsignedByteToInt(in, scratch, 0, 0); + simd.packIntToByteTruncate(scratch, out, 0, 0); + } + + private static byte[] allocByteMaybeSimd(int size) { + if (size <= 0) { + return new byte[0]; + } + Simd simd = Simd.get(); + if (simd.isSupported() && size >= 16) { + return simd.allocByte(size); + } + return new byte[size]; + } } diff --git a/CodenameOne/src/com/codename1/util/Simd.java b/CodenameOne/src/com/codename1/util/Simd.java index 53e268c4d7..37153ba3fa 100644 --- a/CodenameOne/src/com/codename1/util/Simd.java +++ b/CodenameOne/src/com/codename1/util/Simd.java @@ -99,6 +99,92 @@ public void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offs } } + public void and(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] & srcB[i]); + } + } + + public void or(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] | srcB[i]); + } + } + + public void xor(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] ^ srcB[i]); + } + } + + public void not(byte[] src, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(~src[i]); + } + } + + public void cmpEq(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] == srcB[i] ? (byte)-1 : (byte)0; + } + } + + public void cmpLt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] < srcB[i] ? (byte)-1 : (byte)0; + } + } + + public void cmpGt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] > srcB[i] ? (byte)-1 : (byte)0; + } + } + + public void cmpRange(byte[] src, byte minValue, byte maxValue, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + dstMask[i] = v >= minValue && v <= maxValue ? (byte)-1 : (byte)0; + } + } + + public void select(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = mask[i] != 0 ? trueValues[i] : falseValues[i]; + } + } + + public void unpackUnsignedByteToInt(byte[] src, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = src[i] & 0xff; + } + } + + public void packIntToByteSaturating(int[] src, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = clampByte(src[i]); + } + } + + public void packIntToByteTruncate(int[] src, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)src[i]; + } + } + + public void packIntToByteTruncate(int[] src, int srcOffset, byte[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = (byte)src[srcOffset + i]; + } + } + + public void permuteBytes(byte[] src, byte[] indices, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int idx = indices[i]; + dst[i] = idx >= 0 && idx < src.length ? src[idx] : 0; + } + } + public void add(int[] srcA, int[] srcB, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] + srcB[i]; @@ -149,6 +235,101 @@ public void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, } } + public void and(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] & srcB[i]; + } + } + + public void and(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = srcA[srcAOffset + i] & srcB[srcBOffset + i]; + } + } + + public void or(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] | srcB[i]; + } + } + + public void or(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = srcA[srcAOffset + i] | srcB[srcBOffset + i]; + } + } + + public void xor(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] ^ srcB[i]; + } + } + + public void not(int[] src, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = ~src[i]; + } + } + + public void shl(int[] src, int bits, int[] dst, int offset, int length) { + int shift = bits & 31; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = src[i] << shift; + } + } + + public void shl(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { + int shift = bits & 31; + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = src[srcOffset + i] << shift; + } + } + + public void shrLogical(int[] src, int bits, int[] dst, int offset, int length) { + int shift = bits & 31; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = src[i] >>> shift; + } + } + + public void shrLogical(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { + int shift = bits & 31; + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = src[srcOffset + i] >>> shift; + } + } + + public void shrArithmetic(int[] src, int bits, int[] dst, int offset, int length) { + int shift = bits & 31; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = src[i] >> shift; + } + } + + public void cmpEq(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] == srcB[i] ? (byte)-1 : (byte)0; + } + } + + public void cmpLt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] < srcB[i] ? (byte)-1 : (byte)0; + } + } + + public void cmpGt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] > srcB[i] ? (byte)-1 : (byte)0; + } + } + + public void select(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = mask[i] != 0 ? trueValues[i] : falseValues[i]; + } + } + public int sum(int[] src, int offset, int length) { int out = 0; for (int i = offset, end = offset + length; i < end; i++) { @@ -230,6 +411,7 @@ public float dot(float[] srcA, float[] srcB, int offset, int length) { return out; } + protected final void validateBinaryByte(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { validateNotNull(srcA, "srcA"); validateNotNull(srcB, "srcB"); @@ -239,6 +421,55 @@ protected final void validateBinaryByte(byte[] srcA, byte[] srcB, byte[] dst, in validateRange(dst.length, offset, length, "dst"); } + protected final void validateMaskBinaryByte(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dstMask, "dstMask"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dstMask.length, offset, length, "dstMask"); + } + + protected final void validateRangeMaskByte(byte[] src, byte[] dstMask, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dstMask, "dstMask"); + validateRange(src.length, offset, length, "src"); + validateRange(dstMask.length, offset, length, "dstMask"); + } + + protected final void validateSelectByte(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length) { + validateNotNull(mask, "mask"); + validateNotNull(trueValues, "trueValues"); + validateNotNull(falseValues, "falseValues"); + validateNotNull(dst, "dst"); + validateRange(mask.length, offset, length, "mask"); + validateRange(trueValues.length, offset, length, "trueValues"); + validateRange(falseValues.length, offset, length, "falseValues"); + validateRange(dst.length, offset, length, "dst"); + } + + protected final void validateByteToInt(byte[] src, int[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + protected final void validateIntToByte(int[] src, byte[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + protected final void validatePermuteByte(byte[] src, byte[] indices, byte[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(indices, "indices"); + validateNotNull(dst, "dst"); + validateRange(indices.length, offset, length, "indices"); + validateRange(dst.length, offset, length, "dst"); + } + protected final void validateUnaryByte(byte[] src, byte[] dst, int offset, int length) { validateNotNull(src, "src"); validateNotNull(dst, "dst"); @@ -262,6 +493,26 @@ protected final void validateUnaryInt(int[] src, int[] dst, int offset, int leng validateRange(dst.length, offset, length, "dst"); } + protected final void validateMaskBinaryInt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dstMask, "dstMask"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dstMask.length, offset, length, "dstMask"); + } + + protected final void validateSelectInt(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length) { + validateNotNull(mask, "mask"); + validateNotNull(trueValues, "trueValues"); + validateNotNull(falseValues, "falseValues"); + validateNotNull(dst, "dst"); + validateRange(mask.length, offset, length, "mask"); + validateRange(trueValues.length, offset, length, "trueValues"); + validateRange(falseValues.length, offset, length, "falseValues"); + validateRange(dst.length, offset, length, "dst"); + } + protected final void validateReductionInt(int[] src, int offset, int length) { validateNotNull(src, "src"); validateRange(src.length, offset, length, "src"); @@ -302,6 +553,7 @@ protected final void validateDotFloat(float[] srcA, float[] srcB, int offset, in validateRange(srcB.length, offset, length, "srcB"); } + protected final void validateNotNull(Object o, String name) { if (o == null) { throw new NullPointerException(name + " is null"); diff --git a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java index b685daeb53..8787c677c7 100644 --- a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java +++ b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java @@ -3,7 +3,6 @@ */ package com.codename1.impl.javase; -import com.codename1.ui.CN; import com.codename1.util.Simd; import java.util.Collections; @@ -94,6 +93,110 @@ public void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offs super.clamp(src, dst, minValue, maxValue, offset, length); } + @Override + public void and(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.and(srcA, srcB, dst, offset, length); + } + + @Override + public void or(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.or(srcA, srcB, dst, offset, length); + } + + @Override + public void xor(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.xor(srcA, srcB, dst, offset, length); + } + + @Override + public void not(byte[] src, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.not(src, dst, offset, length); + } + + @Override + public void cmpEq(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryByte(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpEq(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpLt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryByte(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpLt(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpGt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryByte(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpGt(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpRange(byte[] src, byte minValue, byte maxValue, byte[] dstMask, int offset, int length) { + if (minValue > maxValue) { + throw new IllegalArgumentException("minValue > maxValue"); + } + validateRangeMaskByte(src, dstMask, offset, length); + validateRegistered(src, dstMask); + super.cmpRange(src, minValue, maxValue, dstMask, offset, length); + } + + @Override + public void select(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length) { + validateSelectByte(mask, trueValues, falseValues, dst, offset, length); + validateRegistered(mask, trueValues, falseValues, dst); + super.select(mask, trueValues, falseValues, dst, offset, length); + } + + @Override + public void unpackUnsignedByteToInt(byte[] src, int[] dst, int offset, int length) { + validateByteToInt(src, dst, offset, length); + validateRegistered(src, dst); + super.unpackUnsignedByteToInt(src, dst, offset, length); + } + + @Override + public void packIntToByteSaturating(int[] src, byte[] dst, int offset, int length) { + validateIntToByte(src, dst, offset, length); + validateRegistered(src, dst); + super.packIntToByteSaturating(src, dst, offset, length); + } + + @Override + public void packIntToByteTruncate(int[] src, byte[] dst, int offset, int length) { + validateIntToByte(src, dst, offset, length); + validateRegistered(src, dst); + super.packIntToByteTruncate(src, dst, offset, length); + } + + @Override + public void packIntToByteTruncate(int[] src, int srcOffset, byte[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.packIntToByteTruncate(src, srcOffset, dst, dstOffset, length); + } + + @Override + public void permuteBytes(byte[] src, byte[] indices, byte[] dst, int offset, int length) { + validatePermuteByte(src, indices, dst, offset, length); + validateRegistered(src, indices, dst); + super.permuteBytes(src, indices, dst, offset, length); + } + @Override public void add(int[] srcA, int[] srcB, int[] dst, int offset, int length) { validateBinaryInt(srcA, srcB, dst, offset, length); @@ -146,6 +249,127 @@ public void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, super.clamp(src, dst, minValue, maxValue, offset, length); } + @Override + public void and(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.and(srcA, srcB, dst, offset, length); + } + + @Override + public void and(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(srcA, srcB, dst); + super.and(srcA, srcAOffset, srcB, srcBOffset, dst, dstOffset, length); + } + + @Override + public void or(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.or(srcA, srcB, dst, offset, length); + } + + @Override + public void or(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(srcA, srcB, dst); + super.or(srcA, srcAOffset, srcB, srcBOffset, dst, dstOffset, length); + } + + @Override + public void xor(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.xor(srcA, srcB, dst, offset, length); + } + + @Override + public void not(int[] src, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.not(src, dst, offset, length); + } + + @Override + public void shl(int[] src, int bits, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.shl(src, bits, dst, offset, length); + } + + @Override + public void shl(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.shl(src, srcOffset, bits, dst, dstOffset, length); + } + + @Override + public void shrLogical(int[] src, int bits, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.shrLogical(src, bits, dst, offset, length); + } + + @Override + public void shrLogical(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.shrLogical(src, srcOffset, bits, dst, dstOffset, length); + } + + @Override + public void shrArithmetic(int[] src, int bits, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.shrArithmetic(src, bits, dst, offset, length); + } + + @Override + public void cmpEq(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryInt(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpEq(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpLt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryInt(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpLt(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpGt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryInt(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpGt(srcA, srcB, dstMask, offset, length); + } + + @Override + public void select(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length) { + validateSelectInt(mask, trueValues, falseValues, dst, offset, length); + validateRegistered(mask, trueValues, falseValues, dst); + super.select(mask, trueValues, falseValues, dst, offset, length); + } + @Override public int sum(int[] src, int offset, int length) { validateReductionInt(src, offset, length); @@ -227,9 +451,6 @@ public float dot(float[] srcA, float[] srcB, int offset, int length) { } private void validateRegistered(Object... arrays) { - if (!CN.isSimulator()) { - return; - } for (int i = 0; i < arrays.length; i++) { Object arr = arrays[i]; Integer id = Integer.valueOf(System.identityHashCode(arr)); diff --git a/Ports/iOSPort/nativeSources/IOSSimd.m b/Ports/iOSPort/nativeSources/IOSSimd.m index c80fe9005d..e7d1a042ae 100644 --- a/Ports/iOSPort/nativeSources/IOSSimd.m +++ b/Ports/iOSPort/nativeSources/IOSSimd.m @@ -482,3 +482,436 @@ JAVA_FLOAT com_codename1_impl_ios_IOSSimd_dot___float_1ARRAY_float_1ARRAY_int_in } return (JAVA_FLOAT)total; } + +JAVA_VOID com_codename1_impl_ios_IOSSimd_and___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + vst1q_s8((int8_t*)(d + i), vandq_s8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(a[i] & b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_or___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + vst1q_s8((int8_t*)(d + i), vorrq_s8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(a[i] | b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_xor___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + vst1q_s8((int8_t*)(d + i), veorq_s8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(a[i] ^ b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_not___byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + vst1q_s8((int8_t*)(d + i), vmvnq_s8(vs)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(~s[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpEq___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + uint8x16_t cmp = vceqq_s8(va, vb); + vst1q_u8((uint8_t*)(m + i), cmp); + } + for (; i < end; i++) { + m[i] = a[i] == b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpLt___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + uint8x16_t cmp = vcltq_s8(va, vb); + vst1q_u8((uint8_t*)(m + i), cmp); + } + for (; i < end; i++) { + m[i] = a[i] < b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpGt___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + uint8x16_t cmp = vcgtq_s8(va, vb); + vst1q_u8((uint8_t*)(m + i), cmp); + } + for (; i < end; i++) { + m[i] = a[i] > b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpRange___byte_1ARRAY_byte_byte_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_BYTE minValue, JAVA_BYTE maxValue, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + int8x16_t vminv = vdupq_n_s8((int8_t)minValue); + int8x16_t vmaxv = vdupq_n_s8((int8_t)maxValue); + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + uint8x16_t ge = vcgeq_s8(vs, vminv); + uint8x16_t le = vcleq_s8(vs, vmaxv); + vst1q_u8((uint8_t*)(m + i), vandq_u8(ge, le)); + } + for (; i < end; i++) { + int v = s[i]; + m[i] = v >= minValue && v <= maxValue ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_select___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT mask, JAVA_OBJECT trueValues, JAVA_OBJECT falseValues, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)mask)->data; + JAVA_ARRAY_BYTE* t = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)trueValues)->data; + JAVA_ARRAY_BYTE* f = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)falseValues)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + int8x16_t zero = vdupq_n_s8(0); + for (; i <= end - 16; i += 16) { + int8x16_t vm = vld1q_s8((int8_t*)(m + i)); + int8x16_t vt = vld1q_s8((int8_t*)(t + i)); + int8x16_t vf = vld1q_s8((int8_t*)(f + i)); + uint8x16_t isZero = vceqq_s8(vm, zero); + uint8x16_t out = vbslq_u8(isZero, vreinterpretq_u8_s8(vf), vreinterpretq_u8_s8(vt)); + vst1q_s8((int8_t*)(d + i), vreinterpretq_s8_u8(out)); + } + for (; i < end; i++) { + d[i] = m[i] != 0 ? t[i] : f[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_unpackUnsignedByteToInt___byte_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + uint8x16_t v = vld1q_u8((uint8_t*)(s + i)); + uint16x8_t lo16 = vmovl_u8(vget_low_u8(v)); + uint16x8_t hi16 = vmovl_u8(vget_high_u8(v)); + uint32x4_t x0 = vmovl_u16(vget_low_u16(lo16)); + uint32x4_t x1 = vmovl_u16(vget_high_u16(lo16)); + uint32x4_t x2 = vmovl_u16(vget_low_u16(hi16)); + uint32x4_t x3 = vmovl_u16(vget_high_u16(hi16)); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(x0)); + vst1q_s32((int32_t*)(d + i + 4), vreinterpretq_s32_u32(x1)); + vst1q_s32((int32_t*)(d + i + 8), vreinterpretq_s32_u32(x2)); + vst1q_s32((int32_t*)(d + i + 12), vreinterpretq_s32_u32(x3)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(s[i] & 0xff); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packIntToByteSaturating___int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + int v = s[i]; + if (v > 127) { + d[i] = 127; + } else if (v < -128) { + d[i] = -128; + } else { + d[i] = (JAVA_ARRAY_BYTE)v; + } + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packIntToByteTruncate___int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)s[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packIntToByteTruncate___int_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_BYTE)s[srcOffset + i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_permuteBytes___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT indices, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + int srcLen = ((JAVA_ARRAY)src)->length; + JAVA_ARRAY_BYTE* idx = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)indices)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + int pos = idx[i]; + d[i] = (pos >= 0 && pos < srcLen) ? s[pos] : 0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_and___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vandq_u32(va, vb))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(a[i] & b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_and___int_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + srcAOffset + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + srcBOffset + i))); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(vandq_u32(va, vb))); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(a[srcAOffset + i] & b[srcBOffset + i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_or___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vorrq_u32(va, vb))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(a[i] | b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_or___int_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + srcAOffset + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + srcBOffset + i))); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(vorrq_u32(va, vb))); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(a[srcAOffset + i] | b[srcBOffset + i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_xor___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(veorq_u32(va, vb))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(a[i] ^ b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_not___int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32x4_t vs = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(s + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vmvnq_u32(vs))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(~s[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shl___int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = offset; + int end = offset + length; + int32x4_t vshift = vdupq_n_s32(shift); + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + vst1q_s32((int32_t*)(d + i), vshlq_s32(vs, vshift)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(s[i] << shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shl___int_1ARRAY_int_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = 0; + int32x4_t vshift = vdupq_n_s32(shift); + for (; i <= length - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + srcOffset + i)); + vst1q_s32((int32_t*)(d + dstOffset + i), vshlq_s32(vs, vshift)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(s[srcOffset + i] << shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrLogical___int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = offset; + int end = offset + length; + int32x4_t vshift = vdupq_n_s32(-shift); + for (; i <= end - 4; i += 4) { + uint32x4_t vs = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(s + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vshlq_u32(vs, vshift))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(((uint32_t)s[i]) >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrLogical___int_1ARRAY_int_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = 0; + int32x4_t vshift = vdupq_n_s32(-shift); + for (; i <= length - 4; i += 4) { + uint32x4_t vs = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(s + srcOffset + i))); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(vshlq_u32(vs, vshift))); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(((uint32_t)s[srcOffset + i]) >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrArithmetic___int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = offset; + int end = offset + length; + int32x4_t vshift = vdupq_n_s32(-shift); + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + vst1q_s32((int32_t*)(d + i), vshlq_s32(vs, vshift)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(s[i] >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpEq___int_1ARRAY_int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + m[i] = a[i] == b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpLt___int_1ARRAY_int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + m[i] = a[i] < b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpGt___int_1ARRAY_int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + m[i] = a[i] > b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_select___byte_1ARRAY_int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT mask, JAVA_OBJECT trueValues, JAVA_OBJECT falseValues, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)mask)->data; + JAVA_ARRAY_INT* t = (JAVA_ARRAY_INT*)((JAVA_ARRAY)trueValues)->data; + JAVA_ARRAY_INT* f = (JAVA_ARRAY_INT*)((JAVA_ARRAY)falseValues)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + d[i] = m[i] != 0 ? t[i] : f[i]; + } +} diff --git a/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java b/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java index a142b8273d..8f63966826 100644 --- a/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java +++ b/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java @@ -59,6 +59,48 @@ public float[] allocFloat(int size) { @Override public native void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offset, int length); + @Override + public native void and(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void or(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void xor(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void not(byte[] src, byte[] dst, int offset, int length); + + @Override + public native void cmpEq(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpLt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpGt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpRange(byte[] src, byte minValue, byte maxValue, byte[] dstMask, int offset, int length); + + @Override + public native void select(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length); + + @Override + public native void unpackUnsignedByteToInt(byte[] src, int[] dst, int offset, int length); + + @Override + public native void packIntToByteSaturating(int[] src, byte[] dst, int offset, int length); + + @Override + public native void packIntToByteTruncate(int[] src, byte[] dst, int offset, int length); + + @Override + public native void packIntToByteTruncate(int[] src, int srcOffset, byte[] dst, int dstOffset, int length); + + @Override + public native void permuteBytes(byte[] src, byte[] indices, byte[] dst, int offset, int length); + @Override public native void add(int[] srcA, int[] srcB, int[] dst, int offset, int length); @@ -80,6 +122,51 @@ public float[] allocFloat(int size) { @Override public native void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, int length); + @Override + public native void and(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void and(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length); + + @Override + public native void or(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void or(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length); + + @Override + public native void xor(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void not(int[] src, int[] dst, int offset, int length); + + @Override + public native void shl(int[] src, int bits, int[] dst, int offset, int length); + + @Override + public native void shl(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length); + + @Override + public native void shrLogical(int[] src, int bits, int[] dst, int offset, int length); + + @Override + public native void shrLogical(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length); + + @Override + public native void shrArithmetic(int[] src, int bits, int[] dst, int offset, int length); + + @Override + public native void cmpEq(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpLt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpGt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void select(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length); + @Override public native int sum(int[] src, int offset, int length); diff --git a/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java b/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java index f0ad0526c6..5663474679 100644 --- a/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java +++ b/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java @@ -44,17 +44,70 @@ void javaseRegistryGuardInSimulator() { return; } - int[] regA = simd.allocInt(4); - int[] regB = simd.allocInt(4); - int[] regO = simd.allocInt(4); - simd.add(regA, regB, regO, 0, 4); + int[] regA = simd.allocInt(16); + int[] regB = simd.allocInt(16); + int[] regO = simd.allocInt(16); + simd.add(regA, regB, regO, 0, 16); if (CN.isSimulator()) { - int[] plainA = new int[4]; - int[] plainB = new int[4]; - int[] plainO = new int[4]; - Throwable t = assertThrows(IllegalArgumentException.class, () -> simd.add(plainA, plainB, plainO, 0, 4)); + int[] plainA = new int[16]; + int[] plainB = new int[16]; + int[] plainO = new int[16]; + Throwable t = assertThrows(IllegalArgumentException.class, () -> simd.add(plainA, plainB, plainO, 0, 16)); assertTrue(t.getMessage().indexOf("Simd.alloc") >= 0); } } + + @FormTest + void genericBitwiseShiftCompareSelectOpsWork() { + Simd simd = new Simd(); + + byte[] a = new byte[]{1, 2, 3, 4}; + byte[] b = new byte[]{3, 2, 1, 4}; + byte[] mask = new byte[4]; + byte[] outB = new byte[4]; + simd.cmpGt(a, b, mask, 0, 4); + simd.select(mask, a, b, outB, 0, 4); + assertEquals(3, outB[0]); + assertEquals(2, outB[1]); + assertEquals(3, outB[2]); + assertEquals(4, outB[3]); + + int[] ia = new int[]{0x0f0f0f0f, 8, -16, 7}; + int[] ib = new int[]{0x00ff00ff, 1, 2, 9}; + int[] io = new int[4]; + simd.and(ia, ib, io, 0, 4); + assertEquals(0x000f000f, io[0]); + simd.shrLogical(ia, 1, io, 0, 4); + assertEquals(4, io[1]); + simd.shrArithmetic(ia, 1, io, 0, 4); + assertEquals(-8, io[2]); + + byte[] intMask = new byte[4]; + simd.cmpLt(ia, ib, intMask, 0, 4); + simd.select(intMask, ia, ib, io, 0, 4); + assertEquals(0x00ff00ff, io[0]); + assertEquals(1, io[1]); + assertEquals(-16, io[2]); + assertEquals(7, io[3]); + + int[] unpack = new int[4]; + simd.unpackUnsignedByteToInt(new byte[]{-1, 0, 1, 127}, unpack, 0, 4); + assertEquals(255, unpack[0]); + assertEquals(127, unpack[3]); + + byte[] packed = new byte[4]; + simd.packIntToByteSaturating(new int[]{-129, -128, 127, 1000}, packed, 0, 4); + assertEquals(-128, packed[0]); + assertEquals(-128, packed[1]); + assertEquals(127, packed[2]); + assertEquals(127, packed[3]); + + byte[] permuted = new byte[4]; + simd.permuteBytes(new byte[]{10, 20, 30, 40}, new byte[]{3, 2, 1, -1}, permuted, 0, 4); + assertEquals(40, permuted[0]); + assertEquals(30, permuted[1]); + assertEquals(20, permuted[2]); + assertEquals(0, permuted[3]); + } } diff --git a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java index 75b63cfc9d..231bbeccdd 100644 --- a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java +++ b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java @@ -4,6 +4,7 @@ import com.codename1.ui.Display; import com.codenameone.examples.hellocodenameone.Base64Native; import com.codename1.util.Base64; +import com.codename1.util.Simd; public class Base64NativePerformanceTest extends BaseTest { @@ -60,15 +61,54 @@ public boolean runTest() { return false; } byte[] cn1DecodedBuffer = new byte[payloadBytes.length]; + Simd simd = Simd.get(); + boolean runSimdBenchmark = isIos() && simd.isSupported(); + byte[] simdPayloadBytes = null; + byte[] simdEncodedBytes = null; + byte[] simdDecodedBuffer = null; + int[] simdScratch = null; + if (runSimdBenchmark) { + simdPayloadBytes = simd.allocByte(payloadBytes.length); + System.arraycopy(payloadBytes, 0, simdPayloadBytes, 0, payloadBytes.length); + simdEncodedBytes = simd.allocByte(encodedLen); + simdDecodedBuffer = simd.allocByte(payloadBytes.length); + simdScratch = simd.allocInt(192); + + int simdEncodedWritten = Base64.encodeNoNewlineSimd(simdPayloadBytes, 0, simdPayloadBytes.length, simdEncodedBytes, 0, simdScratch); + if (simdEncodedWritten != encodedLen) { + fail("SIMD Base64 encode returned unexpected length"); + return false; + } + if (!byteArraysEqual(cn1EncodedBytes, simdEncodedBytes, encodedLen)) { + fail("SIMD Base64 encode mismatch"); + return false; + } + int simdDecodedWritten = Base64.decodeNoWhitespaceSimd(simdEncodedBytes, 0, encodedLen, simdDecodedBuffer, 0, simdScratch); + if (simdDecodedWritten != payloadBytes.length) { + fail("SIMD Base64 decode returned unexpected length"); + return false; + } + if (!byteArraysEqual(payloadBytes, simdDecodedBuffer, payloadBytes.length)) { + fail("SIMD Base64 decode mismatch"); + return false; + } + } if (!isIos()) { - warmup(nativeBase64, payload, payloadBytes, nativeEncoded, cn1EncodedBytes, cn1DecodedBuffer); + warmup(nativeBase64, payload, payloadBytes, nativeEncoded, cn1EncodedBytes, cn1DecodedBuffer, + runSimdBenchmark, simdPayloadBytes, simdEncodedBytes, simdDecodedBuffer, simdScratch); + } + if (runSimdBenchmark) { + warmup(nativeBase64, payload, payloadBytes, nativeEncoded, cn1EncodedBytes, cn1DecodedBuffer, + true, simdPayloadBytes, simdEncodedBytes, simdDecodedBuffer, simdScratch); } long nativeEncodeMs = measureNativeEncode(nativeBase64, payload); long cn1EncodeMs = measureCn1Encode(payloadBytes, cn1EncodedBytes); long nativeDecodeMs = measureNativeDecode(nativeBase64, nativeEncoded); long cn1DecodeMs = measureCn1Decode(cn1EncodedBytes, cn1DecodedBuffer); + long simdEncodeMs = runSimdBenchmark ? measureSimdEncode(simdPayloadBytes, simdEncodedBytes, simdScratch) : -1; + long simdDecodeMs = runSimdBenchmark ? measureSimdDecode(simdEncodedBytes, simdDecodedBuffer, simdScratch) : -1; double encodeRatio = cn1EncodeMs / Math.max(1.0, (double) nativeEncodeMs); double decodeRatio = cn1DecodeMs / Math.max(1.0, (double) nativeDecodeMs); @@ -80,17 +120,35 @@ public boolean runTest() { emitStat("Base64 native decode", formatMs(nativeDecodeMs)); emitStat("Base64 CN1 decode", formatMs(cn1DecodeMs)); emitStat("Base64 decode ratio (CN1/native)", formatRatio(decodeRatio)); + if (runSimdBenchmark) { + double simdEncodeRatioVsNative = simdEncodeMs / Math.max(1.0, (double) nativeEncodeMs); + double simdDecodeRatioVsNative = simdDecodeMs / Math.max(1.0, (double) nativeDecodeMs); + double simdEncodeRatioVsCn1 = simdEncodeMs / Math.max(1.0, (double) cn1EncodeMs); + double simdDecodeRatioVsCn1 = simdDecodeMs / Math.max(1.0, (double) cn1DecodeMs); + emitStat("Base64 SIMD encode", formatMs(simdEncodeMs)); + emitStat("Base64 encode ratio (SIMD/native)", formatRatio(simdEncodeRatioVsNative)); + emitStat("Base64 encode ratio (SIMD/CN1)", formatRatio(simdEncodeRatioVsCn1)); + emitStat("Base64 SIMD decode", formatMs(simdDecodeMs)); + emitStat("Base64 decode ratio (SIMD/native)", formatRatio(simdDecodeRatioVsNative)); + emitStat("Base64 decode ratio (SIMD/CN1)", formatRatio(simdDecodeRatioVsCn1)); + } done(); return true; } - private static void warmup(Base64Native nativeBase64, String payload, byte[] payloadBytes, String nativeEncoded, byte[] cn1EncodedBytes, byte[] cn1DecodedBuffer) { + private static void warmup(Base64Native nativeBase64, String payload, byte[] payloadBytes, String nativeEncoded, byte[] cn1EncodedBytes, + byte[] cn1DecodedBuffer, boolean includeSimd, byte[] simdPayloadBytes, byte[] simdEncodedBytes, + byte[] simdDecodedBuffer, int[] simdScratch) { for (int i = 0; i < 40; i++) { nativeBase64.encodeUtf8(payload); Base64.encodeNoNewline(payloadBytes, cn1EncodedBytes); nativeBase64.decodeToUtf8(nativeEncoded); Base64.decode(cn1EncodedBytes, cn1DecodedBuffer); + if (includeSimd) { + Base64.encodeNoNewlineSimd(simdPayloadBytes, 0, simdPayloadBytes.length, simdEncodedBytes, 0, simdScratch); + Base64.decodeNoWhitespaceSimd(simdEncodedBytes, 0, simdEncodedBytes.length, simdDecodedBuffer, 0, simdScratch); + } } } @@ -126,6 +184,22 @@ private static long measureCn1Decode(byte[] encoded, byte[] outputBuffer) { return System.currentTimeMillis() - start; } + private static long measureSimdEncode(byte[] payloadBytes, byte[] outputBuffer, int[] scratch) { + long start = System.currentTimeMillis(); + for (int i = 0; i < ITERATIONS; i++) { + Base64.encodeNoNewlineSimd(payloadBytes, 0, payloadBytes.length, outputBuffer, 0, scratch); + } + return System.currentTimeMillis() - start; + } + + private static long measureSimdDecode(byte[] encoded, byte[] outputBuffer, int[] scratch) { + long start = System.currentTimeMillis(); + for (int i = 0; i < ITERATIONS; i++) { + Base64.decodeNoWhitespaceSimd(encoded, 0, encoded.length, outputBuffer, 0, scratch); + } + return System.currentTimeMillis() - start; + } + private static String decodeUtf8(String base64) { try { return new String(Base64.decode(base64.getBytes()), "UTF-8"); @@ -147,6 +221,21 @@ private static boolean isIos() { return platformName != null && platformName.toLowerCase().contains("ios"); } + private static boolean byteArraysEqual(byte[] a, byte[] b, int len) { + if (a == b) { + return true; + } + if (a == null || b == null || a.length < len || b.length < len) { + return false; + } + for (int i = 0; i < len; i++) { + if (a[i] != b[i]) { + return false; + } + } + return true; + } + private static String formatMs(double millis) { return formatDecimal(millis, 3) + " ms"; } From 3f668931b01c5b86b6e90299c04833109a4bb817 Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Mon, 13 Apr 2026 06:18:39 +0300 Subject: [PATCH 03/12] Fixed ci allocations and removed allocations --- .../src/com/codename1/util/Base64.java | 140 +++++------------- .../hellocodenameone/tests/SimdApiTest.java | 12 +- 2 files changed, 47 insertions(+), 105 deletions(-) diff --git a/CodenameOne/src/com/codename1/util/Base64.java b/CodenameOne/src/com/codename1/util/Base64.java index f28762ade5..98646af79f 100644 --- a/CodenameOne/src/com/codename1/util/Base64.java +++ b/CodenameOne/src/com/codename1/util/Base64.java @@ -452,7 +452,6 @@ public static int encodeNoNewline(byte[] in, byte[] out) { @DisableDebugInfo @DisableNullChecksAndArrayBoundsChecks public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { - Simd simd = Simd.get(); int outputLength = ((inLength + 2) / 3) * 4; if (out.length - outOffset < outputLength) { throw new IllegalArgumentException("Output buffer too small for encoded data"); @@ -461,56 +460,45 @@ public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byt return 0; } requireScratch(scratch); - requireSimdApiArrays(simd, in, out, scratch); - - final int b0 = 0; - final int b1 = b0 + SIMD_LANES; - final int b2 = b1 + SIMD_LANES; - final int s0 = b2 + SIMD_LANES; - final int s1 = s0 + SIMD_LANES; - final int s2 = s1 + SIMD_LANES; - final int s3 = s2 + SIMD_LANES; - final int t0 = s3 + SIMD_LANES; - final int t1 = t0 + SIMD_LANES; - final int c3 = t1 + SIMD_LANES; - final int c15 = c3 + SIMD_LANES; - final int c63 = c15 + SIMD_LANES; - - for (int lane = 0; lane < SIMD_LANES; lane++) { - scratch[c3 + lane] = 3; - scratch[c15 + lane] = 15; - scratch[c63 + lane] = 63; - } + requireSimdApiArrays(Simd.get(), in, out, scratch); int end = inOffset + inLength - (inLength % 3); - int simdEnd = end - ((end - inOffset) % 48); int inIndex = inOffset; int outIndex = outOffset; - for (; inIndex < simdEnd; inIndex += 48) { - for (int lane = 0; lane < SIMD_LANES; lane++) { - int src = inIndex + lane * 3; - scratch[b0 + lane] = in[src] & 0xff; - scratch[b1 + lane] = in[src + 1] & 0xff; - scratch[b2 + lane] = in[src + 2] & 0xff; - } - - simd.shrLogical(scratch, b0, 2, scratch, s0, SIMD_LANES); - simd.and(scratch, b0, scratch, c3, scratch, t0, SIMD_LANES); - simd.shl(scratch, t0, 4, scratch, t0, SIMD_LANES); - simd.shrLogical(scratch, b1, 4, scratch, t1, SIMD_LANES); - simd.or(scratch, t0, scratch, t1, scratch, s1, SIMD_LANES); - simd.and(scratch, b1, scratch, c15, scratch, t0, SIMD_LANES); - simd.shl(scratch, t0, 2, scratch, t0, SIMD_LANES); - simd.shrLogical(scratch, b2, 6, scratch, t1, SIMD_LANES); - simd.or(scratch, t0, scratch, t1, scratch, s2, SIMD_LANES); - simd.and(scratch, b2, scratch, c63, scratch, s3, SIMD_LANES); - - for (int lane = 0; lane < SIMD_LANES; lane++) { - out[outIndex++] = map[scratch[s0 + lane]]; - out[outIndex++] = map[scratch[s1 + lane]]; - out[outIndex++] = map[scratch[s2 + lane]]; - out[outIndex++] = map[scratch[s3 + lane]]; - } + int fastEnd = end - 12; + for (; inIndex <= fastEnd; inIndex += 12) { + int b0 = in[inIndex] & 0xff; + int b1 = in[inIndex + 1] & 0xff; + int b2 = in[inIndex + 2] & 0xff; + int b3 = in[inIndex + 3] & 0xff; + int b4 = in[inIndex + 4] & 0xff; + int b5 = in[inIndex + 5] & 0xff; + int b6 = in[inIndex + 6] & 0xff; + int b7 = in[inIndex + 7] & 0xff; + int b8 = in[inIndex + 8] & 0xff; + int b9 = in[inIndex + 9] & 0xff; + int b10 = in[inIndex + 10] & 0xff; + int b11 = in[inIndex + 11] & 0xff; + + out[outIndex++] = map[b0 >> 2]; + out[outIndex++] = map[((b0 & 0x03) << 4) | (b1 >> 4)]; + out[outIndex++] = map[((b1 & 0x0f) << 2) | (b2 >> 6)]; + out[outIndex++] = map[b2 & 0x3f]; + + out[outIndex++] = map[b3 >> 2]; + out[outIndex++] = map[((b3 & 0x03) << 4) | (b4 >> 4)]; + out[outIndex++] = map[((b4 & 0x0f) << 2) | (b5 >> 6)]; + out[outIndex++] = map[b5 & 0x3f]; + + out[outIndex++] = map[b6 >> 2]; + out[outIndex++] = map[((b6 & 0x03) << 4) | (b7 >> 4)]; + out[outIndex++] = map[((b7 & 0x0f) << 2) | (b8 >> 6)]; + out[outIndex++] = map[b8 & 0x3f]; + + out[outIndex++] = map[b9 >> 2]; + out[outIndex++] = map[((b9 & 0x03) << 4) | (b10 >> 4)]; + out[outIndex++] = map[((b10 & 0x0f) << 2) | (b11 >> 6)]; + out[outIndex++] = map[b11 & 0x3f]; } for (; inIndex < end; inIndex += 3) { @@ -586,66 +574,12 @@ public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, } requireScratch(scratch); - Simd simd = Simd.get(); - requireSimdApiArrays(simd, in, out, scratch); - - final int q0 = 0; - final int q1 = q0 + SIMD_LANES; - final int q2 = q1 + SIMD_LANES; - final int q3 = q2 + SIMD_LANES; - final int o0 = q3 + SIMD_LANES; - final int o1 = o0 + SIMD_LANES; - final int o2 = o1 + SIMD_LANES; - final int t0 = o2 + SIMD_LANES; - final int t1 = t0 + SIMD_LANES; - final int c3 = t1 + SIMD_LANES; - final int c15 = c3 + SIMD_LANES; - - for (int lane = 0; lane < SIMD_LANES; lane++) { - scratch[c3 + lane] = 3; - scratch[c15 + lane] = 15; - } + requireSimdApiArrays(Simd.get(), in, out, scratch); int fullLen = inLength - (pad > 0 ? 4 : 0); - int simdFullLen = fullLen - (fullLen % 64); + int fullEnd = inOffset + fullLen; int inIndex = inOffset; int outIndex = outOffset; - int endVector = inOffset + simdFullLen; - for (; inIndex < endVector; inIndex += 64) { - for (int lane = 0; lane < SIMD_LANES; lane++) { - int src = inIndex + lane * 4; - int d0 = decodeMapInt[in[src] & 0xff]; - int d1 = decodeMapInt[in[src + 1] & 0xff]; - int d2 = decodeMapInt[in[src + 2] & 0xff]; - int d3 = decodeMapInt[in[src + 3] & 0xff]; - if ((d0 | d1 | d2 | d3) < 0) { - return -1; - } - scratch[q0 + lane] = d0; - scratch[q1 + lane] = d1; - scratch[q2 + lane] = d2; - scratch[q3 + lane] = d3; - } - - simd.shl(scratch, q0, 2, scratch, o0, SIMD_LANES); - simd.shrLogical(scratch, q1, 4, scratch, t0, SIMD_LANES); - simd.or(scratch, o0, scratch, t0, scratch, o0, SIMD_LANES); - simd.and(scratch, q1, scratch, c15, scratch, t0, SIMD_LANES); - simd.shl(scratch, t0, 4, scratch, t0, SIMD_LANES); - simd.shrLogical(scratch, q2, 2, scratch, t1, SIMD_LANES); - simd.or(scratch, t0, scratch, t1, scratch, o1, SIMD_LANES); - simd.and(scratch, q2, scratch, c3, scratch, t0, SIMD_LANES); - simd.shl(scratch, t0, 6, scratch, t0, SIMD_LANES); - simd.or(scratch, t0, scratch, q3, scratch, o2, SIMD_LANES); - - for (int lane = 0; lane < SIMD_LANES; lane++) { - out[outIndex++] = (byte)scratch[o0 + lane]; - out[outIndex++] = (byte)scratch[o1 + lane]; - out[outIndex++] = (byte)scratch[o2 + lane]; - } - } - - int fullEnd = inOffset + fullLen; for (; inIndex < fullEnd; inIndex += 4) { int c0 = in[inIndex] & 0xff; int c1 = in[inIndex + 1] & 0xff; diff --git a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java index 9414c679ba..36c07f55a6 100644 --- a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java +++ b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java @@ -9,8 +9,16 @@ public boolean runTest() { try { Simd simd = Simd.get(); if (!simd.isSupported()) { - fail("Simd.isSupported() returned false"); - return false; + int[] a = new int[]{1, 2, 3, 4}; + int[] b = new int[]{9, 8, 7, 6}; + int[] out = new int[4]; + simd.add(a, b, out, 0, 4); + if (out[0] != 10 || out[1] != 10 || out[2] != 10 || out[3] != 10) { + fail("Fallback SIMD API add failed on unsupported platform"); + return false; + } + done(); + return true; } int[] a = simd.allocInt(16); From 88312b11a39ca41e84e1375b32cbcd8d2550a5b7 Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Mon, 13 Apr 2026 07:39:19 +0300 Subject: [PATCH 04/12] Removed unused variables --- CodenameOne/src/com/codename1/util/Base64.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/CodenameOne/src/com/codename1/util/Base64.java b/CodenameOne/src/com/codename1/util/Base64.java index 98646af79f..53ed70c327 100644 --- a/CodenameOne/src/com/codename1/util/Base64.java +++ b/CodenameOne/src/com/codename1/util/Base64.java @@ -39,7 +39,6 @@ public abstract class Base64 { private static final byte[] decodeMap = new byte[256]; private static final int[] decodeMapInt = new int[256]; - private static final int SIMD_LANES = 16; private static final int SIMD_SCRATCH_INTS = 192; static { @@ -452,6 +451,10 @@ public static int encodeNoNewline(byte[] in, byte[] out) { @DisableDebugInfo @DisableNullChecksAndArrayBoundsChecks public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { + if (inOffset == 0 && outOffset == 0 && inLength == in.length) { + requireScratch(scratch); + return encodeNoNewline(in, out); + } int outputLength = ((inLength + 2) / 3) * 4; if (out.length - outOffset < outputLength) { throw new IllegalArgumentException("Output buffer too small for encoded data"); @@ -460,7 +463,6 @@ public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byt return 0; } requireScratch(scratch); - requireSimdApiArrays(Simd.get(), in, out, scratch); int end = inOffset + inLength - (inLength % 3); int inIndex = inOffset; @@ -552,6 +554,10 @@ public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byt @DisableDebugInfo @DisableNullChecksAndArrayBoundsChecks public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { + if (inOffset == 0 && outOffset == 0 && inLength == in.length) { + requireScratch(scratch); + return decodeNoWhitespace(in, inLength, out); + } if ((inLength & 0x3) != 0) { return -1; } @@ -574,7 +580,6 @@ public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, } requireScratch(scratch); - requireSimdApiArrays(Simd.get(), in, out, scratch); int fullLen = inLength - (pad > 0 ? 4 : 0); int fullEnd = inOffset + fullLen; @@ -643,11 +648,6 @@ private static void requireScratch(int[] scratch) { } } - private static void requireSimdApiArrays(Simd simd, byte[] in, byte[] out, int[] scratch) { - simd.unpackUnsignedByteToInt(in, scratch, 0, 0); - simd.packIntToByteTruncate(scratch, out, 0, 0); - } - private static byte[] allocByteMaybeSimd(int size) { if (size <= 0) { return new byte[0]; From 2bd1c149f66939320cae09ee5768aff47b3fbc1f Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Mon, 13 Apr 2026 21:03:26 +0300 Subject: [PATCH 05/12] Another attempt at improving performance --- .../src/com/codename1/util/Base64.java | 84 ++++++++----------- .../tests/Base64NativePerformanceTest.java | 26 ++++-- 2 files changed, 53 insertions(+), 57 deletions(-) diff --git a/CodenameOne/src/com/codename1/util/Base64.java b/CodenameOne/src/com/codename1/util/Base64.java index 53ed70c327..af054b326c 100644 --- a/CodenameOne/src/com/codename1/util/Base64.java +++ b/CodenameOne/src/com/codename1/util/Base64.java @@ -39,8 +39,6 @@ public abstract class Base64 { private static final byte[] decodeMap = new byte[256]; private static final int[] decodeMapInt = new int[256]; - private static final int SIMD_SCRATCH_INTS = 192; - static { for (int i = 0; i < decodeMap.length; i++) { decodeMap[i] = (byte) DECODE_INVALID; @@ -451,10 +449,6 @@ public static int encodeNoNewline(byte[] in, byte[] out) { @DisableDebugInfo @DisableNullChecksAndArrayBoundsChecks public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { - if (inOffset == 0 && outOffset == 0 && inLength == in.length) { - requireScratch(scratch); - return encodeNoNewline(in, out); - } int outputLength = ((inLength + 2) / 3) * 4; if (out.length - outOffset < outputLength) { throw new IllegalArgumentException("Output buffer too small for encoded data"); @@ -462,7 +456,7 @@ public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byt if (inLength == 0) { return 0; } - requireScratch(scratch); + byte[] mapLocal = map; int end = inOffset + inLength - (inLength % 3); int inIndex = inOffset; @@ -482,42 +476,42 @@ public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byt int b10 = in[inIndex + 10] & 0xff; int b11 = in[inIndex + 11] & 0xff; - out[outIndex++] = map[b0 >> 2]; - out[outIndex++] = map[((b0 & 0x03) << 4) | (b1 >> 4)]; - out[outIndex++] = map[((b1 & 0x0f) << 2) | (b2 >> 6)]; - out[outIndex++] = map[b2 & 0x3f]; + out[outIndex++] = mapLocal[b0 >> 2]; + out[outIndex++] = mapLocal[((b0 & 0x03) << 4) | (b1 >> 4)]; + out[outIndex++] = mapLocal[((b1 & 0x0f) << 2) | (b2 >> 6)]; + out[outIndex++] = mapLocal[b2 & 0x3f]; - out[outIndex++] = map[b3 >> 2]; - out[outIndex++] = map[((b3 & 0x03) << 4) | (b4 >> 4)]; - out[outIndex++] = map[((b4 & 0x0f) << 2) | (b5 >> 6)]; - out[outIndex++] = map[b5 & 0x3f]; + out[outIndex++] = mapLocal[b3 >> 2]; + out[outIndex++] = mapLocal[((b3 & 0x03) << 4) | (b4 >> 4)]; + out[outIndex++] = mapLocal[((b4 & 0x0f) << 2) | (b5 >> 6)]; + out[outIndex++] = mapLocal[b5 & 0x3f]; - out[outIndex++] = map[b6 >> 2]; - out[outIndex++] = map[((b6 & 0x03) << 4) | (b7 >> 4)]; - out[outIndex++] = map[((b7 & 0x0f) << 2) | (b8 >> 6)]; - out[outIndex++] = map[b8 & 0x3f]; + out[outIndex++] = mapLocal[b6 >> 2]; + out[outIndex++] = mapLocal[((b6 & 0x03) << 4) | (b7 >> 4)]; + out[outIndex++] = mapLocal[((b7 & 0x0f) << 2) | (b8 >> 6)]; + out[outIndex++] = mapLocal[b8 & 0x3f]; - out[outIndex++] = map[b9 >> 2]; - out[outIndex++] = map[((b9 & 0x03) << 4) | (b10 >> 4)]; - out[outIndex++] = map[((b10 & 0x0f) << 2) | (b11 >> 6)]; - out[outIndex++] = map[b11 & 0x3f]; + out[outIndex++] = mapLocal[b9 >> 2]; + out[outIndex++] = mapLocal[((b9 & 0x03) << 4) | (b10 >> 4)]; + out[outIndex++] = mapLocal[((b10 & 0x0f) << 2) | (b11 >> 6)]; + out[outIndex++] = mapLocal[b11 & 0x3f]; } for (; inIndex < end; inIndex += 3) { int x0 = in[inIndex] & 0xff; int x1 = in[inIndex + 1] & 0xff; int x2 = in[inIndex + 2] & 0xff; - out[outIndex++] = map[x0 >> 2]; - out[outIndex++] = map[((x0 & 0x03) << 4) | (x1 >> 4)]; - out[outIndex++] = map[((x1 & 0x0f) << 2) | (x2 >> 6)]; - out[outIndex++] = map[x2 & 0x3f]; + out[outIndex++] = mapLocal[x0 >> 2]; + out[outIndex++] = mapLocal[((x0 & 0x03) << 4) | (x1 >> 4)]; + out[outIndex++] = mapLocal[((x1 & 0x0f) << 2) | (x2 >> 6)]; + out[outIndex++] = mapLocal[x2 & 0x3f]; } switch (inOffset + inLength - end) { case 1: { int x0 = in[end] & 0xff; - out[outIndex++] = map[x0 >> 2]; - out[outIndex++] = map[(x0 & 0x03) << 4]; + out[outIndex++] = mapLocal[x0 >> 2]; + out[outIndex++] = mapLocal[(x0 & 0x03) << 4]; out[outIndex++] = '='; out[outIndex++] = '='; break; @@ -525,9 +519,9 @@ public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byt case 2: { int x0 = in[end] & 0xff; int x1 = in[end + 1] & 0xff; - out[outIndex++] = map[x0 >> 2]; - out[outIndex++] = map[((x0 & 0x03) << 4) | (x1 >> 4)]; - out[outIndex++] = map[(x1 & 0x0f) << 2]; + out[outIndex++] = mapLocal[x0 >> 2]; + out[outIndex++] = mapLocal[((x0 & 0x03) << 4) | (x1 >> 4)]; + out[outIndex++] = mapLocal[(x1 & 0x0f) << 2]; out[outIndex++] = '='; break; } @@ -554,10 +548,6 @@ public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byt @DisableDebugInfo @DisableNullChecksAndArrayBoundsChecks public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { - if (inOffset == 0 && outOffset == 0 && inLength == in.length) { - requireScratch(scratch); - return decodeNoWhitespace(in, inLength, out); - } if ((inLength & 0x3) != 0) { return -1; } @@ -579,7 +569,7 @@ public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, throw new IllegalArgumentException("Output buffer too small for decoded data"); } - requireScratch(scratch); + int[] decodeMap = decodeMapInt; int fullLen = inLength - (pad > 0 ? 4 : 0); int fullEnd = inOffset + fullLen; @@ -590,10 +580,10 @@ public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, int c1 = in[inIndex + 1] & 0xff; int c2 = in[inIndex + 2] & 0xff; int c3v = in[inIndex + 3] & 0xff; - int x0 = decodeMapInt[c0]; - int x1 = decodeMapInt[c1]; - int x2 = decodeMapInt[c2]; - int x3 = decodeMapInt[c3v]; + int x0 = decodeMap[c0]; + int x1 = decodeMap[c1]; + int x2 = decodeMap[c2]; + int x3 = decodeMap[c3v]; if ((x0 | x1 | x2 | x3) < 0) { return -1; } @@ -610,8 +600,8 @@ public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, int i = inOffset + inLength - 4; int c0 = in[i] & 0xff; int c1 = in[i + 1] & 0xff; - int x0 = decodeMapInt[c0]; - int x1 = decodeMapInt[c1]; + int x0 = decodeMap[c0]; + int x1 = decodeMap[c1]; if ((x0 | x1) < 0) { return -1; } @@ -622,7 +612,7 @@ public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, if (in[i + 3] != '=') { return -1; } - int x2 = decodeMapInt[in[i + 2] & 0xff]; + int x2 = decodeMap[in[i + 2] & 0xff]; if (x2 < 0) { return -1; } @@ -642,12 +632,6 @@ public static int decodeNoWhitespaceSimd(byte[] in, int len, byte[] out, int[] s return decodeNoWhitespaceSimd(in, 0, len, out, 0, scratch); } - private static void requireScratch(int[] scratch) { - if (scratch == null || scratch.length < SIMD_SCRATCH_INTS) { - throw new IllegalArgumentException("scratch must be an int[] allocated with Simd.allocInt(192) or larger"); - } - } - private static byte[] allocByteMaybeSimd(int size) { if (size <= 0) { return new byte[0]; diff --git a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java index 231bbeccdd..e5d1b933cc 100644 --- a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java +++ b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java @@ -96,11 +96,11 @@ public boolean runTest() { if (!isIos()) { warmup(nativeBase64, payload, payloadBytes, nativeEncoded, cn1EncodedBytes, cn1DecodedBuffer, - runSimdBenchmark, simdPayloadBytes, simdEncodedBytes, simdDecodedBuffer, simdScratch); + runSimdBenchmark, simdPayloadBytes, simdEncodedBytes, simdDecodedBuffer, simdScratch, encodedLen); } if (runSimdBenchmark) { warmup(nativeBase64, payload, payloadBytes, nativeEncoded, cn1EncodedBytes, cn1DecodedBuffer, - true, simdPayloadBytes, simdEncodedBytes, simdDecodedBuffer, simdScratch); + true, simdPayloadBytes, simdEncodedBytes, simdDecodedBuffer, simdScratch, encodedLen); } long nativeEncodeMs = measureNativeEncode(nativeBase64, payload); @@ -139,15 +139,27 @@ public boolean runTest() { private static void warmup(Base64Native nativeBase64, String payload, byte[] payloadBytes, String nativeEncoded, byte[] cn1EncodedBytes, byte[] cn1DecodedBuffer, boolean includeSimd, byte[] simdPayloadBytes, byte[] simdEncodedBytes, - byte[] simdDecodedBuffer, int[] simdScratch) { + byte[] simdDecodedBuffer, int[] simdScratch, int encodedLen) { for (int i = 0; i < 40; i++) { nativeBase64.encodeUtf8(payload); - Base64.encodeNoNewline(payloadBytes, cn1EncodedBytes); + int cn1EncodedWritten = Base64.encodeNoNewline(payloadBytes, cn1EncodedBytes); + if (cn1EncodedWritten != encodedLen) { + throw new IllegalStateException("Warmup CN1 encode length mismatch"); + } nativeBase64.decodeToUtf8(nativeEncoded); - Base64.decode(cn1EncodedBytes, cn1DecodedBuffer); + int cn1DecodedWritten = Base64.decode(cn1EncodedBytes, cn1DecodedBuffer); + if (cn1DecodedWritten != payloadBytes.length || !byteArraysEqual(payloadBytes, cn1DecodedBuffer, payloadBytes.length)) { + throw new IllegalStateException("Warmup CN1 decode mismatch"); + } if (includeSimd) { - Base64.encodeNoNewlineSimd(simdPayloadBytes, 0, simdPayloadBytes.length, simdEncodedBytes, 0, simdScratch); - Base64.decodeNoWhitespaceSimd(simdEncodedBytes, 0, simdEncodedBytes.length, simdDecodedBuffer, 0, simdScratch); + int simdEncodedWritten = Base64.encodeNoNewlineSimd(simdPayloadBytes, 0, simdPayloadBytes.length, simdEncodedBytes, 0, simdScratch); + if (simdEncodedWritten != encodedLen || !byteArraysEqual(cn1EncodedBytes, simdEncodedBytes, encodedLen)) { + throw new IllegalStateException("Warmup SIMD encode mismatch"); + } + int simdDecodedWritten = Base64.decodeNoWhitespaceSimd(simdEncodedBytes, 0, encodedLen, simdDecodedBuffer, 0, simdScratch); + if (simdDecodedWritten != payloadBytes.length || !byteArraysEqual(payloadBytes, simdDecodedBuffer, payloadBytes.length)) { + throw new IllegalStateException("Warmup SIMD decode mismatch"); + } } } } From 1af686d84b7c0c7272d955cf8846bc30a1568ef4 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 20:36:38 +0300 Subject: [PATCH 06/12] Replace hardcoded Base64 in Simd with generic primitives; rewrite Base64 SIMD in Java (#4745) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add NEON-accelerated base64Encode/base64Decode to Simd API and wire into Base64 SIMD methods Agent-Logs-Url: https://github.com/codenameone/CodenameOne/sessions/c218992e-943a-4ce5-8d63-f82c0792416f Co-authored-by: shai-almog <67850168+shai-almog@users.noreply.github.com> * Address code review: clarify loop conditions and comment in NEON base64 Agent-Logs-Url: https://github.com/codenameone/CodenameOne/sessions/c218992e-943a-4ce5-8d63-f82c0792416f Co-authored-by: shai-almog <67850168+shai-almog@users.noreply.github.com> * Add new SIMD generic primitive declarations and remove base64 methods from IOSSimd - Added shl, shrLogical, addWrapping, subWrapping for byte arrays - Added offset-based overloads for unpackUnsignedByteToInt, add, cmpEq, cmpLt, select - Removed base64Encode and base64Decode declarations Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: shai-almog <67850168+shai-almog@users.noreply.github.com> * Replace base64 overrides with generic SIMD primitive validation wrappers in JavaSESimd Remove base64Encode and base64Decode overrides. Add validation wrapper overrides for new generic Simd primitives: shl, shrLogical, addWrapping, subWrapping, unpackUnsignedByteToInt, add (int[]), cmpEq, cmpLt, and select. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: shai-almog <67850168+shai-almog@users.noreply.github.com> * Replace base64 NEON section with generic SIMD primitives in IOSSimd.m Remove NEON-accelerated Base64 encode/decode implementations and add NEON implementations for new generic Simd primitives: shl, shrLogical, addWrapping, subWrapping, unpackUnsignedByteToInt, add (int), cmpEq, cmpLt, and select (with offset parameters). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: shai-almog <67850168+shai-almog@users.noreply.github.com> * Remove base64 methods from Simd; add generic byte/int primitives; rewrite Base64 SIMD in Java - Remove base64Encode/base64Decode from Simd, IOSSimd, JavaSESimd, IOSSimd.m - Add generic byte shift primitives: shl(byte[]), shrLogical(byte[]) - Add wrapping byte arithmetic: addWrapping(byte[]), subWrapping(byte[]) - Add offset-based int overloads: unpackUnsignedByteToInt, add, cmpLt, cmpEq, select - Add NEON implementations for all new primitives in IOSSimd.m - Add validation wrappers in JavaSESimd.java - Rewrite encodeNoNewlineSimd in Java using int-domain SIMD compare/select - Rewrite decodeNoWhitespaceSimd in Java using int-domain SIMD shift/or - Update SimdTest with tests for new primitives - All 2345 tests pass Agent-Logs-Url: https://github.com/codenameone/CodenameOne/sessions/5fc6976c-f0ec-4638-b22b-2cbc9c9ca5dd Co-authored-by: shai-almog <67850168+shai-almog@users.noreply.github.com> * Optimize SIMD Base64: replace slow scatter/gather + per-element SIMD calls with fast inline scalar Java The previous approach called 15+ individual Simd operations per 48-byte chunk (each a virtual dispatch + JNI transition on iOS), plus scalar scatter/gather loops for byte↔int conversion. This added ~2500 JNI transitions per encode of 8KB, making it 64-109% slower than the already-fast scalar code. Replace with the same 4x-unrolled table-lookup approach used by encodeNoNewline(), now with offset support. This matches the scalar CN1 encode/decode performance while maintaining the same API contract. Agent-Logs-Url: https://github.com/codenameone/CodenameOne/sessions/90b8c36e-8f20-47da-9fb4-56344f18a336 Co-authored-by: shai-almog <67850168+shai-almog@users.noreply.github.com> * Revert "Optimize SIMD Base64: replace slow scatter/gather + per-element SIMD calls with fast inline scalar Java" This reverts commit 00e51031f951fd025776b4623d0372703c14bfd1. --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: shai-almog <67850168+shai-almog@users.noreply.github.com> --- .../src/com/codename1/util/Base64.java | 396 ++++++++++++------ CodenameOne/src/com/codename1/util/Simd.java | 56 +++ .../com/codename1/impl/javase/JavaSESimd.java | 88 ++++ Ports/iOSPort/nativeSources/IOSSimd.m | 132 ++++++ .../src/com/codename1/impl/ios/IOSSimd.java | 27 ++ .../java/com/codename1/util/SimdTest.java | 133 ++++++ 6 files changed, 708 insertions(+), 124 deletions(-) diff --git a/CodenameOne/src/com/codename1/util/Base64.java b/CodenameOne/src/com/codename1/util/Base64.java index af054b326c..7e4b7125d7 100644 --- a/CodenameOne/src/com/codename1/util/Base64.java +++ b/CodenameOne/src/com/codename1/util/Base64.java @@ -39,6 +39,8 @@ public abstract class Base64 { private static final byte[] decodeMap = new byte[256]; private static final int[] decodeMapInt = new int[256]; + private static final int SIMD_SCRATCH_INTS = 192; + static { for (int i = 0; i < decodeMap.length; i++) { decodeMap[i] = (byte) DECODE_INVALID; @@ -434,95 +436,183 @@ public static int encodeNoNewline(byte[] in, byte[] out) { return outIndex; } + // ---- SIMD constant tables (lazily initialized) ---- + private static volatile int[] simdEncConst; + + // Encode constant offsets (each sub-array is 64 ints) + private static final int ENC_K26 = 0; // threshold 26 + private static final int ENC_K52 = 64; // threshold 52 + private static final int ENC_K62 = 128; // threshold 62 + private static final int ENC_OFF_AZ = 192; // +65 for A-Z + private static final int ENC_OFF_az = 256; // +71 for a-z + private static final int ENC_OFF_09 = 320; // -4 for 0-9 + private static final int ENC_OFF_PLUS = 384; // -19 for + + private static final int ENC_OFF_SLASH = 448; // -16 for / + // masks (16 ints each at offset 512) + private static final int ENC_M03 = 512; + private static final int ENC_M0F = 528; + private static final int ENC_M3F = 544; + private static final int ENC_CONST_SIZE = 560; + + private static volatile byte[] simdMask; + + private static int[] getSimdEncConst(Simd simd) { + int[] c = simdEncConst; + if (c != null) { + return c; + } + c = simd.allocInt(ENC_CONST_SIZE); + fillRange(c, ENC_K26, 64, 26); + fillRange(c, ENC_K52, 64, 52); + fillRange(c, ENC_K62, 64, 62); + fillRange(c, ENC_OFF_AZ, 64, 65); + fillRange(c, ENC_OFF_az, 64, 71); + fillRange(c, ENC_OFF_09, 64, -4); + fillRange(c, ENC_OFF_PLUS, 64, -19); + fillRange(c, ENC_OFF_SLASH, 64, -16); + fillRange(c, ENC_M03, 16, 0x03); + fillRange(c, ENC_M0F, 16, 0x0F); + fillRange(c, ENC_M3F, 16, 0x3F); + simdEncConst = c; + return c; + } + + private static byte[] getSimdMask(Simd simd) { + byte[] m = simdMask; + if (m != null) { + return m; + } + m = simd.allocByte(64); + simdMask = m; + return m; + } + + private static void fillRange(int[] arr, int offset, int len, int val) { + for (int i = offset, end = offset + len; i < end; i++) { + arr[i] = val; + } + } + /// SIMD-optimized Base64 encoding with explicit offsets and caller scratch. - /// Scratch layout: a single SIMD-allocated `int[]` buffer of at least 192 ints. + /// Uses generic Simd int-domain operations to extract 6-bit indices and + /// map them to ASCII via branchless compare/select. /// - /// Usage example: - /// ```java - /// Simd simd = Simd.get(); - /// byte[] input = simd.allocByte(data.length); - /// System.arraycopy(data, 0, input, 0, data.length); - /// byte[] output = simd.allocByte(((data.length + 2) / 3) * 4); - /// int[] scratch = simd.allocInt(192); - /// int written = Base64.encodeNoNewlineSimd(input, 0, input.length, output, 0, scratch); - /// ``` + /// Scratch layout: a single SIMD-allocated `int[]` buffer of at least 192 ints. + /// Working regions within scratch: + /// - [0..47] : input bytes unpacked to ints (3 stripes of 16) + /// - [48..111] : output indices / ASCII values (4 stripes of 16) + /// - [112..175] : temporaries @DisableDebugInfo @DisableNullChecksAndArrayBoundsChecks public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { int outputLength = ((inLength + 2) / 3) * 4; - if (out.length - outOffset < outputLength) { - throw new IllegalArgumentException("Output buffer too small for encoded data"); - } if (inLength == 0) { return 0; } - byte[] mapLocal = map; - - int end = inOffset + inLength - (inLength % 3); - int inIndex = inOffset; - int outIndex = outOffset; - int fastEnd = end - 12; - for (; inIndex <= fastEnd; inIndex += 12) { - int b0 = in[inIndex] & 0xff; - int b1 = in[inIndex + 1] & 0xff; - int b2 = in[inIndex + 2] & 0xff; - int b3 = in[inIndex + 3] & 0xff; - int b4 = in[inIndex + 4] & 0xff; - int b5 = in[inIndex + 5] & 0xff; - int b6 = in[inIndex + 6] & 0xff; - int b7 = in[inIndex + 7] & 0xff; - int b8 = in[inIndex + 8] & 0xff; - int b9 = in[inIndex + 9] & 0xff; - int b10 = in[inIndex + 10] & 0xff; - int b11 = in[inIndex + 11] & 0xff; - - out[outIndex++] = mapLocal[b0 >> 2]; - out[outIndex++] = mapLocal[((b0 & 0x03) << 4) | (b1 >> 4)]; - out[outIndex++] = mapLocal[((b1 & 0x0f) << 2) | (b2 >> 6)]; - out[outIndex++] = mapLocal[b2 & 0x3f]; - - out[outIndex++] = mapLocal[b3 >> 2]; - out[outIndex++] = mapLocal[((b3 & 0x03) << 4) | (b4 >> 4)]; - out[outIndex++] = mapLocal[((b4 & 0x0f) << 2) | (b5 >> 6)]; - out[outIndex++] = mapLocal[b5 & 0x3f]; - - out[outIndex++] = mapLocal[b6 >> 2]; - out[outIndex++] = mapLocal[((b6 & 0x03) << 4) | (b7 >> 4)]; - out[outIndex++] = mapLocal[((b7 & 0x0f) << 2) | (b8 >> 6)]; - out[outIndex++] = mapLocal[b8 & 0x3f]; + requireScratch(scratch); + Simd simd = Simd.get(); + int[] ec = getSimdEncConst(simd); + byte[] mask = getSimdMask(simd); + + int end3 = inOffset + inLength - (inLength % 3); + int si = inOffset; + int di = outOffset; + + // Process 16 triplets (48 input bytes → 64 output bytes) per iteration + int simdEnd = end3 - 48 + 1; + while (si < simdEnd) { + // 1. Scatter input bytes into 3 int stripes (b0, b1, b2) + for (int j = 0; j < 16; j++) { + scratch[j] = in[si + j * 3] & 0xff; + scratch[16 + j] = in[si + j * 3 + 1] & 0xff; + scratch[32 + j] = in[si + j * 3 + 2] & 0xff; + } - out[outIndex++] = mapLocal[b9 >> 2]; - out[outIndex++] = mapLocal[((b9 & 0x03) << 4) | (b10 >> 4)]; - out[outIndex++] = mapLocal[((b10 & 0x0f) << 2) | (b11 >> 6)]; - out[outIndex++] = mapLocal[b11 & 0x3f]; - } + // 2. Extract 4 six-bit index stripes using SIMD int ops + // idx0 = b0 >> 2 + simd.shrLogical(scratch, 0, 2, scratch, 48, 16); + + // idx1 = ((b0 & 0x03) << 4) | (b1 >> 4) + simd.and(scratch, 0, ec, ENC_M03, scratch, 112, 16); + simd.shl(scratch, 112, 4, scratch, 112, 16); + simd.shrLogical(scratch, 16, 4, scratch, 128, 16); + simd.or(scratch, 112, scratch, 128, scratch, 64, 16); + + // idx2 = ((b1 & 0x0f) << 2) | (b2 >> 6) + simd.and(scratch, 16, ec, ENC_M0F, scratch, 112, 16); + simd.shl(scratch, 112, 2, scratch, 112, 16); + simd.shrLogical(scratch, 32, 6, scratch, 128, 16); + simd.or(scratch, 112, scratch, 128, scratch, 80, 16); + + // idx3 = b2 & 0x3f + simd.and(scratch, 32, ec, ENC_M3F, scratch, 96, 16); + + // 3. Map all 64 indices to ASCII in batch + // Initialize offset accumulator [112..175] with '/' offset (-16) + System.arraycopy(ec, ENC_OFF_SLASH, scratch, 112, 64); + + // eq62 → use '+' offset + simd.cmpEq(scratch, 48, ec, ENC_K62, mask, 0, 64); + simd.select(mask, 0, ec, ENC_OFF_PLUS, scratch, 112, scratch, 112, 64); + + // lt62 → use '0'-'9' offset + simd.cmpLt(scratch, 48, ec, ENC_K62, mask, 0, 64); + simd.select(mask, 0, ec, ENC_OFF_09, scratch, 112, scratch, 112, 64); + + // lt52 → use 'a'-'z' offset + simd.cmpLt(scratch, 48, ec, ENC_K52, mask, 0, 64); + simd.select(mask, 0, ec, ENC_OFF_az, scratch, 112, scratch, 112, 64); + + // lt26 → use 'A'-'Z' offset + simd.cmpLt(scratch, 48, ec, ENC_K26, mask, 0, 64); + simd.select(mask, 0, ec, ENC_OFF_AZ, scratch, 112, scratch, 112, 64); + + // ascii = indices + offset + simd.add(scratch, 48, scratch, 112, scratch, 48, 64); + + // 4. Interleave 4 output stripes into output bytes + for (int j = 0; j < 16; j++) { + out[di + j * 4] = (byte) scratch[48 + j]; + out[di + j * 4 + 1] = (byte) scratch[64 + j]; + out[di + j * 4 + 2] = (byte) scratch[80 + j]; + out[di + j * 4 + 3] = (byte) scratch[96 + j]; + } - for (; inIndex < end; inIndex += 3) { - int x0 = in[inIndex] & 0xff; - int x1 = in[inIndex + 1] & 0xff; - int x2 = in[inIndex + 2] & 0xff; - out[outIndex++] = mapLocal[x0 >> 2]; - out[outIndex++] = mapLocal[((x0 & 0x03) << 4) | (x1 >> 4)]; - out[outIndex++] = mapLocal[((x1 & 0x0f) << 2) | (x2 >> 6)]; - out[outIndex++] = mapLocal[x2 & 0x3f]; + si += 48; + di += 64; } - switch (inOffset + inLength - end) { + // Scalar tail for remaining complete triplets + byte[] mapLocal = map; + while (si < end3) { + int b0 = in[si] & 0xff; + int b1 = in[si + 1] & 0xff; + int b2 = in[si + 2] & 0xff; + out[di] = mapLocal[b0 >> 2]; + out[di + 1] = mapLocal[((b0 & 0x03) << 4) | (b1 >> 4)]; + out[di + 2] = mapLocal[((b1 & 0x0f) << 2) | (b2 >> 6)]; + out[di + 3] = mapLocal[b2 & 0x3f]; + si += 3; + di += 4; + } + + // Handle 1- or 2-byte remainder with padding + switch (inOffset + inLength - end3) { case 1: { - int x0 = in[end] & 0xff; - out[outIndex++] = mapLocal[x0 >> 2]; - out[outIndex++] = mapLocal[(x0 & 0x03) << 4]; - out[outIndex++] = '='; - out[outIndex++] = '='; + int b0 = in[si] & 0xff; + out[di] = mapLocal[b0 >> 2]; + out[di + 1] = mapLocal[(b0 & 0x03) << 4]; + out[di + 2] = '='; + out[di + 3] = '='; break; } case 2: { - int x0 = in[end] & 0xff; - int x1 = in[end + 1] & 0xff; - out[outIndex++] = mapLocal[x0 >> 2]; - out[outIndex++] = mapLocal[((x0 & 0x03) << 4) | (x1 >> 4)]; - out[outIndex++] = mapLocal[(x1 & 0x0f) << 2]; - out[outIndex++] = '='; + int b0 = in[si] & 0xff; + int b1 = in[si + 1] & 0xff; + out[di] = mapLocal[b0 >> 2]; + out[di + 1] = mapLocal[((b0 & 0x03) << 4) | (b1 >> 4)]; + out[di + 2] = mapLocal[(b1 & 0x0f) << 2]; + out[di + 3] = '='; break; } default: @@ -532,27 +622,29 @@ public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byt } /// SIMD-optimized Base64 decoding for no-whitespace input. - /// Scratch layout: a single SIMD-allocated `int[]` buffer of at least 192 ints. + /// Uses generic Simd int-domain operations to map ASCII chars back to + /// 6-bit values via branchless compare/select, then combines into bytes. /// /// Returns decoded bytes written, or `-1` for invalid input. /// - /// Usage example: - /// ```java - /// Simd simd = Simd.get(); - /// byte[] encoded = simd.allocByte(base64Bytes.length); - /// System.arraycopy(base64Bytes, 0, encoded, 0, base64Bytes.length); - /// byte[] decoded = simd.allocByte((encoded.length / 4) * 3); - /// int[] scratch = simd.allocInt(192); - /// int written = Base64.decodeNoWhitespaceSimd(encoded, 0, encoded.length, decoded, 0, scratch); - /// ``` + /// Scratch layout: a single SIMD-allocated `int[]` buffer of at least 192 ints. + /// Working regions: + /// - [0..63] : input chars unpacked to ints / decoded 6-bit values + /// - [64..111] : output byte values (3 stripes of 16) + /// - [112..175] : temporaries @DisableDebugInfo @DisableNullChecksAndArrayBoundsChecks public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { + if (inLength == 0) { + return 0; + } if ((inLength & 0x3) != 0) { return -1; } + requireScratch(scratch); + int pad = 0; - if (inLength > 0 && in[inOffset + inLength - 1] == '=') { + if (in[inOffset + inLength - 1] == '=') { pad++; if (inLength > 1 && in[inOffset + inLength - 2] == '=') { pad++; @@ -565,58 +657,108 @@ public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, if (outLength <= 0) { return 0; } - if (out.length - outOffset < outLength) { - throw new IllegalArgumentException("Output buffer too small for decoded data"); - } - int[] decodeMap = decodeMapInt; + Simd simd = Simd.get(); + int[] decodeMapLocal = decodeMapInt; int fullLen = inLength - (pad > 0 ? 4 : 0); int fullEnd = inOffset + fullLen; - int inIndex = inOffset; - int outIndex = outOffset; - for (; inIndex < fullEnd; inIndex += 4) { - int c0 = in[inIndex] & 0xff; - int c1 = in[inIndex + 1] & 0xff; - int c2 = in[inIndex + 2] & 0xff; - int c3v = in[inIndex + 3] & 0xff; - int x0 = decodeMap[c0]; - int x1 = decodeMap[c1]; - int x2 = decodeMap[c2]; - int x3 = decodeMap[c3v]; - if ((x0 | x1 | x2 | x3) < 0) { + int si = inOffset; + int di = outOffset; + + // Process 16 quads (64 input bytes → 48 output bytes) per iteration + int simdEnd = fullEnd - 64 + 1; + while (si < simdEnd) { + // 1. De-interleave and decode: scatter 64 input bytes into 4 stripes, + // converting ASCII to 6-bit values using the scalar decode table + boolean invalid = false; + for (int j = 0; j < 16; j++) { + int v0 = decodeMapLocal[in[si + j * 4] & 0xff]; + int v1 = decodeMapLocal[in[si + j * 4 + 1] & 0xff]; + int v2 = decodeMapLocal[in[si + j * 4 + 2] & 0xff]; + int v3 = decodeMapLocal[in[si + j * 4 + 3] & 0xff]; + scratch[j] = v0; + scratch[16 + j] = v1; + scratch[32 + j] = v2; + scratch[48 + j] = v3; + if ((v0 | v1 | v2 | v3) < 0) { + invalid = true; + } + } + if (invalid) { return -1; } - int quantum = (x0 << 18) | (x1 << 12) | (x2 << 6) | x3; - out[outIndex++] = (byte)((quantum >> 16) & 0xff); - out[outIndex++] = (byte)((quantum >> 8) & 0xff); - out[outIndex++] = (byte)(quantum & 0xff); - } - if (pad == 0) { - return outLength; - } + // 2. Combine 4 six-bit values into 3 bytes using SIMD int ops + // o0 = (d0 << 2) | (d1 >> 4) + simd.shl(scratch, 0, 2, scratch, 64, 16); + simd.shrLogical(scratch, 16, 4, scratch, 112, 16); + simd.or(scratch, 64, scratch, 112, scratch, 64, 16); + + // o1 = (d1 << 4) | (d2 >> 2) + simd.shl(scratch, 16, 4, scratch, 80, 16); + simd.shrLogical(scratch, 32, 2, scratch, 112, 16); + simd.or(scratch, 80, scratch, 112, scratch, 80, 16); + + // o2 = (d2 << 6) | d3 + simd.shl(scratch, 32, 6, scratch, 96, 16); + simd.or(scratch, 96, scratch, 48, scratch, 96, 16); + + // 3. Interleave 3 output stripes into output bytes + for (int j = 0; j < 16; j++) { + out[di + j * 3] = (byte) scratch[64 + j]; + out[di + j * 3 + 1] = (byte) scratch[80 + j]; + out[di + j * 3 + 2] = (byte) scratch[96 + j]; + } - int i = inOffset + inLength - 4; - int c0 = in[i] & 0xff; - int c1 = in[i + 1] & 0xff; - int x0 = decodeMap[c0]; - int x1 = decodeMap[c1]; - if ((x0 | x1) < 0) { - return -1; + si += 64; + di += 48; } - out[outIndex++] = (byte)((x0 << 2) | (x1 >> 4)); - if (pad == 2) { - return (in[i + 2] == '=' && in[i + 3] == '=') ? outLength : -1; - } - if (in[i + 3] != '=') { - return -1; + + // Scalar tail for remaining complete quads + while (si < fullEnd) { + int c0 = in[si] & 0xff; + int c1 = in[si + 1] & 0xff; + int c2 = in[si + 2] & 0xff; + int c3 = in[si + 3] & 0xff; + int b0 = decodeMapLocal[c0]; + int b1 = decodeMapLocal[c1]; + int b2 = decodeMapLocal[c2]; + int b3 = decodeMapLocal[c3]; + if ((b0 | b1 | b2 | b3) < 0) { + return -1; + } + int quantum = (b0 << 18) | (b1 << 12) | (b2 << 6) | b3; + out[di++] = (byte) ((quantum >> 16) & 0xff); + out[di++] = (byte) ((quantum >> 8) & 0xff); + out[di++] = (byte) (quantum & 0xff); + si += 4; } - int x2 = decodeMap[in[i + 2] & 0xff]; - if (x2 < 0) { - return -1; + + // Handle last quad with padding + if (pad > 0) { + int i = inOffset + inLength - 4; + int c0 = in[i] & 0xff; + int c1 = in[i + 1] & 0xff; + int b0 = decodeMapLocal[c0]; + int b1 = decodeMapLocal[c1]; + if ((b0 | b1) < 0) { + return -1; + } + out[di++] = (byte) ((b0 << 2) | (b1 >> 4)); + if (pad == 2) { + return (in[i + 2] == '=' && in[i + 3] == '=') ? outLength : -1; + } + if (in[i + 3] != '=') { + return -1; + } + int b2 = decodeMapLocal[in[i + 2] & 0xff]; + if (b2 < 0) { + return -1; + } + out[di] = (byte) ((b1 << 4) | (b2 >> 2)); } - out[outIndex] = (byte)((x1 << 4) | (x2 >> 2)); + return outLength; } @@ -632,6 +774,12 @@ public static int decodeNoWhitespaceSimd(byte[] in, int len, byte[] out, int[] s return decodeNoWhitespaceSimd(in, 0, len, out, 0, scratch); } + private static void requireScratch(int[] scratch) { + if (scratch == null || scratch.length < SIMD_SCRATCH_INTS) { + throw new IllegalArgumentException("scratch must be an int[] allocated with Simd.allocInt(192) or larger"); + } + } + private static byte[] allocByteMaybeSimd(int size) { if (size <= 0) { return new byte[0]; diff --git a/CodenameOne/src/com/codename1/util/Simd.java b/CodenameOne/src/com/codename1/util/Simd.java index 37153ba3fa..81aaa07033 100644 --- a/CodenameOne/src/com/codename1/util/Simd.java +++ b/CodenameOne/src/com/codename1/util/Simd.java @@ -154,12 +154,44 @@ public void select(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] ds } } + public void shl(byte[] src, int bits, byte[] dst, int offset, int length) { + int shift = bits & 7; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)((src[i] & 0xff) << shift); + } + } + + public void shrLogical(byte[] src, int bits, byte[] dst, int offset, int length) { + int shift = bits & 7; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)((src[i] & 0xff) >>> shift); + } + } + + public void addWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] + srcB[i]); + } + } + + public void subWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] - srcB[i]); + } + } + public void unpackUnsignedByteToInt(byte[] src, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = src[i] & 0xff; } } + public void unpackUnsignedByteToInt(byte[] src, int srcOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = src[srcOffset + i] & 0xff; + } + } + public void packIntToByteSaturating(int[] src, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = clampByte(src[i]); @@ -191,6 +223,12 @@ public void add(int[] srcA, int[] srcB, int[] dst, int offset, int length) { } } + public void add(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = srcA[srcAOffset + i] + srcB[srcBOffset + i]; + } + } + public void sub(int[] srcA, int[] srcB, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] - srcB[i]; @@ -312,12 +350,24 @@ public void cmpEq(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length } } + public void cmpEq(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dstMask[dstOffset + i] = srcA[srcAOffset + i] == srcB[srcBOffset + i] ? (byte)-1 : (byte)0; + } + } + public void cmpLt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dstMask[i] = srcA[i] < srcB[i] ? (byte)-1 : (byte)0; } } + public void cmpLt(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dstMask[dstOffset + i] = srcA[srcAOffset + i] < srcB[srcBOffset + i] ? (byte)-1 : (byte)0; + } + } + public void cmpGt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dstMask[i] = srcA[i] > srcB[i] ? (byte)-1 : (byte)0; @@ -330,6 +380,12 @@ public void select(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, } } + public void select(byte[] mask, int maskOffset, int[] trueValues, int trueOffset, int[] falseValues, int falseOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = mask[maskOffset + i] != 0 ? trueValues[trueOffset + i] : falseValues[falseOffset + i]; + } + } + public int sum(int[] src, int offset, int length) { int out = 0; for (int i = offset, end = offset + length; i < end; i++) { diff --git a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java index 8787c677c7..d5cddbdd8b 100644 --- a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java +++ b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java @@ -450,6 +450,94 @@ public float dot(float[] srcA, float[] srcB, int offset, int length) { return super.dot(srcA, srcB, offset, length); } + @Override + public void shl(byte[] src, int bits, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.shl(src, bits, dst, offset, length); + } + + @Override + public void shrLogical(byte[] src, int bits, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.shrLogical(src, bits, dst, offset, length); + } + + @Override + public void addWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.addWrapping(srcA, srcB, dst, offset, length); + } + + @Override + public void subWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.subWrapping(srcA, srcB, dst, offset, length); + } + + @Override + public void unpackUnsignedByteToInt(byte[] src, int srcOffset, int[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.unpackUnsignedByteToInt(src, srcOffset, dst, dstOffset, length); + } + + @Override + public void add(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(srcA, srcB, dst); + super.add(srcA, srcAOffset, srcB, srcBOffset, dst, dstOffset, length); + } + + @Override + public void cmpEq(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dstMask, "dstMask"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dstMask.length, dstOffset, length, "dstMask"); + validateRegistered(srcA, srcB, dstMask); + super.cmpEq(srcA, srcAOffset, srcB, srcBOffset, dstMask, dstOffset, length); + } + + @Override + public void cmpLt(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dstMask, "dstMask"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dstMask.length, dstOffset, length, "dstMask"); + validateRegistered(srcA, srcB, dstMask); + super.cmpLt(srcA, srcAOffset, srcB, srcBOffset, dstMask, dstOffset, length); + } + + @Override + public void select(byte[] mask, int maskOffset, int[] trueValues, int trueOffset, int[] falseValues, int falseOffset, int[] dst, int dstOffset, int length) { + validateNotNull(mask, "mask"); + validateNotNull(trueValues, "trueValues"); + validateNotNull(falseValues, "falseValues"); + validateNotNull(dst, "dst"); + validateRange(mask.length, maskOffset, length, "mask"); + validateRange(trueValues.length, trueOffset, length, "trueValues"); + validateRange(falseValues.length, falseOffset, length, "falseValues"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(mask, trueValues, falseValues, dst); + super.select(mask, maskOffset, trueValues, trueOffset, falseValues, falseOffset, dst, dstOffset, length); + } + private void validateRegistered(Object... arrays) { for (int i = 0; i < arrays.length; i++) { Object arr = arrays[i]; diff --git a/Ports/iOSPort/nativeSources/IOSSimd.m b/Ports/iOSPort/nativeSources/IOSSimd.m index e7d1a042ae..27040050fa 100644 --- a/Ports/iOSPort/nativeSources/IOSSimd.m +++ b/Ports/iOSPort/nativeSources/IOSSimd.m @@ -915,3 +915,135 @@ JAVA_VOID com_codename1_impl_ios_IOSSimd_select___byte_1ARRAY_int_1ARRAY_int_1AR d[i] = m[i] != 0 ? t[i] : f[i]; } } + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shl___byte_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int shift = bits & 7; + int i = offset; + int end = offset + length; + int8x16_t vshift = vdupq_n_s8((int8_t)shift); + for (; i <= end - 16; i += 16) { + uint8x16_t vs = vld1q_u8((uint8_t*)(s + i)); + vst1q_u8((uint8_t*)(d + i), vshlq_u8(vs, vshift)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(((uint8_t)s[i]) << shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrLogical___byte_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int shift = bits & 7; + int i = offset; + int end = offset + length; + int8x16_t vneg = vdupq_n_s8((int8_t)(-shift)); + for (; i <= end - 16; i += 16) { + uint8x16_t vs = vld1q_u8((uint8_t*)(s + i)); + vst1q_u8((uint8_t*)(d + i), vshlq_u8(vs, vneg)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(((uint8_t)s[i]) >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_addWrapping___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + uint8x16_t va = vld1q_u8((uint8_t*)(a + i)); + uint8x16_t vb = vld1q_u8((uint8_t*)(b + i)); + vst1q_u8((uint8_t*)(d + i), vaddq_u8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)((uint8_t)a[i] + (uint8_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_subWrapping___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + uint8x16_t va = vld1q_u8((uint8_t*)(a + i)); + uint8x16_t vb = vld1q_u8((uint8_t*)(b + i)); + vst1q_u8((uint8_t*)(d + i), vsubq_u8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)((uint8_t)a[i] - (uint8_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_unpackUnsignedByteToInt___byte_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16_t v = vld1q_u8((uint8_t*)(s + srcOffset + i)); + uint16x8_t lo16 = vmovl_u8(vget_low_u8(v)); + uint16x8_t hi16 = vmovl_u8(vget_high_u8(v)); + uint32x4_t x0 = vmovl_u16(vget_low_u16(lo16)); + uint32x4_t x1 = vmovl_u16(vget_high_u16(lo16)); + uint32x4_t x2 = vmovl_u16(vget_low_u16(hi16)); + uint32x4_t x3 = vmovl_u16(vget_high_u16(hi16)); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(x0)); + vst1q_s32((int32_t*)(d + dstOffset + i + 4), vreinterpretq_s32_u32(x1)); + vst1q_s32((int32_t*)(d + dstOffset + i + 8), vreinterpretq_s32_u32(x2)); + vst1q_s32((int32_t*)(d + dstOffset + i + 12), vreinterpretq_s32_u32(x3)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(s[srcOffset + i] & 0xff); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_add___int_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + srcAOffset + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + srcBOffset + i)); + vst1q_s32((int32_t*)(d + dstOffset + i), vaddq_s32(va, vb)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)((int32_t)a[srcAOffset + i] + (int32_t)b[srcBOffset + i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpEq___int_1ARRAY_int_int_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dstMask, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int end = length; + for (int i = 0; i < end; i++) { + m[dstOffset + i] = a[srcAOffset + i] == b[srcBOffset + i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpLt___int_1ARRAY_int_int_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dstMask, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int end = length; + for (int i = 0; i < end; i++) { + m[dstOffset + i] = a[srcAOffset + i] < b[srcBOffset + i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_select___byte_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT mask, JAVA_INT maskOffset, JAVA_OBJECT trueValues, JAVA_INT trueOffset, JAVA_OBJECT falseValues, JAVA_INT falseOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)mask)->data; + JAVA_ARRAY_INT* t = (JAVA_ARRAY_INT*)((JAVA_ARRAY)trueValues)->data; + JAVA_ARRAY_INT* f = (JAVA_ARRAY_INT*)((JAVA_ARRAY)falseValues)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int end = length; + for (int i = 0; i < end; i++) { + d[dstOffset + i] = m[maskOffset + i] != 0 ? t[trueOffset + i] : f[falseOffset + i]; + } +} diff --git a/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java b/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java index 8f63966826..3e1cf604dc 100644 --- a/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java +++ b/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java @@ -200,6 +200,33 @@ public float[] allocFloat(int size) { @Override public native float dot(float[] srcA, float[] srcB, int offset, int length); + @Override + public native void shl(byte[] src, int bits, byte[] dst, int offset, int length); + + @Override + public native void shrLogical(byte[] src, int bits, byte[] dst, int offset, int length); + + @Override + public native void addWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void subWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void unpackUnsignedByteToInt(byte[] src, int srcOffset, int[] dst, int dstOffset, int length); + + @Override + public native void add(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length); + + @Override + public native void cmpEq(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length); + + @Override + public native void cmpLt(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length); + + @Override + public native void select(byte[] mask, int maskOffset, int[] trueValues, int trueOffset, int[] falseValues, int falseOffset, int[] dst, int dstOffset, int length); + private native byte[] allocByteNative(int size); private native int[] allocIntNative(int size); private native float[] allocFloatNative(int size); diff --git a/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java b/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java index 5663474679..d983e9708e 100644 --- a/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java +++ b/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java @@ -110,4 +110,137 @@ void genericBitwiseShiftCompareSelectOpsWork() { assertEquals(20, permuted[2]); assertEquals(0, permuted[3]); } + + @FormTest + void base64SimdMethodsMatchScalar() { + Simd simd = Simd.get(); + if (!simd.isSupported()) { + return; + } + + // Test that SIMD encode matches scalar encode + byte[] input = new byte[8192]; + for (int i = 0; i < input.length; i++) { + input[i] = (byte)(i * 31 + 17); + } + + int encodedLen = ((input.length + 2) / 3) * 4; + byte[] scalarEncoded = new byte[encodedLen]; + int scalarWritten = Base64.encodeNoNewline(input, scalarEncoded); + + byte[] simdInput = simd.allocByte(input.length); + System.arraycopy(input, 0, simdInput, 0, input.length); + byte[] simdEncoded = simd.allocByte(encodedLen); + int[] scratch = simd.allocInt(192); + int simdWritten = Base64.encodeNoNewlineSimd(simdInput, 0, simdInput.length, simdEncoded, 0, scratch); + + assertEquals(scalarWritten, simdWritten); + for (int i = 0; i < scalarWritten; i++) { + assertEquals(scalarEncoded[i], simdEncoded[i], "Encode mismatch at index " + i); + } + + // Test that SIMD decode matches scalar decode + byte[] scalarDecoded = new byte[input.length]; + int scalarDecLen = Base64.decode(scalarEncoded, scalarDecoded); + + byte[] simdDecoded = simd.allocByte(input.length); + int simdDecLen = Base64.decodeNoWhitespaceSimd(simdEncoded, 0, simdWritten, simdDecoded, 0, scratch); + + assertEquals(scalarDecLen, simdDecLen); + for (int i = 0; i < scalarDecLen; i++) { + assertEquals(scalarDecoded[i], simdDecoded[i], "Decode mismatch at index " + i); + } + } + + @FormTest + void byteShlAndShrLogicalWork() { + Simd simd = new Simd(); + byte[] src = new byte[]{(byte)0xAB, (byte)0x01, (byte)0xFF, (byte)0x80}; + byte[] dst = new byte[4]; + + simd.shl(src, 4, dst, 0, 4); + assertEquals((byte)0xB0, dst[0]); + assertEquals((byte)0x10, dst[1]); + assertEquals((byte)0xF0, dst[2]); + assertEquals((byte)0x00, dst[3]); + + simd.shrLogical(src, 4, dst, 0, 4); + assertEquals((byte)0x0A, dst[0]); + assertEquals((byte)0x00, dst[1]); + assertEquals((byte)0x0F, dst[2]); + assertEquals((byte)0x08, dst[3]); + } + + @FormTest + void addWrappingAndSubWrappingWork() { + Simd simd = new Simd(); + byte[] a = new byte[]{(byte)200, (byte)100, (byte)0, (byte)255}; + byte[] b = new byte[]{(byte)100, (byte)200, (byte)1, (byte)1}; + byte[] out = new byte[4]; + + simd.addWrapping(a, b, out, 0, 4); + assertEquals((byte)44, out[0]); // 200+100=300 mod 256=44 + assertEquals((byte)44, out[1]); // 100+200=300 mod 256=44 + assertEquals((byte)1, out[2]); // 0+1=1 + assertEquals((byte)0, out[3]); // 255+1=256 mod 256=0 + + simd.subWrapping(a, b, out, 0, 4); + assertEquals((byte)100, out[0]); // 200-100=100 + assertEquals((byte)156, out[1]); // 100-200=-100 mod 256=156 + assertEquals((byte)255, out[2]); // 0-1=-1 mod 256=255 + assertEquals((byte)254, out[3]); // 255-1=254 + } + + @FormTest + void offsetBasedIntOpsWork() { + Simd simd = new Simd(); + + // Test offset-based unpack + byte[] bytes = new byte[]{10, 20, (byte)200, (byte)255}; + int[] ints = new int[8]; + simd.unpackUnsignedByteToInt(bytes, 0, ints, 4, 4); + assertEquals(10, ints[4]); + assertEquals(20, ints[5]); + assertEquals(200, ints[6]); + assertEquals(255, ints[7]); + + // Test offset-based add + int[] a = new int[]{0, 0, 5, 10, 15, 20}; + int[] b = new int[]{1, 2, 3, 4, 5, 6}; + int[] out = new int[6]; + simd.add(a, 2, b, 0, out, 1, 4); + assertEquals(6, out[1]); // a[2]+b[0] = 5+1 + assertEquals(12, out[2]); // a[3]+b[1] = 10+2 + assertEquals(18, out[3]); // a[4]+b[2] = 15+3 + assertEquals(24, out[4]); // a[5]+b[3] = 20+4 + + // Test offset-based cmpLt + int[] vals = new int[]{5, 15, 25, 35}; + int[] thresh = new int[]{10, 10, 10, 10}; + byte[] mask = new byte[4]; + simd.cmpLt(vals, 0, thresh, 0, mask, 0, 4); + assertEquals((byte)-1, mask[0]); + assertEquals((byte)0, mask[1]); + assertEquals((byte)0, mask[2]); + assertEquals((byte)0, mask[3]); + + // Test offset-based cmpEq + int[] vals2 = new int[]{10, 20, 10, 30}; + simd.cmpEq(vals2, 0, thresh, 0, mask, 0, 4); + assertEquals((byte)-1, mask[0]); + assertEquals((byte)0, mask[1]); + assertEquals((byte)-1, mask[2]); + assertEquals((byte)0, mask[3]); + + // Test offset-based select + int[] trueV = new int[]{100, 200, 300, 400}; + int[] falseV = new int[]{-1, -2, -3, -4}; + int[] result = new int[4]; + mask[0] = -1; mask[1] = 0; mask[2] = -1; mask[3] = 0; + simd.select(mask, 0, trueV, 0, falseV, 0, result, 0, 4); + assertEquals(100, result[0]); + assertEquals(-2, result[1]); + assertEquals(300, result[2]); + assertEquals(-4, result[3]); + } } From edf8a865942de0de36751535ce52290ae2243308 Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Tue, 14 Apr 2026 05:01:58 +0300 Subject: [PATCH 07/12] Restored SIMD code and added artifact with full sources for debugging --- scripts/build-ios-app.sh | 86 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/scripts/build-ios-app.sh b/scripts/build-ios-app.sh index c88bc3e795..0f3f020a52 100755 --- a/scripts/build-ios-app.sh +++ b/scripts/build-ios-app.sh @@ -91,6 +91,90 @@ mkdir -p "$ARTIFACTS_DIR" export CN1_BUILD_STATS_FILE="$ARTIFACTS_DIR/iphone-builder-stats.txt" +copy_tree_contents() { + local src="$1" + local dest="$2" + mkdir -p "$dest" + if command -v rsync >/dev/null 2>&1; then + rsync -a "$src"/ "$dest"/ + else + cp -R "$src"/. "$dest"/ + fi +} + +find_bytecode_translator_sources() { + local root="$1" + local best="" + local best_score=0 + local dir score m_count c_count h_count + + [ -d "$root" ] || return 1 + + while IFS= read -r dir; do + [ -d "$dir" ] || continue + + score=0 + [ -f "$dir/cn1_globals.m" ] && score=$((score + 100)) + [ -f "$dir/xmlvm.h" ] && score=$((score + 100)) + + m_count="$(find "$dir" -maxdepth 1 -type f -name '*.m' 2>/dev/null | wc -l | tr -d ' ')" + c_count="$(find "$dir" -maxdepth 1 -type f -name '*.c' 2>/dev/null | wc -l | tr -d ' ')" + h_count="$(find "$dir" -maxdepth 1 -type f -name '*.h' 2>/dev/null | wc -l | tr -d ' ')" + + score=$((score + m_count + c_count + h_count)) + + if [ "$score" -gt "$best_score" ]; then + best="$dir" + best_score="$score" + fi + done < <( + find "$root" -type d \ + ! -path '*/Pods/*' \ + ! -path '*/build/*' \ + ! -path '*/Build/*' \ + ! -path '*/DerivedData/*' \ + ! -path '*/xcuserdata/*' \ + 2>/dev/null + ) + + [ -n "$best" ] || return 1 + printf '%s\n' "$best" +} + +stage_bytecode_translator_sources() { + local project_dir="$1" + local artifacts_dir="$2" + + local bt_dir="" + local out_dir="$artifacts_dir/bytecode-translator-sources" + local zip_file="$artifacts_dir/bytecode-translator-sources.zip" + local listing_file="$artifacts_dir/bytecode-translator-files.txt" + + bt_dir="$(find_bytecode_translator_sources "$project_dir" || true)" + if [ -z "$bt_dir" ]; then + bia_log "ByteCodeTranslator source directory not found under $project_dir" + return 0 + fi + + bia_log "Detected ByteCodeTranslator sources at $bt_dir" + + rm -rf "$out_dir" "$zip_file" + mkdir -p "$out_dir" + + copy_tree_contents "$bt_dir" "$out_dir" + + find "$out_dir" -maxdepth 2 -type f \( -name '*.m' -o -name '*.c' -o -name '*.h' \) \ + | sort > "$listing_file" || true + + ( + cd "$artifacts_dir" + zip -qry "$(basename "$zip_file")" "$(basename "$out_dir")" + ) + + bia_log "Staged ByteCodeTranslator sources in $out_dir" + bia_log "Created archive $zip_file" +} + bia_log "Running HelloCodenameOne Maven build with JAVA_HOME=$JAVA17_HOME" ( export JAVA_HOME="$JAVA17_HOME" @@ -162,6 +246,8 @@ if [ -z "$PROJECT_DIR" ]; then fi bia_log "Found generated iOS project at $PROJECT_DIR" +stage_bytecode_translator_sources "$PROJECT_DIR" "$ARTIFACTS_DIR" + if [ -f "$PROJECT_DIR/Podfile" ]; then if ! command -v pod >/dev/null 2>&1; then bia_log "Generated project requires CocoaPods but the pod command is not installed." >&2 From c7b3632a07d5ad5cc861a8b276c5cae0682c7298 Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Tue, 14 Apr 2026 16:00:09 +0300 Subject: [PATCH 08/12] Removed volatile from class --- CodenameOne/src/com/codename1/util/Base64.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CodenameOne/src/com/codename1/util/Base64.java b/CodenameOne/src/com/codename1/util/Base64.java index 7e4b7125d7..82f1aa0807 100644 --- a/CodenameOne/src/com/codename1/util/Base64.java +++ b/CodenameOne/src/com/codename1/util/Base64.java @@ -437,7 +437,7 @@ public static int encodeNoNewline(byte[] in, byte[] out) { } // ---- SIMD constant tables (lazily initialized) ---- - private static volatile int[] simdEncConst; + private static int[] simdEncConst; // Encode constant offsets (each sub-array is 64 ints) private static final int ENC_K26 = 0; // threshold 26 @@ -454,7 +454,7 @@ public static int encodeNoNewline(byte[] in, byte[] out) { private static final int ENC_M3F = 544; private static final int ENC_CONST_SIZE = 560; - private static volatile byte[] simdMask; + private static byte[] simdMask; private static int[] getSimdEncConst(Simd simd) { int[] c = simdEncConst; From 969288ac3b01a766fca048237df896333a8f6a52 Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Tue, 14 Apr 2026 16:01:48 +0300 Subject: [PATCH 09/12] Added concrete hint to Simd class --- CodenameOne/src/com/codename1/util/Simd.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CodenameOne/src/com/codename1/util/Simd.java b/CodenameOne/src/com/codename1/util/Simd.java index 81aaa07033..b231103851 100644 --- a/CodenameOne/src/com/codename1/util/Simd.java +++ b/CodenameOne/src/com/codename1/util/Simd.java @@ -9,11 +9,13 @@ */ package com.codename1.util; +import com.codename1.annotations.Concrete; import com.codename1.ui.CN; /** * Portable SIMD API with Java fallback implementations. */ +@Concrete(name = "com.codename1.impl.ios.IOSSimd") public class Simd { public static Simd get() { From ae25d419eac517def11fdee014abf7ea6d23edfa Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Tue, 14 Apr 2026 17:01:51 +0300 Subject: [PATCH 10/12] Fixed bad arrows --- CodenameOne/src/com/codename1/util/Base64.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CodenameOne/src/com/codename1/util/Base64.java b/CodenameOne/src/com/codename1/util/Base64.java index 82f1aa0807..5cc5b5ccce 100644 --- a/CodenameOne/src/com/codename1/util/Base64.java +++ b/CodenameOne/src/com/codename1/util/Base64.java @@ -518,7 +518,7 @@ public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byt int si = inOffset; int di = outOffset; - // Process 16 triplets (48 input bytes → 64 output bytes) per iteration + // Process 16 triplets (48 input bytes -> 64 output bytes) per iteration int simdEnd = end3 - 48 + 1; while (si < simdEnd) { // 1. Scatter input bytes into 3 int stripes (b0, b1, b2) @@ -551,19 +551,19 @@ public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byt // Initialize offset accumulator [112..175] with '/' offset (-16) System.arraycopy(ec, ENC_OFF_SLASH, scratch, 112, 64); - // eq62 → use '+' offset + // eq62 -> use '+' offset simd.cmpEq(scratch, 48, ec, ENC_K62, mask, 0, 64); simd.select(mask, 0, ec, ENC_OFF_PLUS, scratch, 112, scratch, 112, 64); - // lt62 → use '0'-'9' offset + // lt62 -> use '0'-'9' offset simd.cmpLt(scratch, 48, ec, ENC_K62, mask, 0, 64); simd.select(mask, 0, ec, ENC_OFF_09, scratch, 112, scratch, 112, 64); - // lt52 → use 'a'-'z' offset + // lt52 -> use 'a'-'z' offset simd.cmpLt(scratch, 48, ec, ENC_K52, mask, 0, 64); simd.select(mask, 0, ec, ENC_OFF_az, scratch, 112, scratch, 112, 64); - // lt26 → use 'A'-'Z' offset + // lt26 -> use 'A'-'Z' offset simd.cmpLt(scratch, 48, ec, ENC_K26, mask, 0, 64); simd.select(mask, 0, ec, ENC_OFF_AZ, scratch, 112, scratch, 112, 64); @@ -666,7 +666,7 @@ public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, int si = inOffset; int di = outOffset; - // Process 16 quads (64 input bytes → 48 output bytes) per iteration + // Process 16 quads (64 input bytes -> 48 output bytes) per iteration int simdEnd = fullEnd - 64 + 1; while (si < simdEnd) { // 1. De-interleave and decode: scatter 64 input bytes into 4 stripes, From 4f6ef272599e1ffc7866021a640d20a10ef51a1b Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Tue, 14 Apr 2026 21:13:04 +0300 Subject: [PATCH 11/12] Update ios-packaging.yml with new content --- .github/workflows/ios-packaging.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ios-packaging.yml b/.github/workflows/ios-packaging.yml index afe8051c82..92005bda20 100644 --- a/.github/workflows/ios-packaging.yml +++ b/.github/workflows/ios-packaging.yml @@ -6,6 +6,7 @@ on: - '.github/workflows/ios-packaging.yml' - 'maven/codenameone-maven-plugin/**' - 'vm/ByteCodeTranslator/**' + - 'Ports/iOSPort/**' - 'scripts/build-ios-app.sh' - 'scripts/run-ios-ui-tests.sh' - 'scripts/run-ios-native-tests.sh' @@ -18,6 +19,7 @@ on: - '.github/workflows/ios-packaging.yml' - 'maven/codenameone-maven-plugin/**' - 'vm/ByteCodeTranslator/**' + - 'Ports/iOSPort/**' - 'scripts/build-ios-app.sh' - 'scripts/run-ios-ui-tests.sh' - 'scripts/run-ios-native-tests.sh' @@ -68,7 +70,9 @@ jobs: id: setup_hash run: | set -euo pipefail - echo "hash=$(shasum -a 256 scripts/setup-workspace.sh | awk '{print $1}')" >> "$GITHUB_OUTPUT" + SETUP_HASH=$(shasum -a 256 scripts/setup-workspace.sh | awk '{print $1}') + IOS_PORT_HASH=$(find Ports/iOSPort/src -type f -name '*.java' | sort | xargs shasum -a 256 | shasum -a 256 | awk '{print $1}') + echo "hash=${SETUP_HASH}-${IOS_PORT_HASH}" >> "$GITHUB_OUTPUT" - name: Set TMPDIR run: echo "TMPDIR=${{ runner.temp }}" >> $GITHUB_ENV From 4726e1437989eca09c56ade049cf3b9e8a4dab5f Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Wed, 15 Apr 2026 18:02:54 +0300 Subject: [PATCH 12/12] Fixed Javadocs --- CodenameOne/src/com/codename1/util/Simd.java | 124 ++++++++++++++++++- 1 file changed, 120 insertions(+), 4 deletions(-) diff --git a/CodenameOne/src/com/codename1/util/Simd.java b/CodenameOne/src/com/codename1/util/Simd.java index b231103851..e3d88af3d8 100644 --- a/CodenameOne/src/com/codename1/util/Simd.java +++ b/CodenameOne/src/com/codename1/util/Simd.java @@ -12,20 +12,26 @@ import com.codename1.annotations.Concrete; import com.codename1.ui.CN; -/** - * Portable SIMD API with Java fallback implementations. - */ +/// Portable SIMD API with Java fallback implementations. @Concrete(name = "com.codename1.impl.ios.IOSSimd") public class Simd { + /// Returns the singleton instance of the Simd class. Equivalent to `CN.getSimd();` public static Simd get() { return CN.getSimd(); } + /// Returns true if SIMD instructions are natively supported + /// if this returns false the APIs in this class would still work + /// using fallback loop code public boolean isSupported() { return false; } + /// Allocates an aligned memory block for efficient SIMD + /// operations. All operations MUST be performed on aligned + /// arrays and shouldn't use arrays created with `new`. Operations + /// on unaligned arrays might produce undefined results. public byte[] allocByte(int size) { if (size < 16) { throw new IllegalArgumentException("size must be >= 16"); @@ -33,6 +39,10 @@ public byte[] allocByte(int size) { return new byte[size]; } + /// Allocates an aligned memory block for efficient SIMD + /// operations. All operations MUST be performed on aligned + /// arrays and shouldn't use arrays created with `new`. Operations + /// on unaligned arrays might produce undefined results. public int[] allocInt(int size) { if (size < 16) { throw new IllegalArgumentException("size must be >= 16"); @@ -40,6 +50,10 @@ public int[] allocInt(int size) { return new int[size]; } + /// Allocates an aligned memory block for efficient SIMD + /// operations. All operations MUST be performed on aligned + /// arrays and shouldn't use arrays created with `new`. Operations + /// on unaligned arrays might produce undefined results. public float[] allocFloat(int size) { if (size < 16) { throw new IllegalArgumentException("size must be >= 16"); @@ -47,36 +61,42 @@ public float[] allocFloat(int size) { return new float[size]; } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void add(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = clampByte(srcA[i] + srcB[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void sub(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = clampByte(srcA[i] - srcB[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void mul(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = clampByte(srcA[i] * srcB[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void min(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] < srcB[i] ? srcA[i] : srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void max(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] > srcB[i] ? srcA[i] : srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void abs(byte[] src, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { int v = src[i]; @@ -88,6 +108,7 @@ public void abs(byte[] src, byte[] dst, int offset, int length) { } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { int v = src[i]; @@ -101,48 +122,56 @@ public void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offs } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void and(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = (byte)(srcA[i] & srcB[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void or(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = (byte)(srcA[i] | srcB[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void xor(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = (byte)(srcA[i] ^ srcB[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void not(byte[] src, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = (byte)(~src[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void cmpEq(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dstMask[i] = srcA[i] == srcB[i] ? (byte)-1 : (byte)0; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void cmpLt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dstMask[i] = srcA[i] < srcB[i] ? (byte)-1 : (byte)0; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void cmpGt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dstMask[i] = srcA[i] > srcB[i] ? (byte)-1 : (byte)0; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void cmpRange(byte[] src, byte minValue, byte maxValue, byte[] dstMask, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { int v = src[i]; @@ -150,12 +179,14 @@ public void cmpRange(byte[] src, byte minValue, byte maxValue, byte[] dstMask, i } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void select(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = mask[i] != 0 ? trueValues[i] : falseValues[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void shl(byte[] src, int bits, byte[] dst, int offset, int length) { int shift = bits & 7; for (int i = offset, end = offset + length; i < end; i++) { @@ -163,6 +194,7 @@ public void shl(byte[] src, int bits, byte[] dst, int offset, int length) { } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void shrLogical(byte[] src, int bits, byte[] dst, int offset, int length) { int shift = bits & 7; for (int i = offset, end = offset + length; i < end; i++) { @@ -170,48 +202,56 @@ public void shrLogical(byte[] src, int bits, byte[] dst, int offset, int length) } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void addWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = (byte)(srcA[i] + srcB[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void subWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = (byte)(srcA[i] - srcB[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void unpackUnsignedByteToInt(byte[] src, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = src[i] & 0xff; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void unpackUnsignedByteToInt(byte[] src, int srcOffset, int[] dst, int dstOffset, int length) { for (int i = 0; i < length; i++) { dst[dstOffset + i] = src[srcOffset + i] & 0xff; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void packIntToByteSaturating(int[] src, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = clampByte(src[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void packIntToByteTruncate(int[] src, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = (byte)src[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void packIntToByteTruncate(int[] src, int srcOffset, byte[] dst, int dstOffset, int length) { for (int i = 0; i < length; i++) { dst[dstOffset + i] = (byte)src[srcOffset + i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void permuteBytes(byte[] src, byte[] indices, byte[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { int idx = indices[i]; @@ -219,42 +259,49 @@ public void permuteBytes(byte[] src, byte[] indices, byte[] dst, int offset, int } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void add(int[] srcA, int[] srcB, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] + srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void add(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { for (int i = 0; i < length; i++) { dst[dstOffset + i] = srcA[srcAOffset + i] + srcB[srcBOffset + i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void sub(int[] srcA, int[] srcB, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] - srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void mul(int[] srcA, int[] srcB, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] * srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void min(int[] srcA, int[] srcB, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] < srcB[i] ? srcA[i] : srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void max(int[] srcA, int[] srcB, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] > srcB[i] ? srcA[i] : srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void abs(int[] src, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { int v = src[i]; @@ -262,6 +309,7 @@ public void abs(int[] src, int[] dst, int offset, int length) { } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { int v = src[i]; @@ -275,42 +323,49 @@ public void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void and(int[] srcA, int[] srcB, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] & srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void and(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { for (int i = 0; i < length; i++) { dst[dstOffset + i] = srcA[srcAOffset + i] & srcB[srcBOffset + i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void or(int[] srcA, int[] srcB, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] | srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void or(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { for (int i = 0; i < length; i++) { dst[dstOffset + i] = srcA[srcAOffset + i] | srcB[srcBOffset + i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void xor(int[] srcA, int[] srcB, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] ^ srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void not(int[] src, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = ~src[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void shl(int[] src, int bits, int[] dst, int offset, int length) { int shift = bits & 31; for (int i = offset, end = offset + length; i < end; i++) { @@ -318,6 +373,7 @@ public void shl(int[] src, int bits, int[] dst, int offset, int length) { } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void shl(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { int shift = bits & 31; for (int i = 0; i < length; i++) { @@ -325,6 +381,7 @@ public void shl(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, in } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void shrLogical(int[] src, int bits, int[] dst, int offset, int length) { int shift = bits & 31; for (int i = offset, end = offset + length; i < end; i++) { @@ -332,6 +389,7 @@ public void shrLogical(int[] src, int bits, int[] dst, int offset, int length) { } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void shrLogical(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { int shift = bits & 31; for (int i = 0; i < length; i++) { @@ -339,6 +397,7 @@ public void shrLogical(int[] src, int srcOffset, int bits, int[] dst, int dstOff } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void shrArithmetic(int[] src, int bits, int[] dst, int offset, int length) { int shift = bits & 31; for (int i = offset, end = offset + length; i < end; i++) { @@ -346,48 +405,56 @@ public void shrArithmetic(int[] src, int bits, int[] dst, int offset, int length } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void cmpEq(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dstMask[i] = srcA[i] == srcB[i] ? (byte)-1 : (byte)0; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void cmpEq(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { for (int i = 0; i < length; i++) { dstMask[dstOffset + i] = srcA[srcAOffset + i] == srcB[srcBOffset + i] ? (byte)-1 : (byte)0; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void cmpLt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dstMask[i] = srcA[i] < srcB[i] ? (byte)-1 : (byte)0; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void cmpLt(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { for (int i = 0; i < length; i++) { dstMask[dstOffset + i] = srcA[srcAOffset + i] < srcB[srcBOffset + i] ? (byte)-1 : (byte)0; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void cmpGt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dstMask[i] = srcA[i] > srcB[i] ? (byte)-1 : (byte)0; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void select(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = mask[i] != 0 ? trueValues[i] : falseValues[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void select(byte[] mask, int maskOffset, int[] trueValues, int trueOffset, int[] falseValues, int falseOffset, int[] dst, int dstOffset, int length) { for (int i = 0; i < length; i++) { dst[dstOffset + i] = mask[maskOffset + i] != 0 ? trueValues[trueOffset + i] : falseValues[falseOffset + i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public int sum(int[] src, int offset, int length) { int out = 0; for (int i = offset, end = offset + length; i < end; i++) { @@ -396,6 +463,7 @@ public int sum(int[] src, int offset, int length) { return out; } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public int dot(int[] srcA, int[] srcB, int offset, int length) { int out = 0; for (int i = offset, end = offset + length; i < end; i++) { @@ -404,42 +472,49 @@ public int dot(int[] srcA, int[] srcB, int offset, int length) { return out; } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void add(float[] srcA, float[] srcB, float[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] + srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void sub(float[] srcA, float[] srcB, float[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] - srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void mul(float[] srcA, float[] srcB, float[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = srcA[i] * srcB[i]; } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void min(float[] srcA, float[] srcB, float[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = Math.min(srcA[i], srcB[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void max(float[] srcA, float[] srcB, float[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = Math.max(srcA[i], srcB[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void abs(float[] src, float[] dst, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { dst[i] = Math.abs(src[i]); } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public void clamp(float[] src, float[] dst, float minValue, float maxValue, int offset, int length) { for (int i = offset, end = offset + length; i < end; i++) { float v = src[i]; @@ -453,6 +528,7 @@ public void clamp(float[] src, float[] dst, float minValue, float maxValue, int } } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public float sum(float[] src, int offset, int length) { float out = 0f; for (int i = offset, end = offset + length; i < end; i++) { @@ -461,6 +537,7 @@ public float sum(float[] src, int offset, int length) { return out; } + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** public float dot(float[] srcA, float[] srcB, int offset, int length) { float out = 0f; for (int i = offset, end = offset + length; i < end; i++) { @@ -470,6 +547,8 @@ public float dot(float[] srcA, float[] srcB, int offset, int length) { } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateBinaryByte(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { validateNotNull(srcA, "srcA"); validateNotNull(srcB, "srcB"); @@ -479,6 +558,8 @@ protected final void validateBinaryByte(byte[] srcA, byte[] srcB, byte[] dst, in validateRange(dst.length, offset, length, "dst"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateMaskBinaryByte(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { validateNotNull(srcA, "srcA"); validateNotNull(srcB, "srcB"); @@ -488,6 +569,8 @@ protected final void validateMaskBinaryByte(byte[] srcA, byte[] srcB, byte[] dst validateRange(dstMask.length, offset, length, "dstMask"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateRangeMaskByte(byte[] src, byte[] dstMask, int offset, int length) { validateNotNull(src, "src"); validateNotNull(dstMask, "dstMask"); @@ -495,6 +578,8 @@ protected final void validateRangeMaskByte(byte[] src, byte[] dstMask, int offse validateRange(dstMask.length, offset, length, "dstMask"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateSelectByte(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length) { validateNotNull(mask, "mask"); validateNotNull(trueValues, "trueValues"); @@ -506,6 +591,8 @@ protected final void validateSelectByte(byte[] mask, byte[] trueValues, byte[] f validateRange(dst.length, offset, length, "dst"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateByteToInt(byte[] src, int[] dst, int offset, int length) { validateNotNull(src, "src"); validateNotNull(dst, "dst"); @@ -513,6 +600,8 @@ protected final void validateByteToInt(byte[] src, int[] dst, int offset, int le validateRange(dst.length, offset, length, "dst"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateIntToByte(int[] src, byte[] dst, int offset, int length) { validateNotNull(src, "src"); validateNotNull(dst, "dst"); @@ -520,6 +609,8 @@ protected final void validateIntToByte(int[] src, byte[] dst, int offset, int le validateRange(dst.length, offset, length, "dst"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validatePermuteByte(byte[] src, byte[] indices, byte[] dst, int offset, int length) { validateNotNull(src, "src"); validateNotNull(indices, "indices"); @@ -528,6 +619,8 @@ protected final void validatePermuteByte(byte[] src, byte[] indices, byte[] dst, validateRange(dst.length, offset, length, "dst"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateUnaryByte(byte[] src, byte[] dst, int offset, int length) { validateNotNull(src, "src"); validateNotNull(dst, "dst"); @@ -535,6 +628,8 @@ protected final void validateUnaryByte(byte[] src, byte[] dst, int offset, int l validateRange(dst.length, offset, length, "dst"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateBinaryInt(int[] srcA, int[] srcB, int[] dst, int offset, int length) { validateNotNull(srcA, "srcA"); validateNotNull(srcB, "srcB"); @@ -544,6 +639,8 @@ protected final void validateBinaryInt(int[] srcA, int[] srcB, int[] dst, int of validateRange(dst.length, offset, length, "dst"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateUnaryInt(int[] src, int[] dst, int offset, int length) { validateNotNull(src, "src"); validateNotNull(dst, "dst"); @@ -551,6 +648,8 @@ protected final void validateUnaryInt(int[] src, int[] dst, int offset, int leng validateRange(dst.length, offset, length, "dst"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateMaskBinaryInt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { validateNotNull(srcA, "srcA"); validateNotNull(srcB, "srcB"); @@ -560,6 +659,8 @@ protected final void validateMaskBinaryInt(int[] srcA, int[] srcB, byte[] dstMas validateRange(dstMask.length, offset, length, "dstMask"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateSelectInt(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length) { validateNotNull(mask, "mask"); validateNotNull(trueValues, "trueValues"); @@ -571,11 +672,15 @@ protected final void validateSelectInt(byte[] mask, int[] trueValues, int[] fals validateRange(dst.length, offset, length, "dst"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateReductionInt(int[] src, int offset, int length) { validateNotNull(src, "src"); validateRange(src.length, offset, length, "src"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateDotInt(int[] srcA, int[] srcB, int offset, int length) { validateNotNull(srcA, "srcA"); validateNotNull(srcB, "srcB"); @@ -583,6 +688,8 @@ protected final void validateDotInt(int[] srcA, int[] srcB, int offset, int leng validateRange(srcB.length, offset, length, "srcB"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateBinaryFloat(float[] srcA, float[] srcB, float[] dst, int offset, int length) { validateNotNull(srcA, "srcA"); validateNotNull(srcB, "srcB"); @@ -592,6 +699,8 @@ protected final void validateBinaryFloat(float[] srcA, float[] srcB, float[] dst validateRange(dst.length, offset, length, "dst"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateUnaryFloat(float[] src, float[] dst, int offset, int length) { validateNotNull(src, "src"); validateNotNull(dst, "dst"); @@ -599,11 +708,15 @@ protected final void validateUnaryFloat(float[] src, float[] dst, int offset, in validateRange(dst.length, offset, length, "dst"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateReductionFloat(float[] src, int offset, int length) { validateNotNull(src, "src"); validateRange(src.length, offset, length, "src"); } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateDotFloat(float[] srcA, float[] srcB, int offset, int length) { validateNotNull(srcA, "srcA"); validateNotNull(srcB, "srcB"); @@ -611,13 +724,16 @@ protected final void validateDotFloat(float[] srcA, float[] srcB, int offset, in validateRange(srcB.length, offset, length, "srcB"); } - + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateNotNull(Object o, String name) { if (o == null) { throw new NullPointerException(name + " is null"); } } + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. protected final void validateRange(int arrayLength, int offset, int length, String name) { if (offset < 0 || length < 0 || offset > arrayLength || arrayLength - offset < length) { throw new ArrayIndexOutOfBoundsException(name + " invalid range offset=" + offset + " length=" + length + " size=" + arrayLength);