diff --git a/.github/workflows/ios-packaging.yml b/.github/workflows/ios-packaging.yml index afe8051c82..92005bda20 100644 --- a/.github/workflows/ios-packaging.yml +++ b/.github/workflows/ios-packaging.yml @@ -6,6 +6,7 @@ on: - '.github/workflows/ios-packaging.yml' - 'maven/codenameone-maven-plugin/**' - 'vm/ByteCodeTranslator/**' + - 'Ports/iOSPort/**' - 'scripts/build-ios-app.sh' - 'scripts/run-ios-ui-tests.sh' - 'scripts/run-ios-native-tests.sh' @@ -18,6 +19,7 @@ on: - '.github/workflows/ios-packaging.yml' - 'maven/codenameone-maven-plugin/**' - 'vm/ByteCodeTranslator/**' + - 'Ports/iOSPort/**' - 'scripts/build-ios-app.sh' - 'scripts/run-ios-ui-tests.sh' - 'scripts/run-ios-native-tests.sh' @@ -68,7 +70,9 @@ jobs: id: setup_hash run: | set -euo pipefail - echo "hash=$(shasum -a 256 scripts/setup-workspace.sh | awk '{print $1}')" >> "$GITHUB_OUTPUT" + SETUP_HASH=$(shasum -a 256 scripts/setup-workspace.sh | awk '{print $1}') + IOS_PORT_HASH=$(find Ports/iOSPort/src -type f -name '*.java' | sort | xargs shasum -a 256 | shasum -a 256 | awk '{print $1}') + echo "hash=${SETUP_HASH}-${IOS_PORT_HASH}" >> "$GITHUB_OUTPUT" - name: Set TMPDIR run: echo "TMPDIR=${{ runner.temp }}" >> $GITHUB_ENV diff --git a/CodenameOne/src/com/codename1/impl/CodenameOneImplementation.java b/CodenameOne/src/com/codename1/impl/CodenameOneImplementation.java index 4777cde002..1190843222 100644 --- a/CodenameOne/src/com/codename1/impl/CodenameOneImplementation.java +++ b/CodenameOne/src/com/codename1/impl/CodenameOneImplementation.java @@ -85,6 +85,7 @@ import com.codename1.ui.util.ImageIO; import com.codename1.util.AsyncResource; import com.codename1.util.FailureCallback; +import com.codename1.util.Simd; import com.codename1.util.StringUtil; import com.codename1.util.SuccessCallback; @@ -8397,6 +8398,12 @@ public ImageIO getImageIO() { return null; } + /// Creates the SIMD implementation for this platform. + /// Ports may override this to provide accelerated SIMD behavior. + public Simd createSimd() { + return new Simd(); + } + /// Workaround for XMLVM bug public boolean instanceofObjArray(Object o) { return o instanceof Object[]; diff --git a/CodenameOne/src/com/codename1/ui/CN.java b/CodenameOne/src/com/codename1/ui/CN.java index 9bcce255ea..62470ab8da 100644 --- a/CodenameOne/src/com/codename1/ui/CN.java +++ b/CodenameOne/src/com/codename1/ui/CN.java @@ -36,6 +36,7 @@ import com.codename1.ui.events.WindowEvent; import com.codename1.ui.geom.Dimension; import com.codename1.ui.geom.Rectangle; +import com.codename1.util.Simd; import com.codename1.util.RunnableWithResultSync; import java.io.IOException; @@ -1032,6 +1033,11 @@ public static String getPlatformName() { return Display.impl.getPlatformName(); } + /// Returns the SIMD API for the current platform. + public static Simd getSimd() { + return Display.getInstance().getSimd(); + } + /// Opens the device Dialer application with the given phone number /// diff --git a/CodenameOne/src/com/codename1/ui/Display.java b/CodenameOne/src/com/codename1/ui/Display.java index c56aaf780d..34bf4c4034 100644 --- a/CodenameOne/src/com/codename1/ui/Display.java +++ b/CodenameOne/src/com/codename1/ui/Display.java @@ -60,6 +60,7 @@ import com.codename1.ui.util.EventDispatcher; import com.codename1.ui.util.ImageIO; import com.codename1.util.AsyncResource; +import com.codename1.util.Simd; import com.codename1.util.RunnableWithResultSync; import com.codename1.util.SuccessCallback; @@ -216,6 +217,7 @@ public final class Display extends CN1Constants { long time; private int transitionDelay = -1; private String selectedVirtualKeyboard = null; + private Simd simd; private CrashReport crashReporter; private EventDispatcher errorHandler; private boolean inNativeUI; @@ -343,6 +345,7 @@ public static void init(Object m) { commandBehaviour = impl.getCommandBehavior(); } impl = (CodenameOneImplementation) ImplementationFactory.getInstance().createImplementation(); + INSTANCE.simd = null; impl.setDisplayLock(lock); impl.initImpl(m); @@ -493,6 +496,18 @@ CodenameOneImplementation getImplementation() { return impl; } + /// Returns the SIMD API instance bound to the current implementation. + public Simd getSimd() { + if (simd == null) { + Simd created = impl.createSimd(); + if (created == null) { + created = new Simd(); + } + simd = created; + } + return simd; + } + /// Indicates the maximum frames the API will try to draw every second /// by default this is set to 10. The advantage of limiting /// framerate is to allow the CPU to perform other tasks besides drawing. diff --git a/CodenameOne/src/com/codename1/ui/Image.java b/CodenameOne/src/com/codename1/ui/Image.java index e399d609ae..0acd29ac4b 100644 --- a/CodenameOne/src/com/codename1/ui/Image.java +++ b/CodenameOne/src/com/codename1/ui/Image.java @@ -33,6 +33,7 @@ import com.codename1.ui.geom.Dimension; import com.codename1.ui.util.EventDispatcher; import com.codename1.ui.util.ImageIO; +import com.codename1.util.Simd; import java.io.IOException; import java.io.InputStream; @@ -44,6 +45,8 @@ /// /// @author Chen Fishbein public class Image implements ActionSource { + private static final int SIMD_BLOCK_SIZE = 64; + private static boolean simdOptimizationsEnabled = Simd.get().isSupported(); int transform; private EventDispatcher listeners; private Object rgbCache; @@ -57,6 +60,23 @@ public class Image implements ActionSource { private byte[] svgData; private String imageName; + /// Indicates whether Image SIMD optimizations are enabled. When unset this defaults + /// to the current platform SIMD support. + public static boolean isSimdOptimizationsEnabled() { + return simdOptimizationsEnabled; + } + + /// Enables or disables Image SIMD optimizations explicitly. + public static void setSimdOptimizationsEnabled(boolean enabled) { + simdOptimizationsEnabled = enabled; + } + + /// Clears the explicit Image SIMD override and restores the default behavior of + /// using SIMD whenever it is supported by the current platform. + public static void resetSimdOptimizationsEnabled() { + simdOptimizationsEnabled = Simd.get().isSupported(); + } + /// Subclasses may use this and point to an underlying native image which might be /// null for a case of an image that doesn't use native drawing /// @@ -1053,9 +1073,22 @@ public boolean isSVG() { public Object createMask() { int[] rgb = getRGBCached(); int rlen = rgb.length; - byte[] mask = new byte[rlen]; - for (int iter = 0; iter < rlen; iter++) { - mask[iter] = (byte) (rgb[iter] & 0xff); + byte[] mask; + if (isSimdOptimizationsEnabled() && rlen >= 16) { + Simd simd = Simd.get(); + mask = simd.allocByte(rlen); + int blockSize = Math.min(rlen, SIMD_BLOCK_SIZE); + int[] scratch = simd.allocInt(blockSize); + for (int offset = 0; offset < rlen; offset += blockSize) { + int length = Math.min(blockSize, rlen - offset); + System.arraycopy(rgb, offset, scratch, 0, length); + simd.packIntToByteTruncate(scratch, 0, mask, offset, length); + } + } else { + mask = new byte[rlen]; + for (int iter = 0; iter < rlen; iter++) { + mask[iter] = (byte) (rgb[iter] & 0xff); + } } return new IndexedImage(getWidth(), getHeight(), null, mask); } @@ -1156,11 +1189,34 @@ public Image applyMask(Object mask) { if (mWidth != getWidth() || mHeight != getHeight()) { throw new IllegalArgumentException("Mask and image sizes don't match"); } - int mdlen = maskData.length; - for (int iter = 0; iter < mdlen; iter++) { - int maskAlpha = maskData[iter] & 0xff; - maskAlpha = (maskAlpha << 24) & 0xff000000; - rgb[iter] = (rgb[iter] & 0xffffff) | maskAlpha; + if (isSimdOptimizationsEnabled() && maskData.length >= 16) { + Simd simd = Simd.get(); + int blockSize = Math.min(maskData.length, SIMD_BLOCK_SIZE); + int srcOffset = 0; + int alphaOffset = blockSize; + int maskOffset = blockSize * 2; + int[] scratch = simd.allocInt(blockSize * 3); + byte[] scratchBytes = simd.allocByte(blockSize); + for (int iter = 0; iter < blockSize; iter++) { + scratch[maskOffset + iter] = 0xffffff; + } + for (int offset = 0; offset < maskData.length; offset += blockSize) { + int length = Math.min(blockSize, maskData.length - offset); + System.arraycopy(rgb, offset, scratch, srcOffset, length); + System.arraycopy(maskData, offset, scratchBytes, 0, length); + simd.and(scratch, srcOffset, scratch, maskOffset, scratch, srcOffset, length); + simd.unpackUnsignedByteToInt(scratchBytes, 0, scratch, alphaOffset, length); + simd.shl(scratch, alphaOffset, 24, scratch, alphaOffset, length); + simd.or(scratch, srcOffset, scratch, alphaOffset, scratch, srcOffset, length); + System.arraycopy(scratch, srcOffset, rgb, offset, length); + } + } else { + int mdlen = maskData.length; + for (int iter = 0; iter < mdlen; iter++) { + int maskAlpha = maskData[iter] & 0xff; + maskAlpha = (maskAlpha << 24) & 0xff000000; + rgb[iter] = (rgb[iter] & 0xffffff) | maskAlpha; + } } return createImage(rgb, mWidth, mHeight); } @@ -1306,11 +1362,38 @@ public Image modifyAlpha(byte alpha) { int h = getHeight(); int size = w * h; int[] arr = getRGB(); - int alphaInt = (((int) alpha) << 24) & 0xff000000; - for (int iter = 0; iter < size; iter++) { - int currentAlpha = (arr[iter] >> 24) & 0xff; - if (currentAlpha != 0) { - arr[iter] = (arr[iter] & 0xffffff) | alphaInt; + if (isSimdOptimizationsEnabled() && size >= 16) { + Simd simd = Simd.get(); + int blockSize = Math.min(size, SIMD_BLOCK_SIZE); + int srcOffset = 0; + int workOffset = blockSize; + int maskOffset = blockSize * 2; + int alphaOffset = blockSize * 3; + int zeroOffset = blockSize * 4; + int[] scratch = simd.allocInt(blockSize * 5); + byte[] scratchBytes = simd.allocByte(blockSize); + int alphaInt = (((int) alpha) << 24) & 0xff000000; + for (int iter = 0; iter < blockSize; iter++) { + scratch[maskOffset + iter] = 0xffffff; + scratch[alphaOffset + iter] = alphaInt; + } + for (int offset = 0; offset < size; offset += blockSize) { + int length = Math.min(blockSize, size - offset); + System.arraycopy(arr, offset, scratch, srcOffset, length); + simd.shrLogical(scratch, srcOffset, 24, scratch, workOffset, length); + simd.cmpEq(scratch, workOffset, scratch, zeroOffset, scratchBytes, 0, length); + simd.and(scratch, srcOffset, scratch, maskOffset, scratch, workOffset, length); + simd.or(scratch, workOffset, scratch, alphaOffset, scratch, workOffset, length); + simd.select(scratchBytes, 0, scratch, srcOffset, scratch, workOffset, scratch, srcOffset, length); + System.arraycopy(scratch, srcOffset, arr, offset, length); + } + } else { + int alphaInt = (((int) alpha) << 24) & 0xff000000; + for (int iter = 0; iter < size; iter++) { + int currentAlpha = (arr[iter] >> 24) & 0xff; + if (currentAlpha != 0) { + arr[iter] = (arr[iter] & 0xffffff) | alphaInt; + } } } Image i = new Image(arr, w, h); @@ -1378,12 +1461,44 @@ public Image modifyAlpha(byte alpha, int removeColor) { int size = w * h; int[] arr = new int[size]; getRGB(arr, 0, 0, 0, w, h); - int alphaInt = (((int) alpha) << 24) & 0xff000000; - for (int iter = 0; iter < size; iter++) { - if ((arr[iter] & 0xff000000) != 0) { - arr[iter] = (arr[iter] & 0xffffff) | alphaInt; - if (removeColor == (0xffffff & arr[iter])) { - arr[iter] = 0; + if (isSimdOptimizationsEnabled() && size >= 16) { + Simd simd = Simd.get(); + int blockSize = Math.min(size, SIMD_BLOCK_SIZE); + int srcOffset = 0; + int workOffset = blockSize; + int maskOffset = blockSize * 2; + int alphaOffset = blockSize * 3; + int zeroOffset = blockSize * 4; + int removeColorOffset = blockSize * 5; + int[] scratch = simd.allocInt(blockSize * 6); + byte[] scratchBytes = simd.allocByte(blockSize); + int alphaInt = (((int) alpha) << 24) & 0xff000000; + for (int iter = 0; iter < blockSize; iter++) { + scratch[maskOffset + iter] = 0xffffff; + scratch[alphaOffset + iter] = alphaInt; + scratch[removeColorOffset + iter] = removeColor & 0xffffff; + } + for (int offset = 0; offset < size; offset += blockSize) { + int length = Math.min(blockSize, size - offset); + System.arraycopy(arr, offset, scratch, srcOffset, length); + simd.shrLogical(scratch, srcOffset, 24, scratch, workOffset, length); + simd.cmpEq(scratch, workOffset, scratch, zeroOffset, scratchBytes, 0, length); + simd.and(scratch, srcOffset, scratch, maskOffset, scratch, workOffset, length); + simd.or(scratch, workOffset, scratch, alphaOffset, scratch, workOffset, length); + simd.select(scratchBytes, 0, scratch, srcOffset, scratch, workOffset, scratch, srcOffset, length); + simd.and(scratch, srcOffset, scratch, maskOffset, scratch, workOffset, length); + simd.cmpEq(scratch, workOffset, scratch, removeColorOffset, scratchBytes, 0, length); + simd.select(scratchBytes, 0, scratch, zeroOffset, scratch, srcOffset, scratch, srcOffset, length); + System.arraycopy(scratch, srcOffset, arr, offset, length); + } + } else { + int alphaInt = (((int) alpha) << 24) & 0xff000000; + for (int iter = 0; iter < size; iter++) { + if ((arr[iter] & 0xff000000) != 0) { + arr[iter] = (arr[iter] & 0xffffff) | alphaInt; + if (removeColor == (0xffffff & arr[iter])) { + arr[iter] = 0; + } } } } diff --git a/CodenameOne/src/com/codename1/ui/RGBImage.java b/CodenameOne/src/com/codename1/ui/RGBImage.java index fb54e7d5a9..df1ee0207e 100644 --- a/CodenameOne/src/com/codename1/ui/RGBImage.java +++ b/CodenameOne/src/com/codename1/ui/RGBImage.java @@ -23,6 +23,8 @@ */ package com.codename1.ui; +import com.codename1.util.Simd; + /// An image that stores its data as an integer RGB array internally, /// this image cannot be manipulated via Graphics primitives however its /// array is accessible and modifiable programmatically. This is very useful @@ -140,11 +142,38 @@ public Image rotate(int degrees) { public Image modifyAlpha(byte alpha) { int[] arr = new int[rgb.length]; System.arraycopy(rgb, 0, arr, 0, rgb.length); - int alphaInt = (((int) alpha) << 24) & 0xff000000; - int rlen = rgb.length; - for (int iter = 0; iter < rlen; iter++) { - if ((arr[iter] & 0xff000000) != 0) { - arr[iter] = (arr[iter] & 0xffffff) | alphaInt; + if (Image.isSimdOptimizationsEnabled() && arr.length >= 16) { + Simd simd = Simd.get(); + int blockSize = Math.min(arr.length, 64); + int srcOffset = 0; + int workOffset = blockSize; + int maskOffset = blockSize * 2; + int alphaOffset = blockSize * 3; + int zeroOffset = blockSize * 4; + int[] scratch = simd.allocInt(blockSize * 5); + byte[] scratchBytes = simd.allocByte(blockSize); + int alphaInt = (((int) alpha) << 24) & 0xff000000; + for (int iter = 0; iter < blockSize; iter++) { + scratch[maskOffset + iter] = 0xffffff; + scratch[alphaOffset + iter] = alphaInt; + } + for (int offset = 0; offset < arr.length; offset += blockSize) { + int length = Math.min(blockSize, arr.length - offset); + System.arraycopy(arr, offset, scratch, srcOffset, length); + simd.shrLogical(scratch, srcOffset, 24, scratch, workOffset, length); + simd.cmpEq(scratch, workOffset, scratch, zeroOffset, scratchBytes, 0, length); + simd.and(scratch, srcOffset, scratch, maskOffset, scratch, workOffset, length); + simd.or(scratch, workOffset, scratch, alphaOffset, scratch, workOffset, length); + simd.select(scratchBytes, 0, scratch, srcOffset, scratch, workOffset, scratch, srcOffset, length); + System.arraycopy(scratch, srcOffset, arr, offset, length); + } + } else { + int alphaInt = (((int) alpha) << 24) & 0xff000000; + int rlen = rgb.length; + for (int iter = 0; iter < rlen; iter++) { + if ((arr[iter] & 0xff000000) != 0) { + arr[iter] = (arr[iter] & 0xffffff) | alphaInt; + } } } return new RGBImage(arr, width, height); diff --git a/CodenameOne/src/com/codename1/util/Base64.java b/CodenameOne/src/com/codename1/util/Base64.java index e4e6b6b740..817093144f 100644 --- a/CodenameOne/src/com/codename1/util/Base64.java +++ b/CodenameOne/src/com/codename1/util/Base64.java @@ -39,6 +39,7 @@ public abstract class Base64 { private static final byte[] decodeMap = new byte[256]; private static final int[] decodeMapInt = new int[256]; + private static final int SIMD_BYTE_LANES = 16; static { for (int i = 0; i < decodeMap.length; i++) { @@ -79,7 +80,7 @@ public static byte[] decode(byte[] in, int len) { return new byte[0]; } int maxOutputLength = (len / 4) * 3 + 3; - byte[] out = new byte[maxOutputLength]; + byte[] out = allocByteMaybeSimd(maxOutputLength); int outputLength = decode(in, len, out); if (outputLength < 0) { return null; @@ -87,7 +88,7 @@ public static byte[] decode(byte[] in, int len) { if (outputLength == out.length) { return out; } - byte[] trimmed = new byte[outputLength]; + byte[] trimmed = allocByteMaybeSimd(outputLength); System.arraycopy(out, 0, trimmed, 0, outputLength); return trimmed; } @@ -229,8 +230,9 @@ private static int decodeNoWhitespace(byte[] in, int len, byte[] out) { int outIndex = 0; int fullLen = len - (pad > 0 ? 4 : 0); int[] decodeMapLocal = decodeMapInt; + int simdFullLen = 0; - for (int i = 0; i < fullLen; i += 4) { + for (int i = simdFullLen; i < fullLen; i += 4) { int c0 = in[i] & 0xff; int c1 = in[i + 1] & 0xff; int c2 = in[i + 2] & 0xff; @@ -342,7 +344,7 @@ public static String encodeNoNewline(byte[] in) { return ""; } int outputLength = ((inputLength + 2) / 3) * 4; - byte[] out = new byte[outputLength]; + byte[] out = allocByteMaybeSimd(outputLength); encodeNoNewline(in, out); return com.codename1.util.StringUtil.newString(out, 0, outputLength); } @@ -442,4 +444,321 @@ public static int encodeNoNewline(byte[] in, byte[] out) { } return outIndex; } + + private static final int SIMD_ENC_LANE0 = 0; + private static final int SIMD_ENC_LANE1 = SIMD_ENC_LANE0 + SIMD_BYTE_LANES; + private static final int SIMD_ENC_LANE2 = SIMD_ENC_LANE1 + SIMD_BYTE_LANES; + private static final int SIMD_ENC_OUT0 = SIMD_ENC_LANE2 + SIMD_BYTE_LANES; + private static final int SIMD_ENC_TMP = SIMD_ENC_OUT0 + SIMD_BYTE_LANES; + private static final int SIMD_ENC_OUT1 = SIMD_ENC_TMP + SIMD_BYTE_LANES; + private static final int SIMD_ENC_OUT2 = SIMD_ENC_OUT1 + SIMD_BYTE_LANES; + private static final int SIMD_ENCODE_SCRATCH_BYTES = SIMD_ENC_OUT2 + SIMD_BYTE_LANES; + + private static final int SIMD_DEC_OUT0 = 0; + private static final int SIMD_DEC_OUT1 = SIMD_DEC_OUT0 + SIMD_BYTE_LANES; + private static final int SIMD_DEC_OUT2 = SIMD_DEC_OUT1 + SIMD_BYTE_LANES; + private static final int SIMD_DEC_OUT3 = SIMD_DEC_OUT2 + SIMD_BYTE_LANES; + private static final int SIMD_DEC_TMP = SIMD_DEC_OUT3 + SIMD_BYTE_LANES; + private static final int SIMD_DECODE_SCRATCH_BYTES = SIMD_DEC_TMP + SIMD_BYTE_LANES; + + private static byte[] simdEncodeMap; + private static byte[] simdDecodeMap; + private static byte[] simdConst03; + private static byte[] simdConst0F; + private static byte[] simdConst3F; + + private static byte[] getSimdEncodeMap(Simd simd) { + byte[] out = simdEncodeMap; + if (out != null) { + return out; + } + out = simd.allocByte(64); + System.arraycopy(map, 0, out, 0, 64); + simdEncodeMap = out; + return out; + } + + private static byte[] getSimdDecodeMap(Simd simd) { + byte[] out = simdDecodeMap; + if (out != null) { + return out; + } + out = simd.allocByte(256); + fillRange(out, (byte) -1); + for (int i = 0; i < 64; i++) { + out[map[i] & 0xff] = (byte) i; + } + simdDecodeMap = out; + return out; + } + + private static byte[] getSimdConst03(Simd simd) { + byte[] out = simdConst03; + if (out != null) { + return out; + } + out = simd.allocByte(SIMD_BYTE_LANES); + fillRange(out, (byte) 0x03); + simdConst03 = out; + return out; + } + + private static byte[] getSimdConst0F(Simd simd) { + byte[] out = simdConst0F; + if (out != null) { + return out; + } + out = simd.allocByte(SIMD_BYTE_LANES); + fillRange(out, (byte) 0x0F); + simdConst0F = out; + return out; + } + + private static byte[] getSimdConst3F(Simd simd) { + byte[] out = simdConst3F; + if (out != null) { + return out; + } + out = simd.allocByte(SIMD_BYTE_LANES); + fillRange(out, (byte) 0x3F); + simdConst3F = out; + return out; + } + + private static void fillRange(byte[] arr, byte value) { + for (int i = 0; i < arr.length; i++) { + arr[i] = value; + } + } + + /// SIMD-optimized Base64 encoding with explicit offsets. + /// Uses generic SIMD byte primitives over a single caller-invisible scratch slab. + @DisableDebugInfo + @DisableNullChecksAndArrayBoundsChecks + public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset) { + int outputLength = ((inLength + 2) / 3) * 4; + if (inLength == 0) { + return 0; + } + Simd simd = Simd.get(); + byte[] encodeMap = getSimdEncodeMap(simd); + + byte[] const03 = getSimdConst03(simd); + byte[] const0F = getSimdConst0F(simd); + byte[] const3F = getSimdConst3F(simd); + int end3 = inOffset + inLength - (inLength % 3); + int si = inOffset; + int di = outOffset; + int simdEnd = end3 - SIMD_BYTE_LANES * 3 + 1; + byte[] scratch = simdEnd > si ? simd.allocByte(SIMD_ENCODE_SCRATCH_BYTES) : null; + + while (si < simdEnd) { + simd.unpackBytesInterleaved3(in, si, scratch, SIMD_ENC_LANE0, SIMD_ENC_LANE1, SIMD_ENC_LANE2, SIMD_BYTE_LANES); + simd.shrLogical(scratch, SIMD_ENC_LANE0, 2, scratch, SIMD_ENC_OUT0, SIMD_BYTE_LANES); + + simd.and(scratch, SIMD_ENC_LANE0, const03, 0, scratch, SIMD_ENC_TMP, SIMD_BYTE_LANES); + simd.shl(scratch, SIMD_ENC_TMP, 4, scratch, SIMD_ENC_TMP, SIMD_BYTE_LANES); + simd.shrLogical(scratch, SIMD_ENC_LANE1, 4, scratch, SIMD_ENC_OUT1, SIMD_BYTE_LANES); + simd.or(scratch, SIMD_ENC_TMP, scratch, SIMD_ENC_OUT1, scratch, SIMD_ENC_OUT1, SIMD_BYTE_LANES); + + simd.and(scratch, SIMD_ENC_LANE1, const0F, 0, scratch, SIMD_ENC_TMP, SIMD_BYTE_LANES); + simd.shl(scratch, SIMD_ENC_TMP, 2, scratch, SIMD_ENC_TMP, SIMD_BYTE_LANES); + simd.shrLogical(scratch, SIMD_ENC_LANE2, 6, scratch, SIMD_ENC_OUT2, SIMD_BYTE_LANES); + simd.or(scratch, SIMD_ENC_TMP, scratch, SIMD_ENC_OUT2, scratch, SIMD_ENC_OUT2, SIMD_BYTE_LANES); + + simd.and(scratch, SIMD_ENC_LANE2, const3F, 0, scratch, SIMD_ENC_LANE2, SIMD_BYTE_LANES); + + simd.lookupBytes(encodeMap, scratch, SIMD_ENC_OUT0, scratch, SIMD_ENC_LANE0, SIMD_BYTE_LANES); + simd.lookupBytes(encodeMap, scratch, SIMD_ENC_OUT1, scratch, SIMD_ENC_LANE1, SIMD_BYTE_LANES); + simd.lookupBytes(encodeMap, scratch, SIMD_ENC_OUT2, scratch, SIMD_ENC_OUT0, SIMD_BYTE_LANES); + simd.lookupBytes(encodeMap, scratch, SIMD_ENC_LANE2, scratch, SIMD_ENC_LANE2, SIMD_BYTE_LANES); + + simd.packBytesInterleaved4(scratch, SIMD_ENC_LANE0, SIMD_ENC_LANE1, SIMD_ENC_OUT0, SIMD_ENC_LANE2, out, di, SIMD_BYTE_LANES); + si += SIMD_BYTE_LANES * 3; + di += SIMD_BYTE_LANES * 4; + } + + byte[] mapLocal = map; + while (si < end3) { + int b0 = in[si] & 0xff; + int b1 = in[si + 1] & 0xff; + int b2 = in[si + 2] & 0xff; + out[di] = mapLocal[b0 >> 2]; + out[di + 1] = mapLocal[((b0 & 0x03) << 4) | (b1 >> 4)]; + out[di + 2] = mapLocal[((b1 & 0x0f) << 2) | (b2 >> 6)]; + out[di + 3] = mapLocal[b2 & 0x3f]; + si += 3; + di += 4; + } + + switch (inOffset + inLength - end3) { + case 1: { + int b0 = in[si] & 0xff; + out[di] = mapLocal[b0 >> 2]; + out[di + 1] = mapLocal[(b0 & 0x03) << 4]; + out[di + 2] = '='; + out[di + 3] = '='; + break; + } + case 2: { + int b0 = in[si] & 0xff; + int b1 = in[si + 1] & 0xff; + out[di] = mapLocal[b0 >> 2]; + out[di + 1] = mapLocal[((b0 & 0x03) << 4) | (b1 >> 4)]; + out[di + 2] = mapLocal[(b1 & 0x0f) << 2]; + out[di + 3] = '='; + break; + } + default: + break; + } + return outputLength; + } + + /// Compatibility overload that ignores the legacy scratch parameter. + public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { + return encodeNoNewlineSimd(in, inOffset, inLength, out, outOffset); + } + + /// SIMD-optimized Base64 decoding for no-whitespace input. + /// Uses generic SIMD byte primitives over a single caller-invisible scratch slab. + @DisableDebugInfo + @DisableNullChecksAndArrayBoundsChecks + public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset) { + if (inLength == 0) { + return 0; + } + if ((inLength & 0x3) != 0) { + return -1; + } + Simd simd = Simd.get(); + + int pad = 0; + if (in[inOffset + inLength - 1] == '=') { + pad++; + if (inLength > 1 && in[inOffset + inLength - 2] == '=') { + pad++; + } + } + if (pad > 2) { + return -1; + } + int outLength = (inLength / 4) * 3 - pad; + if (outLength <= 0) { + return 0; + } + + byte[] decodeMap = getSimdDecodeMap(simd); + int[] decodeMapLocal = decodeMapInt; + + int fullLen = inLength - (pad > 0 ? 4 : 0); + int fullEnd = inOffset + fullLen; + int si = inOffset; + int di = outOffset; + int simdEnd = fullEnd - SIMD_BYTE_LANES * 4 + 1; + byte[] scratch = simdEnd > si ? simd.allocByte(SIMD_DECODE_SCRATCH_BYTES) : null; + while (si < simdEnd) { + if (simd.unpackLookupBytesInterleaved4(decodeMap, in, si, scratch, SIMD_DEC_OUT0, SIMD_DEC_OUT1, SIMD_DEC_OUT2, SIMD_DEC_OUT3, SIMD_BYTE_LANES) < 0) { + return -1; + } + simd.shl(scratch, SIMD_DEC_OUT0, 2, scratch, SIMD_DEC_OUT0, SIMD_BYTE_LANES); + simd.shrLogical(scratch, SIMD_DEC_OUT1, 4, scratch, SIMD_DEC_TMP, SIMD_BYTE_LANES); + simd.or(scratch, SIMD_DEC_OUT0, scratch, SIMD_DEC_TMP, scratch, SIMD_DEC_OUT0, SIMD_BYTE_LANES); + + simd.shl(scratch, SIMD_DEC_OUT1, 4, scratch, SIMD_DEC_OUT1, SIMD_BYTE_LANES); + simd.shrLogical(scratch, SIMD_DEC_OUT2, 2, scratch, SIMD_DEC_TMP, SIMD_BYTE_LANES); + simd.or(scratch, SIMD_DEC_OUT1, scratch, SIMD_DEC_TMP, scratch, SIMD_DEC_OUT1, SIMD_BYTE_LANES); + + simd.shl(scratch, SIMD_DEC_OUT2, 6, scratch, SIMD_DEC_OUT2, SIMD_BYTE_LANES); + simd.or(scratch, SIMD_DEC_OUT2, scratch, SIMD_DEC_OUT3, scratch, SIMD_DEC_OUT2, SIMD_BYTE_LANES); + + simd.packBytesInterleaved3(scratch, SIMD_DEC_OUT0, SIMD_DEC_OUT1, SIMD_DEC_OUT2, out, di, SIMD_BYTE_LANES); + si += SIMD_BYTE_LANES * 4; + di += SIMD_BYTE_LANES * 3; + } + + while (si < fullEnd) { + int c0 = in[si] & 0xff; + int c1 = in[si + 1] & 0xff; + int c2 = in[si + 2] & 0xff; + int c3 = in[si + 3] & 0xff; + int b0 = decodeMapLocal[c0]; + int b1 = decodeMapLocal[c1]; + int b2 = decodeMapLocal[c2]; + int b3 = decodeMapLocal[c3]; + if ((b0 | b1 | b2 | b3) < 0) { + return -1; + } + int quantum = (b0 << 18) | (b1 << 12) | (b2 << 6) | b3; + out[di++] = (byte) ((quantum >> 16) & 0xff); + out[di++] = (byte) ((quantum >> 8) & 0xff); + out[di++] = (byte) (quantum & 0xff); + si += 4; + } + + if (pad > 0) { + int i = inOffset + inLength - 4; + int c0 = in[i] & 0xff; + int c1 = in[i + 1] & 0xff; + int b0 = decodeMapLocal[c0]; + int b1 = decodeMapLocal[c1]; + if ((b0 | b1) < 0) { + return -1; + } + out[di++] = (byte) ((b0 << 2) | (b1 >> 4)); + if (pad == 2) { + return (in[i + 2] == '=' && in[i + 3] == '=') ? outLength : -1; + } + if (in[i + 3] != '=') { + return -1; + } + int b2 = decodeMapLocal[in[i + 2] & 0xff]; + if (b2 < 0) { + return -1; + } + out[di] = (byte) ((b1 << 4) | (b2 >> 2)); + } + + return outLength; + } + + /// Compatibility overload that ignores the legacy scratch parameter. + public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) { + return decodeNoWhitespaceSimd(in, inOffset, inLength, out, outOffset); + } + + /// Convenience overload for `encodeNoNewlineSimd(byte[], int, int, byte[], int)` + /// using zero offsets. + public static int encodeNoNewlineSimd(byte[] in, byte[] out) { + return encodeNoNewlineSimd(in, 0, in.length, out, 0); + } + + /// Convenience overload for `encodeNoNewlineSimd(byte[], int, int, byte[], int)` + /// that preserves the legacy scratch-bearing signature. + public static int encodeNoNewlineSimd(byte[] in, byte[] out, int[] scratch) { + return encodeNoNewlineSimd(in, 0, in.length, out, 0); + } + + /// Convenience overload for `decodeNoWhitespaceSimd(byte[], int, int, byte[], int)` + /// using zero input offset. + public static int decodeNoWhitespaceSimd(byte[] in, int len, byte[] out) { + return decodeNoWhitespaceSimd(in, 0, len, out, 0); + } + + /// Convenience overload for `decodeNoWhitespaceSimd(byte[], int, int, byte[], int)` + /// that preserves the legacy scratch-bearing signature. + public static int decodeNoWhitespaceSimd(byte[] in, int len, byte[] out, int[] scratch) { + return decodeNoWhitespaceSimd(in, 0, len, out, 0); + } + + private static byte[] allocByteMaybeSimd(int size) { + if (size <= 0) { + return new byte[0]; + } + Simd simd = Simd.get(); + if (simd.isSupported() && size >= 16) { + return simd.allocByte(size); + } + return new byte[size]; + } } diff --git a/CodenameOne/src/com/codename1/util/Simd.java b/CodenameOne/src/com/codename1/util/Simd.java new file mode 100644 index 0000000000..14c48d4932 --- /dev/null +++ b/CodenameOne/src/com/codename1/util/Simd.java @@ -0,0 +1,967 @@ +/* + * Copyright (c) 2026, Codename One and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Codename One designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + */ +package com.codename1.util; + +import com.codename1.annotations.Concrete; +import com.codename1.ui.CN; + +/// Portable SIMD API with Java fallback implementations. +@Concrete(name = "com.codename1.impl.ios.IOSSimd") +public class Simd { + /// Returns the singleton instance of the Simd class. Equivalent to `CN.getSimd();` + public static Simd get() { + return CN.getSimd(); + } + + /// Returns true if SIMD instructions are natively supported + /// if this returns false the APIs in this class would still work + /// using fallback loop code + public boolean isSupported() { + return false; + } + + /// Allocates an aligned memory block for efficient SIMD + /// operations. All operations MUST be performed on aligned + /// arrays and shouldn't use arrays created with `new`. Operations + /// on unaligned arrays might produce undefined results. + public byte[] allocByte(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return new byte[size]; + } + + /// Allocates an aligned memory block for efficient SIMD + /// operations. All operations MUST be performed on aligned + /// arrays and shouldn't use arrays created with `new`. Operations + /// on unaligned arrays might produce undefined results. + public int[] allocInt(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return new int[size]; + } + + /// Allocates an aligned memory block for efficient SIMD + /// operations. All operations MUST be performed on aligned + /// arrays and shouldn't use arrays created with `new`. Operations + /// on unaligned arrays might produce undefined results. + public float[] allocFloat(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return new float[size]; + } + + /// Looks up values from a table using unsigned byte indices. + public void lookupBytes(byte[] table, byte[] indices, byte[] dst, int offset, int length) { + lookupBytes(table, indices, offset, dst, offset, length); + } + + /// Looks up values from a table using unsigned byte indices. + public void lookupBytes(byte[] table, byte[] indices, int indicesOffset, byte[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + int idx = indices[indicesOffset + i] & 0xff; + dst[dstOffset + i] = idx < table.length ? table[idx] : 0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void and(byte[] srcA, int srcAOffset, byte[] srcB, int srcBOffset, byte[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = (byte)(srcA[srcAOffset + i] & srcB[srcBOffset + i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void or(byte[] srcA, int srcAOffset, byte[] srcB, int srcBOffset, byte[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = (byte)(srcA[srcAOffset + i] | srcB[srcBOffset + i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shl(byte[] src, int srcOffset, int bits, byte[] dst, int dstOffset, int length) { + int shift = bits & 7; + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = (byte)((src[srcOffset + i] & 0xff) << shift); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shrLogical(byte[] src, int srcOffset, int bits, byte[] dst, int dstOffset, int length) { + int shift = bits & 7; + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = (byte)((src[srcOffset + i] & 0xff) >>> shift); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void unpackBytesInterleaved3(byte[] src, int srcOffset, byte[] dst, int dst0Offset, int dst1Offset, int dst2Offset, int length) { + for (int i = 0; i < length; i++) { + int srcIndex = srcOffset + i * 3; + dst[dst0Offset + i] = src[srcIndex]; + dst[dst1Offset + i] = src[srcIndex + 1]; + dst[dst2Offset + i] = src[srcIndex + 2]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void add(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = clampByte(srcA[i] + srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void sub(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = clampByte(srcA[i] - srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void mul(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = clampByte(srcA[i] * srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void min(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] < srcB[i] ? srcA[i] : srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void max(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] > srcB[i] ? srcA[i] : srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void abs(byte[] src, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + if (v == Byte.MIN_VALUE) { + dst[i] = Byte.MAX_VALUE; + } else { + dst[i] = (byte)Math.abs(v); + } + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + if (v < minValue) { + dst[i] = minValue; + } else if (v > maxValue) { + dst[i] = maxValue; + } else { + dst[i] = (byte)v; + } + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void and(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + and(srcA, offset, srcB, offset, dst, offset, length); + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void or(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + or(srcA, offset, srcB, offset, dst, offset, length); + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void xor(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] ^ srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void not(byte[] src, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(~src[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpEq(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] == srcB[i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpEq(byte[] src, byte value, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = src[i] == value ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpLt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] < srcB[i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpLt(byte[] src, byte value, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = src[i] < value ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpGt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] > srcB[i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpRange(byte[] src, byte minValue, byte maxValue, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + dstMask[i] = v >= minValue && v <= maxValue ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void select(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = mask[i] != 0 ? trueValues[i] : falseValues[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shl(byte[] src, int bits, byte[] dst, int offset, int length) { + shl(src, offset, bits, dst, offset, length); + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shrLogical(byte[] src, int bits, byte[] dst, int offset, int length) { + shrLogical(src, offset, bits, dst, offset, length); + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void addWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] + srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void addWrapping(byte[] src, byte value, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(src[i] + value); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void subWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(srcA[i] - srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void subWrapping(byte[] src, byte value, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)(src[i] - value); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void unpackUnsignedByteToInt(byte[] src, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = src[i] & 0xff; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void unpackUnsignedByteToInt(byte[] src, int srcOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = src[srcOffset + i] & 0xff; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void unpackUnsignedByteToIntInterleaved3(byte[] src, int srcOffset, int[] dst, int dst0Offset, int dst1Offset, int dst2Offset, int length) { + for (int i = 0; i < length; i++) { + int srcIndex = srcOffset + i * 3; + dst[dst0Offset + i] = src[srcIndex] & 0xff; + dst[dst1Offset + i] = src[srcIndex + 1] & 0xff; + dst[dst2Offset + i] = src[srcIndex + 2] & 0xff; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void unpackBytesInterleaved3(byte[] src, int srcOffset, byte[] dst0, byte[] dst1, byte[] dst2, int length) { + for (int i = 0; i < length; i++) { + int srcIndex = srcOffset + i * 3; + dst0[i] = src[srcIndex]; + dst1[i] = src[srcIndex + 1]; + dst2[i] = src[srcIndex + 2]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void unpackBytesInterleaved4(byte[] src, int srcOffset, byte[] dst0, byte[] dst1, byte[] dst2, byte[] dst3, int length) { + for (int i = 0; i < length; i++) { + int srcIndex = srcOffset + i * 4; + dst0[i] = src[srcIndex]; + dst1[i] = src[srcIndex + 1]; + dst2[i] = src[srcIndex + 2]; + dst3[i] = src[srcIndex + 3]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void unpackBytesInterleaved4(byte[] src, int srcOffset, byte[] dst, int dst0Offset, int dst1Offset, int dst2Offset, int dst3Offset, int length) { + for (int i = 0; i < length; i++) { + int srcIndex = srcOffset + i * 4; + dst[dst0Offset + i] = src[srcIndex]; + dst[dst1Offset + i] = src[srcIndex + 1]; + dst[dst2Offset + i] = src[srcIndex + 2]; + dst[dst3Offset + i] = src[srcIndex + 3]; + } + } + + /// Unpacks interleaved bytes, looks each byte up in the provided table, stores the + /// looked-up values into separate lane arrays, and returns the bitwise OR of all + /// written values. + public int unpackLookupBytesInterleaved4(byte[] table, byte[] src, int srcOffset, byte[] dst0, byte[] dst1, byte[] dst2, byte[] dst3, int length) { + int or = 0; + int tableLength = table.length; + for (int i = 0; i < length; i++) { + int srcIndex = srcOffset + i * 4; + int idx0 = src[srcIndex] & 0xff; + int idx1 = src[srcIndex + 1] & 0xff; + int idx2 = src[srcIndex + 2] & 0xff; + int idx3 = src[srcIndex + 3] & 0xff; + byte v0 = idx0 < tableLength ? table[idx0] : 0; + byte v1 = idx1 < tableLength ? table[idx1] : 0; + byte v2 = idx2 < tableLength ? table[idx2] : 0; + byte v3 = idx3 < tableLength ? table[idx3] : 0; + dst0[i] = v0; + dst1[i] = v1; + dst2[i] = v2; + dst3[i] = v3; + or |= v0 | v1 | v2 | v3; + } + return or; + } + + /// Unpacks interleaved bytes, looks each byte up in the provided table, stores the + /// looked-up values into virtual lane ranges in a destination array, and returns + /// the bitwise OR of all written values. + public int unpackLookupBytesInterleaved4(byte[] table, byte[] src, int srcOffset, byte[] dst, int dst0Offset, int dst1Offset, int dst2Offset, int dst3Offset, int length) { + int or = 0; + int tableLength = table.length; + for (int i = 0; i < length; i++) { + int srcIndex = srcOffset + i * 4; + int idx0 = src[srcIndex] & 0xff; + int idx1 = src[srcIndex + 1] & 0xff; + int idx2 = src[srcIndex + 2] & 0xff; + int idx3 = src[srcIndex + 3] & 0xff; + byte v0 = idx0 < tableLength ? table[idx0] : 0; + byte v1 = idx1 < tableLength ? table[idx1] : 0; + byte v2 = idx2 < tableLength ? table[idx2] : 0; + byte v3 = idx3 < tableLength ? table[idx3] : 0; + dst[dst0Offset + i] = v0; + dst[dst1Offset + i] = v1; + dst[dst2Offset + i] = v2; + dst[dst3Offset + i] = v3; + or |= v0 | v1 | v2 | v3; + } + return or; + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void packIntToByteSaturating(int[] src, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = clampByte(src[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void packIntToByteTruncate(int[] src, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = (byte)src[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void packIntToByteTruncate(int[] src, int srcOffset, byte[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = (byte)src[srcOffset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void packIntToByteTruncateInterleaved4(int[] src, int src0Offset, int src1Offset, int src2Offset, int src3Offset, byte[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + int dstIndex = dstOffset + i * 4; + dst[dstIndex] = (byte)src[src0Offset + i]; + dst[dstIndex + 1] = (byte)src[src1Offset + i]; + dst[dstIndex + 2] = (byte)src[src2Offset + i]; + dst[dstIndex + 3] = (byte)src[src3Offset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void packBytesInterleaved3(byte[] src0, byte[] src1, byte[] src2, byte[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + int dstIndex = dstOffset + i * 3; + dst[dstIndex] = src0[i]; + dst[dstIndex + 1] = src1[i]; + dst[dstIndex + 2] = src2[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void packBytesInterleaved3(byte[] src, int src0Offset, int src1Offset, int src2Offset, byte[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + int dstIndex = dstOffset + i * 3; + dst[dstIndex] = src[src0Offset + i]; + dst[dstIndex + 1] = src[src1Offset + i]; + dst[dstIndex + 2] = src[src2Offset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void packBytesInterleaved4(byte[] src0, byte[] src1, byte[] src2, byte[] src3, byte[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + int dstIndex = dstOffset + i * 4; + dst[dstIndex] = src0[i]; + dst[dstIndex + 1] = src1[i]; + dst[dstIndex + 2] = src2[i]; + dst[dstIndex + 3] = src3[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void packBytesInterleaved4(byte[] src, int src0Offset, int src1Offset, int src2Offset, int src3Offset, byte[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + int dstIndex = dstOffset + i * 4; + dst[dstIndex] = src[src0Offset + i]; + dst[dstIndex + 1] = src[src1Offset + i]; + dst[dstIndex + 2] = src[src2Offset + i]; + dst[dstIndex + 3] = src[src3Offset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void permuteBytes(byte[] src, byte[] indices, byte[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int idx = indices[i]; + dst[i] = idx >= 0 && idx < src.length ? src[idx] : 0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void add(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] + srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void add(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = srcA[srcAOffset + i] + srcB[srcBOffset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void sub(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] - srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void mul(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] * srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void min(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] < srcB[i] ? srcA[i] : srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void max(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] > srcB[i] ? srcA[i] : srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void abs(int[] src, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + dst[i] = v == Integer.MIN_VALUE ? Integer.MAX_VALUE : Math.abs(v); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + int v = src[i]; + if (v < minValue) { + dst[i] = minValue; + } else if (v > maxValue) { + dst[i] = maxValue; + } else { + dst[i] = v; + } + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void and(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] & srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void and(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = srcA[srcAOffset + i] & srcB[srcBOffset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void or(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] | srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void or(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = srcA[srcAOffset + i] | srcB[srcBOffset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void xor(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] ^ srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void not(int[] src, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = ~src[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shl(int[] src, int bits, int[] dst, int offset, int length) { + int shift = bits & 31; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = src[i] << shift; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shl(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { + int shift = bits & 31; + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = src[srcOffset + i] << shift; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shrLogical(int[] src, int bits, int[] dst, int offset, int length) { + int shift = bits & 31; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = src[i] >>> shift; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shrLogical(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { + int shift = bits & 31; + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = src[srcOffset + i] >>> shift; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void shrArithmetic(int[] src, int bits, int[] dst, int offset, int length) { + int shift = bits & 31; + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = src[i] >> shift; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpEq(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] == srcB[i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpEq(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dstMask[dstOffset + i] = srcA[srcAOffset + i] == srcB[srcBOffset + i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpLt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] < srcB[i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpLt(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dstMask[dstOffset + i] = srcA[srcAOffset + i] < srcB[srcBOffset + i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void cmpGt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dstMask[i] = srcA[i] > srcB[i] ? (byte)-1 : (byte)0; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void select(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = mask[i] != 0 ? trueValues[i] : falseValues[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void select(byte[] mask, int maskOffset, int[] trueValues, int trueOffset, int[] falseValues, int falseOffset, int[] dst, int dstOffset, int length) { + for (int i = 0; i < length; i++) { + dst[dstOffset + i] = mask[maskOffset + i] != 0 ? trueValues[trueOffset + i] : falseValues[falseOffset + i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public int sum(int[] src, int offset, int length) { + int out = 0; + for (int i = offset, end = offset + length; i < end; i++) { + out += src[i]; + } + return out; + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public int dot(int[] srcA, int[] srcB, int offset, int length) { + int out = 0; + for (int i = offset, end = offset + length; i < end; i++) { + out += srcA[i] * srcB[i]; + } + return out; + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void add(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] + srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void sub(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] - srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void mul(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = srcA[i] * srcB[i]; + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void min(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = Math.min(srcA[i], srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void max(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = Math.max(srcA[i], srcB[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void abs(float[] src, float[] dst, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + dst[i] = Math.abs(src[i]); + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public void clamp(float[] src, float[] dst, float minValue, float maxValue, int offset, int length) { + for (int i = offset, end = offset + length; i < end; i++) { + float v = src[i]; + if (v < minValue) { + dst[i] = minValue; + } else if (v > maxValue) { + dst[i] = maxValue; + } else { + dst[i] = v; + } + } + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public float sum(float[] src, int offset, int length) { + float out = 0f; + for (int i = offset, end = offset + length; i < end; i++) { + out += src[i]; + } + return out; + } + + /// Exposes SIMD APIs directly **all arrays MUST be aligned arrays** + public float dot(float[] srcA, float[] srcB, int offset, int length) { + float out = 0f; + for (int i = offset, end = offset + length; i < end; i++) { + out += srcA[i] * srcB[i]; + } + return out; + } + + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateBinaryByte(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateMaskBinaryByte(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dstMask, "dstMask"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dstMask.length, offset, length, "dstMask"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateRangeMaskByte(byte[] src, byte[] dstMask, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dstMask, "dstMask"); + validateRange(src.length, offset, length, "src"); + validateRange(dstMask.length, offset, length, "dstMask"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateSelectByte(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length) { + validateNotNull(mask, "mask"); + validateNotNull(trueValues, "trueValues"); + validateNotNull(falseValues, "falseValues"); + validateNotNull(dst, "dst"); + validateRange(mask.length, offset, length, "mask"); + validateRange(trueValues.length, offset, length, "trueValues"); + validateRange(falseValues.length, offset, length, "falseValues"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateByteToInt(byte[] src, int[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateIntToByte(int[] src, byte[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validatePermuteByte(byte[] src, byte[] indices, byte[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(indices, "indices"); + validateNotNull(dst, "dst"); + validateRange(indices.length, offset, length, "indices"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateUnaryByte(byte[] src, byte[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateBinaryInt(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateUnaryInt(int[] src, int[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateMaskBinaryInt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dstMask, "dstMask"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dstMask.length, offset, length, "dstMask"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateSelectInt(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length) { + validateNotNull(mask, "mask"); + validateNotNull(trueValues, "trueValues"); + validateNotNull(falseValues, "falseValues"); + validateNotNull(dst, "dst"); + validateRange(mask.length, offset, length, "mask"); + validateRange(trueValues.length, offset, length, "trueValues"); + validateRange(falseValues.length, offset, length, "falseValues"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateReductionInt(int[] src, int offset, int length) { + validateNotNull(src, "src"); + validateRange(src.length, offset, length, "src"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateDotInt(int[] srcA, int[] srcB, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateBinaryFloat(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateUnaryFloat(float[] src, float[] dst, int offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, offset, length, "src"); + validateRange(dst.length, offset, length, "dst"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateReductionFloat(float[] src, int offset, int length) { + validateNotNull(src, "src"); + validateRange(src.length, offset, length, "src"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateDotFloat(float[] srcA, float[] srcB, int offset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateRange(srcA.length, offset, length, "srcA"); + validateRange(srcB.length, offset, length, "srcB"); + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateNotNull(Object o, String name) { + if (o == null) { + throw new NullPointerException(name + " is null"); + } + } + + /// This API is used internally to verify valid array arguments in the simulator + /// notice that no validation occurs on the devices. + protected final void validateRange(int arrayLength, int offset, int length, String name) { + if (offset < 0 || length < 0 || offset > arrayLength || arrayLength - offset < length) { + throw new ArrayIndexOutOfBoundsException(name + " invalid range offset=" + offset + " length=" + length + " size=" + arrayLength); + } + } + + private byte clampByte(int value) { + if (value > Byte.MAX_VALUE) { + return Byte.MAX_VALUE; + } + if (value < Byte.MIN_VALUE) { + return Byte.MIN_VALUE; + } + return (byte)value; + } +} diff --git a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSEPort.java b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSEPort.java index bcf4c861e1..148699a232 100644 --- a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSEPort.java +++ b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSEPort.java @@ -118,6 +118,7 @@ import com.codename1.ui.util.UITimer; import com.codename1.util.AsyncResource; import com.codename1.util.Callback; +import com.codename1.util.Simd; import com.jhlabs.image.GaussianFilter; import java.awt.*; import java.awt.datatransfer.Clipboard; @@ -10753,6 +10754,11 @@ public String getPlatformName() { return platformName; } + @Override + public Simd createSimd() { + return new JavaSESimd(); + } + /** * @inheritDoc */ diff --git a/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java new file mode 100644 index 0000000000..ebc4766e70 --- /dev/null +++ b/Ports/JavaSE/src/com/codename1/impl/javase/JavaSESimd.java @@ -0,0 +1,783 @@ +/* + * Copyright (c) 2026, Codename One and/or its affiliates. All rights reserved. + */ +package com.codename1.impl.javase; + +import com.codename1.util.Simd; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +/** + * JavaSE SIMD implementation used for simulator validation and fallback execution. + */ +public class JavaSESimd extends Simd { + private final Set allocatedIds = Collections.synchronizedSet(new HashSet()); + + @Override + public boolean isSupported() { + return true; + } + + @Override + public byte[] allocByte(int size) { + byte[] out = super.allocByte(size); + allocatedIds.add(Integer.valueOf(System.identityHashCode(out))); + return out; + } + + @Override + public int[] allocInt(int size) { + int[] out = super.allocInt(size); + allocatedIds.add(Integer.valueOf(System.identityHashCode(out))); + return out; + } + + @Override + public float[] allocFloat(int size) { + float[] out = super.allocFloat(size); + allocatedIds.add(Integer.valueOf(System.identityHashCode(out))); + return out; + } + + @Override + public void lookupBytes(byte[] table, byte[] indices, int indicesOffset, byte[] dst, int dstOffset, int length) { + validateNotNull(table, "table"); + validateNotNull(indices, "indices"); + validateNotNull(dst, "dst"); + validateRange(indices.length, indicesOffset, length, "indices"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(table, indices, dst); + super.lookupBytes(table, indices, indicesOffset, dst, dstOffset, length); + } + + @Override + public void add(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.add(srcA, srcB, dst, offset, length); + } + + @Override + public void sub(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.sub(srcA, srcB, dst, offset, length); + } + + @Override + public void mul(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.mul(srcA, srcB, dst, offset, length); + } + + @Override + public void min(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.min(srcA, srcB, dst, offset, length); + } + + @Override + public void max(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.max(srcA, srcB, dst, offset, length); + } + + @Override + public void abs(byte[] src, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.abs(src, dst, offset, length); + } + + @Override + public void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offset, int length) { + if (minValue > maxValue) { + throw new IllegalArgumentException("minValue > maxValue"); + } + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.clamp(src, dst, minValue, maxValue, offset, length); + } + + @Override + public void and(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.and(srcA, srcB, dst, offset, length); + } + + @Override + public void and(byte[] srcA, int srcAOffset, byte[] srcB, int srcBOffset, byte[] dst, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(srcA, srcB, dst); + super.and(srcA, srcAOffset, srcB, srcBOffset, dst, dstOffset, length); + } + + @Override + public void or(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.or(srcA, srcB, dst, offset, length); + } + + @Override + public void or(byte[] srcA, int srcAOffset, byte[] srcB, int srcBOffset, byte[] dst, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(srcA, srcB, dst); + super.or(srcA, srcAOffset, srcB, srcBOffset, dst, dstOffset, length); + } + + @Override + public void xor(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.xor(srcA, srcB, dst, offset, length); + } + + @Override + public void not(byte[] src, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.not(src, dst, offset, length); + } + + @Override + public void cmpEq(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryByte(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpEq(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpEq(byte[] src, byte value, byte[] dstMask, int offset, int length) { + validateRangeMaskByte(src, dstMask, offset, length); + validateRegistered(src, dstMask); + super.cmpEq(src, value, dstMask, offset, length); + } + + @Override + public void cmpLt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryByte(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpLt(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpLt(byte[] src, byte value, byte[] dstMask, int offset, int length) { + validateRangeMaskByte(src, dstMask, offset, length); + validateRegistered(src, dstMask); + super.cmpLt(src, value, dstMask, offset, length); + } + + @Override + public void cmpGt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryByte(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpGt(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpRange(byte[] src, byte minValue, byte maxValue, byte[] dstMask, int offset, int length) { + if (minValue > maxValue) { + throw new IllegalArgumentException("minValue > maxValue"); + } + validateRangeMaskByte(src, dstMask, offset, length); + validateRegistered(src, dstMask); + super.cmpRange(src, minValue, maxValue, dstMask, offset, length); + } + + @Override + public void select(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length) { + validateSelectByte(mask, trueValues, falseValues, dst, offset, length); + validateRegistered(mask, trueValues, falseValues, dst); + super.select(mask, trueValues, falseValues, dst, offset, length); + } + + @Override + public void unpackUnsignedByteToInt(byte[] src, int[] dst, int offset, int length) { + validateByteToInt(src, dst, offset, length); + validateRegistered(src, dst); + super.unpackUnsignedByteToInt(src, dst, offset, length); + } + + @Override + public void packIntToByteSaturating(int[] src, byte[] dst, int offset, int length) { + validateIntToByte(src, dst, offset, length); + validateRegistered(src, dst); + super.packIntToByteSaturating(src, dst, offset, length); + } + + @Override + public void packIntToByteTruncate(int[] src, byte[] dst, int offset, int length) { + validateIntToByte(src, dst, offset, length); + validateRegistered(src, dst); + super.packIntToByteTruncate(src, dst, offset, length); + } + + @Override + public void packIntToByteTruncate(int[] src, int srcOffset, byte[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.packIntToByteTruncate(src, srcOffset, dst, dstOffset, length); + } + + @Override + public void permuteBytes(byte[] src, byte[] indices, byte[] dst, int offset, int length) { + validatePermuteByte(src, indices, dst, offset, length); + validateRegistered(src, indices, dst); + super.permuteBytes(src, indices, dst, offset, length); + } + + @Override + public void add(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.add(srcA, srcB, dst, offset, length); + } + + @Override + public void sub(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.sub(srcA, srcB, dst, offset, length); + } + + @Override + public void mul(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.mul(srcA, srcB, dst, offset, length); + } + + @Override + public void min(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.min(srcA, srcB, dst, offset, length); + } + + @Override + public void max(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.max(srcA, srcB, dst, offset, length); + } + + @Override + public void abs(int[] src, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.abs(src, dst, offset, length); + } + + @Override + public void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, int length) { + if (minValue > maxValue) { + throw new IllegalArgumentException("minValue > maxValue"); + } + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.clamp(src, dst, minValue, maxValue, offset, length); + } + + @Override + public void and(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.and(srcA, srcB, dst, offset, length); + } + + @Override + public void and(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(srcA, srcB, dst); + super.and(srcA, srcAOffset, srcB, srcBOffset, dst, dstOffset, length); + } + + @Override + public void or(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.or(srcA, srcB, dst, offset, length); + } + + @Override + public void or(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(srcA, srcB, dst); + super.or(srcA, srcAOffset, srcB, srcBOffset, dst, dstOffset, length); + } + + @Override + public void xor(int[] srcA, int[] srcB, int[] dst, int offset, int length) { + validateBinaryInt(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.xor(srcA, srcB, dst, offset, length); + } + + @Override + public void not(int[] src, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.not(src, dst, offset, length); + } + + @Override + public void shl(int[] src, int bits, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.shl(src, bits, dst, offset, length); + } + + @Override + public void shl(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.shl(src, srcOffset, bits, dst, dstOffset, length); + } + + @Override + public void shrLogical(int[] src, int bits, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.shrLogical(src, bits, dst, offset, length); + } + + @Override + public void shrLogical(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.shrLogical(src, srcOffset, bits, dst, dstOffset, length); + } + + @Override + public void shrArithmetic(int[] src, int bits, int[] dst, int offset, int length) { + validateUnaryInt(src, dst, offset, length); + validateRegistered(src, dst); + super.shrArithmetic(src, bits, dst, offset, length); + } + + @Override + public void cmpEq(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryInt(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpEq(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpLt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryInt(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpLt(srcA, srcB, dstMask, offset, length); + } + + @Override + public void cmpGt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length) { + validateMaskBinaryInt(srcA, srcB, dstMask, offset, length); + validateRegistered(srcA, srcB, dstMask); + super.cmpGt(srcA, srcB, dstMask, offset, length); + } + + @Override + public void select(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length) { + validateSelectInt(mask, trueValues, falseValues, dst, offset, length); + validateRegistered(mask, trueValues, falseValues, dst); + super.select(mask, trueValues, falseValues, dst, offset, length); + } + + @Override + public int sum(int[] src, int offset, int length) { + validateReductionInt(src, offset, length); + validateRegistered(src); + return super.sum(src, offset, length); + } + + @Override + public int dot(int[] srcA, int[] srcB, int offset, int length) { + validateDotInt(srcA, srcB, offset, length); + validateRegistered(srcA, srcB); + return super.dot(srcA, srcB, offset, length); + } + + @Override + public void add(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.add(srcA, srcB, dst, offset, length); + } + + @Override + public void sub(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.sub(srcA, srcB, dst, offset, length); + } + + @Override + public void mul(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.mul(srcA, srcB, dst, offset, length); + } + + @Override + public void min(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.min(srcA, srcB, dst, offset, length); + } + + @Override + public void max(float[] srcA, float[] srcB, float[] dst, int offset, int length) { + validateBinaryFloat(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.max(srcA, srcB, dst, offset, length); + } + + @Override + public void abs(float[] src, float[] dst, int offset, int length) { + validateUnaryFloat(src, dst, offset, length); + validateRegistered(src, dst); + super.abs(src, dst, offset, length); + } + + @Override + public void clamp(float[] src, float[] dst, float minValue, float maxValue, int offset, int length) { + if (minValue > maxValue) { + throw new IllegalArgumentException("minValue > maxValue"); + } + validateUnaryFloat(src, dst, offset, length); + validateRegistered(src, dst); + super.clamp(src, dst, minValue, maxValue, offset, length); + } + + @Override + public float sum(float[] src, int offset, int length) { + validateReductionFloat(src, offset, length); + validateRegistered(src); + return super.sum(src, offset, length); + } + + @Override + public float dot(float[] srcA, float[] srcB, int offset, int length) { + validateDotFloat(srcA, srcB, offset, length); + validateRegistered(srcA, srcB); + return super.dot(srcA, srcB, offset, length); + } + + @Override + public void shl(byte[] src, int bits, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.shl(src, bits, dst, offset, length); + } + + @Override + public void shl(byte[] src, int srcOffset, int bits, byte[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.shl(src, srcOffset, bits, dst, dstOffset, length); + } + + @Override + public void shrLogical(byte[] src, int bits, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.shrLogical(src, bits, dst, offset, length); + } + + @Override + public void shrLogical(byte[] src, int srcOffset, int bits, byte[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.shrLogical(src, srcOffset, bits, dst, dstOffset, length); + } + + @Override + public void addWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.addWrapping(srcA, srcB, dst, offset, length); + } + + @Override + public void addWrapping(byte[] src, byte value, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.addWrapping(src, value, dst, offset, length); + } + + @Override + public void subWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length) { + validateBinaryByte(srcA, srcB, dst, offset, length); + validateRegistered(srcA, srcB, dst); + super.subWrapping(srcA, srcB, dst, offset, length); + } + + @Override + public void subWrapping(byte[] src, byte value, byte[] dst, int offset, int length) { + validateUnaryByte(src, dst, offset, length); + validateRegistered(src, dst); + super.subWrapping(src, value, dst, offset, length); + } + + @Override + public void unpackUnsignedByteToInt(byte[] src, int srcOffset, int[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length, "src"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(src, dst); + super.unpackUnsignedByteToInt(src, srcOffset, dst, dstOffset, length); + } + + @Override + public void unpackUnsignedByteToIntInterleaved3(byte[] src, int srcOffset, int[] dst, int dst0Offset, int dst1Offset, int dst2Offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length * 3, "src"); + validateRange(dst.length, dst0Offset, length, "dst"); + validateRange(dst.length, dst1Offset, length, "dst"); + validateRange(dst.length, dst2Offset, length, "dst"); + validateRegistered(src, dst); + super.unpackUnsignedByteToIntInterleaved3(src, srcOffset, dst, dst0Offset, dst1Offset, dst2Offset, length); + } + + @Override + public void unpackBytesInterleaved3(byte[] src, int srcOffset, byte[] dst0, byte[] dst1, byte[] dst2, int length) { + validateNotNull(src, "src"); + validateNotNull(dst0, "dst0"); + validateNotNull(dst1, "dst1"); + validateNotNull(dst2, "dst2"); + validateRange(src.length, srcOffset, length * 3, "src"); + validateRange(dst0.length, 0, length, "dst0"); + validateRange(dst1.length, 0, length, "dst1"); + validateRange(dst2.length, 0, length, "dst2"); + validateRegistered(src, dst0, dst1, dst2); + super.unpackBytesInterleaved3(src, srcOffset, dst0, dst1, dst2, length); + } + + @Override + public void unpackBytesInterleaved3(byte[] src, int srcOffset, byte[] dst, int dst0Offset, int dst1Offset, int dst2Offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length * 3, "src"); + validateRange(dst.length, dst0Offset, length, "dst"); + validateRange(dst.length, dst1Offset, length, "dst"); + validateRange(dst.length, dst2Offset, length, "dst"); + validateRegistered(src, dst); + super.unpackBytesInterleaved3(src, srcOffset, dst, dst0Offset, dst1Offset, dst2Offset, length); + } + + @Override + public void unpackBytesInterleaved4(byte[] src, int srcOffset, byte[] dst0, byte[] dst1, byte[] dst2, byte[] dst3, int length) { + validateNotNull(src, "src"); + validateNotNull(dst0, "dst0"); + validateNotNull(dst1, "dst1"); + validateNotNull(dst2, "dst2"); + validateNotNull(dst3, "dst3"); + validateRange(src.length, srcOffset, length * 4, "src"); + validateRange(dst0.length, 0, length, "dst0"); + validateRange(dst1.length, 0, length, "dst1"); + validateRange(dst2.length, 0, length, "dst2"); + validateRange(dst3.length, 0, length, "dst3"); + validateRegistered(src, dst0, dst1, dst2, dst3); + super.unpackBytesInterleaved4(src, srcOffset, dst0, dst1, dst2, dst3, length); + } + + @Override + public void unpackBytesInterleaved4(byte[] src, int srcOffset, byte[] dst, int dst0Offset, int dst1Offset, int dst2Offset, int dst3Offset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length * 4, "src"); + validateRange(dst.length, dst0Offset, length, "dst"); + validateRange(dst.length, dst1Offset, length, "dst"); + validateRange(dst.length, dst2Offset, length, "dst"); + validateRange(dst.length, dst3Offset, length, "dst"); + validateRegistered(src, dst); + super.unpackBytesInterleaved4(src, srcOffset, dst, dst0Offset, dst1Offset, dst2Offset, dst3Offset, length); + } + + @Override + public int unpackLookupBytesInterleaved4(byte[] table, byte[] src, int srcOffset, byte[] dst, int dst0Offset, int dst1Offset, int dst2Offset, int dst3Offset, int length) { + validateNotNull(table, "table"); + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, srcOffset, length * 4, "src"); + validateRange(dst.length, dst0Offset, length, "dst"); + validateRange(dst.length, dst1Offset, length, "dst"); + validateRange(dst.length, dst2Offset, length, "dst"); + validateRange(dst.length, dst3Offset, length, "dst"); + validateRegistered(table, src, dst); + return super.unpackLookupBytesInterleaved4(table, src, srcOffset, dst, dst0Offset, dst1Offset, dst2Offset, dst3Offset, length); + } + + @Override + public void add(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dst, "dst"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(srcA, srcB, dst); + super.add(srcA, srcAOffset, srcB, srcBOffset, dst, dstOffset, length); + } + + @Override + public void cmpEq(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dstMask, "dstMask"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dstMask.length, dstOffset, length, "dstMask"); + validateRegistered(srcA, srcB, dstMask); + super.cmpEq(srcA, srcAOffset, srcB, srcBOffset, dstMask, dstOffset, length); + } + + @Override + public void cmpLt(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length) { + validateNotNull(srcA, "srcA"); + validateNotNull(srcB, "srcB"); + validateNotNull(dstMask, "dstMask"); + validateRange(srcA.length, srcAOffset, length, "srcA"); + validateRange(srcB.length, srcBOffset, length, "srcB"); + validateRange(dstMask.length, dstOffset, length, "dstMask"); + validateRegistered(srcA, srcB, dstMask); + super.cmpLt(srcA, srcAOffset, srcB, srcBOffset, dstMask, dstOffset, length); + } + + @Override + public void select(byte[] mask, int maskOffset, int[] trueValues, int trueOffset, int[] falseValues, int falseOffset, int[] dst, int dstOffset, int length) { + validateNotNull(mask, "mask"); + validateNotNull(trueValues, "trueValues"); + validateNotNull(falseValues, "falseValues"); + validateNotNull(dst, "dst"); + validateRange(mask.length, maskOffset, length, "mask"); + validateRange(trueValues.length, trueOffset, length, "trueValues"); + validateRange(falseValues.length, falseOffset, length, "falseValues"); + validateRange(dst.length, dstOffset, length, "dst"); + validateRegistered(mask, trueValues, falseValues, dst); + super.select(mask, maskOffset, trueValues, trueOffset, falseValues, falseOffset, dst, dstOffset, length); + } + + @Override + public void packIntToByteTruncateInterleaved4(int[] src, int src0Offset, int src1Offset, int src2Offset, int src3Offset, byte[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, src0Offset, length, "src"); + validateRange(src.length, src1Offset, length, "src"); + validateRange(src.length, src2Offset, length, "src"); + validateRange(src.length, src3Offset, length, "src"); + validateRange(dst.length, dstOffset, length * 4, "dst"); + validateRegistered(src, dst); + super.packIntToByteTruncateInterleaved4(src, src0Offset, src1Offset, src2Offset, src3Offset, dst, dstOffset, length); + } + + @Override + public void packBytesInterleaved3(byte[] src0, byte[] src1, byte[] src2, byte[] dst, int dstOffset, int length) { + validateNotNull(src0, "src0"); + validateNotNull(src1, "src1"); + validateNotNull(src2, "src2"); + validateNotNull(dst, "dst"); + validateRange(src0.length, 0, length, "src0"); + validateRange(src1.length, 0, length, "src1"); + validateRange(src2.length, 0, length, "src2"); + validateRange(dst.length, dstOffset, length * 3, "dst"); + validateRegistered(src0, src1, src2, dst); + super.packBytesInterleaved3(src0, src1, src2, dst, dstOffset, length); + } + + @Override + public void packBytesInterleaved3(byte[] src, int src0Offset, int src1Offset, int src2Offset, byte[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, src0Offset, length, "src"); + validateRange(src.length, src1Offset, length, "src"); + validateRange(src.length, src2Offset, length, "src"); + validateRange(dst.length, dstOffset, length * 3, "dst"); + validateRegistered(src, dst); + super.packBytesInterleaved3(src, src0Offset, src1Offset, src2Offset, dst, dstOffset, length); + } + + @Override + public void packBytesInterleaved4(byte[] src0, byte[] src1, byte[] src2, byte[] src3, byte[] dst, int dstOffset, int length) { + validateNotNull(src0, "src0"); + validateNotNull(src1, "src1"); + validateNotNull(src2, "src2"); + validateNotNull(src3, "src3"); + validateNotNull(dst, "dst"); + validateRange(src0.length, 0, length, "src0"); + validateRange(src1.length, 0, length, "src1"); + validateRange(src2.length, 0, length, "src2"); + validateRange(src3.length, 0, length, "src3"); + validateRange(dst.length, dstOffset, length * 4, "dst"); + validateRegistered(src0, src1, src2, src3, dst); + super.packBytesInterleaved4(src0, src1, src2, src3, dst, dstOffset, length); + } + + @Override + public void packBytesInterleaved4(byte[] src, int src0Offset, int src1Offset, int src2Offset, int src3Offset, byte[] dst, int dstOffset, int length) { + validateNotNull(src, "src"); + validateNotNull(dst, "dst"); + validateRange(src.length, src0Offset, length, "src"); + validateRange(src.length, src1Offset, length, "src"); + validateRange(src.length, src2Offset, length, "src"); + validateRange(src.length, src3Offset, length, "src"); + validateRange(dst.length, dstOffset, length * 4, "dst"); + validateRegistered(src, dst); + super.packBytesInterleaved4(src, src0Offset, src1Offset, src2Offset, src3Offset, dst, dstOffset, length); + } + + private void validateRegistered(Object... arrays) { + for (int i = 0; i < arrays.length; i++) { + Object arr = arrays[i]; + Integer id = Integer.valueOf(System.identityHashCode(arr)); + if (!allocatedIds.contains(id)) { + throw new IllegalArgumentException( + "SIMD array argument was not allocated using Simd.alloc*(). objectId=" + id.intValue()); + } + } + } +} diff --git a/Ports/iOSPort/nativeSources/IOSSimd.m b/Ports/iOSPort/nativeSources/IOSSimd.m new file mode 100644 index 0000000000..362a6fdb3f --- /dev/null +++ b/Ports/iOSPort/nativeSources/IOSSimd.m @@ -0,0 +1,1535 @@ +#include "xmlvm.h" +#include +#include +#include +#include + +static JAVA_ARRAY_BYTE cn1_saturating_byte(int value) { + if (value > 127) { + return 127; + } + if (value < -128) { + return -128; + } + return (JAVA_ARRAY_BYTE)value; +} + +static void cn1_store_u8x16_to_ints(uint8x16_t v, JAVA_ARRAY_INT* d, int offset) { + uint16x8_t lo16 = vmovl_u8(vget_low_u8(v)); + uint16x8_t hi16 = vmovl_u8(vget_high_u8(v)); + uint32x4_t x0 = vmovl_u16(vget_low_u16(lo16)); + uint32x4_t x1 = vmovl_u16(vget_high_u16(lo16)); + uint32x4_t x2 = vmovl_u16(vget_low_u16(hi16)); + uint32x4_t x3 = vmovl_u16(vget_high_u16(hi16)); + vst1q_s32((int32_t*)(d + offset), vreinterpretq_s32_u32(x0)); + vst1q_s32((int32_t*)(d + offset + 4), vreinterpretq_s32_u32(x1)); + vst1q_s32((int32_t*)(d + offset + 8), vreinterpretq_s32_u32(x2)); + vst1q_s32((int32_t*)(d + offset + 12), vreinterpretq_s32_u32(x3)); +} + +static uint8x16_t cn1_load_ints_to_u8x16(JAVA_ARRAY_INT* s, int offset) { + int16x8_t lo16 = vcombine_s16( + vmovn_s32(vld1q_s32((int32_t*)(s + offset))), + vmovn_s32(vld1q_s32((int32_t*)(s + offset + 4)))); + int16x8_t hi16 = vcombine_s16( + vmovn_s32(vld1q_s32((int32_t*)(s + offset + 8))), + vmovn_s32(vld1q_s32((int32_t*)(s + offset + 12)))); + int8x16_t out = vcombine_s8(vmovn_s16(lo16), vmovn_s16(hi16)); + return vreinterpretq_u8_s8(out); +} + +JAVA_OBJECT com_codename1_impl_ios_IOSSimd_allocByteNative___int_R_byte_1ARRAY(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_INT size) { + return allocArrayAligned(threadStateData, size, &class_array1__JAVA_BYTE, sizeof(JAVA_ARRAY_BYTE), 1, 16); +} + +JAVA_OBJECT com_codename1_impl_ios_IOSSimd_allocIntNative___int_R_int_1ARRAY(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_INT size) { + return allocArrayAligned(threadStateData, size, &class_array1__JAVA_INT, sizeof(JAVA_ARRAY_INT), 1, 16); +} + +JAVA_OBJECT com_codename1_impl_ios_IOSSimd_allocFloatNative___int_R_float_1ARRAY(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_INT size) { + return allocArrayAligned(threadStateData, size, &class_array1__JAVA_FLOAT, sizeof(JAVA_ARRAY_FLOAT), 1, 16); +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_lookupBytes___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT table, JAVA_OBJECT indices, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* t = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)table)->data; + JAVA_ARRAY_BYTE* idx = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)indices)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int tableLen = ((JAVA_ARRAY)table)->length; + int i = offset; + int end = offset + length; + for (; i < end; i++) { + // Java byte values are signed, but lookup-style consumers intentionally use + // them as unsigned indices into byte tables. Indices beyond the table length + // resolve to 0 to mirror the generic Java fallback. + int lookupIndex = idx[i] & 0xff; + d[i] = lookupIndex < tableLen ? t[lookupIndex] : 0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_lookupBytes___byte_1ARRAY_byte_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT table, JAVA_OBJECT indices, JAVA_INT indicesOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* t = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)table)->data; + JAVA_ARRAY_BYTE* idx = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)indices)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int tableLen = ((JAVA_ARRAY)table)->length; + for (int i = 0; i < length; i++) { + int lookupIndex = idx[indicesOffset + i] & 0xff; + d[dstOffset + i] = lookupIndex < tableLen ? t[lookupIndex] : 0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_add___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int8x16_t vc = vqaddq_s8(va, vb); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = cn1_saturating_byte((int)a[i] + (int)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_sub___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int8x16_t vc = vqsubq_s8(va, vb); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = cn1_saturating_byte((int)a[i] - (int)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_mul___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int16x8_t low = vmull_s8(vget_low_s8(va), vget_low_s8(vb)); + int16x8_t high = vmull_s8(vget_high_s8(va), vget_high_s8(vb)); + int8x8_t low8 = vqmovn_s16(low); + int8x8_t high8 = vqmovn_s16(high); + int8x16_t out = vcombine_s8(low8, high8); + vst1q_s8((int8_t*)(d + i), out); + } + for (; i < end; i++) { + d[i] = cn1_saturating_byte((int)a[i] * (int)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_min___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int8x16_t vc = vminq_s8(va, vb); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] < b[i] ? a[i] : b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_max___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + int8x16_t vc = vmaxq_s8(va, vb); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] > b[i] ? a[i] : b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_abs___byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + int8x16_t vd = vqabsq_s8(vs); + vst1q_s8((int8_t*)(d + i), vd); + } + for (; i < end; i++) { + int v = s[i]; + d[i] = v == -128 ? 127 : (JAVA_ARRAY_BYTE)abs(v); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_clamp___byte_1ARRAY_byte_1ARRAY_byte_byte_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_BYTE minValue, JAVA_BYTE maxValue, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + int8x16_t vminv = vdupq_n_s8((int8_t)minValue); + int8x16_t vmaxv = vdupq_n_s8((int8_t)maxValue); + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + int8x16_t vc = vmaxq_s8(vminv, vminq_s8(vs, vmaxv)); + vst1q_s8((int8_t*)(d + i), vc); + } + for (; i < end; i++) { + int v = s[i]; + if (v < minValue) { + d[i] = minValue; + } else if (v > maxValue) { + d[i] = maxValue; + } else { + d[i] = (JAVA_ARRAY_BYTE)v; + } + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_add___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vaddq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)((int32_t)a[i] + (int32_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_sub___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vsubq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)((int32_t)a[i] - (int32_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_mul___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vmulq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)((int32_t)a[i] * (int32_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_min___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vminq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] < b[i] ? a[i] : b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_max___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + int32x4_t vc = vmaxq_s32(va, vb); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] > b[i] ? a[i] : b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_abs___int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + int32x4_t vd = vqabsq_s32(vs); + vst1q_s32((int32_t*)(d + i), vd); + } + for (; i < end; i++) { + int32_t v = (int32_t)s[i]; + d[i] = (JAVA_ARRAY_INT)(v == INT32_MIN ? INT32_MAX : (v < 0 ? -v : v)); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_clamp___int_1ARRAY_int_1ARRAY_int_int_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT minValue, JAVA_INT maxValue, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + int32x4_t vminv = vdupq_n_s32((int32_t)minValue); + int32x4_t vmaxv = vdupq_n_s32((int32_t)maxValue); + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + int32x4_t vc = vmaxq_s32(vminv, vminq_s32(vs, vmaxv)); + vst1q_s32((int32_t*)(d + i), vc); + } + for (; i < end; i++) { + int v = s[i]; + if (v < minValue) { + d[i] = minValue; + } else if (v > maxValue) { + d[i] = maxValue; + } else { + d[i] = (JAVA_ARRAY_INT)v; + } + } +} + +JAVA_INT com_codename1_impl_ios_IOSSimd_sum___int_1ARRAY_int_int_R_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + int i = offset; + int end = offset + length; + int64_t total = 0; + int32x4_t vacc = vdupq_n_s32(0); + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + vacc = vaddq_s32(vacc, vs); + } + int32_t partial[4]; + vst1q_s32(partial, vacc); + total += (int64_t)partial[0] + (int64_t)partial[1] + (int64_t)partial[2] + (int64_t)partial[3]; + for (; i < end; i++) { + total += (int64_t)((int32_t)s[i]); + } + return (JAVA_INT)((int32_t)total); +} + +JAVA_INT com_codename1_impl_ios_IOSSimd_dot___int_1ARRAY_int_1ARRAY_int_int_R_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + int i = offset; + int end = offset + length; + int64_t total = 0; + int32x4_t vacc = vdupq_n_s32(0); + for (; i <= end - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + i)); + vacc = vaddq_s32(vacc, vmulq_s32(va, vb)); + } + int32_t partial[4]; + vst1q_s32(partial, vacc); + total += (int64_t)partial[0] + (int64_t)partial[1] + (int64_t)partial[2] + (int64_t)partial[3]; + for (; i < end; i++) { + total += (int64_t)((int32_t)a[i]) * (int64_t)((int32_t)b[i]); + } + return (JAVA_INT)((int32_t)total); +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_add___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vaddq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] + b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_sub___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vsubq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] - b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_mul___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vmulq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = a[i] * b[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_min___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vminq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = fminf(a[i], b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_max___float_1ARRAY_float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + float32x4_t vc = vmaxq_f32(va, vb); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + d[i] = fmaxf(a[i], b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_abs___float_1ARRAY_float_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* s = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + float32x4_t vs = vld1q_f32((float*)(s + i)); + float32x4_t vd = vabsq_f32(vs); + vst1q_f32((float*)(d + i), vd); + } + for (; i < end; i++) { + d[i] = fabsf(s[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_clamp___float_1ARRAY_float_1ARRAY_float_float_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_FLOAT minValue, JAVA_FLOAT maxValue, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* s = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_FLOAT* d = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + float32x4_t vminv = vdupq_n_f32((float)minValue); + float32x4_t vmaxv = vdupq_n_f32((float)maxValue); + for (; i <= end - 4; i += 4) { + float32x4_t vs = vld1q_f32((float*)(s + i)); + float32x4_t vc = vmaxq_f32(vminv, vminq_f32(vs, vmaxv)); + vst1q_f32((float*)(d + i), vc); + } + for (; i < end; i++) { + float v = s[i]; + if (v < minValue) { + d[i] = minValue; + } else if (v > maxValue) { + d[i] = maxValue; + } else { + d[i] = v; + } + } +} + +JAVA_FLOAT com_codename1_impl_ios_IOSSimd_sum___float_1ARRAY_int_int_R_float(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* s = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)src)->data; + int i = offset; + int end = offset + length; + float total = 0.f; + float32x4_t vacc = vdupq_n_f32(0.f); + for (; i <= end - 4; i += 4) { + float32x4_t vs = vld1q_f32((float*)(s + i)); + vacc = vaddq_f32(vacc, vs); + } + float partial[4]; + vst1q_f32(partial, vacc); + total += partial[0] + partial[1] + partial[2] + partial[3]; + for (; i < end; i++) { + total += s[i]; + } + return (JAVA_FLOAT)total; +} + +JAVA_FLOAT com_codename1_impl_ios_IOSSimd_dot___float_1ARRAY_float_1ARRAY_int_int_R_float(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_FLOAT* a = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_FLOAT* b = (JAVA_ARRAY_FLOAT*)((JAVA_ARRAY)srcB)->data; + int i = offset; + int end = offset + length; + float total = 0.f; + float32x4_t vacc = vdupq_n_f32(0.f); + for (; i <= end - 4; i += 4) { + float32x4_t va = vld1q_f32((float*)(a + i)); + float32x4_t vb = vld1q_f32((float*)(b + i)); + vacc = vaddq_f32(vacc, vmulq_f32(va, vb)); + } + float partial[4]; + vst1q_f32(partial, vacc); + total += partial[0] + partial[1] + partial[2] + partial[3]; + for (; i < end; i++) { + total += a[i] * b[i]; + } + return (JAVA_FLOAT)total; +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_and___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + vst1q_s8((int8_t*)(d + i), vandq_s8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(a[i] & b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_and___byte_1ARRAY_int_byte_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + srcAOffset + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + srcBOffset + i)); + vst1q_s8((int8_t*)(d + dstOffset + i), vandq_s8(va, vb)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_BYTE)(a[srcAOffset + i] & b[srcBOffset + i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_or___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + vst1q_s8((int8_t*)(d + i), vorrq_s8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(a[i] | b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_or___byte_1ARRAY_int_byte_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + srcAOffset + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + srcBOffset + i)); + vst1q_s8((int8_t*)(d + dstOffset + i), vorrq_s8(va, vb)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_BYTE)(a[srcAOffset + i] | b[srcBOffset + i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_xor___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + vst1q_s8((int8_t*)(d + i), veorq_s8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(a[i] ^ b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_not___byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + vst1q_s8((int8_t*)(d + i), vmvnq_s8(vs)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(~s[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpEq___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + uint8x16_t cmp = vceqq_s8(va, vb); + vst1q_u8((uint8_t*)(m + i), cmp); + } + for (; i < end; i++) { + m[i] = a[i] == b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpEq___byte_1ARRAY_byte_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_BYTE value, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + int8x16_t vv = vdupq_n_s8((int8_t)value); + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + vst1q_s8((int8_t*)(m + i), vreinterpretq_s8_u8(vceqq_s8(vs, vv))); + } + for (; i < end; i++) { + m[i] = s[i] == value ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpLt___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + uint8x16_t cmp = vcltq_s8(va, vb); + vst1q_u8((uint8_t*)(m + i), cmp); + } + for (; i < end; i++) { + m[i] = a[i] < b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpLt___byte_1ARRAY_byte_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_BYTE value, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + int8x16_t vv = vdupq_n_s8((int8_t)value); + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + vst1q_s8((int8_t*)(m + i), vreinterpretq_s8_u8(vcltq_s8(vs, vv))); + } + for (; i < end; i++) { + m[i] = s[i] < value ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpGt___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + int8x16_t va = vld1q_s8((int8_t*)(a + i)); + int8x16_t vb = vld1q_s8((int8_t*)(b + i)); + uint8x16_t cmp = vcgtq_s8(va, vb); + vst1q_u8((uint8_t*)(m + i), cmp); + } + for (; i < end; i++) { + m[i] = a[i] > b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpRange___byte_1ARRAY_byte_byte_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_BYTE minValue, JAVA_BYTE maxValue, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + int8x16_t vminv = vdupq_n_s8((int8_t)minValue); + int8x16_t vmaxv = vdupq_n_s8((int8_t)maxValue); + for (; i <= end - 16; i += 16) { + int8x16_t vs = vld1q_s8((int8_t*)(s + i)); + uint8x16_t ge = vcgeq_s8(vs, vminv); + uint8x16_t le = vcleq_s8(vs, vmaxv); + vst1q_u8((uint8_t*)(m + i), vandq_u8(ge, le)); + } + for (; i < end; i++) { + int v = s[i]; + m[i] = v >= minValue && v <= maxValue ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_select___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT mask, JAVA_OBJECT trueValues, JAVA_OBJECT falseValues, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)mask)->data; + JAVA_ARRAY_BYTE* t = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)trueValues)->data; + JAVA_ARRAY_BYTE* f = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)falseValues)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + int8x16_t zero = vdupq_n_s8(0); + for (; i <= end - 16; i += 16) { + int8x16_t vm = vld1q_s8((int8_t*)(m + i)); + int8x16_t vt = vld1q_s8((int8_t*)(t + i)); + int8x16_t vf = vld1q_s8((int8_t*)(f + i)); + uint8x16_t isZero = vceqq_s8(vm, zero); + uint8x16_t out = vbslq_u8(isZero, vreinterpretq_u8_s8(vf), vreinterpretq_u8_s8(vt)); + vst1q_s8((int8_t*)(d + i), vreinterpretq_s8_u8(out)); + } + for (; i < end; i++) { + d[i] = m[i] != 0 ? t[i] : f[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_unpackUnsignedByteToInt___byte_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + uint8x16_t v = vld1q_u8((uint8_t*)(s + i)); + uint16x8_t lo16 = vmovl_u8(vget_low_u8(v)); + uint16x8_t hi16 = vmovl_u8(vget_high_u8(v)); + uint32x4_t x0 = vmovl_u16(vget_low_u16(lo16)); + uint32x4_t x1 = vmovl_u16(vget_high_u16(lo16)); + uint32x4_t x2 = vmovl_u16(vget_low_u16(hi16)); + uint32x4_t x3 = vmovl_u16(vget_high_u16(hi16)); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(x0)); + vst1q_s32((int32_t*)(d + i + 4), vreinterpretq_s32_u32(x1)); + vst1q_s32((int32_t*)(d + i + 8), vreinterpretq_s32_u32(x2)); + vst1q_s32((int32_t*)(d + i + 12), vreinterpretq_s32_u32(x3)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(s[i] & 0xff); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packIntToByteSaturating___int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + int v = s[i]; + if (v > 127) { + d[i] = 127; + } else if (v < -128) { + d[i] = -128; + } else { + d[i] = (JAVA_ARRAY_BYTE)v; + } + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packIntToByteTruncate___int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)s[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packIntToByteTruncate___int_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_BYTE)s[srcOffset + i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packIntToByteTruncateInterleaved4___int_1ARRAY_int_int_int_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT src0Offset, JAVA_INT src1Offset, JAVA_INT src2Offset, JAVA_INT src3Offset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16x4_t v; + v.val[0] = cn1_load_ints_to_u8x16(s, src0Offset + i); + v.val[1] = cn1_load_ints_to_u8x16(s, src1Offset + i); + v.val[2] = cn1_load_ints_to_u8x16(s, src2Offset + i); + v.val[3] = cn1_load_ints_to_u8x16(s, src3Offset + i); + vst4q_u8((uint8_t*)(d + dstOffset + i * 4), v); + } + for (; i < length; i++) { + int dstIndex = dstOffset + i * 4; + d[dstIndex] = (JAVA_ARRAY_BYTE)s[src0Offset + i]; + d[dstIndex + 1] = (JAVA_ARRAY_BYTE)s[src1Offset + i]; + d[dstIndex + 2] = (JAVA_ARRAY_BYTE)s[src2Offset + i]; + d[dstIndex + 3] = (JAVA_ARRAY_BYTE)s[src3Offset + i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packBytesInterleaved3___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src0, JAVA_OBJECT src1, JAVA_OBJECT src2, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s0 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src0)->data; + JAVA_ARRAY_BYTE* s1 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src1)->data; + JAVA_ARRAY_BYTE* s2 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src2)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16x3_t v; + v.val[0] = vld1q_u8((uint8_t*)(s0 + i)); + v.val[1] = vld1q_u8((uint8_t*)(s1 + i)); + v.val[2] = vld1q_u8((uint8_t*)(s2 + i)); + vst3q_u8((uint8_t*)(d + dstOffset + i * 3), v); + } + for (; i < length; i++) { + int dstIndex = dstOffset + i * 3; + d[dstIndex] = s0[i]; + d[dstIndex + 1] = s1[i]; + d[dstIndex + 2] = s2[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packBytesInterleaved3___byte_1ARRAY_int_int_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT src0Offset, JAVA_INT src1Offset, JAVA_INT src2Offset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16x3_t v; + v.val[0] = vld1q_u8((uint8_t*)(s + src0Offset + i)); + v.val[1] = vld1q_u8((uint8_t*)(s + src1Offset + i)); + v.val[2] = vld1q_u8((uint8_t*)(s + src2Offset + i)); + vst3q_u8((uint8_t*)(d + dstOffset + i * 3), v); + } + for (; i < length; i++) { + int dstIndex = dstOffset + i * 3; + d[dstIndex] = s[src0Offset + i]; + d[dstIndex + 1] = s[src1Offset + i]; + d[dstIndex + 2] = s[src2Offset + i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packBytesInterleaved4___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src0, JAVA_OBJECT src1, JAVA_OBJECT src2, JAVA_OBJECT src3, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s0 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src0)->data; + JAVA_ARRAY_BYTE* s1 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src1)->data; + JAVA_ARRAY_BYTE* s2 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src2)->data; + JAVA_ARRAY_BYTE* s3 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src3)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16x4_t v; + v.val[0] = vld1q_u8((uint8_t*)(s0 + i)); + v.val[1] = vld1q_u8((uint8_t*)(s1 + i)); + v.val[2] = vld1q_u8((uint8_t*)(s2 + i)); + v.val[3] = vld1q_u8((uint8_t*)(s3 + i)); + vst4q_u8((uint8_t*)(d + dstOffset + i * 4), v); + } + for (; i < length; i++) { + int dstIndex = dstOffset + i * 4; + d[dstIndex] = s0[i]; + d[dstIndex + 1] = s1[i]; + d[dstIndex + 2] = s2[i]; + d[dstIndex + 3] = s3[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_packBytesInterleaved4___byte_1ARRAY_int_int_int_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT src0Offset, JAVA_INT src1Offset, JAVA_INT src2Offset, JAVA_INT src3Offset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16x4_t v; + v.val[0] = vld1q_u8((uint8_t*)(s + src0Offset + i)); + v.val[1] = vld1q_u8((uint8_t*)(s + src1Offset + i)); + v.val[2] = vld1q_u8((uint8_t*)(s + src2Offset + i)); + v.val[3] = vld1q_u8((uint8_t*)(s + src3Offset + i)); + vst4q_u8((uint8_t*)(d + dstOffset + i * 4), v); + } + for (; i < length; i++) { + int dstIndex = dstOffset + i * 4; + d[dstIndex] = s[src0Offset + i]; + d[dstIndex + 1] = s[src1Offset + i]; + d[dstIndex + 2] = s[src2Offset + i]; + d[dstIndex + 3] = s[src3Offset + i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_permuteBytes___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT indices, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + int srcLen = ((JAVA_ARRAY)src)->length; + JAVA_ARRAY_BYTE* idx = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)indices)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; +#if defined(__aarch64__) + if (srcLen <= 16) { + uint8x16_t table = vld1q_u8((uint8_t*)s); + for (; i <= end - 16; i += 16) { + int8x16_t rawIdx = vld1q_s8((int8_t*)(idx + i)); + uint8x16_t valid = vcgeq_s8(rawIdx, vdupq_n_s8(0)); + uint8x16_t selected = vqtbl1q_u8(table, vreinterpretq_u8_s8(rawIdx)); + vst1q_u8((uint8_t*)(d + i), vandq_u8(selected, valid)); + } + } +#endif + for (; i < end; i++) { + int pos = idx[i]; + d[i] = (pos >= 0 && pos < srcLen) ? s[pos] : 0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_and___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vandq_u32(va, vb))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(a[i] & b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_and___int_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + srcAOffset + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + srcBOffset + i))); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(vandq_u32(va, vb))); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(a[srcAOffset + i] & b[srcBOffset + i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_or___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vorrq_u32(va, vb))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(a[i] | b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_or___int_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + srcAOffset + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + srcBOffset + i))); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(vorrq_u32(va, vb))); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(a[srcAOffset + i] | b[srcBOffset + i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_xor___int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32x4_t va = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(a + i))); + uint32x4_t vb = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(b + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(veorq_u32(va, vb))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(a[i] ^ b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_not___int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32x4_t vs = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(s + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vmvnq_u32(vs))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(~s[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shl___int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = offset; + int end = offset + length; + int32x4_t vshift = vdupq_n_s32(shift); + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + vst1q_s32((int32_t*)(d + i), vshlq_s32(vs, vshift)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(s[i] << shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shl___int_1ARRAY_int_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = 0; + int32x4_t vshift = vdupq_n_s32(shift); + for (; i <= length - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + srcOffset + i)); + vst1q_s32((int32_t*)(d + dstOffset + i), vshlq_s32(vs, vshift)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(s[srcOffset + i] << shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrLogical___int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = offset; + int end = offset + length; + int32x4_t vshift = vdupq_n_s32(-shift); + for (; i <= end - 4; i += 4) { + uint32x4_t vs = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(s + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vshlq_u32(vs, vshift))); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(((uint32_t)s[i]) >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrLogical___int_1ARRAY_int_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = 0; + int32x4_t vshift = vdupq_n_s32(-shift); + for (; i <= length - 4; i += 4) { + uint32x4_t vs = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(s + srcOffset + i))); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(vshlq_u32(vs, vshift))); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(((uint32_t)s[srcOffset + i]) >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrArithmetic___int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* s = (JAVA_ARRAY_INT*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int shift = bits & 31; + int i = offset; + int end = offset + length; + int32x4_t vshift = vdupq_n_s32(-shift); + for (; i <= end - 4; i += 4) { + int32x4_t vs = vld1q_s32((int32_t*)(s + i)); + vst1q_s32((int32_t*)(d + i), vshlq_s32(vs, vshift)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_INT)(s[i] >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpEq___int_1ARRAY_int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 8; i += 8) { + uint16x4_t lo16 = vmovn_u32(vceqq_s32(vld1q_s32((int32_t*)(a + i)), vld1q_s32((int32_t*)(b + i)))); + uint16x4_t hi16 = vmovn_u32(vceqq_s32(vld1q_s32((int32_t*)(a + i + 4)), vld1q_s32((int32_t*)(b + i + 4)))); + vst1_u8((uint8_t*)(m + i), vmovn_u16(vcombine_u16(lo16, hi16))); + } + for (; i < end; i++) { + m[i] = a[i] == b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpLt___int_1ARRAY_int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 8; i += 8) { + uint16x4_t lo16 = vmovn_u32(vreinterpretq_u32_s32(vcltq_s32(vld1q_s32((int32_t*)(a + i)), vld1q_s32((int32_t*)(b + i))))); + uint16x4_t hi16 = vmovn_u32(vreinterpretq_u32_s32(vcltq_s32(vld1q_s32((int32_t*)(a + i + 4)), vld1q_s32((int32_t*)(b + i + 4))))); + vst1_u8((uint8_t*)(m + i), vmovn_u16(vcombine_u16(lo16, hi16))); + } + for (; i < end; i++) { + m[i] = a[i] < b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpGt___int_1ARRAY_int_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dstMask, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int end = offset + length; + for (int i = offset; i < end; i++) { + m[i] = a[i] > b[i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_select___byte_1ARRAY_int_1ARRAY_int_1ARRAY_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT mask, JAVA_OBJECT trueValues, JAVA_OBJECT falseValues, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)mask)->data; + JAVA_ARRAY_INT* t = (JAVA_ARRAY_INT*)((JAVA_ARRAY)trueValues)->data; + JAVA_ARRAY_INT* f = (JAVA_ARRAY_INT*)((JAVA_ARRAY)falseValues)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 4; i += 4) { + uint32_t packedMask; + memcpy(&packedMask, m + i, sizeof(packedMask)); + uint8x8_t maskBytes = vreinterpret_u8_u32(vdup_n_u32(packedMask)); + uint32x4_t vm = vcgtq_u32(vmovl_u16(vget_low_u16(vmovl_u8(maskBytes))), vdupq_n_u32(0)); + uint32x4_t vt = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(t + i))); + uint32x4_t vf = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(f + i))); + vst1q_s32((int32_t*)(d + i), vreinterpretq_s32_u32(vbslq_u32(vm, vt, vf))); + } + for (; i < end; i++) { + d[i] = m[i] != 0 ? t[i] : f[i]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shl___byte_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int shift = bits & 7; + int i = offset; + int end = offset + length; + int8x16_t vshift = vdupq_n_s8((int8_t)shift); + for (; i <= end - 16; i += 16) { + uint8x16_t vs = vld1q_u8((uint8_t*)(s + i)); + vst1q_u8((uint8_t*)(d + i), vshlq_u8(vs, vshift)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(((uint8_t)s[i]) << shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shl___byte_1ARRAY_int_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int shift = bits & 7; + int i = 0; + int8x16_t vshift = vdupq_n_s8((int8_t)shift); + for (; i <= length - 16; i += 16) { + uint8x16_t vs = vld1q_u8((uint8_t*)(s + srcOffset + i)); + vst1q_u8((uint8_t*)(d + dstOffset + i), vshlq_u8(vs, vshift)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_BYTE)(((uint8_t)s[srcOffset + i]) << shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrLogical___byte_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int shift = bits & 7; + int i = offset; + int end = offset + length; + int8x16_t vneg = vdupq_n_s8((int8_t)(-shift)); + for (; i <= end - 16; i += 16) { + uint8x16_t vs = vld1q_u8((uint8_t*)(s + i)); + vst1q_u8((uint8_t*)(d + i), vshlq_u8(vs, vneg)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)(((uint8_t)s[i]) >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_shrLogical___byte_1ARRAY_int_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_INT bits, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int shift = bits & 7; + int i = 0; + int8x16_t vneg = vdupq_n_s8((int8_t)(-shift)); + for (; i <= length - 16; i += 16) { + uint8x16_t vs = vld1q_u8((uint8_t*)(s + srcOffset + i)); + vst1q_u8((uint8_t*)(d + dstOffset + i), vshlq_u8(vs, vneg)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_BYTE)(((uint8_t)s[srcOffset + i]) >> shift); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_addWrapping___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + uint8x16_t va = vld1q_u8((uint8_t*)(a + i)); + uint8x16_t vb = vld1q_u8((uint8_t*)(b + i)); + vst1q_u8((uint8_t*)(d + i), vaddq_u8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)((uint8_t)a[i] + (uint8_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_addWrapping___byte_1ARRAY_byte_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_BYTE value, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + uint8x16_t vv = vdupq_n_u8((uint8_t)value); + for (; i <= end - 16; i += 16) { + uint8x16_t vs = vld1q_u8((uint8_t*)(s + i)); + vst1q_u8((uint8_t*)(d + i), vaddq_u8(vs, vv)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)((uint8_t)s[i] + (uint8_t)value); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_subWrapping___byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_OBJECT srcB, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* a = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_BYTE* b = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + for (; i <= end - 16; i += 16) { + uint8x16_t va = vld1q_u8((uint8_t*)(a + i)); + uint8x16_t vb = vld1q_u8((uint8_t*)(b + i)); + vst1q_u8((uint8_t*)(d + i), vsubq_u8(va, vb)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)((uint8_t)a[i] - (uint8_t)b[i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_subWrapping___byte_1ARRAY_byte_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_BYTE value, JAVA_OBJECT dst, JAVA_INT offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = offset; + int end = offset + length; + uint8x16_t vv = vdupq_n_u8((uint8_t)value); + for (; i <= end - 16; i += 16) { + uint8x16_t vs = vld1q_u8((uint8_t*)(s + i)); + vst1q_u8((uint8_t*)(d + i), vsubq_u8(vs, vv)); + } + for (; i < end; i++) { + d[i] = (JAVA_ARRAY_BYTE)((uint8_t)s[i] - (uint8_t)value); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_unpackUnsignedByteToInt___byte_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16_t v = vld1q_u8((uint8_t*)(s + srcOffset + i)); + uint16x8_t lo16 = vmovl_u8(vget_low_u8(v)); + uint16x8_t hi16 = vmovl_u8(vget_high_u8(v)); + uint32x4_t x0 = vmovl_u16(vget_low_u16(lo16)); + uint32x4_t x1 = vmovl_u16(vget_high_u16(lo16)); + uint32x4_t x2 = vmovl_u16(vget_low_u16(hi16)); + uint32x4_t x3 = vmovl_u16(vget_high_u16(hi16)); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(x0)); + vst1q_s32((int32_t*)(d + dstOffset + i + 4), vreinterpretq_s32_u32(x1)); + vst1q_s32((int32_t*)(d + dstOffset + i + 8), vreinterpretq_s32_u32(x2)); + vst1q_s32((int32_t*)(d + dstOffset + i + 12), vreinterpretq_s32_u32(x3)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)(s[srcOffset + i] & 0xff); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_unpackUnsignedByteToIntInterleaved3___byte_1ARRAY_int_int_1ARRAY_int_int_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst, JAVA_INT dst0Offset, JAVA_INT dst1Offset, JAVA_INT dst2Offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16x3_t v = vld3q_u8((uint8_t*)(s + srcOffset + i * 3)); + cn1_store_u8x16_to_ints(v.val[0], d, dst0Offset + i); + cn1_store_u8x16_to_ints(v.val[1], d, dst1Offset + i); + cn1_store_u8x16_to_ints(v.val[2], d, dst2Offset + i); + } + for (; i < length; i++) { + int srcIndex = srcOffset + i * 3; + d[dst0Offset + i] = (JAVA_ARRAY_INT)(s[srcIndex] & 0xff); + d[dst1Offset + i] = (JAVA_ARRAY_INT)(s[srcIndex + 1] & 0xff); + d[dst2Offset + i] = (JAVA_ARRAY_INT)(s[srcIndex + 2] & 0xff); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_unpackBytesInterleaved3___byte_1ARRAY_int_byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst0, JAVA_OBJECT dst1, JAVA_OBJECT dst2, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d0 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst0)->data; + JAVA_ARRAY_BYTE* d1 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst1)->data; + JAVA_ARRAY_BYTE* d2 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst2)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16x3_t v = vld3q_u8((uint8_t*)(s + srcOffset + i * 3)); + vst1q_u8((uint8_t*)(d0 + i), v.val[0]); + vst1q_u8((uint8_t*)(d1 + i), v.val[1]); + vst1q_u8((uint8_t*)(d2 + i), v.val[2]); + } + for (; i < length; i++) { + int srcIndex = srcOffset + i * 3; + d0[i] = s[srcIndex]; + d1[i] = s[srcIndex + 1]; + d2[i] = s[srcIndex + 2]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_unpackBytesInterleaved3___byte_1ARRAY_int_byte_1ARRAY_int_int_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst, JAVA_INT dst0Offset, JAVA_INT dst1Offset, JAVA_INT dst2Offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16x3_t v = vld3q_u8((uint8_t*)(s + srcOffset + i * 3)); + vst1q_u8((uint8_t*)(d + dst0Offset + i), v.val[0]); + vst1q_u8((uint8_t*)(d + dst1Offset + i), v.val[1]); + vst1q_u8((uint8_t*)(d + dst2Offset + i), v.val[2]); + } + for (; i < length; i++) { + int srcIndex = srcOffset + i * 3; + d[dst0Offset + i] = s[srcIndex]; + d[dst1Offset + i] = s[srcIndex + 1]; + d[dst2Offset + i] = s[srcIndex + 2]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_unpackBytesInterleaved4___byte_1ARRAY_int_byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst0, JAVA_OBJECT dst1, JAVA_OBJECT dst2, JAVA_OBJECT dst3, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d0 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst0)->data; + JAVA_ARRAY_BYTE* d1 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst1)->data; + JAVA_ARRAY_BYTE* d2 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst2)->data; + JAVA_ARRAY_BYTE* d3 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst3)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16x4_t v = vld4q_u8((uint8_t*)(s + srcOffset + i * 4)); + vst1q_u8((uint8_t*)(d0 + i), v.val[0]); + vst1q_u8((uint8_t*)(d1 + i), v.val[1]); + vst1q_u8((uint8_t*)(d2 + i), v.val[2]); + vst1q_u8((uint8_t*)(d3 + i), v.val[3]); + } + for (; i < length; i++) { + int srcIndex = srcOffset + i * 4; + d0[i] = s[srcIndex]; + d1[i] = s[srcIndex + 1]; + d2[i] = s[srcIndex + 2]; + d3[i] = s[srcIndex + 3]; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_unpackBytesInterleaved4___byte_1ARRAY_int_byte_1ARRAY_int_int_int_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst, JAVA_INT dst0Offset, JAVA_INT dst1Offset, JAVA_INT dst2Offset, JAVA_INT dst3Offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 16; i += 16) { + uint8x16x4_t v = vld4q_u8((uint8_t*)(s + srcOffset + i * 4)); + vst1q_u8((uint8_t*)(d + dst0Offset + i), v.val[0]); + vst1q_u8((uint8_t*)(d + dst1Offset + i), v.val[1]); + vst1q_u8((uint8_t*)(d + dst2Offset + i), v.val[2]); + vst1q_u8((uint8_t*)(d + dst3Offset + i), v.val[3]); + } + for (; i < length; i++) { + int srcIndex = srcOffset + i * 4; + d[dst0Offset + i] = s[srcIndex]; + d[dst1Offset + i] = s[srcIndex + 1]; + d[dst2Offset + i] = s[srcIndex + 2]; + d[dst3Offset + i] = s[srcIndex + 3]; + } +} + +JAVA_INT com_codename1_impl_ios_IOSSimd_unpackLookupBytesInterleaved4___byte_1ARRAY_byte_1ARRAY_int_byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_byte_1ARRAY_int_R_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT table, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst0, JAVA_OBJECT dst1, JAVA_OBJECT dst2, JAVA_OBJECT dst3, JAVA_INT length) { + JAVA_ARRAY_BYTE* t = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)table)->data; + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d0 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst0)->data; + JAVA_ARRAY_BYTE* d1 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst1)->data; + JAVA_ARRAY_BYTE* d2 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst2)->data; + JAVA_ARRAY_BYTE* d3 = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst3)->data; + int tableLen = ((JAVA_ARRAY)table)->length; + JAVA_INT orValue = 0; + for (int i = 0; i < length; i++) { + int srcIndex = srcOffset + i * 4; + int idx0 = s[srcIndex] & 0xff; + int idx1 = s[srcIndex + 1] & 0xff; + int idx2 = s[srcIndex + 2] & 0xff; + int idx3 = s[srcIndex + 3] & 0xff; + JAVA_ARRAY_BYTE v0 = idx0 < tableLen ? t[idx0] : 0; + JAVA_ARRAY_BYTE v1 = idx1 < tableLen ? t[idx1] : 0; + JAVA_ARRAY_BYTE v2 = idx2 < tableLen ? t[idx2] : 0; + JAVA_ARRAY_BYTE v3 = idx3 < tableLen ? t[idx3] : 0; + d0[i] = v0; + d1[i] = v1; + d2[i] = v2; + d3[i] = v3; + orValue |= v0 | v1 | v2 | v3; + } + return orValue; +} + +JAVA_INT com_codename1_impl_ios_IOSSimd_unpackLookupBytesInterleaved4___byte_1ARRAY_byte_1ARRAY_int_byte_1ARRAY_int_int_int_int_int_R_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT table, JAVA_OBJECT src, JAVA_INT srcOffset, JAVA_OBJECT dst, JAVA_INT dst0Offset, JAVA_INT dst1Offset, JAVA_INT dst2Offset, JAVA_INT dst3Offset, JAVA_INT length) { + JAVA_ARRAY_BYTE* t = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)table)->data; + JAVA_ARRAY_BYTE* s = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)src)->data; + JAVA_ARRAY_BYTE* d = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dst)->data; + int tableLen = ((JAVA_ARRAY)table)->length; + JAVA_INT orValue = 0; + for (int i = 0; i < length; i++) { + int srcIndex = srcOffset + i * 4; + int idx0 = s[srcIndex] & 0xff; + int idx1 = s[srcIndex + 1] & 0xff; + int idx2 = s[srcIndex + 2] & 0xff; + int idx3 = s[srcIndex + 3] & 0xff; + JAVA_ARRAY_BYTE v0 = idx0 < tableLen ? t[idx0] : 0; + JAVA_ARRAY_BYTE v1 = idx1 < tableLen ? t[idx1] : 0; + JAVA_ARRAY_BYTE v2 = idx2 < tableLen ? t[idx2] : 0; + JAVA_ARRAY_BYTE v3 = idx3 < tableLen ? t[idx3] : 0; + d[dst0Offset + i] = v0; + d[dst1Offset + i] = v1; + d[dst2Offset + i] = v2; + d[dst3Offset + i] = v3; + orValue |= v0 | v1 | v2 | v3; + } + return orValue; +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_add___int_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + for (; i <= length - 4; i += 4) { + int32x4_t va = vld1q_s32((int32_t*)(a + srcAOffset + i)); + int32x4_t vb = vld1q_s32((int32_t*)(b + srcBOffset + i)); + vst1q_s32((int32_t*)(d + dstOffset + i), vaddq_s32(va, vb)); + } + for (; i < length; i++) { + d[dstOffset + i] = (JAVA_ARRAY_INT)((int32_t)a[srcAOffset + i] + (int32_t)b[srcBOffset + i]); + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpEq___int_1ARRAY_int_int_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dstMask, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = 0; + int end = length; + for (; i <= end - 8; i += 8) { + uint16x4_t lo16 = vmovn_u32(vceqq_s32(vld1q_s32((int32_t*)(a + srcAOffset + i)), vld1q_s32((int32_t*)(b + srcBOffset + i)))); + uint16x4_t hi16 = vmovn_u32(vceqq_s32(vld1q_s32((int32_t*)(a + srcAOffset + i + 4)), vld1q_s32((int32_t*)(b + srcBOffset + i + 4)))); + vst1_u8((uint8_t*)(m + dstOffset + i), vmovn_u16(vcombine_u16(lo16, hi16))); + } + for (; i < end; i++) { + m[dstOffset + i] = a[srcAOffset + i] == b[srcBOffset + i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_cmpLt___int_1ARRAY_int_int_1ARRAY_int_byte_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT srcA, JAVA_INT srcAOffset, JAVA_OBJECT srcB, JAVA_INT srcBOffset, JAVA_OBJECT dstMask, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_INT* a = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcA)->data; + JAVA_ARRAY_INT* b = (JAVA_ARRAY_INT*)((JAVA_ARRAY)srcB)->data; + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)dstMask)->data; + int i = 0; + int end = length; + for (; i <= end - 8; i += 8) { + uint16x4_t lo16 = vmovn_u32(vreinterpretq_u32_s32(vcltq_s32(vld1q_s32((int32_t*)(a + srcAOffset + i)), vld1q_s32((int32_t*)(b + srcBOffset + i))))); + uint16x4_t hi16 = vmovn_u32(vreinterpretq_u32_s32(vcltq_s32(vld1q_s32((int32_t*)(a + srcAOffset + i + 4)), vld1q_s32((int32_t*)(b + srcBOffset + i + 4))))); + vst1_u8((uint8_t*)(m + dstOffset + i), vmovn_u16(vcombine_u16(lo16, hi16))); + } + for (; i < end; i++) { + m[dstOffset + i] = a[srcAOffset + i] < b[srcBOffset + i] ? (JAVA_ARRAY_BYTE)-1 : (JAVA_ARRAY_BYTE)0; + } +} + +JAVA_VOID com_codename1_impl_ios_IOSSimd_select___byte_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int_1ARRAY_int_int(CN1_THREAD_STATE_MULTI_ARG JAVA_OBJECT instanceObject, JAVA_OBJECT mask, JAVA_INT maskOffset, JAVA_OBJECT trueValues, JAVA_INT trueOffset, JAVA_OBJECT falseValues, JAVA_INT falseOffset, JAVA_OBJECT dst, JAVA_INT dstOffset, JAVA_INT length) { + JAVA_ARRAY_BYTE* m = (JAVA_ARRAY_BYTE*)((JAVA_ARRAY)mask)->data; + JAVA_ARRAY_INT* t = (JAVA_ARRAY_INT*)((JAVA_ARRAY)trueValues)->data; + JAVA_ARRAY_INT* f = (JAVA_ARRAY_INT*)((JAVA_ARRAY)falseValues)->data; + JAVA_ARRAY_INT* d = (JAVA_ARRAY_INT*)((JAVA_ARRAY)dst)->data; + int i = 0; + int end = length; + for (; i <= end - 4; i += 4) { + uint32_t packedMask; + memcpy(&packedMask, m + maskOffset + i, sizeof(packedMask)); + uint8x8_t maskBytes = vreinterpret_u8_u32(vdup_n_u32(packedMask)); + uint32x4_t vm = vcgtq_u32(vmovl_u16(vget_low_u16(vmovl_u8(maskBytes))), vdupq_n_u32(0)); + uint32x4_t vt = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(t + trueOffset + i))); + uint32x4_t vf = vreinterpretq_u32_s32(vld1q_s32((int32_t*)(f + falseOffset + i))); + vst1q_s32((int32_t*)(d + dstOffset + i), vreinterpretq_s32_u32(vbslq_u32(vm, vt, vf))); + } + for (; i < end; i++) { + d[dstOffset + i] = m[maskOffset + i] != 0 ? t[trueOffset + i] : f[falseOffset + i]; + } +} diff --git a/Ports/iOSPort/src/com/codename1/impl/ios/IOSImplementation.java b/Ports/iOSPort/src/com/codename1/impl/ios/IOSImplementation.java index 3029ccfa61..d94049d5ed 100644 --- a/Ports/iOSPort/src/com/codename1/impl/ios/IOSImplementation.java +++ b/Ports/iOSPort/src/com/codename1/impl/ios/IOSImplementation.java @@ -110,6 +110,7 @@ import com.codename1.util.Callback; import com.codename1.util.StringUtil; import com.codename1.util.SuccessCallback; +import com.codename1.util.Simd; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -8091,6 +8092,11 @@ public String getPlatformName() { return "ios"; } + @Override + public Simd createSimd() { + return new IOSSimd(); + } + /** * @inheritDoc */ diff --git a/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java b/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java new file mode 100644 index 0000000000..437c79c57f --- /dev/null +++ b/Ports/iOSPort/src/com/codename1/impl/ios/IOSSimd.java @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2026, Codename One and/or its affiliates. All rights reserved. + */ +package com.codename1.impl.ios; + +import com.codename1.util.Simd; + +/** + * iOS SIMD implementation backed by NEON wrappers. + */ +public class IOSSimd extends Simd { + @Override + public boolean isSupported() { + return true; + } + + @Override + public byte[] allocByte(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return allocByteNative(size); + } + + @Override + public int[] allocInt(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return allocIntNative(size); + } + + @Override + public float[] allocFloat(int size) { + if (size < 16) { + throw new IllegalArgumentException("size must be >= 16"); + } + return allocFloatNative(size); + } + + @Override + public native void lookupBytes(byte[] table, byte[] indices, byte[] dst, int offset, int length); + + @Override + public native void lookupBytes(byte[] table, byte[] indices, int indicesOffset, byte[] dst, int dstOffset, int length); + + @Override + public native void add(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void sub(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void mul(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void min(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void max(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void abs(byte[] src, byte[] dst, int offset, int length); + + @Override + public native void clamp(byte[] src, byte[] dst, byte minValue, byte maxValue, int offset, int length); + + @Override + public native void and(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void and(byte[] srcA, int srcAOffset, byte[] srcB, int srcBOffset, byte[] dst, int dstOffset, int length); + + @Override + public native void or(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void or(byte[] srcA, int srcAOffset, byte[] srcB, int srcBOffset, byte[] dst, int dstOffset, int length); + + @Override + public native void xor(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void not(byte[] src, byte[] dst, int offset, int length); + + @Override + public native void cmpEq(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpEq(byte[] src, byte value, byte[] dstMask, int offset, int length); + + @Override + public native void cmpLt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpLt(byte[] src, byte value, byte[] dstMask, int offset, int length); + + @Override + public native void cmpGt(byte[] srcA, byte[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpRange(byte[] src, byte minValue, byte maxValue, byte[] dstMask, int offset, int length); + + @Override + public native void select(byte[] mask, byte[] trueValues, byte[] falseValues, byte[] dst, int offset, int length); + + @Override + public native void unpackUnsignedByteToInt(byte[] src, int[] dst, int offset, int length); + + @Override + public native void packIntToByteSaturating(int[] src, byte[] dst, int offset, int length); + + @Override + public native void packIntToByteTruncate(int[] src, byte[] dst, int offset, int length); + + @Override + public native void packIntToByteTruncate(int[] src, int srcOffset, byte[] dst, int dstOffset, int length); + + @Override + public native void permuteBytes(byte[] src, byte[] indices, byte[] dst, int offset, int length); + + @Override + public native void add(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void sub(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void mul(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void min(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void max(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void abs(int[] src, int[] dst, int offset, int length); + + @Override + public native void clamp(int[] src, int[] dst, int minValue, int maxValue, int offset, int length); + + @Override + public native void and(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void and(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length); + + @Override + public native void or(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void or(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length); + + @Override + public native void xor(int[] srcA, int[] srcB, int[] dst, int offset, int length); + + @Override + public native void not(int[] src, int[] dst, int offset, int length); + + @Override + public native void shl(int[] src, int bits, int[] dst, int offset, int length); + + @Override + public native void shl(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length); + + @Override + public native void shrLogical(int[] src, int bits, int[] dst, int offset, int length); + + @Override + public native void shrLogical(int[] src, int srcOffset, int bits, int[] dst, int dstOffset, int length); + + @Override + public native void shrArithmetic(int[] src, int bits, int[] dst, int offset, int length); + + @Override + public native void cmpEq(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpLt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void cmpGt(int[] srcA, int[] srcB, byte[] dstMask, int offset, int length); + + @Override + public native void select(byte[] mask, int[] trueValues, int[] falseValues, int[] dst, int offset, int length); + + @Override + public native int sum(int[] src, int offset, int length); + + @Override + public native int dot(int[] srcA, int[] srcB, int offset, int length); + + @Override + public native void add(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void sub(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void mul(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void min(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void max(float[] srcA, float[] srcB, float[] dst, int offset, int length); + + @Override + public native void abs(float[] src, float[] dst, int offset, int length); + + @Override + public native void clamp(float[] src, float[] dst, float minValue, float maxValue, int offset, int length); + + @Override + public native float sum(float[] src, int offset, int length); + + @Override + public native float dot(float[] srcA, float[] srcB, int offset, int length); + + @Override + public native void shl(byte[] src, int bits, byte[] dst, int offset, int length); + + @Override + public native void shl(byte[] src, int srcOffset, int bits, byte[] dst, int dstOffset, int length); + + @Override + public native void shrLogical(byte[] src, int bits, byte[] dst, int offset, int length); + + @Override + public native void shrLogical(byte[] src, int srcOffset, int bits, byte[] dst, int dstOffset, int length); + + @Override + public native void addWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void addWrapping(byte[] src, byte value, byte[] dst, int offset, int length); + + @Override + public native void subWrapping(byte[] srcA, byte[] srcB, byte[] dst, int offset, int length); + + @Override + public native void subWrapping(byte[] src, byte value, byte[] dst, int offset, int length); + + @Override + public native void unpackUnsignedByteToInt(byte[] src, int srcOffset, int[] dst, int dstOffset, int length); + + @Override + public native void unpackUnsignedByteToIntInterleaved3(byte[] src, int srcOffset, int[] dst, int dst0Offset, int dst1Offset, int dst2Offset, int length); + + @Override + public native void unpackBytesInterleaved3(byte[] src, int srcOffset, byte[] dst0, byte[] dst1, byte[] dst2, int length); + + @Override + public native void unpackBytesInterleaved3(byte[] src, int srcOffset, byte[] dst, int dst0Offset, int dst1Offset, int dst2Offset, int length); + + @Override + public native void unpackBytesInterleaved4(byte[] src, int srcOffset, byte[] dst0, byte[] dst1, byte[] dst2, byte[] dst3, int length); + + @Override + public native void unpackBytesInterleaved4(byte[] src, int srcOffset, byte[] dst, int dst0Offset, int dst1Offset, int dst2Offset, int dst3Offset, int length); + + @Override + public native int unpackLookupBytesInterleaved4(byte[] table, byte[] src, int srcOffset, byte[] dst0, byte[] dst1, byte[] dst2, byte[] dst3, int length); + + @Override + public native int unpackLookupBytesInterleaved4(byte[] table, byte[] src, int srcOffset, byte[] dst, int dst0Offset, int dst1Offset, int dst2Offset, int dst3Offset, int length); + + @Override + public native void add(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, int[] dst, int dstOffset, int length); + + @Override + public native void packIntToByteTruncateInterleaved4(int[] src, int src0Offset, int src1Offset, int src2Offset, int src3Offset, byte[] dst, int dstOffset, int length); + + @Override + public native void packBytesInterleaved3(byte[] src0, byte[] src1, byte[] src2, byte[] dst, int dstOffset, int length); + + @Override + public native void packBytesInterleaved3(byte[] src, int src0Offset, int src1Offset, int src2Offset, byte[] dst, int dstOffset, int length); + + @Override + public native void packBytesInterleaved4(byte[] src0, byte[] src1, byte[] src2, byte[] src3, byte[] dst, int dstOffset, int length); + + @Override + public native void packBytesInterleaved4(byte[] src, int src0Offset, int src1Offset, int src2Offset, int src3Offset, byte[] dst, int dstOffset, int length); + + @Override + public native void cmpEq(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length); + + @Override + public native void cmpLt(int[] srcA, int srcAOffset, int[] srcB, int srcBOffset, byte[] dstMask, int dstOffset, int length); + + @Override + public native void select(byte[] mask, int maskOffset, int[] trueValues, int trueOffset, int[] falseValues, int falseOffset, int[] dst, int dstOffset, int length); + + private native byte[] allocByteNative(int size); + private native int[] allocIntNative(int size); + private native float[] allocFloatNative(int size); +} diff --git a/maven/core-unittests/src/test/java/com/codename1/ui/ImageTest.java b/maven/core-unittests/src/test/java/com/codename1/ui/ImageTest.java index 6ad4056020..3bb1d9a6b8 100644 --- a/maven/core-unittests/src/test/java/com/codename1/ui/ImageTest.java +++ b/maven/core-unittests/src/test/java/com/codename1/ui/ImageTest.java @@ -4,6 +4,7 @@ import com.codename1.junit.UITestBase; import com.codename1.ui.events.ActionEvent; import com.codename1.ui.events.ActionListener; +import com.codename1.util.Simd; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -371,6 +372,55 @@ void testApplyMaskAutoScale() { assertEquals(50, masked.getHeight()); } + @FormTest + void testImageSimdToggleDefaultsToPlatformSupport() { + boolean supported = Display.isInitialized() && Simd.get() != null && Simd.get().isSupported(); + Image.resetSimdOptimizationsEnabled(); + assertEquals(supported, Image.isSimdOptimizationsEnabled()); + } + + @FormTest + void testImageSimdToggleOverrideAndReset() { + Image.setSimdOptimizationsEnabled(false); + assertFalse(Image.isSimdOptimizationsEnabled()); + Image.setSimdOptimizationsEnabled(true); + assertTrue(Image.isSimdOptimizationsEnabled()); + Image.resetSimdOptimizationsEnabled(); + boolean supported = Display.isInitialized() && Simd.get() != null && Simd.get().isSupported(); + assertEquals(supported, Image.isSimdOptimizationsEnabled()); + } + + @FormTest + void testImageSimdAndScalarPathsMatch() { + int[] rgb = new int[]{ + 0x00FF0000, 0xFFFF0000, 0x8000FF00, 0xFF0000FF, + 0xFFFFFFFF, 0x7F123456, 0x00000000, 0xFFABCDEF, + 0x800000FF, 0xFF00FFFF, 0x40010203, 0xFFFFFFFF, + 0x11223344, 0xFF445566, 0xFF0000FF, 0x00FFFFFF + }; + Image source = Image.createImage(rgb, 4, 4); + try { + Image.setSimdOptimizationsEnabled(false); + Object scalarMask = source.createMask(); + Image scalarApplied = source.applyMask(scalarMask); + Image scalarAlpha = source.modifyAlpha((byte) 0x66); + Image scalarAlphaRemoveColor = source.modifyAlpha((byte) 0x66, 0xFF0000FF); + + Image.setSimdOptimizationsEnabled(true); + Object simdMask = source.createMask(); + Image simdApplied = source.applyMask(simdMask); + Image simdAlpha = source.modifyAlpha((byte) 0x66); + Image simdAlphaRemoveColor = source.modifyAlpha((byte) 0x66, 0xFF0000FF); + + assertArrayEquals(((IndexedImage) scalarMask).getImageDataByte(), ((IndexedImage) simdMask).getImageDataByte()); + assertArrayEquals(scalarApplied.getRGB(), simdApplied.getRGB()); + assertArrayEquals(scalarAlpha.getRGB(), simdAlpha.getRGB()); + assertArrayEquals(scalarAlphaRemoveColor.getRGB(), simdAlphaRemoveColor.getRGB()); + } finally { + Image.resetSimdOptimizationsEnabled(); + } + } + @FormTest void testScaledWithSameDimensionsReturnsSameDimensions() { Image image = Image.createImage(50, 50); diff --git a/maven/core-unittests/src/test/java/com/codename1/ui/RGBImageTest.java b/maven/core-unittests/src/test/java/com/codename1/ui/RGBImageTest.java index ee2d5060b5..bac2e48480 100644 --- a/maven/core-unittests/src/test/java/com/codename1/ui/RGBImageTest.java +++ b/maven/core-unittests/src/test/java/com/codename1/ui/RGBImageTest.java @@ -44,6 +44,23 @@ void testModifyAlphaAndOpaque() { assertTrue(image.isOpaque()); } + @FormTest + void testRgbImageModifyAlphaSimdMatchesScalar() { + RGBImage image = new RGBImage(new int[]{ + 0x00FF0000, 0xFFFF0000, + 0x8000FF00, 0xFF0000FF + }, 2, 2); + try { + Image.setSimdOptimizationsEnabled(false); + RGBImage scalar = (RGBImage) image.modifyAlpha((byte) 0x40); + Image.setSimdOptimizationsEnabled(true); + RGBImage simd = (RGBImage) image.modifyAlpha((byte) 0x40); + assertArrayEquals(scalar.getRGB(), simd.getRGB()); + } finally { + Image.resetSimdOptimizationsEnabled(); + } + } + @FormTest void testDrawImageAndGetRGB() { RGBImage image = createSampleImage(); diff --git a/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java b/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java new file mode 100644 index 0000000000..e2c0a75a38 --- /dev/null +++ b/maven/core-unittests/src/test/java/com/codename1/util/SimdTest.java @@ -0,0 +1,532 @@ +package com.codename1.util; + +import com.codename1.junit.FormTest; +import com.codename1.junit.UITestBase; +import com.codename1.ui.CN; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class SimdTest extends UITestBase { + + @FormTest + void baseFallbackOpsWork() { + Simd simd = new Simd(); + + int[] a = new int[]{1, 2, 3, 4}; + int[] b = new int[]{4, 3, 2, 1}; + int[] out = new int[4]; + simd.add(a, b, out, 0, 4); + assertEquals(5, out[0]); + assertEquals(5, out[3]); + + float[] fa = new float[]{1f, -2f, 3f}; + float[] fb = new float[]{4f, 5f, -6f}; + float[] fo = new float[3]; + simd.mul(fa, fb, fo, 0, 3); + assertEquals(4f, fo[0], 0.0001f); + assertEquals(-18f, fo[2], 0.0001f); + + byte[] ba = new byte[]{120, 100, -128}; + byte[] bb = new byte[]{20, 100, -1}; + byte[] bo = new byte[3]; + simd.add(ba, bb, bo, 0, 3); + assertEquals(127, bo[0]); + assertEquals(127, bo[1]); + assertEquals(-128, bo[2]); + } + + @FormTest + void javaseRegistryGuardInSimulator() { + Simd simd = Simd.get(); + if (!simd.isSupported()) { + return; + } + + int[] regA = simd.allocInt(16); + int[] regB = simd.allocInt(16); + int[] regO = simd.allocInt(16); + simd.add(regA, regB, regO, 0, 16); + + if (CN.isSimulator()) { + int[] plainA = new int[16]; + int[] plainB = new int[16]; + int[] plainO = new int[16]; + Throwable t = assertThrows(IllegalArgumentException.class, () -> simd.add(plainA, plainB, plainO, 0, 16)); + assertTrue(t.getMessage().indexOf("Simd.alloc") >= 0); + } + } + + @FormTest + void genericBitwiseShiftCompareSelectOpsWork() { + Simd simd = new Simd(); + + byte[] a = new byte[]{1, 2, 3, 4}; + byte[] b = new byte[]{3, 2, 1, 4}; + byte[] mask = new byte[4]; + byte[] outB = new byte[4]; + simd.cmpGt(a, b, mask, 0, 4); + simd.select(mask, a, b, outB, 0, 4); + assertEquals(3, outB[0]); + assertEquals(2, outB[1]); + assertEquals(3, outB[2]); + assertEquals(4, outB[3]); + + int[] ia = new int[]{0x0f0f0f0f, 8, -16, 7}; + int[] ib = new int[]{0x00ff00ff, 1, 2, 9}; + int[] io = new int[4]; + simd.and(ia, ib, io, 0, 4); + assertEquals(0x000f000f, io[0]); + simd.shrLogical(ia, 1, io, 0, 4); + assertEquals(4, io[1]); + simd.shrArithmetic(ia, 1, io, 0, 4); + assertEquals(-8, io[2]); + + byte[] intMask = new byte[4]; + simd.cmpLt(ia, ib, intMask, 0, 4); + simd.select(intMask, ia, ib, io, 0, 4); + assertEquals(0x00ff00ff, io[0]); + assertEquals(1, io[1]); + assertEquals(-16, io[2]); + assertEquals(7, io[3]); + + int[] unpack = new int[4]; + simd.unpackUnsignedByteToInt(new byte[]{-1, 0, 1, 127}, unpack, 0, 4); + assertEquals(255, unpack[0]); + assertEquals(127, unpack[3]); + + byte[] packed = new byte[4]; + simd.packIntToByteSaturating(new int[]{-129, -128, 127, 1000}, packed, 0, 4); + assertEquals(-128, packed[0]); + assertEquals(-128, packed[1]); + assertEquals(127, packed[2]); + assertEquals(127, packed[3]); + + byte[] permuted = new byte[4]; + simd.permuteBytes(new byte[]{10, 20, 30, 40}, new byte[]{3, 2, 1, -1}, permuted, 0, 4); + assertEquals(40, permuted[0]); + assertEquals(30, permuted[1]); + assertEquals(20, permuted[2]); + assertEquals(0, permuted[3]); + + byte[] lookedUp = new byte[4]; + simd.lookupBytes(new byte[]{11, 22, 33, 44}, new byte[]{3, 0, 2, 9}, lookedUp, 0, 4); + assertEquals(44, lookedUp[0]); + assertEquals(11, lookedUp[1]); + assertEquals(33, lookedUp[2]); + assertEquals(0, lookedUp[3]); + + byte[] unpacked0 = new byte[2]; + byte[] unpacked1 = new byte[2]; + byte[] unpacked2 = new byte[2]; + byte[] unpacked3 = new byte[2]; + int lookupOr = simd.unpackLookupBytesInterleaved4( + new byte[]{10, 20, 30, 40, -1}, + new byte[]{3, 1, 0, 2, 4, 0, 1, 2}, + 0, + unpacked0, + unpacked1, + unpacked2, + unpacked3, + 2); + assertEquals(40, unpacked0[0]); + assertEquals(20, unpacked1[0]); + assertEquals(10, unpacked2[0]); + assertEquals(30, unpacked3[0]); + assertEquals(-1, unpacked0[1]); + assertEquals(10, unpacked1[1]); + assertEquals(20, unpacked2[1]); + assertEquals(30, unpacked3[1]); + assertTrue(lookupOr < 0); + + byte[] offsetLookup = new byte[8]; + simd.lookupBytes(new byte[]{11, 22, 33, 44}, new byte[]{9, 9, 3, 0, 2, 9, 9, 9}, 2, offsetLookup, 1, 4); + assertEquals(44, offsetLookup[1]); + assertEquals(11, offsetLookup[2]); + assertEquals(33, offsetLookup[3]); + assertEquals(0, offsetLookup[4]); + + byte[] offsetBitwise = new byte[8]; + simd.and(new byte[]{0, (byte)0xF3, (byte)0xCC, 0, 0}, 1, new byte[]{0, (byte)0x3F, (byte)0x0F, 0, 0}, 1, offsetBitwise, 2, 2); + assertEquals((byte)0x33, offsetBitwise[2]); + assertEquals((byte)0x0C, offsetBitwise[3]); + simd.or(new byte[]{0, (byte)0xF0, (byte)0xC0, 0, 0}, 1, new byte[]{0, (byte)0x0F, (byte)0x0C, 0, 0}, 1, offsetBitwise, 4, 2); + assertEquals((byte)0xFF, offsetBitwise[4]); + assertEquals((byte)0xCC, offsetBitwise[5]); + } + + @FormTest + void base64SimdMethodsMatchScalar() { + Simd simd = Simd.get(); + if (!simd.isSupported()) { + return; + } + + // Test that SIMD encode matches scalar encode + byte[] input = new byte[8192]; + for (int i = 0; i < input.length; i++) { + input[i] = (byte)(i * 31 + 17); + } + + int encodedLen = ((input.length + 2) / 3) * 4; + byte[] scalarEncoded = new byte[encodedLen]; + int scalarWritten = Base64.encodeNoNewline(input, scalarEncoded); + + byte[] simdInput = simd.allocByte(input.length); + System.arraycopy(input, 0, simdInput, 0, input.length); + byte[] simdEncoded = simd.allocByte(encodedLen); + int[] scratch = simd.allocInt(192); + int simdWritten = Base64.encodeNoNewlineSimd(simdInput, 0, simdInput.length, simdEncoded, 0); + + assertEquals(scalarWritten, simdWritten); + for (int i = 0; i < scalarWritten; i++) { + assertEquals(scalarEncoded[i], simdEncoded[i], "Encode mismatch at index " + i); + } + + // Test that SIMD decode matches scalar decode + byte[] scalarDecoded = new byte[input.length]; + int scalarDecLen = Base64.decode(scalarEncoded, scalarDecoded); + + byte[] simdDecoded = simd.allocByte(input.length); + int simdDecLen = Base64.decodeNoWhitespaceSimd(simdEncoded, 0, simdWritten, simdDecoded, 0); + + assertEquals(scalarDecLen, simdDecLen); + for (int i = 0; i < scalarDecLen; i++) { + assertEquals(scalarDecoded[i], simdDecoded[i], "Decode mismatch at index " + i); + } + + byte[] legacyEncoded = simd.allocByte(encodedLen); + byte[] legacyDecoded = simd.allocByte(input.length); + int legacyWritten = Base64.encodeNoNewlineSimd(simdInput, 0, simdInput.length, legacyEncoded, 0, scratch); + int legacyDecodedLen = Base64.decodeNoWhitespaceSimd(legacyEncoded, 0, legacyWritten, legacyDecoded, 0, scratch); + assertEquals(simdWritten, legacyWritten); + assertEquals(simdDecLen, legacyDecodedLen); + for (int i = 0; i < legacyWritten; i++) { + assertEquals(simdEncoded[i], legacyEncoded[i], "Legacy encode mismatch at index " + i); + } + for (int i = 0; i < legacyDecodedLen; i++) { + assertEquals(simdDecoded[i], legacyDecoded[i], "Legacy decode mismatch at index " + i); + } + } + + @FormTest + void byteShlAndShrLogicalWork() { + Simd simd = new Simd(); + byte[] src = new byte[]{(byte)0xAB, (byte)0x01, (byte)0xFF, (byte)0x80}; + byte[] dst = new byte[4]; + + simd.shl(src, 4, dst, 0, 4); + assertEquals((byte)0xB0, dst[0]); + assertEquals((byte)0x10, dst[1]); + assertEquals((byte)0xF0, dst[2]); + assertEquals((byte)0x00, dst[3]); + + simd.shrLogical(src, 4, dst, 0, 4); + assertEquals((byte)0x0A, dst[0]); + assertEquals((byte)0x00, dst[1]); + assertEquals((byte)0x0F, dst[2]); + assertEquals((byte)0x08, dst[3]); + + byte[] offsetDst = new byte[8]; + simd.shl(new byte[]{0, (byte)0xAB, (byte)0x01, (byte)0xFF, (byte)0x80, 0}, 1, 4, offsetDst, 2, 4); + assertEquals((byte)0xB0, offsetDst[2]); + assertEquals((byte)0x10, offsetDst[3]); + assertEquals((byte)0xF0, offsetDst[4]); + assertEquals((byte)0x00, offsetDst[5]); + + simd.shrLogical(new byte[]{0, (byte)0xAB, (byte)0x01, (byte)0xFF, (byte)0x80, 0}, 1, 4, offsetDst, 0, 4); + assertEquals((byte)0x0A, offsetDst[0]); + assertEquals((byte)0x00, offsetDst[1]); + assertEquals((byte)0x0F, offsetDst[2]); + assertEquals((byte)0x08, offsetDst[3]); + } + + @FormTest + void addWrappingAndSubWrappingWork() { + Simd simd = new Simd(); + byte[] a = new byte[]{(byte)200, (byte)100, (byte)0, (byte)255}; + byte[] b = new byte[]{(byte)100, (byte)200, (byte)1, (byte)1}; + byte[] out = new byte[4]; + + simd.addWrapping(a, b, out, 0, 4); + assertEquals((byte)44, out[0]); // 200+100=300 mod 256=44 + assertEquals((byte)44, out[1]); // 100+200=300 mod 256=44 + assertEquals((byte)1, out[2]); // 0+1=1 + assertEquals((byte)0, out[3]); // 255+1=256 mod 256=0 + + simd.subWrapping(a, b, out, 0, 4); + assertEquals((byte)100, out[0]); // 200-100=100 + assertEquals((byte)156, out[1]); // 100-200=-100 mod 256=156 + assertEquals((byte)255, out[2]); // 0-1=-1 mod 256=255 + assertEquals((byte)254, out[3]); // 255-1=254 + } + + @FormTest + void offsetBasedIntOpsWork() { + Simd simd = new Simd(); + + // Test offset-based unpack + byte[] bytes = new byte[]{10, 20, (byte)200, (byte)255}; + int[] ints = new int[8]; + simd.unpackUnsignedByteToInt(bytes, 0, ints, 4, 4); + assertEquals(10, ints[4]); + assertEquals(20, ints[5]); + assertEquals(200, ints[6]); + assertEquals(255, ints[7]); + + // Test offset-based add + int[] a = new int[]{0, 0, 5, 10, 15, 20}; + int[] b = new int[]{1, 2, 3, 4, 5, 6}; + int[] out = new int[6]; + simd.add(a, 2, b, 0, out, 1, 4); + assertEquals(6, out[1]); // a[2]+b[0] = 5+1 + assertEquals(12, out[2]); // a[3]+b[1] = 10+2 + assertEquals(18, out[3]); // a[4]+b[2] = 15+3 + assertEquals(24, out[4]); // a[5]+b[3] = 20+4 + + // Test offset-based cmpLt + int[] vals = new int[]{5, 15, 25, 35}; + int[] thresh = new int[]{10, 10, 10, 10}; + byte[] mask = new byte[4]; + simd.cmpLt(vals, 0, thresh, 0, mask, 0, 4); + assertEquals((byte)-1, mask[0]); + assertEquals((byte)0, mask[1]); + assertEquals((byte)0, mask[2]); + assertEquals((byte)0, mask[3]); + + // Test offset-based cmpEq + int[] vals2 = new int[]{10, 20, 10, 30}; + simd.cmpEq(vals2, 0, thresh, 0, mask, 0, 4); + assertEquals((byte)-1, mask[0]); + assertEquals((byte)0, mask[1]); + assertEquals((byte)-1, mask[2]); + assertEquals((byte)0, mask[3]); + + // Test offset-based select + int[] trueV = new int[]{100, 200, 300, 400}; + int[] falseV = new int[]{-1, -2, -3, -4}; + int[] result = new int[4]; + mask[0] = -1; mask[1] = 0; mask[2] = -1; mask[3] = 0; + simd.select(mask, 0, trueV, 0, falseV, 0, result, 0, 4); + assertEquals(100, result[0]); + assertEquals(-2, result[1]); + assertEquals(300, result[2]); + assertEquals(-4, result[3]); + + byte[] maskOut = new byte[4]; + simd.cmpEq(new byte[]{4, 5, 4, 6}, (byte)4, maskOut, 0, 4); + assertEquals((byte)-1, maskOut[0]); + assertEquals((byte)0, maskOut[1]); + assertEquals((byte)-1, maskOut[2]); + assertEquals((byte)0, maskOut[3]); + + simd.cmpLt(new byte[]{1, 2, 3, 4}, (byte)3, maskOut, 0, 4); + assertEquals((byte)-1, maskOut[0]); + assertEquals((byte)-1, maskOut[1]); + assertEquals((byte)0, maskOut[2]); + assertEquals((byte)0, maskOut[3]); + + byte[] wrapped = new byte[4]; + simd.addWrapping(new byte[]{1, 2, (byte)255, (byte)128}, (byte)2, wrapped, 0, 4); + assertEquals((byte)3, wrapped[0]); + assertEquals((byte)4, wrapped[1]); + assertEquals((byte)1, wrapped[2]); + assertEquals((byte)130, wrapped[3]); + + simd.subWrapping(new byte[]{1, 2, 0, (byte)128}, (byte)2, wrapped, 0, 4); + assertEquals((byte)255, wrapped[0]); + assertEquals((byte)0, wrapped[1]); + assertEquals((byte)254, wrapped[2]); + assertEquals((byte)126, wrapped[3]); + + int[] interleavedInts = new int[48]; + simd.unpackUnsignedByteToIntInterleaved3( + new byte[]{ + 10, 11, 12, + 20, 21, 22, + 30, 31, 32, + 40, 41, 42 + }, + 0, + interleavedInts, + 0, + 16, + 32, + 4); + assertEquals(10, interleavedInts[0]); + assertEquals(20, interleavedInts[1]); + assertEquals(41, interleavedInts[16 + 3]); + assertEquals(32, interleavedInts[32 + 2]); + + byte[] interleavedBytes = new byte[16]; + simd.packIntToByteTruncateInterleaved4( + new int[]{ + 65, 66, 67, 68, + 69, 70, 71, 72, + 73, 74, 75, 76, + 77, 78, 79, 80 + }, + 0, + 4, + 8, + 12, + interleavedBytes, + 0, + 4); + assertEquals((byte)65, interleavedBytes[0]); + assertEquals((byte)69, interleavedBytes[1]); + assertEquals((byte)73, interleavedBytes[2]); + assertEquals((byte)77, interleavedBytes[3]); + assertEquals((byte)68, interleavedBytes[12]); + assertEquals((byte)72, interleavedBytes[13]); + assertEquals((byte)76, interleavedBytes[14]); + assertEquals((byte)80, interleavedBytes[15]); + + byte[] stripe0 = new byte[4]; + byte[] stripe1 = new byte[4]; + byte[] stripe2 = new byte[4]; + byte[] stripe3 = new byte[4]; + simd.unpackBytesInterleaved3( + new byte[]{ + 10, 11, 12, + 20, 21, 22, + 30, 31, 32, + 40, 41, 42 + }, + 0, + stripe0, + stripe1, + stripe2, + 4); + assertEquals((byte)10, stripe0[0]); + assertEquals((byte)20, stripe0[1]); + assertEquals((byte)31, stripe1[2]); + assertEquals((byte)42, stripe2[3]); + + simd.unpackBytesInterleaved4( + new byte[]{ + 1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12, + 13, 14, 15, 16 + }, + 0, + stripe0, + stripe1, + stripe2, + stripe3, + 4); + assertEquals((byte)1, stripe0[0]); + assertEquals((byte)5, stripe0[1]); + assertEquals((byte)10, stripe1[2]); + assertEquals((byte)15, stripe2[3]); + assertEquals((byte)16, stripe3[3]); + + byte[] slab = new byte[20]; + simd.unpackBytesInterleaved3( + new byte[]{ + 10, 11, 12, + 20, 21, 22, + 30, 31, 32, + 40, 41, 42 + }, + 0, + slab, + 1, + 6, + 11, + 4); + assertEquals((byte)10, slab[1]); + assertEquals((byte)20, slab[2]); + assertEquals((byte)31, slab[8]); + assertEquals((byte)42, slab[14]); + + simd.unpackBytesInterleaved4( + new byte[]{ + 1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12, + 13, 14, 15, 16 + }, + 0, + slab, + 0, + 4, + 8, + 12, + 4); + assertEquals((byte)1, slab[0]); + assertEquals((byte)5, slab[1]); + assertEquals((byte)10, slab[6]); + assertEquals((byte)15, slab[11]); + assertEquals((byte)16, slab[15]); + + byte[] packed3 = new byte[12]; + simd.packBytesInterleaved3( + new byte[]{1, 5, 9, 13}, + new byte[]{2, 6, 10, 14}, + new byte[]{3, 7, 11, 15}, + packed3, + 0, + 4); + assertEquals((byte)1, packed3[0]); + assertEquals((byte)2, packed3[1]); + assertEquals((byte)3, packed3[2]); + assertEquals((byte)13, packed3[9]); + assertEquals((byte)14, packed3[10]); + assertEquals((byte)15, packed3[11]); + + byte[] packed4 = new byte[16]; + simd.packBytesInterleaved4( + new byte[]{1, 5, 9, 13}, + new byte[]{2, 6, 10, 14}, + new byte[]{3, 7, 11, 15}, + new byte[]{4, 8, 12, 16}, + packed4, + 0, + 4); + assertEquals((byte)1, packed4[0]); + assertEquals((byte)4, packed4[3]); + assertEquals((byte)9, packed4[8]); + assertEquals((byte)16, packed4[15]); + + byte[] packedFromSlab3 = new byte[12]; + simd.packBytesInterleaved3(new byte[]{ + 1, 5, 9, 13, + 2, 6, 10, 14, + 3, 7, 11, 15 + }, + 0, + 4, + 8, + packedFromSlab3, + 0, + 4); + assertEquals((byte)1, packedFromSlab3[0]); + assertEquals((byte)2, packedFromSlab3[1]); + assertEquals((byte)3, packedFromSlab3[2]); + assertEquals((byte)13, packedFromSlab3[9]); + assertEquals((byte)14, packedFromSlab3[10]); + assertEquals((byte)15, packedFromSlab3[11]); + + byte[] packedFromSlab4 = new byte[16]; + simd.packBytesInterleaved4(new byte[]{ + 1, 5, 9, 13, + 2, 6, 10, 14, + 3, 7, 11, 15, + 4, 8, 12, 16 + }, + 0, + 4, + 8, + 12, + packedFromSlab4, + 0, + 4); + assertEquals((byte)1, packedFromSlab4[0]); + assertEquals((byte)4, packedFromSlab4[3]); + assertEquals((byte)9, packedFromSlab4[8]); + assertEquals((byte)16, packedFromSlab4[15]); + } +} diff --git a/scripts/build-ios-app.sh b/scripts/build-ios-app.sh index c88bc3e795..0f3f020a52 100755 --- a/scripts/build-ios-app.sh +++ b/scripts/build-ios-app.sh @@ -91,6 +91,90 @@ mkdir -p "$ARTIFACTS_DIR" export CN1_BUILD_STATS_FILE="$ARTIFACTS_DIR/iphone-builder-stats.txt" +copy_tree_contents() { + local src="$1" + local dest="$2" + mkdir -p "$dest" + if command -v rsync >/dev/null 2>&1; then + rsync -a "$src"/ "$dest"/ + else + cp -R "$src"/. "$dest"/ + fi +} + +find_bytecode_translator_sources() { + local root="$1" + local best="" + local best_score=0 + local dir score m_count c_count h_count + + [ -d "$root" ] || return 1 + + while IFS= read -r dir; do + [ -d "$dir" ] || continue + + score=0 + [ -f "$dir/cn1_globals.m" ] && score=$((score + 100)) + [ -f "$dir/xmlvm.h" ] && score=$((score + 100)) + + m_count="$(find "$dir" -maxdepth 1 -type f -name '*.m' 2>/dev/null | wc -l | tr -d ' ')" + c_count="$(find "$dir" -maxdepth 1 -type f -name '*.c' 2>/dev/null | wc -l | tr -d ' ')" + h_count="$(find "$dir" -maxdepth 1 -type f -name '*.h' 2>/dev/null | wc -l | tr -d ' ')" + + score=$((score + m_count + c_count + h_count)) + + if [ "$score" -gt "$best_score" ]; then + best="$dir" + best_score="$score" + fi + done < <( + find "$root" -type d \ + ! -path '*/Pods/*' \ + ! -path '*/build/*' \ + ! -path '*/Build/*' \ + ! -path '*/DerivedData/*' \ + ! -path '*/xcuserdata/*' \ + 2>/dev/null + ) + + [ -n "$best" ] || return 1 + printf '%s\n' "$best" +} + +stage_bytecode_translator_sources() { + local project_dir="$1" + local artifacts_dir="$2" + + local bt_dir="" + local out_dir="$artifacts_dir/bytecode-translator-sources" + local zip_file="$artifacts_dir/bytecode-translator-sources.zip" + local listing_file="$artifacts_dir/bytecode-translator-files.txt" + + bt_dir="$(find_bytecode_translator_sources "$project_dir" || true)" + if [ -z "$bt_dir" ]; then + bia_log "ByteCodeTranslator source directory not found under $project_dir" + return 0 + fi + + bia_log "Detected ByteCodeTranslator sources at $bt_dir" + + rm -rf "$out_dir" "$zip_file" + mkdir -p "$out_dir" + + copy_tree_contents "$bt_dir" "$out_dir" + + find "$out_dir" -maxdepth 2 -type f \( -name '*.m' -o -name '*.c' -o -name '*.h' \) \ + | sort > "$listing_file" || true + + ( + cd "$artifacts_dir" + zip -qry "$(basename "$zip_file")" "$(basename "$out_dir")" + ) + + bia_log "Staged ByteCodeTranslator sources in $out_dir" + bia_log "Created archive $zip_file" +} + bia_log "Running HelloCodenameOne Maven build with JAVA_HOME=$JAVA17_HOME" ( export JAVA_HOME="$JAVA17_HOME" @@ -162,6 +246,8 @@ if [ -z "$PROJECT_DIR" ]; then fi bia_log "Found generated iOS project at $PROJECT_DIR" +stage_bytecode_translator_sources "$PROJECT_DIR" "$ARTIFACTS_DIR" + if [ -f "$PROJECT_DIR/Podfile" ]; then if ! command -v pod >/dev/null 2>&1; then bia_log "Generated project requires CocoaPods but the pod command is not installed." >&2 diff --git a/scripts/common/java/RenderScreenshotReport.java b/scripts/common/java/RenderScreenshotReport.java index 10d72106ae..e2d128aff6 100644 --- a/scripts/common/java/RenderScreenshotReport.java +++ b/scripts/common/java/RenderScreenshotReport.java @@ -32,21 +32,24 @@ public static void main(String[] args) throws Exception { CoverageSummary coverage = loadCoverage(arguments.coverageSummary, arguments.coverageHtmlUrl); - Map extraStats = new LinkedHashMap<>(); + Map benchmarkStats = new LinkedHashMap<>(); + Map timingStats = new LinkedHashMap<>(); if (arguments.extraStats != null) { for (Path p : arguments.extraStats) { if (Files.isRegularFile(p)) { - parseStatsFile(p, extraStats); + parseStatsFile(p, benchmarkStats, timingStats); } } } - SummaryAndComment output = buildSummaryAndComment(data, title, marker, successMessage, coverage, arguments.vmTime, arguments.compilationTime, extraStats); + SummaryAndComment output = buildSummaryAndComment(data, title, marker, successMessage, coverage, + arguments.vmTime, arguments.compilationTime, benchmarkStats, timingStats); writeLines(arguments.summaryOut, output.summaryLines); writeLines(arguments.commentOut, output.commentLines); } - private static void parseStatsFile(Path p, Map extraStats) { + private static void parseStatsFile(Path p, Map benchmarkStats, Map timingStats) { + Map target = isBenchmarkStatsFile(p) ? benchmarkStats : timingStats; try { List lines = Files.readAllLines(p, StandardCharsets.UTF_8); for (String line : lines) { @@ -58,7 +61,7 @@ private static void parseStatsFile(Path p, Map extraStats) { if (colon > 0) { String key = line.substring(0, colon).trim(); String val = line.substring(colon + 1).trim(); - extraStats.put(key, val); + target.put(key, val); } } } catch (IOException e) { @@ -66,6 +69,11 @@ private static void parseStatsFile(Path p, Map extraStats) { } } + private static boolean isBenchmarkStatsFile(Path p) { + Path name = p.getFileName(); + return name != null && "base64-performance-stats.txt".equals(name.toString()); + } + private static void writeLines(Path path, List lines) throws IOException { StringBuilder sb = new StringBuilder(); for (int i = 0; i < lines.size(); i++) { @@ -80,7 +88,11 @@ private static void writeLines(Path path, List lines) throws IOException Files.writeString(path, sb.toString(), StandardCharsets.UTF_8); } - private static SummaryAndComment buildSummaryAndComment(Map data, String title, String marker, String successMessage, CoverageSummary coverage, Long vmTime, Long compilationTime, Map extraStats) { + private static SummaryAndComment buildSummaryAndComment(Map data, String title, String marker, + String successMessage, CoverageSummary coverage, + Long vmTime, Long compilationTime, + Map benchmarkStats, + Map timingStats) { List summaryLines = new ArrayList<>(); List commentLines = new ArrayList<>(); Object resultsObj = data.get("results"); @@ -190,7 +202,7 @@ private static SummaryAndComment buildSummaryAndComment(Map data } // Add benchmark results at the end - appendBenchmarkResults(commentLines, vmTime, compilationTime, extraStats); + appendBenchmarkResults(commentLines, vmTime, compilationTime, benchmarkStats, timingStats); if (commentLines.isEmpty() || (commentLines.size() == 1 && commentLines.get(0).isEmpty())) { // This fallback block might be redundant now, but kept for safety. @@ -260,8 +272,11 @@ private static void appendCoverageSummary(List summaryLines, CoverageSum } } - private static void appendBenchmarkResults(List commentLines, Long vmTime, Long compilationTime, Map extraStats) { - if (vmTime == null && compilationTime == null && (extraStats == null || extraStats.isEmpty())) { + private static void appendBenchmarkResults(List commentLines, Long vmTime, Long compilationTime, + Map benchmarkStats, Map timingStats) { + if (vmTime == null && compilationTime == null + && (benchmarkStats == null || benchmarkStats.isEmpty()) + && (timingStats == null || timingStats.isEmpty())) { return; } if (!commentLines.isEmpty() && !commentLines.get(commentLines.size() - 1).isEmpty()) { @@ -275,12 +290,21 @@ private static void appendBenchmarkResults(List commentLines, Long vmTim if (compilationTime != null) { commentLines.add(String.format("- **Compilation Time:** %d seconds", compilationTime)); } - if (extraStats != null && !extraStats.isEmpty()) { + if (timingStats != null && !timingStats.isEmpty()) { + commentLines.add(""); + commentLines.add("#### Build and Run Timing"); + commentLines.add("| Metric | Duration |"); + commentLines.add("| --- | --- |"); + for (Map.Entry entry : timingStats.entrySet()) { + commentLines.add(String.format("| %s | %s |", entry.getKey(), entry.getValue())); + } + } + if (benchmarkStats != null && !benchmarkStats.isEmpty()) { commentLines.add(""); commentLines.add("#### Detailed Performance Metrics"); commentLines.add("| Metric | Duration |"); commentLines.add("| --- | --- |"); - for (Map.Entry entry : extraStats.entrySet()) { + for (Map.Entry entry : benchmarkStats.entrySet()) { commentLines.add(String.format("| %s | %s |", entry.getKey(), entry.getValue())); } } diff --git a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java index 75b63cfc9d..31b5147627 100644 --- a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java +++ b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Base64NativePerformanceTest.java @@ -2,13 +2,20 @@ import com.codename1.system.NativeLookup; import com.codename1.ui.Display; +import com.codename1.ui.Image; +import com.codename1.ui.util.ImageIO; import com.codenameone.examples.hellocodenameone.Base64Native; import com.codename1.util.Base64; +import com.codename1.util.Simd; +import java.io.ByteArrayOutputStream; +import java.io.IOException; public class Base64NativePerformanceTest extends BaseTest { private static final int PAYLOAD_BYTES = 8192; private static final int ITERATIONS = 6000; + private static final int IMAGE_BENCHMARK_ITERATIONS = 250; + private static final int IMAGE_BENCHMARK_SIZE = 96; @Override public boolean shouldTakeScreenshot() { @@ -17,80 +24,203 @@ public boolean shouldTakeScreenshot() { @Override public boolean runTest() { - Base64Native nativeBase64 = NativeLookup.create(Base64Native.class); - if (nativeBase64 == null || !nativeBase64.isSupported()) { - System.out.println("CN1SS:STAT:Base64 benchmark status: skipped (native base64 bridge unavailable)"); - done(); - return true; - } + try { + Base64Native nativeBase64 = NativeLookup.create(Base64Native.class); + if (nativeBase64 == null || !nativeBase64.isSupported()) { + emitStat("Base64 benchmark status", "skipped (native base64 bridge unavailable)"); + done(); + return true; + } - String payload = buildPayload(); - String nativeEncoded = nativeBase64.encodeUtf8(payload); - if (nativeEncoded == null || nativeEncoded.length() == 0) { - fail("Native Base64 encode returned empty result"); - return false; - } + String payload = buildPayload(); + String nativeEncoded = nativeBase64.encodeUtf8(payload); + if (nativeEncoded == null || nativeEncoded.length() == 0) { + fail("Native Base64 encode returned empty result"); + return false; + } - byte[] payloadBytes; - try { - payloadBytes = payload.getBytes("UTF-8"); - } catch (Exception ex) { - fail("Failed to encode payload to UTF-8: " + ex); - return false; - } + byte[] payloadBytes; + try { + payloadBytes = payload.getBytes("UTF-8"); + } catch (Exception ex) { + fail("Failed to encode payload to UTF-8: " + ex); + return false; + } - String cn1Encoded = Base64.encodeNoNewline(payloadBytes); - String nativeDecoded = nativeBase64.decodeToUtf8(nativeEncoded); - if (!payload.equals(nativeDecoded)) { - fail("Native Base64 decode mismatch"); - return false; - } + String cn1Encoded = Base64.encodeNoNewline(payloadBytes); + String nativeDecoded = nativeBase64.decodeToUtf8(nativeEncoded); + if (!payload.equals(nativeDecoded)) { + fail("Native Base64 decode mismatch"); + return false; + } - String cn1Decoded = decodeUtf8(cn1Encoded); - if (!payload.equals(cn1Decoded)) { - fail("CN1 Base64 decode mismatch"); - return false; - } + String cn1Decoded = decodeUtf8(cn1Encoded); + if (!payload.equals(cn1Decoded)) { + fail("CN1 Base64 decode mismatch"); + return false; + } - int encodedLen = ((payloadBytes.length + 2) / 3) * 4; - byte[] cn1EncodedBytes = new byte[encodedLen]; - int encodedWritten = Base64.encodeNoNewline(payloadBytes, cn1EncodedBytes); - if (encodedWritten != encodedLen) { - fail("CN1 preallocated Base64 encode returned unexpected length"); - return false; - } - byte[] cn1DecodedBuffer = new byte[payloadBytes.length]; + int encodedLen = ((payloadBytes.length + 2) / 3) * 4; + byte[] cn1EncodedBytes = new byte[encodedLen]; + int encodedWritten = Base64.encodeNoNewline(payloadBytes, cn1EncodedBytes); + if (encodedWritten != encodedLen) { + fail("CN1 preallocated Base64 encode returned unexpected length"); + return false; + } + byte[] cn1DecodedBuffer = new byte[payloadBytes.length]; + boolean ios = isIos(); + Simd simd = Simd.get(); + boolean runSimdBenchmark = false; + String simdStatus = null; + Throwable simdFailure = null; + byte[] simdPayloadBytes = null; + byte[] simdEncodedBytes = null; + byte[] simdDecodedBuffer = null; + if (ios) { + if (simd == null) { + simdStatus = "unavailable (Simd.get() returned null)"; + } else if (!simd.isSupported()) { + simdStatus = "unsupported on this iOS runtime"; + } else { + try { + simdPayloadBytes = simd.allocByte(payloadBytes.length); + System.arraycopy(payloadBytes, 0, simdPayloadBytes, 0, payloadBytes.length); + simdEncodedBytes = simd.allocByte(encodedLen); + simdDecodedBuffer = simd.allocByte(payloadBytes.length); - if (!isIos()) { - warmup(nativeBase64, payload, payloadBytes, nativeEncoded, cn1EncodedBytes, cn1DecodedBuffer); - } + int simdEncodedWritten = Base64.encodeNoNewlineSimd(simdPayloadBytes, 0, simdPayloadBytes.length, simdEncodedBytes, 0); + if (simdEncodedWritten != encodedLen) { + simdStatus = "unavailable (unexpected SIMD encode length " + simdEncodedWritten + ")"; + } else if (!byteArraysEqual(cn1EncodedBytes, simdEncodedBytes, encodedLen)) { + simdStatus = "unavailable (SIMD encode mismatch)"; + } else { + int simdDecodedWritten = Base64.decodeNoWhitespaceSimd(simdEncodedBytes, 0, encodedLen, simdDecodedBuffer, 0); + if (simdDecodedWritten != payloadBytes.length) { + simdStatus = "unavailable (unexpected SIMD decode length " + simdDecodedWritten + ")"; + } else if (!byteArraysEqual(payloadBytes, simdDecodedBuffer, payloadBytes.length)) { + simdStatus = "unavailable (SIMD decode mismatch)"; + } else { + runSimdBenchmark = true; + } + } + } catch (Throwable t) { + simdFailure = t; + simdStatus = "failed (" + formatThrowable(t) + ")"; + logThrowable("CN1SS:ERR:Base64 SIMD benchmark exception", t); + } + } + } - long nativeEncodeMs = measureNativeEncode(nativeBase64, payload); - long cn1EncodeMs = measureCn1Encode(payloadBytes, cn1EncodedBytes); - long nativeDecodeMs = measureNativeDecode(nativeBase64, nativeEncoded); - long cn1DecodeMs = measureCn1Decode(cn1EncodedBytes, cn1DecodedBuffer); + if (!ios) { + warmup(nativeBase64, payload, payloadBytes, nativeEncoded, cn1EncodedBytes, cn1DecodedBuffer, + false, simdPayloadBytes, simdEncodedBytes, simdDecodedBuffer, encodedLen); + } + if (runSimdBenchmark) { + warmup(nativeBase64, payload, payloadBytes, nativeEncoded, cn1EncodedBytes, cn1DecodedBuffer, + true, simdPayloadBytes, simdEncodedBytes, simdDecodedBuffer, encodedLen); + } - double encodeRatio = cn1EncodeMs / Math.max(1.0, (double) nativeEncodeMs); - double decodeRatio = cn1DecodeMs / Math.max(1.0, (double) nativeDecodeMs); - emitStat("Base64 payload size", payloadBytes.length + " bytes"); - emitStat("Base64 benchmark iterations", String.valueOf(ITERATIONS)); - emitStat("Base64 native encode", formatMs(nativeEncodeMs)); - emitStat("Base64 CN1 encode", formatMs(cn1EncodeMs)); - emitStat("Base64 encode ratio (CN1/native)", formatRatio(encodeRatio)); - emitStat("Base64 native decode", formatMs(nativeDecodeMs)); - emitStat("Base64 CN1 decode", formatMs(cn1DecodeMs)); - emitStat("Base64 decode ratio (CN1/native)", formatRatio(decodeRatio)); + long nativeEncodeMs = measureNativeEncode(nativeBase64, payload); + long cn1EncodeMs = measureCn1Encode(payloadBytes, cn1EncodedBytes); + long nativeDecodeMs = measureNativeDecode(nativeBase64, nativeEncoded); + long cn1DecodeMs = measureCn1Decode(cn1EncodedBytes, cn1DecodedBuffer); + long simdEncodeMs = runSimdBenchmark ? measureSimdEncode(simdPayloadBytes, simdEncodedBytes) : -1; + long simdDecodeMs = runSimdBenchmark ? measureSimdDecode(simdEncodedBytes, simdDecodedBuffer) : -1; - done(); - return true; + double encodeRatio = cn1EncodeMs / Math.max(1.0, (double) nativeEncodeMs); + double decodeRatio = cn1DecodeMs / Math.max(1.0, (double) nativeDecodeMs); + emitStat("Base64 payload size", payloadBytes.length + " bytes"); + emitStat("Base64 benchmark iterations", String.valueOf(ITERATIONS)); + emitStat("Base64 native encode", formatMs(nativeEncodeMs)); + emitStat("Base64 CN1 encode", formatMs(cn1EncodeMs)); + emitStat("Base64 encode ratio (CN1/native)", formatRatio(encodeRatio)); + emitStat("Base64 native decode", formatMs(nativeDecodeMs)); + emitStat("Base64 CN1 decode", formatMs(cn1DecodeMs)); + emitStat("Base64 decode ratio (CN1/native)", formatRatio(decodeRatio)); + if (runSimdBenchmark) { + double simdEncodeRatioVsNative = simdEncodeMs / Math.max(1.0, (double) nativeEncodeMs); + double simdDecodeRatioVsNative = simdDecodeMs / Math.max(1.0, (double) nativeDecodeMs); + double simdEncodeRatioVsCn1 = simdEncodeMs / Math.max(1.0, (double) cn1EncodeMs); + double simdDecodeRatioVsCn1 = simdDecodeMs / Math.max(1.0, (double) cn1DecodeMs); + emitStat("Base64 SIMD encode", formatMs(simdEncodeMs)); + emitStat("Base64 encode ratio (SIMD/native)", formatRatio(simdEncodeRatioVsNative)); + emitStat("Base64 encode ratio (SIMD/CN1)", formatRatio(simdEncodeRatioVsCn1)); + emitStat("Base64 SIMD decode", formatMs(simdDecodeMs)); + emitStat("Base64 decode ratio (SIMD/native)", formatRatio(simdDecodeRatioVsNative)); + emitStat("Base64 decode ratio (SIMD/CN1)", formatRatio(simdDecodeRatioVsCn1)); + } else if (simdStatus != null) { + emitStat("Base64 SIMD benchmark status", simdStatus); + } + + if (simd != null && simd.isSupported()) { + ImageIO imageIo = ImageIO.getImageIO(); + if (imageIo == null) { + emitStat("Image encode benchmark status", "skipped (ImageIO unavailable)"); + } else if (!imageIo.isFormatSupported(ImageIO.FORMAT_PNG)) { + emitStat("Image encode benchmark status", "skipped (PNG unsupported)"); + } else { + Image benchmarkImage = buildBenchmarkImage(IMAGE_BENCHMARK_SIZE, IMAGE_BENCHMARK_SIZE, false); + Image benchmarkMaskImage = buildBenchmarkImage(IMAGE_BENCHMARK_SIZE, IMAGE_BENCHMARK_SIZE, true); + warmupImageEncode(imageIo, benchmarkImage, benchmarkMaskImage); + long pngScalarMs = measureImageEncode(imageIo, benchmarkImage, benchmarkMaskImage, ImageIO.FORMAT_PNG, 1f, false); + long pngSimdMs = measureImageEncode(imageIo, benchmarkImage, benchmarkMaskImage, ImageIO.FORMAT_PNG, 1f, true); + emitStat("Image encode benchmark iterations", String.valueOf(IMAGE_BENCHMARK_ITERATIONS)); + emitStat("Image PNG encode (SIMD off)", formatMs(pngScalarMs)); + emitStat("Image PNG encode (SIMD on)", formatMs(pngSimdMs)); + emitStat("Image PNG encode ratio (SIMD on/off)", formatRatio(pngSimdMs, pngScalarMs)); + if (imageIo.isFormatSupported(ImageIO.FORMAT_JPEG)) { + long jpegScalarMs = measureImageEncode(imageIo, benchmarkImage, benchmarkMaskImage, ImageIO.FORMAT_JPEG, 0.82f, false); + long jpegSimdMs = measureImageEncode(imageIo, benchmarkImage, benchmarkMaskImage, ImageIO.FORMAT_JPEG, 0.82f, true); + emitStat("Image JPEG encode (SIMD off)", formatMs(jpegScalarMs)); + emitStat("Image JPEG encode (SIMD on)", formatMs(jpegSimdMs)); + emitStat("Image JPEG encode ratio (SIMD on/off)", formatRatio(jpegSimdMs, jpegScalarMs)); + } else { + emitStat("Image JPEG encode benchmark status", "skipped (JPEG unsupported)"); + } + } + } else { + emitStat("Image encode benchmark status", "skipped (SIMD unsupported)"); + } + + if (simdFailure != null) { + fail("Base64 SIMD benchmark failed: " + formatThrowable(simdFailure)); + return false; + } + + done(); + return true; + } catch (Throwable t) { + emitStat("Base64 benchmark status", "failed (" + formatThrowable(t) + ")"); + logThrowable("CN1SS:ERR:Base64 benchmark exception", t); + fail("Base64 benchmark failed: " + t); + return false; + } } - private static void warmup(Base64Native nativeBase64, String payload, byte[] payloadBytes, String nativeEncoded, byte[] cn1EncodedBytes, byte[] cn1DecodedBuffer) { + private static void warmup(Base64Native nativeBase64, String payload, byte[] payloadBytes, String nativeEncoded, byte[] cn1EncodedBytes, + byte[] cn1DecodedBuffer, boolean includeSimd, byte[] simdPayloadBytes, byte[] simdEncodedBytes, + byte[] simdDecodedBuffer, int encodedLen) { for (int i = 0; i < 40; i++) { nativeBase64.encodeUtf8(payload); - Base64.encodeNoNewline(payloadBytes, cn1EncodedBytes); + int cn1EncodedWritten = Base64.encodeNoNewline(payloadBytes, cn1EncodedBytes); + if (cn1EncodedWritten != encodedLen) { + throw new IllegalStateException("Warmup CN1 encode length mismatch"); + } nativeBase64.decodeToUtf8(nativeEncoded); - Base64.decode(cn1EncodedBytes, cn1DecodedBuffer); + int cn1DecodedWritten = Base64.decode(cn1EncodedBytes, cn1DecodedBuffer); + if (cn1DecodedWritten != payloadBytes.length || !byteArraysEqual(payloadBytes, cn1DecodedBuffer, payloadBytes.length)) { + throw new IllegalStateException("Warmup CN1 decode mismatch"); + } + if (includeSimd) { + int simdEncodedWritten = Base64.encodeNoNewlineSimd(simdPayloadBytes, 0, simdPayloadBytes.length, simdEncodedBytes, 0); + if (simdEncodedWritten != encodedLen || !byteArraysEqual(cn1EncodedBytes, simdEncodedBytes, encodedLen)) { + throw new IllegalStateException("Warmup SIMD encode mismatch"); + } + int simdDecodedWritten = Base64.decodeNoWhitespaceSimd(simdEncodedBytes, 0, encodedLen, simdDecodedBuffer, 0); + if (simdDecodedWritten != payloadBytes.length || !byteArraysEqual(payloadBytes, simdDecodedBuffer, payloadBytes.length)) { + throw new IllegalStateException("Warmup SIMD decode mismatch"); + } + } } } @@ -126,6 +256,71 @@ private static long measureCn1Decode(byte[] encoded, byte[] outputBuffer) { return System.currentTimeMillis() - start; } + private static long measureSimdEncode(byte[] payloadBytes, byte[] outputBuffer) { + long start = System.currentTimeMillis(); + for (int i = 0; i < ITERATIONS; i++) { + Base64.encodeNoNewlineSimd(payloadBytes, 0, payloadBytes.length, outputBuffer, 0); + } + return System.currentTimeMillis() - start; + } + + private static long measureSimdDecode(byte[] encoded, byte[] outputBuffer) { + long start = System.currentTimeMillis(); + for (int i = 0; i < ITERATIONS; i++) { + Base64.decodeNoWhitespaceSimd(encoded, 0, encoded.length, outputBuffer, 0); + } + return System.currentTimeMillis() - start; + } + + private static void warmupImageEncode(ImageIO imageIo, Image benchmarkImage, Image benchmarkMaskImage) throws IOException { + measureImageEncode(imageIo, benchmarkImage, benchmarkMaskImage, ImageIO.FORMAT_PNG, 1f, false, 20); + measureImageEncode(imageIo, benchmarkImage, benchmarkMaskImage, ImageIO.FORMAT_PNG, 1f, true, 20); + } + + private static long measureImageEncode(ImageIO imageIo, Image benchmarkImage, Image benchmarkMaskImage, + String format, float quality, boolean enableSimd) throws IOException { + return measureImageEncode(imageIo, benchmarkImage, benchmarkMaskImage, format, quality, enableSimd, IMAGE_BENCHMARK_ITERATIONS); + } + + private static long measureImageEncode(ImageIO imageIo, Image benchmarkImage, Image benchmarkMaskImage, + String format, float quality, boolean enableSimd, int iterations) throws IOException { + boolean originalSimd = Image.isSimdOptimizationsEnabled(); + try { + Image.setSimdOptimizationsEnabled(enableSimd); + long start = System.currentTimeMillis(); + for (int i = 0; i < iterations; i++) { + Image alphaAdjusted = benchmarkImage.modifyAlpha((byte) 0x90); + Object mask = benchmarkMaskImage.createMask(); + Image masked = alphaAdjusted.applyMask(mask); + ByteArrayOutputStream out = new ByteArrayOutputStream(4096); + imageIo.save(masked, out, format, quality); + } + return System.currentTimeMillis() - start; + } finally { + Image.setSimdOptimizationsEnabled(originalSimd); + } + } + + private static Image buildBenchmarkImage(int width, int height, boolean maskImage) { + int[] rgb = new int[width * height]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + int offset = x + y * width; + int alpha; + if (maskImage) { + alpha = 0xff; + } else { + alpha = 80 + ((x * 17 + y * 29) & 0x7f); + } + int red = (x * 13 + y * 7) & 0xff; + int green = (x * 5 + y * 19) & 0xff; + int blue = (x * 23 + y * 11) & 0xff; + rgb[offset] = (alpha << 24) | (red << 16) | (green << 8) | blue; + } + } + return Image.createImage(rgb, width, height); + } + private static String decodeUtf8(String base64) { try { return new String(Base64.decode(base64.getBytes()), "UTF-8"); @@ -147,6 +342,21 @@ private static boolean isIos() { return platformName != null && platformName.toLowerCase().contains("ios"); } + private static boolean byteArraysEqual(byte[] a, byte[] b, int len) { + if (a == b) { + return true; + } + if (a == null || b == null || a.length < len || b.length < len) { + return false; + } + for (int i = 0; i < len; i++) { + if (a[i] != b[i]) { + return false; + } + } + return true; + } + private static String formatMs(double millis) { return formatDecimal(millis, 3) + " ms"; } @@ -156,6 +366,13 @@ private static String formatRatio(double ratio) { return formatDecimal(ratio, 3) + "x (" + formatDecimal(Math.abs(slowerPct), 1) + "% " + (slowerPct >= 0 ? "slower" : "faster") + ")"; } + private static String formatRatio(long value, long reference) { + if (reference <= 0) { + return "N/A (reference time was 0ms)"; + } + return formatRatio(value / (double) reference); + } + private static String formatDecimal(double value, int decimals) { boolean negative = value < 0; double abs = Math.abs(value); @@ -174,6 +391,43 @@ private static String formatDecimal(double value, int decimals) { return negative ? "-" + formatted : formatted; } + private static String formatThrowable(Throwable t) { + if (t == null) { + return "unknown error"; + } + String type = t.getClass().getSimpleName(); + String message = t.getMessage(); + if (message == null || message.length() == 0) { + return type; + } + return type + ": " + message; + } + + private static void logThrowable(String prefix, Throwable t) { + if (t == null) { + System.out.println(prefix + "=unknown error"); + return; + } + System.out.println(prefix + "=" + t); + StackTraceElement[] stack = t.getStackTrace(); + if (stack == null) { + return; + } + for (StackTraceElement element : stack) { + System.out.println("CN1SS:ERR:Stack:" + element); + } + Throwable cause = t.getCause(); + if (cause != null && cause != t) { + System.out.println("CN1SS:ERR:Cause=" + cause); + StackTraceElement[] causeStack = cause.getStackTrace(); + if (causeStack != null) { + for (StackTraceElement element : causeStack) { + System.out.println("CN1SS:ERR:CauseStack:" + element); + } + } + } + } + private static void emitStat(String metric, String value) { System.out.println("CN1SS:STAT:" + metric + ": " + value); } diff --git a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Cn1ssDeviceRunner.java b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Cn1ssDeviceRunner.java index 7eb91d2869..5cd364a492 100644 --- a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Cn1ssDeviceRunner.java +++ b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/Cn1ssDeviceRunner.java @@ -80,6 +80,7 @@ public final class Cn1ssDeviceRunner extends DeviceRunner { new OrientationLockScreenshotTest(), new InPlaceEditViewTest(), new BytecodeTranslatorRegressionTest(), + new SimdApiTest(), new StreamApiTest(), new TimeApiTest(), new Java17Tests(), diff --git a/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java new file mode 100644 index 0000000000..36c07f55a6 --- /dev/null +++ b/scripts/hellocodenameone/common/src/main/java/com/codenameone/examples/hellocodenameone/tests/SimdApiTest.java @@ -0,0 +1,94 @@ +package com.codenameone.examples.hellocodenameone.tests; + +import com.codename1.ui.CN; +import com.codename1.util.Simd; + +public class SimdApiTest extends BaseTest { + @Override + public boolean runTest() { + try { + Simd simd = Simd.get(); + if (!simd.isSupported()) { + int[] a = new int[]{1, 2, 3, 4}; + int[] b = new int[]{9, 8, 7, 6}; + int[] out = new int[4]; + simd.add(a, b, out, 0, 4); + if (out[0] != 10 || out[1] != 10 || out[2] != 10 || out[3] != 10) { + fail("Fallback SIMD API add failed on unsupported platform"); + return false; + } + done(); + return true; + } + + int[] a = simd.allocInt(16); + int[] b = simd.allocInt(16); + int[] out = simd.allocInt(16); + for (int i = 0; i < 8; i++) { + a[i] = i + 1; + b[i] = 9 - i; + } + simd.add(a, b, out, 0, 8); + for (int i = 0; i < 8; i++) { + if (out[i] != 10) { + fail("Unexpected int add result at " + i + ": " + out[i]); + return false; + } + } + + float[] fa = simd.allocFloat(16); + float[] fb = simd.allocFloat(16); + float[] fo = simd.allocFloat(16); + fa[0] = 1.5f; + fa[1] = -2f; + fa[2] = 3f; + fa[3] = -4f; + fb[0] = 2f; + fb[1] = 3f; + fb[2] = -1f; + fb[3] = 0.5f; + simd.mul(fa, fb, fo, 0, 4); + if (Math.abs(fo[0] - 3f) > 0.0001f || Math.abs(fo[1] + 6f) > 0.0001f + || Math.abs(fo[2] + 3f) > 0.0001f || Math.abs(fo[3] + 2f) > 0.0001f) { + fail("Unexpected float mul results"); + return false; + } + + byte[] ba = simd.allocByte(16); + byte[] bb = simd.allocByte(16); + byte[] bo = simd.allocByte(16); + ba[0] = 120; + ba[1] = 10; + ba[2] = -120; + bb[0] = 20; + bb[1] = -40; + bb[2] = -20; + simd.add(ba, bb, bo, 0, 3); + if (bo[0] != 127 || bo[1] != -30 || bo[2] != -128) { + fail("Unexpected saturating byte add results"); + return false; + } + + if (CN.isSimulator()) { + try { + simd.add(new int[4], new int[4], new int[4], 0, 4); + fail("Expected simulator registry guard to reject non-alloc arrays"); + return false; + } catch (IllegalArgumentException expected) { + // expected + } + } + + done(); + return true; + } catch (Throwable t) { + fail("SimdApiTest failed: " + t); + return false; + } + } + + @Override + public boolean shouldTakeScreenshot() { + return false; + } +} diff --git a/scripts/run-ios-ui-tests.sh b/scripts/run-ios-ui-tests.sh index 12789aa059..935d690d34 100755 --- a/scripts/run-ios-ui-tests.sh +++ b/scripts/run-ios-ui-tests.sh @@ -7,21 +7,29 @@ ri_log() { echo "[run-ios-ui-tests] $1"; } ensure_dir() { mkdir -p "$1" 2>/dev/null || true; } extract_base64_stats() { - local log_file="$1" - local out_file="$2" - [ -f "$log_file" ] || return 0 - - local lines - lines="$(grep 'CN1SS:STAT:' "$log_file" 2>/dev/null | sed -E 's/^.*CN1SS:STAT://')" || true - if [ -z "${lines:-}" ]; then - return 0 - fi + local out_file="$1" + shift + local log_file lines found=0 : > "$out_file" - while IFS= read -r line; do - [ -n "$line" ] || continue - echo "$line" >> "$out_file" - done <<< "$lines" + for log_file in "$@"; do + [ -f "$log_file" ] || continue + lines="$(grep 'CN1SS:STAT:' "$log_file" 2>/dev/null | sed -E 's/^.*CN1SS:STAT://')" || true + if [ -z "${lines:-}" ]; then + continue + fi + found=1 + while IFS= read -r line; do + [ -n "$line" ] || continue + echo "$line" >> "$out_file" + done <<< "$lines" + done + + if [ "$found" -eq 1 ] && [ -f "$out_file" ]; then + awk '!seen[$0]++' "$out_file" > "$out_file.tmp" && mv "$out_file.tmp" "$out_file" + else + rm -f "$out_file" + fi } if [ $# -lt 1 ]; then @@ -672,11 +680,6 @@ while true; do done END_TIME=$(date +%s) echo "Test Execution : $(( (END_TIME - START_TIME) * 1000 )) ms" >> "$ARTIFACTS_DIR/ios-test-stats.txt" -BASE64_STATS_FILE="$ARTIFACTS_DIR/base64-performance-stats.txt" -extract_base64_stats "$TEST_LOG" "$BASE64_STATS_FILE" -if [ -s "$BASE64_STATS_FILE" ]; then - ri_log "Base64 benchmark stats captured at $BASE64_STATS_FILE" -fi sleep 3 @@ -690,6 +693,17 @@ xcrun simctl spawn "$SIM_DEVICE_ID" \ --predicate '(composedMessage CONTAINS "CN1SS") OR (eventMessage CONTAINS "CN1SS")' \ > "$FALLBACK_LOG" 2>/dev/null || true +BASE64_STATS_FILE="$ARTIFACTS_DIR/base64-performance-stats.txt" +extract_base64_stats "$BASE64_STATS_FILE" "$TEST_LOG" "$FALLBACK_LOG" +if [ -s "$BASE64_STATS_FILE" ]; then + ri_log "Base64 benchmark stats captured at $BASE64_STATS_FILE" +fi + +BASE64_BENCHMARK_FAILURE_LINE="$( (grep -h "CN1SS:ERR:suite test=Base64NativePerformanceTest failed" "$TEST_LOG" "$FALLBACK_LOG" || true) | tail -n 1 )" +if [ -n "$BASE64_BENCHMARK_FAILURE_LINE" ]; then + ri_log "Detected Base64 benchmark failure line: $BASE64_BENCHMARK_FAILURE_LINE" +fi + SWIFT_DIAG_LINE="$( (grep -h "CN1SS:INFO:swift_diag_status=" "$TEST_LOG" "$FALLBACK_LOG" || true) | tail -n 1 )" if [ -n "$SWIFT_DIAG_LINE" ]; then ri_log "Detected swift diagnostic status line: $SWIFT_DIAG_LINE" @@ -822,4 +836,9 @@ comment_rc=$? cp -f "$BUILD_LOG" "$ARTIFACTS_DIR/xcodebuild-build.log" 2>/dev/null || true cp -f "$TEST_LOG" "$ARTIFACTS_DIR/device-runner.log" 2>/dev/null || true +if [ -n "$BASE64_BENCHMARK_FAILURE_LINE" ]; then + ri_log "STAGE:BENCHMARK_FAILED -> $BASE64_BENCHMARK_FAILURE_LINE" + exit 16 +fi + exit $comment_rc diff --git a/vm/ByteCodeTranslator/src/cn1_globals.h b/vm/ByteCodeTranslator/src/cn1_globals.h index 5b3c8bfebe..dbd1cc1dac 100644 --- a/vm/ByteCodeTranslator/src/cn1_globals.h +++ b/vm/ByteCodeTranslator/src/cn1_globals.h @@ -10,6 +10,7 @@ #include #include #include +#include //#define DEBUG_GC_ALLOCATIONS @@ -1085,6 +1086,7 @@ extern void arrayFinalizerFunction(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT array) extern void gcReleaseObj(JAVA_OBJECT o); extern JAVA_OBJECT allocArray(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type, int primitiveSize, int dim); +extern JAVA_OBJECT allocArrayAligned(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type, int primitiveSize, int dim, int alignment); extern JAVA_OBJECT allocMultiArray(int* lengths, struct clazz* type, int primitiveSize, int dim); extern JAVA_OBJECT alloc2DArray(CODENAME_ONE_THREAD_STATE, int length1, int length2, struct clazz* parentType, struct clazz* childType, int primitiveSize); extern JAVA_OBJECT alloc3DArray(CODENAME_ONE_THREAD_STATE, int length1, int length2, int length3, struct clazz* parentType, struct clazz* childType, struct clazz* grandChildType, int primitiveSize); diff --git a/vm/ByteCodeTranslator/src/cn1_globals.m b/vm/ByteCodeTranslator/src/cn1_globals.m index 3269fa92e1..4b3e883111 100644 --- a/vm/ByteCodeTranslator/src/cn1_globals.m +++ b/vm/ByteCodeTranslator/src/cn1_globals.m @@ -1221,6 +1221,31 @@ JAVA_OBJECT allocArray(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type return (JAVA_OBJECT)array; } +JAVA_OBJECT allocArrayAligned(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type, int primitiveSize, int dim, int alignment) { + int actualSize = length * primitiveSize; + int requestedAlignment = alignment; + if (requestedAlignment < (int)sizeof(void*)) { + requestedAlignment = (int)sizeof(void*); + } + if ((requestedAlignment & (requestedAlignment - 1)) != 0) { + requestedAlignment = 16; + } + int extraPadding = requestedAlignment - 1; + JAVA_ARRAY array = (JAVA_ARRAY)codenameOneGcMalloc(threadStateData, sizeof(struct JavaArrayPrototype) + actualSize + sizeof(void*) + extraPadding, type); + (*array).length = length; + (*array).dimensions = dim; + (*array).primitiveSize = primitiveSize; + if (actualSize > 0) { + char* arr = (char*)(&(array->data)); + arr += sizeof(void*); + uintptr_t aligned = (((uintptr_t)arr) + ((uintptr_t)requestedAlignment - 1)) & ~((uintptr_t)requestedAlignment - 1); + (*array).data = (void*)aligned; + } else { + (*array).data = 0; + } + return (JAVA_OBJECT)array; +} + JAVA_OBJECT alloc2DArray(CODENAME_ONE_THREAD_STATE, int length2, int length1, struct clazz* parentType, struct clazz* childType, int primitiveSize) { JAVA_ARRAY base = (JAVA_ARRAY)allocArray(threadStateData, length1, parentType, sizeof(JAVA_OBJECT), 2); JAVA_ARRAY_OBJECT* objs = base->data;