Просмотр исходного кода

Add Smlal_Ve, Smlsl_Ve, Smull_Ve, Umlal_Ve, Umlsl_Ve, Umull_Ve Inst.; add Tests. Add Sse Opt. for Trn1/2_V and Uzp1/2_V Inst. Nits. (#566)

* Update OpCodeTable.cs

* Update InstEmitSimdArithmetic.cs

* Update InstEmitSimdHelper.cs

* Update CpuTestSimdRegElem.cs

* Update InstEmitSimdMove.cs

* Update InstEmitSimdCvt.cs

* Update SoftFallback.cs

* Update InstEmitSimdHelper.cs

* Update SoftFloat.cs

* Update CryptoHelper.cs

* Update InstEmitSimdArithmetic.cs

* Update InstEmitSimdCmp.cs

* Address PR feedback.

* Address PR feedback.
LDj3SNuD 7 лет назад
Родитель
Сommit
8f7fcede7f

+ 34 - 31
ChocolArm64/Instructions/CryptoHelper.cs

@@ -9,7 +9,7 @@ namespace ChocolArm64.Instructions
     static class CryptoHelper
     {
 #region "LookUp Tables"
-        private static byte[] _sBox =
+        private static readonly byte[] _sBox = new byte[]
         {
             0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
             0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
@@ -29,7 +29,7 @@ namespace ChocolArm64.Instructions
             0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
         };
 
-        private static byte[] _invSBox =
+        private static readonly byte[] _invSBox = new byte[]
         {
             0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
             0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
@@ -49,7 +49,7 @@ namespace ChocolArm64.Instructions
             0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
         };
 
-        private static byte[] _gfMul02 =
+        private static readonly byte[] _gfMul02 = new byte[]
         {
             0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e,
             0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
@@ -69,7 +69,7 @@ namespace ChocolArm64.Instructions
             0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5
         };
 
-        private static byte[] _gfMul03 =
+        private static readonly byte[] _gfMul03 = new byte[]
         {
             0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11,
             0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21,
@@ -89,7 +89,7 @@ namespace ChocolArm64.Instructions
             0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a
         };
 
-        private static byte[] _gfMul09 =
+        private static readonly byte[] _gfMul09 = new byte[]
         {
             0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77,
             0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7,
@@ -109,7 +109,7 @@ namespace ChocolArm64.Instructions
             0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46
         };
 
-        private static byte[] _gfMul0B =
+        private static readonly byte[] _gfMul0B = new byte[]
         {
             0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69,
             0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9,
@@ -129,7 +129,7 @@ namespace ChocolArm64.Instructions
             0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3
         };
 
-        private static byte[] _gfMul0D =
+        private static readonly byte[] _gfMul0D = new byte[]
         {
             0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b,
             0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b,
@@ -149,7 +149,7 @@ namespace ChocolArm64.Instructions
             0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97
         };
 
-        private static byte[] _gfMul0E =
+        private static readonly byte[] _gfMul0E = new byte[]
         {
             0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a,
             0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba,
@@ -169,9 +169,15 @@ namespace ChocolArm64.Instructions
             0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d
         };
 
-        private static byte[] _srPerm = { 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3 };
+        private static readonly byte[] _srPerm = new byte[]
+        {
+            0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
+        };
 
-        private static byte[] _isrPerm = { 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11 };
+        private static readonly byte[] _isrPerm = new byte[]
+        {
+            0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
+        };
 #endregion
 
         public static Vector128<float> AesInvMixColumns(Vector128<float> op)
@@ -179,7 +185,7 @@ namespace ChocolArm64.Instructions
             byte[] inState  = new byte[16];
             byte[] outState = new byte[16];
 
-            FromVectorToByteArray(inState, ref op);
+            FromVectorToByteArray(op, inState);
 
             for (int columns = 0; columns <= 3; columns++)
             {
@@ -206,7 +212,7 @@ namespace ChocolArm64.Instructions
             byte[] inState  = new byte[16];
             byte[] outState = new byte[16];
 
-            FromVectorToByteArray(inState, ref op);
+            FromVectorToByteArray(op, inState);
 
             for (int idx = 0; idx <= 15; idx++)
             {
@@ -223,7 +229,7 @@ namespace ChocolArm64.Instructions
             byte[] inState  = new byte[16];
             byte[] outState = new byte[16];
 
-            FromVectorToByteArray(inState, ref op);
+            FromVectorToByteArray(op, inState);
 
             for (int idx = 0; idx <= 15; idx++)
             {
@@ -240,7 +246,7 @@ namespace ChocolArm64.Instructions
             byte[] inState  = new byte[16];
             byte[] outState = new byte[16];
 
-            FromVectorToByteArray(inState, ref op);
+            FromVectorToByteArray(op, inState);
 
             for (int columns = 0; columns <= 3; columns++)
             {
@@ -267,7 +273,7 @@ namespace ChocolArm64.Instructions
             byte[] inState  = new byte[16];
             byte[] outState = new byte[16];
 
-            FromVectorToByteArray(inState, ref op);
+            FromVectorToByteArray(op, inState);
 
             for (int idx = 0; idx <= 15; idx++)
             {
@@ -284,7 +290,7 @@ namespace ChocolArm64.Instructions
             byte[] inState  = new byte[16];
             byte[] outState = new byte[16];
 
-            FromVectorToByteArray(inState, ref op);
+            FromVectorToByteArray(op, inState);
 
             for (int idx = 0; idx <= 15; idx++)
             {
@@ -296,33 +302,30 @@ namespace ChocolArm64.Instructions
             return op;
         }
 
-        private static void FromVectorToByteArray(byte[] state, ref Vector128<float> op)
+        private unsafe static void FromVectorToByteArray(Vector128<float> op, byte[] state)
         {
-            ulong uLongLow  = VectorHelper.VectorExtractIntZx((op), (byte)0, 3);
-            ulong uLongHigh = VectorHelper.VectorExtractIntZx((op), (byte)1, 3);
-
-            for (int idx = 0; idx <= 7; idx++)
+            if (!Sse2.IsSupported)
             {
-                state[idx + 0] = (byte)(uLongLow  & 0xFFUL);
-                state[idx + 8] = (byte)(uLongHigh & 0xFFUL);
+                throw new PlatformNotSupportedException();
+            }
 
-                uLongLow  >>= 8;
-                uLongHigh >>= 8;
+            fixed (byte* ptr = &state[0])
+            {
+                Sse2.Store(ptr, Sse.StaticCast<float, byte>(op));
             }
         }
 
-        private static void FromByteArrayToVector(byte[] state, ref Vector128<float> op)
+        private unsafe static void FromByteArrayToVector(byte[] state, ref Vector128<float> op)
         {
             if (!Sse2.IsSupported)
             {
                 throw new PlatformNotSupportedException();
             }
 
-            op = Sse.StaticCast<byte, float>(Sse2.SetVector128(
-                state[15], state[14], state[13], state[12],
-                state[11], state[10], state[9],  state[8],
-                state[7],  state[6],  state[5],  state[4],
-                state[3],  state[2],  state[1],  state[0]));
+            fixed (byte* ptr = &state[0])
+            {
+                op = Sse.StaticCast<byte, float>(Sse2.LoadVector128(ptr));
+            }
         }
     }
 }

+ 63 - 34
ChocolArm64/Instructions/InstEmitSimdArithmetic.cs

@@ -392,8 +392,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fadd_S(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.AddScalar));
             }
@@ -408,8 +407,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fadd_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Add));
             }
@@ -470,8 +468,7 @@ namespace ChocolArm64.Instructions
 
         public static void Faddp_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Add));
             }
@@ -486,8 +483,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fdiv_S(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.DivideScalar));
             }
@@ -502,8 +498,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fdiv_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Divide));
             }
@@ -564,8 +559,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fmax_S(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.MaxScalar));
             }
@@ -580,8 +574,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fmax_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Max));
             }
@@ -612,8 +605,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fmaxp_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Max));
             }
@@ -628,8 +620,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fmin_S(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.MinScalar));
             }
@@ -644,8 +635,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fmin_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Min));
             }
@@ -676,8 +666,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fminp_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Min));
             }
@@ -984,8 +973,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fmul_S(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.MultiplyScalar));
             }
@@ -1005,8 +993,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fmul_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Multiply));
             }
@@ -1753,8 +1740,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fsqrt_S(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.SqrtScalar));
             }
@@ -1769,8 +1755,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fsqrt_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Sqrt));
             }
@@ -1785,8 +1770,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fsub_S(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.SubtractScalar));
             }
@@ -1801,8 +1785,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fsub_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Subtract));
             }
@@ -2268,6 +2251,15 @@ namespace ChocolArm64.Instructions
             }
         }
 
+        public static void Smlal_Ve(ILEmitterCtx context)
+        {
+            EmitVectorWidenTernaryOpByElemSx(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Add);
+            });
+        }
+
         public static void Smlsl_V(ILEmitterCtx context)
         {
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
@@ -2319,11 +2311,25 @@ namespace ChocolArm64.Instructions
             }
         }
 
+        public static void Smlsl_Ve(ILEmitterCtx context)
+        {
+            EmitVectorWidenTernaryOpByElemSx(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Sub);
+            });
+        }
+
         public static void Smull_V(ILEmitterCtx context)
         {
             EmitVectorWidenRnRmBinaryOpSx(context, () => context.Emit(OpCodes.Mul));
         }
 
+        public static void Smull_Ve(ILEmitterCtx context)
+        {
+            EmitVectorWidenBinaryOpByElemSx(context, () => context.Emit(OpCodes.Mul));
+        }
+
         public static void Sqabs_S(ILEmitterCtx context)
         {
             EmitScalarSaturatingUnaryOpSx(context, () => EmitAbs(context));
@@ -2929,6 +2935,15 @@ namespace ChocolArm64.Instructions
             }
         }
 
+        public static void Umlal_Ve(ILEmitterCtx context)
+        {
+            EmitVectorWidenTernaryOpByElemZx(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Add);
+            });
+        }
+
         public static void Umlsl_V(ILEmitterCtx context)
         {
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
@@ -2980,11 +2995,25 @@ namespace ChocolArm64.Instructions
             }
         }
 
+        public static void Umlsl_Ve(ILEmitterCtx context)
+        {
+            EmitVectorWidenTernaryOpByElemZx(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Sub);
+            });
+        }
+
         public static void Umull_V(ILEmitterCtx context)
         {
             EmitVectorWidenRnRmBinaryOpZx(context, () => context.Emit(OpCodes.Mul));
         }
 
+        public static void Umull_Ve(ILEmitterCtx context)
+        {
+            EmitVectorWidenBinaryOpByElemZx(context, () => context.Emit(OpCodes.Mul));
+        }
+
         public static void Uqadd_S(ILEmitterCtx context)
         {
             EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Add);

+ 10 - 20
ChocolArm64/Instructions/InstEmitSimdCmp.cs

@@ -173,8 +173,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fcmeq_S(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqualScalar), scalar: true);
             }
@@ -186,8 +185,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fcmeq_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqual), scalar: false);
             }
@@ -199,8 +197,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fcmge_S(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true);
             }
@@ -212,8 +209,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fcmge_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false);
             }
@@ -225,8 +221,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fcmgt_S(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true);
             }
@@ -238,8 +233,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fcmgt_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false);
             }
@@ -251,8 +245,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fcmle_S(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true, isLeOrLt: true);
             }
@@ -264,8 +257,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fcmle_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false, isLeOrLt: true);
             }
@@ -277,8 +269,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fcmlt_S(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true, isLeOrLt: true);
             }
@@ -290,8 +281,7 @@ namespace ChocolArm64.Instructions
 
         public static void Fcmlt_V(ILEmitterCtx context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false, isLeOrLt: true);
             }

+ 4 - 6
ChocolArm64/Instructions/InstEmitSimdCvt.cs

@@ -78,7 +78,6 @@ namespace ChocolArm64.Instructions
 
             if (Optimizations.UseSse2 && sizeF == 1)
             {
-                Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
                 Type[] typesCvt = new Type[] { typeof(Vector128<float>) };
 
                 string nameMov = op.RegisterSize == RegisterSize.Simd128
@@ -88,7 +87,7 @@ namespace ChocolArm64.Instructions
                 context.EmitLdvec(op.Rn);
                 context.Emit(OpCodes.Dup);
 
-                context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));
+                context.EmitCall(typeof(Sse).GetMethod(nameMov));
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Double), typesCvt));
 
@@ -144,7 +143,6 @@ namespace ChocolArm64.Instructions
 
             if (Optimizations.UseSse2 && sizeF == 1)
             {
-                Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
                 Type[] typesCvt = new Type[] { typeof(Vector128<double>) };
 
                 string nameMov = op.RegisterSize == RegisterSize.Simd128
@@ -154,15 +152,15 @@ namespace ChocolArm64.Instructions
                 context.EmitLdvec(op.Rd);
                 VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
 
-                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
 
                 EmitLdvecWithCastToDouble(context, op.Rn);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt));
                 context.Emit(OpCodes.Dup);
 
-                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
 
-                context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));
+                context.EmitCall(typeof(Sse).GetMethod(nameMov));
 
                 context.EmitStvec(op.Rd);
             }

+ 62 - 4
ChocolArm64/Instructions/InstEmitSimdHelper.cs

@@ -642,21 +642,21 @@ namespace ChocolArm64.Instructions
         {
             OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
 
-            EmitVectorOpByElem(context, emit, op.Index, false, true);
+            EmitVectorOpByElem(context, emit, op.Index, ternary: false, signed: true);
         }
 
         public static void EmitVectorBinaryOpByElemZx(ILEmitterCtx context, Action emit)
         {
             OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
 
-            EmitVectorOpByElem(context, emit, op.Index, false, false);
+            EmitVectorOpByElem(context, emit, op.Index, ternary: false, signed: false);
         }
 
         public static void EmitVectorTernaryOpByElemZx(ILEmitterCtx context, Action emit)
         {
             OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
 
-            EmitVectorOpByElem(context, emit, op.Index, true, false);
+            EmitVectorOpByElem(context, emit, op.Index, ternary: true, signed: false);
         }
 
         public static void EmitVectorOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed)
@@ -809,6 +809,64 @@ namespace ChocolArm64.Instructions
             context.EmitStvec(op.Rd);
         }
 
+        public static void EmitVectorWidenBinaryOpByElemSx(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
+
+            EmitVectorWidenOpByElem(context, emit, op.Index, ternary: false, signed: true);
+        }
+
+        public static void EmitVectorWidenBinaryOpByElemZx(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
+
+            EmitVectorWidenOpByElem(context, emit, op.Index, ternary: false, signed: false);
+        }
+
+        public static void EmitVectorWidenTernaryOpByElemSx(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
+
+            EmitVectorWidenOpByElem(context, emit, op.Index, ternary: true, signed: true);
+        }
+
+        public static void EmitVectorWidenTernaryOpByElemZx(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
+
+            EmitVectorWidenOpByElem(context, emit, op.Index, ternary: true, signed: false);
+        }
+
+        public static void EmitVectorWidenOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int elems = 8 >> op.Size;
+
+            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
+
+            EmitVectorExtract(context, op.Rm, elem, op.Size, signed);
+            context.EmitSttmp();
+
+            for (int index = 0; index < elems; index++)
+            {
+                if (ternary)
+                {
+                    EmitVectorExtract(context, op.Rd, index, op.Size + 1, signed);
+                }
+
+                EmitVectorExtract(context, op.Rn, part + index, op.Size, signed);
+                context.EmitLdtmp();
+
+                emit();
+
+                EmitVectorInsertTmp(context, index, op.Size + 1);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+        }
+
         public static void EmitVectorPairwiseOpSx(ILEmitterCtx context, Action emit)
         {
             EmitVectorPairwiseOp(context, emit, true);
@@ -1416,7 +1474,7 @@ namespace ChocolArm64.Instructions
             if (Optimizations.UseSse)
             {
                 //TODO: Use Sse2.MoveScalar once it is fixed,
-                //as of the time of writing it just crashes the JIT (SDK 2.1.500).
+                //as of the time of writing it just crashes the JIT (SDK 2.1.503).
 
                 /*Type[] typesMov = new Type[] { typeof(Vector128<ulong>) };
 

+ 180 - 62
ChocolArm64/Instructions/InstEmitSimdMove.cs

@@ -12,6 +12,34 @@ namespace ChocolArm64.Instructions
 {
     static partial class InstEmit
     {
+#region "Masks"
+        private static readonly long[] _masksE0_TrnUzpXtn = new long[]
+        {
+            14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0,
+            13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0,
+            11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0
+        };
+
+        private static readonly long[] _masksE1_TrnUzp = new long[]
+        {
+            15L << 56 | 13L << 48 | 11L << 40 | 09L << 32 | 07L << 24 | 05L << 16 | 03L << 8 | 01L << 0,
+            15L << 56 | 14L << 48 | 11L << 40 | 10L << 32 | 07L << 24 | 06L << 16 | 03L << 8 | 02L << 0,
+            15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0
+        };
+
+        private static readonly long[] _masksE0_Uzp = new long[]
+        {
+            13L << 56 | 09L << 48 | 05L << 40 | 01L << 32 | 12L << 24 | 08L << 16 | 04L << 8 | 00L << 0,
+            11L << 56 | 10L << 48 | 03L << 40 | 02L << 32 | 09L << 24 | 08L << 16 | 01L << 8 | 00L << 0
+        };
+
+        private static readonly long[] _masksE1_Uzp = new long[]
+        {
+            15L << 56 | 11L << 48 | 07L << 40 | 03L << 32 | 14L << 24 | 10L << 16 | 06L << 8 | 02L << 0,
+            15L << 56 | 14L << 48 | 07L << 40 | 06L << 32 | 13L << 24 | 12L << 16 | 05L << 8 | 04L << 0
+        };
+#endregion
+
         public static void Dup_Gp(ILEmitterCtx context)
         {
             OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
@@ -379,15 +407,6 @@ namespace ChocolArm64.Instructions
 
             if (Optimizations.UseSsse3)
             {
-                long[] masks = new long[]
-                {
-                    14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0,
-                    13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0,
-                    11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0
-                };
-
-                Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
-                Type[] typesSfl = new Type[] { typeof(Vector128<sbyte>), typeof(Vector128<sbyte>) };
                 Type[] typesSve = new Type[] { typeof(long), typeof(long) };
 
                 string nameMov = op.RegisterSize == RegisterSize.Simd128
@@ -397,18 +416,18 @@ namespace ChocolArm64.Instructions
                 context.EmitLdvec(op.Rd);
                 VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
 
-                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
 
-                EmitLdvecWithSignedCast(context, op.Rn, 0);
+                EmitLdvecWithSignedCast(context, op.Rn, 0); // value
 
-                context.EmitLdc_I8(masks[op.Size]);
-                context.Emit(OpCodes.Dup);
+                context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // mask
+                context.Emit(OpCodes.Dup); // mask
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
 
-                context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl));
+                context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
 
-                context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));
+                context.EmitCall(typeof(Sse).GetMethod(nameMov));
 
                 context.EmitStvec(op.Rd);
             }
@@ -465,22 +484,61 @@ namespace ChocolArm64.Instructions
         {
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
 
-            int words = op.GetBitsCount() >> 4;
-            int pairs = words >> op.Size;
-
-            for (int index = 0; index < pairs; index++)
+            if (Optimizations.UseSsse3)
             {
-                int idx = index << 1;
+                Type[] typesSve = new Type[] { typeof(long), typeof(long) };
+
+                string nameUpk = part == 0
+                    ? nameof(Sse2.UnpackLow)
+                    : nameof(Sse2.UnpackHigh);
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value
+
+                if (op.Size < 3)
+                {
+                    context.EmitLdc_I8(_masksE1_TrnUzp   [op.Size]); // maskE1
+                    context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                    context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
+                }
+
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size); // value
+
+                if (op.Size < 3)
+                {
+                    context.EmitLdc_I8(_masksE1_TrnUzp   [op.Size]); // maskE1
+                    context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                    context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
+                }
 
-                EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
-                EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
+                context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(op.Size)));
 
-                EmitVectorInsertTmp(context, idx + 1, op.Size);
-                EmitVectorInsertTmp(context, idx,     op.Size);
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
             }
+            else
+            {
+                int words = op.GetBitsCount() >> 4;
+                int pairs = words >> op.Size;
 
-            context.EmitLdvectmp();
-            context.EmitStvec(op.Rd);
+                for (int index = 0; index < pairs; index++)
+                {
+                    int idx = index << 1;
+
+                    EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
+                    EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
+
+                    EmitVectorInsertTmp(context, idx + 1, op.Size);
+                    EmitVectorInsertTmp(context, idx,     op.Size);
+                }
+
+                context.EmitLdvectmp();
+                context.EmitStvec(op.Rd);
+            }
 
             if (op.RegisterSize == RegisterSize.Simd64)
             {
@@ -492,26 +550,91 @@ namespace ChocolArm64.Instructions
         {
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
 
-            int words = op.GetBitsCount() >> 4;
-            int pairs = words >> op.Size;
-
-            for (int index = 0; index < pairs; index++)
+            if (Optimizations.UseSsse3)
             {
-                int idx = index << 1;
+                Type[] typesSve = new Type[] { typeof(long), typeof(long) };
+
+                string nameUpk = part == 0
+                    ? nameof(Sse2.UnpackLow)
+                    : nameof(Sse2.UnpackHigh);
 
-                EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
-                EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value
 
-                EmitVectorInsertTmp(context, pairs + index, op.Size);
-                EmitVectorInsertTmp(context,         index, op.Size);
-            }
+                    if (op.Size < 3)
+                    {
+                        context.EmitLdc_I8(_masksE1_TrnUzp   [op.Size]); // maskE1
+                        context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
 
-            context.EmitLdvectmp();
-            context.EmitStvec(op.Rd);
+                        context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
 
-            if (op.RegisterSize == RegisterSize.Simd64)
+                        context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
+                    }
+
+                    EmitLdvecWithSignedCast(context, op.Rm, op.Size); // value
+
+                    if (op.Size < 3)
+                    {
+                        context.EmitLdc_I8(_masksE1_TrnUzp   [op.Size]); // maskE1
+                        context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
+
+                        context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                        context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
+                    }
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3)));
+
+                    EmitStvecWithSignedCast(context, op.Rd, op.Size);
+                }
+                else
+                {
+                    EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+                    EmitLdvecWithSignedCast(context, op.Rm, op.Size);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackLow), GetTypesSflUpk(op.Size))); // value
+
+                    if (op.Size < 2)
+                    {
+                        context.EmitLdc_I8(_masksE1_Uzp[op.Size]); // maskE1
+                        context.EmitLdc_I8(_masksE0_Uzp[op.Size]); // maskE0
+
+                        context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                        context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
+                    }
+
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt64Zero));
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3)));
+
+                    EmitStvecWithSignedCast(context, op.Rd, op.Size);
+                }
+            }
+            else
             {
-                EmitVectorZeroUpper(context, op.Rd);
+                int words = op.GetBitsCount() >> 4;
+                int pairs = words >> op.Size;
+
+                for (int index = 0; index < pairs; index++)
+                {
+                    int idx = index << 1;
+
+                    EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
+                    EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
+
+                    EmitVectorInsertTmp(context, pairs + index, op.Size);
+                    EmitVectorInsertTmp(context,         index, op.Size);
+                }
+
+                context.EmitLdvectmp();
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
             }
         }
 
@@ -521,36 +644,26 @@ namespace ChocolArm64.Instructions
 
             if (Optimizations.UseSse2)
             {
-                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
-                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
-
-                Type[] types = new Type[]
-                {
-                    VectorUIntTypesPerSizeLog2[op.Size],
-                    VectorUIntTypesPerSizeLog2[op.Size]
-                };
-
-                string name = part == 0 || (part != 0 && op.RegisterSize == RegisterSize.Simd64)
+                string nameUpk = part == 0
                     ? nameof(Sse2.UnpackLow)
                     : nameof(Sse2.UnpackHigh);
 
-                context.EmitCall(typeof(Sse2).GetMethod(name, types));
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
 
-                if (op.RegisterSize == RegisterSize.Simd64 && part != 0)
+                if (op.RegisterSize == RegisterSize.Simd128)
                 {
-                    context.EmitLdc_I4(8);
-
-                    Type[] shTypes = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
-
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), shTypes));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(op.Size)));
                 }
-
-                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
-
-                if (op.RegisterSize == RegisterSize.Simd64 && part == 0)
+                else
                 {
-                    EmitVectorZeroUpper(context, op.Rd);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackLow), GetTypesSflUpk(op.Size)));
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt64Zero));
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3)));
                 }
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
             }
             else
             {
@@ -579,5 +692,10 @@ namespace ChocolArm64.Instructions
                 }
             }
         }
+
+        private static Type[] GetTypesSflUpk(int size)
+        {
+            return new Type[] { VectorIntTypesPerSizeLog2[size], VectorIntTypesPerSizeLog2[size] };
+        }
     }
 }

+ 2 - 7
ChocolArm64/Instructions/SoftFallback.cs

@@ -664,7 +664,7 @@ namespace ChocolArm64.Instructions
 
             for (int bit = highBit; bit >= 0; bit--)
             {
-                if (((value >> bit) & 0b1) != 0)
+                if (((int)(value >> bit) & 0b1) != 0)
                 {
                     return (ulong)(highBit - bit);
                 }
@@ -688,7 +688,7 @@ namespace ChocolArm64.Instructions
             do
             {
                 nibbleIdx -= 4;
-                preCount = ClzNibbleTbl[(value >> nibbleIdx) & 0b1111];
+                preCount = ClzNibbleTbl[(int)(value >> nibbleIdx) & 0b1111];
                 count += preCount;
             }
             while (preCount == 4);
@@ -698,11 +698,6 @@ namespace ChocolArm64.Instructions
 
         public static ulong CountSetBits8(ulong value) // "size" is 8 (SIMD&FP Inst.).
         {
-            if (value == 0xfful)
-            {
-                return 8ul;
-            }
-
             value = ((value >> 1) & 0x55ul) + (value & 0x55ul);
             value = ((value >> 2) & 0x33ul) + (value & 0x33ul);
 

+ 4 - 4
ChocolArm64/Instructions/SoftFloat.cs

@@ -1545,9 +1545,9 @@ namespace ChocolArm64.Instructions
             return -value;
         }
 
-        private static float ZerosOrOnes(bool zeros)
+        private static float ZerosOrOnes(bool ones)
         {
-            return BitConverter.Int32BitsToSingle(!zeros ? 0 : -1);
+            return BitConverter.Int32BitsToSingle(ones ? -1 : 0);
         }
 
         private static float FPUnpack(
@@ -2629,9 +2629,9 @@ namespace ChocolArm64.Instructions
             return -value;
         }
 
-        private static double ZerosOrOnes(bool zeros)
+        private static double ZerosOrOnes(bool ones)
         {
-            return BitConverter.Int64BitsToDouble(!zeros ? 0L : -1L);
+            return BitConverter.Int64BitsToDouble(ones ? -1L : 0L);
         }
 
         private static double FPUnpack(

+ 6 - 0
ChocolArm64/OpCodeTable.cs

@@ -445,9 +445,12 @@ namespace ChocolArm64
             SetA64("0x001110<<1xxxxx011011xxxxxxxxxx", InstEmit.Smin_V,          typeof(OpCodeSimdReg64));
             SetA64("0x001110<<1xxxxx101011xxxxxxxxxx", InstEmit.Sminp_V,         typeof(OpCodeSimdReg64));
             SetA64("0x001110<<1xxxxx100000xxxxxxxxxx", InstEmit.Smlal_V,         typeof(OpCodeSimdReg64));
+            SetA64("0x001111xxxxxxxx0010x0xxxxxxxxxx", InstEmit.Smlal_Ve,        typeof(OpCodeSimdRegElem64));
             SetA64("0x001110<<1xxxxx101000xxxxxxxxxx", InstEmit.Smlsl_V,         typeof(OpCodeSimdReg64));
+            SetA64("0x001111xxxxxxxx0110x0xxxxxxxxxx", InstEmit.Smlsl_Ve,        typeof(OpCodeSimdRegElem64));
             SetA64("0x001110000xxxxx001011xxxxxxxxxx", InstEmit.Smov_S,          typeof(OpCodeSimdIns64));
             SetA64("0x001110<<1xxxxx110000xxxxxxxxxx", InstEmit.Smull_V,         typeof(OpCodeSimdReg64));
+            SetA64("0x001111xxxxxxxx1010x0xxxxxxxxxx", InstEmit.Smull_Ve,        typeof(OpCodeSimdRegElem64));
             SetA64("01011110xx100000011110xxxxxxxxxx", InstEmit.Sqabs_S,         typeof(OpCodeSimd64));
             SetA64("0>001110<<100000011110xxxxxxxxxx", InstEmit.Sqabs_V,         typeof(OpCodeSimd64));
             SetA64("01011110xx1xxxxx000011xxxxxxxxxx", InstEmit.Sqadd_S,         typeof(OpCodeSimdReg64));
@@ -534,9 +537,12 @@ namespace ChocolArm64
             SetA64("0x101110<<1xxxxx011011xxxxxxxxxx", InstEmit.Umin_V,          typeof(OpCodeSimdReg64));
             SetA64("0x101110<<1xxxxx101011xxxxxxxxxx", InstEmit.Uminp_V,         typeof(OpCodeSimdReg64));
             SetA64("0x101110<<1xxxxx100000xxxxxxxxxx", InstEmit.Umlal_V,         typeof(OpCodeSimdReg64));
+            SetA64("0x101111xxxxxxxx0010x0xxxxxxxxxx", InstEmit.Umlal_Ve,        typeof(OpCodeSimdRegElem64));
             SetA64("0x101110<<1xxxxx101000xxxxxxxxxx", InstEmit.Umlsl_V,         typeof(OpCodeSimdReg64));
+            SetA64("0x101111xxxxxxxx0110x0xxxxxxxxxx", InstEmit.Umlsl_Ve,        typeof(OpCodeSimdRegElem64));
             SetA64("0x001110000xxxxx001111xxxxxxxxxx", InstEmit.Umov_S,          typeof(OpCodeSimdIns64));
             SetA64("0x101110<<1xxxxx110000xxxxxxxxxx", InstEmit.Umull_V,         typeof(OpCodeSimdReg64));
+            SetA64("0x101111xxxxxxxx1010x0xxxxxxxxxx", InstEmit.Umull_Ve,        typeof(OpCodeSimdRegElem64));
             SetA64("01111110xx1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_S,         typeof(OpCodeSimdReg64));
             SetA64("0>101110<<1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_V,         typeof(OpCodeSimdReg64));
             SetA64("0>101110<<1xxxxx010111xxxxxxxxxx", InstEmit.Uqrshl_V,        typeof(OpCodeSimdReg64));

+ 81 - 0
Ryujinx.Tests/Cpu/CpuTestSimdRegElem.cs

@@ -45,6 +45,32 @@ namespace Ryujinx.Tests.Cpu
                 0x0F808000u  // MUL V0.2S, V0.2S, V0.S[0]
             };
         }
+
+        private static uint[] _SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S_()
+        {
+            return new uint[]
+            {
+                0x0F402000u, // SMLAL V0.4S, V0.4H, V0.H[0]
+                0x0F406000u, // SMLSL V0.4S, V0.4H, V0.H[0]
+                0x0F40A000u, // SMULL V0.4S, V0.4H, V0.H[0]
+                0x2F402000u, // UMLAL V0.4S, V0.4H, V0.H[0]
+                0x2F406000u, // UMLSL V0.4S, V0.4H, V0.H[0]
+                0x2F40A000u  // UMULL V0.4S, V0.4H, V0.H[0]
+            };
+        }
+
+        private static uint[] _SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D_()
+        {
+            return new uint[]
+            {
+                0x0F802000u, // SMLAL V0.2D, V0.2S, V0.S[0]
+                0x0F806000u, // SMLSL V0.2D, V0.2S, V0.S[0]
+                0x0F80A000u, // SMULL V0.2D, V0.2S, V0.S[0]
+                0x2F802000u, // UMLAL V0.2D, V0.2S, V0.S[0]
+                0x2F806000u, // UMLSL V0.2D, V0.2S, V0.S[0]
+                0x2F80A000u  // UMULL V0.2D, V0.2S, V0.S[0]
+            };
+        }
 #endregion
 
         private const int RndCnt = 2;
@@ -103,6 +129,61 @@ namespace Ryujinx.Tests.Cpu
 
             CompareAgainstUnicorn();
         }
+
+        [Test, Pairwise]
+        public void SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S([ValueSource("_SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S_")] uint opcodes,
+                                                   [Values(0u)]     uint rd,
+                                                   [Values(1u, 0u)] uint rn,
+                                                   [Values(2u, 0u)] uint rm,
+                                                   [ValueSource("_4H_")] [Random(RndCnt)] ulong z,
+                                                   [ValueSource("_4H_")] [Random(RndCnt)] ulong a,
+                                                   [ValueSource("_4H_")] [Random(RndCnt)] ulong b,
+                                                   [Values(0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u)] uint index,
+                                                   [Values(0b0u, 0b1u)] uint q) // <4H4S, 8H4S>
+        {
+            uint h = (index >> 2) & 1;
+            uint l = (index >> 1) & 1;
+            uint m = index & 1;
+
+            opcodes |= ((rm & 15) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
+            opcodes |= (l << 21) | (m << 20) | (h << 11);
+            opcodes |= ((q & 1) << 30);
+
+            Vector128<float> v0 = MakeVectorE0E1(z, z);
+            Vector128<float> v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul);
+            Vector128<float> v2 = MakeVectorE0E1(b, b * h);
+
+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise]
+        public void SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D([ValueSource("_SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D_")] uint opcodes,
+                                                   [Values(0u)]     uint rd,
+                                                   [Values(1u, 0u)] uint rn,
+                                                   [Values(2u, 0u)] uint rm,
+                                                   [ValueSource("_2S_")] [Random(RndCnt)] ulong z,
+                                                   [ValueSource("_2S_")] [Random(RndCnt)] ulong a,
+                                                   [ValueSource("_2S_")] [Random(RndCnt)] ulong b,
+                                                   [Values(0u, 1u, 2u, 3u)] uint index,
+                                                   [Values(0b0u, 0b1u)] uint q) // <2S2D, 4S2D>
+        {
+            uint h = (index >> 1) & 1;
+            uint l = index & 1;
+
+            opcodes |= ((rm & 15) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
+            opcodes |= (l << 21) | (h << 11);
+            opcodes |= ((q & 1) << 30);
+
+            Vector128<float> v0 = MakeVectorE0E1(z, z);
+            Vector128<float> v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul);
+            Vector128<float> v2 = MakeVectorE0E1(b, b * h);
+
+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
 #endif
     }
 }