Просмотр исходного кода

Add Smlal_Ve, Smlsl_Ve, Smull_Ve, Umlal_Ve, Umlsl_Ve, Umull_Ve Inst.; add Tests. Add Sse Opt. for Trn1/2_V and Uzp1/2_V Inst. Nits. (#566)

* Update OpCodeTable.cs

* Update InstEmitSimdArithmetic.cs

* Update InstEmitSimdHelper.cs

* Update CpuTestSimdRegElem.cs

* Update InstEmitSimdMove.cs

* Update InstEmitSimdCvt.cs

* Update SoftFallback.cs

* Update InstEmitSimdHelper.cs

* Update SoftFloat.cs

* Update CryptoHelper.cs

* Update InstEmitSimdArithmetic.cs

* Update InstEmitSimdCmp.cs

* Address PR feedback.

* Address PR feedback.
LDj3SNuD 7 лет назад
Родитель
Сommit
8f7fcede7f

+ 34 - 31
ChocolArm64/Instructions/CryptoHelper.cs

@@ -9,7 +9,7 @@ namespace ChocolArm64.Instructions
     static class CryptoHelper
     static class CryptoHelper
     {
     {
 #region "LookUp Tables"
 #region "LookUp Tables"
-        private static byte[] _sBox =
+        private static readonly byte[] _sBox = new byte[]
         {
         {
             0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
             0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
             0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
             0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
@@ -29,7 +29,7 @@ namespace ChocolArm64.Instructions
             0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
             0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
         };
         };
 
 
-        private static byte[] _invSBox =
+        private static readonly byte[] _invSBox = new byte[]
         {
         {
             0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
             0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
             0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
             0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
@@ -49,7 +49,7 @@ namespace ChocolArm64.Instructions
             0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
             0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
         };
         };
 
 
-        private static byte[] _gfMul02 =
+        private static readonly byte[] _gfMul02 = new byte[]
         {
         {
             0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e,
             0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e,
             0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
             0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
@@ -69,7 +69,7 @@ namespace ChocolArm64.Instructions
             0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5
             0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5
         };
         };
 
 
-        private static byte[] _gfMul03 =
+        private static readonly byte[] _gfMul03 = new byte[]
         {
         {
             0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11,
             0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11,
             0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21,
             0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21,
@@ -89,7 +89,7 @@ namespace ChocolArm64.Instructions
             0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a
             0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a
         };
         };
 
 
-        private static byte[] _gfMul09 =
+        private static readonly byte[] _gfMul09 = new byte[]
         {
         {
             0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77,
             0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77,
             0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7,
             0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7,
@@ -109,7 +109,7 @@ namespace ChocolArm64.Instructions
             0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46
             0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46
         };
         };
 
 
-        private static byte[] _gfMul0B =
+        private static readonly byte[] _gfMul0B = new byte[]
         {
         {
             0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69,
             0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69,
             0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9,
             0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9,
@@ -129,7 +129,7 @@ namespace ChocolArm64.Instructions
             0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3
             0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3
         };
         };
 
 
-        private static byte[] _gfMul0D =
+        private static readonly byte[] _gfMul0D = new byte[]
         {
         {
             0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b,
             0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b,
             0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b,
             0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b,
@@ -149,7 +149,7 @@ namespace ChocolArm64.Instructions
             0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97
             0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97
         };
         };
 
 
-        private static byte[] _gfMul0E =
+        private static readonly byte[] _gfMul0E = new byte[]
         {
         {
             0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a,
             0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a,
             0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba,
             0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba,
@@ -169,9 +169,15 @@ namespace ChocolArm64.Instructions
             0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d
             0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d
         };
         };
 
 
-        private static byte[] _srPerm = { 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3 };
+        private static readonly byte[] _srPerm = new byte[]
+        {
+            0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
+        };
 
 
-        private static byte[] _isrPerm = { 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11 };
+        private static readonly byte[] _isrPerm = new byte[]
+        {
+            0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
+        };
 #endregion
 #endregion
 
 
         public static Vector128<float> AesInvMixColumns(Vector128<float> op)
         public static Vector128<float> AesInvMixColumns(Vector128<float> op)
@@ -179,7 +185,7 @@ namespace ChocolArm64.Instructions
             byte[] inState  = new byte[16];
             byte[] inState  = new byte[16];
             byte[] outState = new byte[16];
             byte[] outState = new byte[16];
 
 
-            FromVectorToByteArray(inState, ref op);
+            FromVectorToByteArray(op, inState);
 
 
             for (int columns = 0; columns <= 3; columns++)
             for (int columns = 0; columns <= 3; columns++)
             {
             {
@@ -206,7 +212,7 @@ namespace ChocolArm64.Instructions
             byte[] inState  = new byte[16];
             byte[] inState  = new byte[16];
             byte[] outState = new byte[16];
             byte[] outState = new byte[16];
 
 
-            FromVectorToByteArray(inState, ref op);
+            FromVectorToByteArray(op, inState);
 
 
             for (int idx = 0; idx <= 15; idx++)
             for (int idx = 0; idx <= 15; idx++)
             {
             {
@@ -223,7 +229,7 @@ namespace ChocolArm64.Instructions
             byte[] inState  = new byte[16];
             byte[] inState  = new byte[16];
             byte[] outState = new byte[16];
             byte[] outState = new byte[16];
 
 
-            FromVectorToByteArray(inState, ref op);
+            FromVectorToByteArray(op, inState);
 
 
             for (int idx = 0; idx <= 15; idx++)
             for (int idx = 0; idx <= 15; idx++)
             {
             {
@@ -240,7 +246,7 @@ namespace ChocolArm64.Instructions
             byte[] inState  = new byte[16];
             byte[] inState  = new byte[16];
             byte[] outState = new byte[16];
             byte[] outState = new byte[16];
 
 
-            FromVectorToByteArray(inState, ref op);
+            FromVectorToByteArray(op, inState);
 
 
             for (int columns = 0; columns <= 3; columns++)
             for (int columns = 0; columns <= 3; columns++)
             {
             {
@@ -267,7 +273,7 @@ namespace ChocolArm64.Instructions
             byte[] inState  = new byte[16];
             byte[] inState  = new byte[16];
             byte[] outState = new byte[16];
             byte[] outState = new byte[16];
 
 
-            FromVectorToByteArray(inState, ref op);
+            FromVectorToByteArray(op, inState);
 
 
             for (int idx = 0; idx <= 15; idx++)
             for (int idx = 0; idx <= 15; idx++)
             {
             {
@@ -284,7 +290,7 @@ namespace ChocolArm64.Instructions
             byte[] inState  = new byte[16];
             byte[] inState  = new byte[16];
             byte[] outState = new byte[16];
             byte[] outState = new byte[16];
 
 
-            FromVectorToByteArray(inState, ref op);
+            FromVectorToByteArray(op, inState);
 
 
             for (int idx = 0; idx <= 15; idx++)
             for (int idx = 0; idx <= 15; idx++)
             {
             {
@@ -296,33 +302,30 @@ namespace ChocolArm64.Instructions
             return op;
             return op;
         }
         }
 
 
-        private static void FromVectorToByteArray(byte[] state, ref Vector128<float> op)
+        private unsafe static void FromVectorToByteArray(Vector128<float> op, byte[] state)
         {
         {
-            ulong uLongLow  = VectorHelper.VectorExtractIntZx((op), (byte)0, 3);
-            ulong uLongHigh = VectorHelper.VectorExtractIntZx((op), (byte)1, 3);
-
-            for (int idx = 0; idx <= 7; idx++)
+            if (!Sse2.IsSupported)
             {
             {
-                state[idx + 0] = (byte)(uLongLow  & 0xFFUL);
-                state[idx + 8] = (byte)(uLongHigh & 0xFFUL);
+                throw new PlatformNotSupportedException();
+            }
 
 
-                uLongLow  >>= 8;
-                uLongHigh >>= 8;
+            fixed (byte* ptr = &state[0])
+            {
+                Sse2.Store(ptr, Sse.StaticCast<float, byte>(op));
             }
             }
         }
         }
 
 
-        private static void FromByteArrayToVector(byte[] state, ref Vector128<float> op)
+        private unsafe static void FromByteArrayToVector(byte[] state, ref Vector128<float> op)
         {
         {
             if (!Sse2.IsSupported)
             if (!Sse2.IsSupported)
             {
             {
                 throw new PlatformNotSupportedException();
                 throw new PlatformNotSupportedException();
             }
             }
 
 
-            op = Sse.StaticCast<byte, float>(Sse2.SetVector128(
-                state[15], state[14], state[13], state[12],
-                state[11], state[10], state[9],  state[8],
-                state[7],  state[6],  state[5],  state[4],
-                state[3],  state[2],  state[1],  state[0]));
+            fixed (byte* ptr = &state[0])
+            {
+                op = Sse.StaticCast<byte, float>(Sse2.LoadVector128(ptr));
+            }
         }
         }
     }
     }
 }
 }

+ 63 - 34
ChocolArm64/Instructions/InstEmitSimdArithmetic.cs

@@ -392,8 +392,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fadd_S(ILEmitterCtx context)
         public static void Fadd_S(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.AddScalar));
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.AddScalar));
             }
             }
@@ -408,8 +407,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fadd_V(ILEmitterCtx context)
         public static void Fadd_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Add));
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Add));
             }
             }
@@ -470,8 +468,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Faddp_V(ILEmitterCtx context)
         public static void Faddp_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Add));
                 EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Add));
             }
             }
@@ -486,8 +483,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fdiv_S(ILEmitterCtx context)
         public static void Fdiv_S(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.DivideScalar));
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.DivideScalar));
             }
             }
@@ -502,8 +498,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fdiv_V(ILEmitterCtx context)
         public static void Fdiv_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Divide));
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Divide));
             }
             }
@@ -564,8 +559,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fmax_S(ILEmitterCtx context)
         public static void Fmax_S(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.MaxScalar));
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.MaxScalar));
             }
             }
@@ -580,8 +574,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fmax_V(ILEmitterCtx context)
         public static void Fmax_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Max));
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Max));
             }
             }
@@ -612,8 +605,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fmaxp_V(ILEmitterCtx context)
         public static void Fmaxp_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Max));
                 EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Max));
             }
             }
@@ -628,8 +620,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fmin_S(ILEmitterCtx context)
         public static void Fmin_S(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.MinScalar));
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.MinScalar));
             }
             }
@@ -644,8 +635,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fmin_V(ILEmitterCtx context)
         public static void Fmin_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Min));
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Min));
             }
             }
@@ -676,8 +666,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fminp_V(ILEmitterCtx context)
         public static void Fminp_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Min));
                 EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Min));
             }
             }
@@ -984,8 +973,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fmul_S(ILEmitterCtx context)
         public static void Fmul_S(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.MultiplyScalar));
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.MultiplyScalar));
             }
             }
@@ -1005,8 +993,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fmul_V(ILEmitterCtx context)
         public static void Fmul_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Multiply));
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Multiply));
             }
             }
@@ -1753,8 +1740,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fsqrt_S(ILEmitterCtx context)
         public static void Fsqrt_S(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.SqrtScalar));
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.SqrtScalar));
             }
             }
@@ -1769,8 +1755,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fsqrt_V(ILEmitterCtx context)
         public static void Fsqrt_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Sqrt));
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Sqrt));
             }
             }
@@ -1785,8 +1770,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fsub_S(ILEmitterCtx context)
         public static void Fsub_S(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.SubtractScalar));
                 EmitScalarSseOrSse2OpF(context, nameof(Sse.SubtractScalar));
             }
             }
@@ -1801,8 +1785,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fsub_V(ILEmitterCtx context)
         public static void Fsub_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Subtract));
                 EmitVectorSseOrSse2OpF(context, nameof(Sse.Subtract));
             }
             }
@@ -2268,6 +2251,15 @@ namespace ChocolArm64.Instructions
             }
             }
         }
         }
 
 
+        public static void Smlal_Ve(ILEmitterCtx context)
+        {
+            EmitVectorWidenTernaryOpByElemSx(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Add);
+            });
+        }
+
         public static void Smlsl_V(ILEmitterCtx context)
         public static void Smlsl_V(ILEmitterCtx context)
         {
         {
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
@@ -2319,11 +2311,25 @@ namespace ChocolArm64.Instructions
             }
             }
         }
         }
 
 
+        public static void Smlsl_Ve(ILEmitterCtx context)
+        {
+            EmitVectorWidenTernaryOpByElemSx(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Sub);
+            });
+        }
+
         public static void Smull_V(ILEmitterCtx context)
         public static void Smull_V(ILEmitterCtx context)
         {
         {
             EmitVectorWidenRnRmBinaryOpSx(context, () => context.Emit(OpCodes.Mul));
             EmitVectorWidenRnRmBinaryOpSx(context, () => context.Emit(OpCodes.Mul));
         }
         }
 
 
+        public static void Smull_Ve(ILEmitterCtx context)
+        {
+            EmitVectorWidenBinaryOpByElemSx(context, () => context.Emit(OpCodes.Mul));
+        }
+
         public static void Sqabs_S(ILEmitterCtx context)
         public static void Sqabs_S(ILEmitterCtx context)
         {
         {
             EmitScalarSaturatingUnaryOpSx(context, () => EmitAbs(context));
             EmitScalarSaturatingUnaryOpSx(context, () => EmitAbs(context));
@@ -2929,6 +2935,15 @@ namespace ChocolArm64.Instructions
             }
             }
         }
         }
 
 
+        public static void Umlal_Ve(ILEmitterCtx context)
+        {
+            EmitVectorWidenTernaryOpByElemZx(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Add);
+            });
+        }
+
         public static void Umlsl_V(ILEmitterCtx context)
         public static void Umlsl_V(ILEmitterCtx context)
         {
         {
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
@@ -2980,11 +2995,25 @@ namespace ChocolArm64.Instructions
             }
             }
         }
         }
 
 
+        public static void Umlsl_Ve(ILEmitterCtx context)
+        {
+            EmitVectorWidenTernaryOpByElemZx(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Sub);
+            });
+        }
+
         public static void Umull_V(ILEmitterCtx context)
         public static void Umull_V(ILEmitterCtx context)
         {
         {
             EmitVectorWidenRnRmBinaryOpZx(context, () => context.Emit(OpCodes.Mul));
             EmitVectorWidenRnRmBinaryOpZx(context, () => context.Emit(OpCodes.Mul));
         }
         }
 
 
+        public static void Umull_Ve(ILEmitterCtx context)
+        {
+            EmitVectorWidenBinaryOpByElemZx(context, () => context.Emit(OpCodes.Mul));
+        }
+
         public static void Uqadd_S(ILEmitterCtx context)
         public static void Uqadd_S(ILEmitterCtx context)
         {
         {
             EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Add);
             EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Add);

+ 10 - 20
ChocolArm64/Instructions/InstEmitSimdCmp.cs

@@ -173,8 +173,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fcmeq_S(ILEmitterCtx context)
         public static void Fcmeq_S(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqualScalar), scalar: true);
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqualScalar), scalar: true);
             }
             }
@@ -186,8 +185,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fcmeq_V(ILEmitterCtx context)
         public static void Fcmeq_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqual), scalar: false);
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqual), scalar: false);
             }
             }
@@ -199,8 +197,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fcmge_S(ILEmitterCtx context)
         public static void Fcmge_S(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true);
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true);
             }
             }
@@ -212,8 +209,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fcmge_V(ILEmitterCtx context)
         public static void Fcmge_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false);
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false);
             }
             }
@@ -225,8 +221,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fcmgt_S(ILEmitterCtx context)
         public static void Fcmgt_S(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true);
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true);
             }
             }
@@ -238,8 +233,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fcmgt_V(ILEmitterCtx context)
         public static void Fcmgt_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false);
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false);
             }
             }
@@ -251,8 +245,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fcmle_S(ILEmitterCtx context)
         public static void Fcmle_S(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true, isLeOrLt: true);
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true, isLeOrLt: true);
             }
             }
@@ -264,8 +257,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fcmle_V(ILEmitterCtx context)
         public static void Fcmle_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false, isLeOrLt: true);
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false, isLeOrLt: true);
             }
             }
@@ -277,8 +269,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fcmlt_S(ILEmitterCtx context)
         public static void Fcmlt_S(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true, isLeOrLt: true);
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true, isLeOrLt: true);
             }
             }
@@ -290,8 +281,7 @@ namespace ChocolArm64.Instructions
 
 
         public static void Fcmlt_V(ILEmitterCtx context)
         public static void Fcmlt_V(ILEmitterCtx context)
         {
         {
-            if (Optimizations.FastFP && Optimizations.UseSse
-                                     && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
             {
             {
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false, isLeOrLt: true);
                 EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false, isLeOrLt: true);
             }
             }

+ 4 - 6
ChocolArm64/Instructions/InstEmitSimdCvt.cs

@@ -78,7 +78,6 @@ namespace ChocolArm64.Instructions
 
 
             if (Optimizations.UseSse2 && sizeF == 1)
             if (Optimizations.UseSse2 && sizeF == 1)
             {
             {
-                Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
                 Type[] typesCvt = new Type[] { typeof(Vector128<float>) };
                 Type[] typesCvt = new Type[] { typeof(Vector128<float>) };
 
 
                 string nameMov = op.RegisterSize == RegisterSize.Simd128
                 string nameMov = op.RegisterSize == RegisterSize.Simd128
@@ -88,7 +87,7 @@ namespace ChocolArm64.Instructions
                 context.EmitLdvec(op.Rn);
                 context.EmitLdvec(op.Rn);
                 context.Emit(OpCodes.Dup);
                 context.Emit(OpCodes.Dup);
 
 
-                context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));
+                context.EmitCall(typeof(Sse).GetMethod(nameMov));
 
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Double), typesCvt));
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Double), typesCvt));
 
 
@@ -144,7 +143,6 @@ namespace ChocolArm64.Instructions
 
 
             if (Optimizations.UseSse2 && sizeF == 1)
             if (Optimizations.UseSse2 && sizeF == 1)
             {
             {
-                Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
                 Type[] typesCvt = new Type[] { typeof(Vector128<double>) };
                 Type[] typesCvt = new Type[] { typeof(Vector128<double>) };
 
 
                 string nameMov = op.RegisterSize == RegisterSize.Simd128
                 string nameMov = op.RegisterSize == RegisterSize.Simd128
@@ -154,15 +152,15 @@ namespace ChocolArm64.Instructions
                 context.EmitLdvec(op.Rd);
                 context.EmitLdvec(op.Rd);
                 VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
                 VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
 
 
-                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
 
 
                 EmitLdvecWithCastToDouble(context, op.Rn);
                 EmitLdvecWithCastToDouble(context, op.Rn);
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt));
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt));
                 context.Emit(OpCodes.Dup);
                 context.Emit(OpCodes.Dup);
 
 
-                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
 
 
-                context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));
+                context.EmitCall(typeof(Sse).GetMethod(nameMov));
 
 
                 context.EmitStvec(op.Rd);
                 context.EmitStvec(op.Rd);
             }
             }

+ 62 - 4
ChocolArm64/Instructions/InstEmitSimdHelper.cs

@@ -642,21 +642,21 @@ namespace ChocolArm64.Instructions
         {
         {
             OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
             OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
 
 
-            EmitVectorOpByElem(context, emit, op.Index, false, true);
+            EmitVectorOpByElem(context, emit, op.Index, ternary: false, signed: true);
         }
         }
 
 
         public static void EmitVectorBinaryOpByElemZx(ILEmitterCtx context, Action emit)
         public static void EmitVectorBinaryOpByElemZx(ILEmitterCtx context, Action emit)
         {
         {
             OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
             OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
 
 
-            EmitVectorOpByElem(context, emit, op.Index, false, false);
+            EmitVectorOpByElem(context, emit, op.Index, ternary: false, signed: false);
         }
         }
 
 
         public static void EmitVectorTernaryOpByElemZx(ILEmitterCtx context, Action emit)
         public static void EmitVectorTernaryOpByElemZx(ILEmitterCtx context, Action emit)
         {
         {
             OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
             OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
 
 
-            EmitVectorOpByElem(context, emit, op.Index, true, false);
+            EmitVectorOpByElem(context, emit, op.Index, ternary: true, signed: false);
         }
         }
 
 
         public static void EmitVectorOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed)
         public static void EmitVectorOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed)
@@ -809,6 +809,64 @@ namespace ChocolArm64.Instructions
             context.EmitStvec(op.Rd);
             context.EmitStvec(op.Rd);
         }
         }
 
 
+        public static void EmitVectorWidenBinaryOpByElemSx(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
+
+            EmitVectorWidenOpByElem(context, emit, op.Index, ternary: false, signed: true);
+        }
+
+        public static void EmitVectorWidenBinaryOpByElemZx(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
+
+            EmitVectorWidenOpByElem(context, emit, op.Index, ternary: false, signed: false);
+        }
+
+        public static void EmitVectorWidenTernaryOpByElemSx(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
+
+            EmitVectorWidenOpByElem(context, emit, op.Index, ternary: true, signed: true);
+        }
+
+        public static void EmitVectorWidenTernaryOpByElemZx(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
+
+            EmitVectorWidenOpByElem(context, emit, op.Index, ternary: true, signed: false);
+        }
+
+        public static void EmitVectorWidenOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int elems = 8 >> op.Size;
+
+            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
+
+            EmitVectorExtract(context, op.Rm, elem, op.Size, signed);
+            context.EmitSttmp();
+
+            for (int index = 0; index < elems; index++)
+            {
+                if (ternary)
+                {
+                    EmitVectorExtract(context, op.Rd, index, op.Size + 1, signed);
+                }
+
+                EmitVectorExtract(context, op.Rn, part + index, op.Size, signed);
+                context.EmitLdtmp();
+
+                emit();
+
+                EmitVectorInsertTmp(context, index, op.Size + 1);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+        }
+
         public static void EmitVectorPairwiseOpSx(ILEmitterCtx context, Action emit)
         public static void EmitVectorPairwiseOpSx(ILEmitterCtx context, Action emit)
         {
         {
             EmitVectorPairwiseOp(context, emit, true);
             EmitVectorPairwiseOp(context, emit, true);
@@ -1416,7 +1474,7 @@ namespace ChocolArm64.Instructions
             if (Optimizations.UseSse)
             if (Optimizations.UseSse)
             {
             {
                 //TODO: Use Sse2.MoveScalar once it is fixed,
                 //TODO: Use Sse2.MoveScalar once it is fixed,
-                //as of the time of writing it just crashes the JIT (SDK 2.1.500).
+                //as of the time of writing it just crashes the JIT (SDK 2.1.503).
 
 
                 /*Type[] typesMov = new Type[] { typeof(Vector128<ulong>) };
                 /*Type[] typesMov = new Type[] { typeof(Vector128<ulong>) };
 
 

+ 180 - 62
ChocolArm64/Instructions/InstEmitSimdMove.cs

@@ -12,6 +12,34 @@ namespace ChocolArm64.Instructions
 {
 {
     static partial class InstEmit
     static partial class InstEmit
     {
     {
+#region "Masks"
+        private static readonly long[] _masksE0_TrnUzpXtn = new long[]
+        {
+            14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0,
+            13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0,
+            11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0
+        };
+
+        private static readonly long[] _masksE1_TrnUzp = new long[]
+        {
+            15L << 56 | 13L << 48 | 11L << 40 | 09L << 32 | 07L << 24 | 05L << 16 | 03L << 8 | 01L << 0,
+            15L << 56 | 14L << 48 | 11L << 40 | 10L << 32 | 07L << 24 | 06L << 16 | 03L << 8 | 02L << 0,
+            15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0
+        };
+
+        private static readonly long[] _masksE0_Uzp = new long[]
+        {
+            13L << 56 | 09L << 48 | 05L << 40 | 01L << 32 | 12L << 24 | 08L << 16 | 04L << 8 | 00L << 0,
+            11L << 56 | 10L << 48 | 03L << 40 | 02L << 32 | 09L << 24 | 08L << 16 | 01L << 8 | 00L << 0
+        };
+
+        private static readonly long[] _masksE1_Uzp = new long[]
+        {
+            15L << 56 | 11L << 48 | 07L << 40 | 03L << 32 | 14L << 24 | 10L << 16 | 06L << 8 | 02L << 0,
+            15L << 56 | 14L << 48 | 07L << 40 | 06L << 32 | 13L << 24 | 12L << 16 | 05L << 8 | 04L << 0
+        };
+#endregion
+
         public static void Dup_Gp(ILEmitterCtx context)
         public static void Dup_Gp(ILEmitterCtx context)
         {
         {
             OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
             OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
@@ -379,15 +407,6 @@ namespace ChocolArm64.Instructions
 
 
             if (Optimizations.UseSsse3)
             if (Optimizations.UseSsse3)
             {
             {
-                long[] masks = new long[]
-                {
-                    14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0,
-                    13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0,
-                    11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0
-                };
-
-                Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
-                Type[] typesSfl = new Type[] { typeof(Vector128<sbyte>), typeof(Vector128<sbyte>) };
                 Type[] typesSve = new Type[] { typeof(long), typeof(long) };
                 Type[] typesSve = new Type[] { typeof(long), typeof(long) };
 
 
                 string nameMov = op.RegisterSize == RegisterSize.Simd128
                 string nameMov = op.RegisterSize == RegisterSize.Simd128
@@ -397,18 +416,18 @@ namespace ChocolArm64.Instructions
                 context.EmitLdvec(op.Rd);
                 context.EmitLdvec(op.Rd);
                 VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
                 VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
 
 
-                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
 
 
-                EmitLdvecWithSignedCast(context, op.Rn, 0);
+                EmitLdvecWithSignedCast(context, op.Rn, 0); // value
 
 
-                context.EmitLdc_I8(masks[op.Size]);
-                context.Emit(OpCodes.Dup);
+                context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // mask
+                context.Emit(OpCodes.Dup); // mask
 
 
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
 
 
-                context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl));
+                context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
 
 
-                context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));
+                context.EmitCall(typeof(Sse).GetMethod(nameMov));
 
 
                 context.EmitStvec(op.Rd);
                 context.EmitStvec(op.Rd);
             }
             }
@@ -465,22 +484,61 @@ namespace ChocolArm64.Instructions
         {
         {
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
 
 
-            int words = op.GetBitsCount() >> 4;
-            int pairs = words >> op.Size;
-
-            for (int index = 0; index < pairs; index++)
+            if (Optimizations.UseSsse3)
             {
             {
-                int idx = index << 1;
+                Type[] typesSve = new Type[] { typeof(long), typeof(long) };
+
+                string nameUpk = part == 0
+                    ? nameof(Sse2.UnpackLow)
+                    : nameof(Sse2.UnpackHigh);
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value
+
+                if (op.Size < 3)
+                {
+                    context.EmitLdc_I8(_masksE1_TrnUzp   [op.Size]); // maskE1
+                    context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                    context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
+                }
+
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size); // value
+
+                if (op.Size < 3)
+                {
+                    context.EmitLdc_I8(_masksE1_TrnUzp   [op.Size]); // maskE1
+                    context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                    context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
+                }
 
 
-                EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
-                EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
+                context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(op.Size)));
 
 
-                EmitVectorInsertTmp(context, idx + 1, op.Size);
-                EmitVectorInsertTmp(context, idx,     op.Size);
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
             }
             }
+            else
+            {
+                int words = op.GetBitsCount() >> 4;
+                int pairs = words >> op.Size;
 
 
-            context.EmitLdvectmp();
-            context.EmitStvec(op.Rd);
+                for (int index = 0; index < pairs; index++)
+                {
+                    int idx = index << 1;
+
+                    EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
+                    EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
+
+                    EmitVectorInsertTmp(context, idx + 1, op.Size);
+                    EmitVectorInsertTmp(context, idx,     op.Size);
+                }
+
+                context.EmitLdvectmp();
+                context.EmitStvec(op.Rd);
+            }
 
 
             if (op.RegisterSize == RegisterSize.Simd64)
             if (op.RegisterSize == RegisterSize.Simd64)
             {
             {
@@ -492,26 +550,91 @@ namespace ChocolArm64.Instructions
         {
         {
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
 
 
-            int words = op.GetBitsCount() >> 4;
-            int pairs = words >> op.Size;
-
-            for (int index = 0; index < pairs; index++)
+            if (Optimizations.UseSsse3)
             {
             {
-                int idx = index << 1;
+                Type[] typesSve = new Type[] { typeof(long), typeof(long) };
+
+                string nameUpk = part == 0
+                    ? nameof(Sse2.UnpackLow)
+                    : nameof(Sse2.UnpackHigh);
 
 
-                EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
-                EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
+                if (op.RegisterSize == RegisterSize.Simd128)
+                {
+                    EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value
 
 
-                EmitVectorInsertTmp(context, pairs + index, op.Size);
-                EmitVectorInsertTmp(context,         index, op.Size);
-            }
+                    if (op.Size < 3)
+                    {
+                        context.EmitLdc_I8(_masksE1_TrnUzp   [op.Size]); // maskE1
+                        context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
 
 
-            context.EmitLdvectmp();
-            context.EmitStvec(op.Rd);
+                        context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
 
 
-            if (op.RegisterSize == RegisterSize.Simd64)
+                        context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
+                    }
+
+                    EmitLdvecWithSignedCast(context, op.Rm, op.Size); // value
+
+                    if (op.Size < 3)
+                    {
+                        context.EmitLdc_I8(_masksE1_TrnUzp   [op.Size]); // maskE1
+                        context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
+
+                        context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                        context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
+                    }
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3)));
+
+                    EmitStvecWithSignedCast(context, op.Rd, op.Size);
+                }
+                else
+                {
+                    EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+                    EmitLdvecWithSignedCast(context, op.Rm, op.Size);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackLow), GetTypesSflUpk(op.Size))); // value
+
+                    if (op.Size < 2)
+                    {
+                        context.EmitLdc_I8(_masksE1_Uzp[op.Size]); // maskE1
+                        context.EmitLdc_I8(_masksE0_Uzp[op.Size]); // maskE0
+
+                        context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                        context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
+                    }
+
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt64Zero));
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3)));
+
+                    EmitStvecWithSignedCast(context, op.Rd, op.Size);
+                }
+            }
+            else
             {
             {
-                EmitVectorZeroUpper(context, op.Rd);
+                int words = op.GetBitsCount() >> 4;
+                int pairs = words >> op.Size;
+
+                for (int index = 0; index < pairs; index++)
+                {
+                    int idx = index << 1;
+
+                    EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
+                    EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
+
+                    EmitVectorInsertTmp(context, pairs + index, op.Size);
+                    EmitVectorInsertTmp(context,         index, op.Size);
+                }
+
+                context.EmitLdvectmp();
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
             }
             }
         }
         }
 
 
@@ -521,36 +644,26 @@ namespace ChocolArm64.Instructions
 
 
             if (Optimizations.UseSse2)
             if (Optimizations.UseSse2)
             {
             {
-                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
-                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
-
-                Type[] types = new Type[]
-                {
-                    VectorUIntTypesPerSizeLog2[op.Size],
-                    VectorUIntTypesPerSizeLog2[op.Size]
-                };
-
-                string name = part == 0 || (part != 0 && op.RegisterSize == RegisterSize.Simd64)
+                string nameUpk = part == 0
                     ? nameof(Sse2.UnpackLow)
                     ? nameof(Sse2.UnpackLow)
                     : nameof(Sse2.UnpackHigh);
                     : nameof(Sse2.UnpackHigh);
 
 
-                context.EmitCall(typeof(Sse2).GetMethod(name, types));
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
 
 
-                if (op.RegisterSize == RegisterSize.Simd64 && part != 0)
+                if (op.RegisterSize == RegisterSize.Simd128)
                 {
                 {
-                    context.EmitLdc_I4(8);
-
-                    Type[] shTypes = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
-
-                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), shTypes));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(op.Size)));
                 }
                 }
-
-                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
-
-                if (op.RegisterSize == RegisterSize.Simd64 && part == 0)
+                else
                 {
                 {
-                    EmitVectorZeroUpper(context, op.Rd);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackLow), GetTypesSflUpk(op.Size)));
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt64Zero));
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3)));
                 }
                 }
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
             }
             }
             else
             else
             {
             {
@@ -579,5 +692,10 @@ namespace ChocolArm64.Instructions
                 }
                 }
             }
             }
         }
         }
+
+        private static Type[] GetTypesSflUpk(int size)
+        {
+            return new Type[] { VectorIntTypesPerSizeLog2[size], VectorIntTypesPerSizeLog2[size] };
+        }
     }
     }
 }
 }

+ 2 - 7
ChocolArm64/Instructions/SoftFallback.cs

@@ -664,7 +664,7 @@ namespace ChocolArm64.Instructions
 
 
             for (int bit = highBit; bit >= 0; bit--)
             for (int bit = highBit; bit >= 0; bit--)
             {
             {
-                if (((value >> bit) & 0b1) != 0)
+                if (((int)(value >> bit) & 0b1) != 0)
                 {
                 {
                     return (ulong)(highBit - bit);
                     return (ulong)(highBit - bit);
                 }
                 }
@@ -688,7 +688,7 @@ namespace ChocolArm64.Instructions
             do
             do
             {
             {
                 nibbleIdx -= 4;
                 nibbleIdx -= 4;
-                preCount = ClzNibbleTbl[(value >> nibbleIdx) & 0b1111];
+                preCount = ClzNibbleTbl[(int)(value >> nibbleIdx) & 0b1111];
                 count += preCount;
                 count += preCount;
             }
             }
             while (preCount == 4);
             while (preCount == 4);
@@ -698,11 +698,6 @@ namespace ChocolArm64.Instructions
 
 
         public static ulong CountSetBits8(ulong value) // "size" is 8 (SIMD&FP Inst.).
         public static ulong CountSetBits8(ulong value) // "size" is 8 (SIMD&FP Inst.).
         {
         {
-            if (value == 0xfful)
-            {
-                return 8ul;
-            }
-
             value = ((value >> 1) & 0x55ul) + (value & 0x55ul);
             value = ((value >> 1) & 0x55ul) + (value & 0x55ul);
             value = ((value >> 2) & 0x33ul) + (value & 0x33ul);
             value = ((value >> 2) & 0x33ul) + (value & 0x33ul);
 
 

+ 4 - 4
ChocolArm64/Instructions/SoftFloat.cs

@@ -1545,9 +1545,9 @@ namespace ChocolArm64.Instructions
             return -value;
             return -value;
         }
         }
 
 
-        private static float ZerosOrOnes(bool zeros)
+        private static float ZerosOrOnes(bool ones)
         {
         {
-            return BitConverter.Int32BitsToSingle(!zeros ? 0 : -1);
+            return BitConverter.Int32BitsToSingle(ones ? -1 : 0);
         }
         }
 
 
         private static float FPUnpack(
         private static float FPUnpack(
@@ -2629,9 +2629,9 @@ namespace ChocolArm64.Instructions
             return -value;
             return -value;
         }
         }
 
 
-        private static double ZerosOrOnes(bool zeros)
+        private static double ZerosOrOnes(bool ones)
         {
         {
-            return BitConverter.Int64BitsToDouble(!zeros ? 0L : -1L);
+            return BitConverter.Int64BitsToDouble(ones ? -1L : 0L);
         }
         }
 
 
         private static double FPUnpack(
         private static double FPUnpack(

+ 6 - 0
ChocolArm64/OpCodeTable.cs

@@ -445,9 +445,12 @@ namespace ChocolArm64
             SetA64("0x001110<<1xxxxx011011xxxxxxxxxx", InstEmit.Smin_V,          typeof(OpCodeSimdReg64));
             SetA64("0x001110<<1xxxxx011011xxxxxxxxxx", InstEmit.Smin_V,          typeof(OpCodeSimdReg64));
             SetA64("0x001110<<1xxxxx101011xxxxxxxxxx", InstEmit.Sminp_V,         typeof(OpCodeSimdReg64));
             SetA64("0x001110<<1xxxxx101011xxxxxxxxxx", InstEmit.Sminp_V,         typeof(OpCodeSimdReg64));
             SetA64("0x001110<<1xxxxx100000xxxxxxxxxx", InstEmit.Smlal_V,         typeof(OpCodeSimdReg64));
             SetA64("0x001110<<1xxxxx100000xxxxxxxxxx", InstEmit.Smlal_V,         typeof(OpCodeSimdReg64));
+            SetA64("0x001111xxxxxxxx0010x0xxxxxxxxxx", InstEmit.Smlal_Ve,        typeof(OpCodeSimdRegElem64));
             SetA64("0x001110<<1xxxxx101000xxxxxxxxxx", InstEmit.Smlsl_V,         typeof(OpCodeSimdReg64));
             SetA64("0x001110<<1xxxxx101000xxxxxxxxxx", InstEmit.Smlsl_V,         typeof(OpCodeSimdReg64));
+            SetA64("0x001111xxxxxxxx0110x0xxxxxxxxxx", InstEmit.Smlsl_Ve,        typeof(OpCodeSimdRegElem64));
             SetA64("0x001110000xxxxx001011xxxxxxxxxx", InstEmit.Smov_S,          typeof(OpCodeSimdIns64));
             SetA64("0x001110000xxxxx001011xxxxxxxxxx", InstEmit.Smov_S,          typeof(OpCodeSimdIns64));
             SetA64("0x001110<<1xxxxx110000xxxxxxxxxx", InstEmit.Smull_V,         typeof(OpCodeSimdReg64));
             SetA64("0x001110<<1xxxxx110000xxxxxxxxxx", InstEmit.Smull_V,         typeof(OpCodeSimdReg64));
+            SetA64("0x001111xxxxxxxx1010x0xxxxxxxxxx", InstEmit.Smull_Ve,        typeof(OpCodeSimdRegElem64));
             SetA64("01011110xx100000011110xxxxxxxxxx", InstEmit.Sqabs_S,         typeof(OpCodeSimd64));
             SetA64("01011110xx100000011110xxxxxxxxxx", InstEmit.Sqabs_S,         typeof(OpCodeSimd64));
             SetA64("0>001110<<100000011110xxxxxxxxxx", InstEmit.Sqabs_V,         typeof(OpCodeSimd64));
             SetA64("0>001110<<100000011110xxxxxxxxxx", InstEmit.Sqabs_V,         typeof(OpCodeSimd64));
             SetA64("01011110xx1xxxxx000011xxxxxxxxxx", InstEmit.Sqadd_S,         typeof(OpCodeSimdReg64));
             SetA64("01011110xx1xxxxx000011xxxxxxxxxx", InstEmit.Sqadd_S,         typeof(OpCodeSimdReg64));
@@ -534,9 +537,12 @@ namespace ChocolArm64
             SetA64("0x101110<<1xxxxx011011xxxxxxxxxx", InstEmit.Umin_V,          typeof(OpCodeSimdReg64));
             SetA64("0x101110<<1xxxxx011011xxxxxxxxxx", InstEmit.Umin_V,          typeof(OpCodeSimdReg64));
             SetA64("0x101110<<1xxxxx101011xxxxxxxxxx", InstEmit.Uminp_V,         typeof(OpCodeSimdReg64));
             SetA64("0x101110<<1xxxxx101011xxxxxxxxxx", InstEmit.Uminp_V,         typeof(OpCodeSimdReg64));
             SetA64("0x101110<<1xxxxx100000xxxxxxxxxx", InstEmit.Umlal_V,         typeof(OpCodeSimdReg64));
             SetA64("0x101110<<1xxxxx100000xxxxxxxxxx", InstEmit.Umlal_V,         typeof(OpCodeSimdReg64));
+            SetA64("0x101111xxxxxxxx0010x0xxxxxxxxxx", InstEmit.Umlal_Ve,        typeof(OpCodeSimdRegElem64));
             SetA64("0x101110<<1xxxxx101000xxxxxxxxxx", InstEmit.Umlsl_V,         typeof(OpCodeSimdReg64));
             SetA64("0x101110<<1xxxxx101000xxxxxxxxxx", InstEmit.Umlsl_V,         typeof(OpCodeSimdReg64));
+            SetA64("0x101111xxxxxxxx0110x0xxxxxxxxxx", InstEmit.Umlsl_Ve,        typeof(OpCodeSimdRegElem64));
             SetA64("0x001110000xxxxx001111xxxxxxxxxx", InstEmit.Umov_S,          typeof(OpCodeSimdIns64));
             SetA64("0x001110000xxxxx001111xxxxxxxxxx", InstEmit.Umov_S,          typeof(OpCodeSimdIns64));
             SetA64("0x101110<<1xxxxx110000xxxxxxxxxx", InstEmit.Umull_V,         typeof(OpCodeSimdReg64));
             SetA64("0x101110<<1xxxxx110000xxxxxxxxxx", InstEmit.Umull_V,         typeof(OpCodeSimdReg64));
+            SetA64("0x101111xxxxxxxx1010x0xxxxxxxxxx", InstEmit.Umull_Ve,        typeof(OpCodeSimdRegElem64));
             SetA64("01111110xx1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_S,         typeof(OpCodeSimdReg64));
             SetA64("01111110xx1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_S,         typeof(OpCodeSimdReg64));
             SetA64("0>101110<<1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_V,         typeof(OpCodeSimdReg64));
             SetA64("0>101110<<1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_V,         typeof(OpCodeSimdReg64));
             SetA64("0>101110<<1xxxxx010111xxxxxxxxxx", InstEmit.Uqrshl_V,        typeof(OpCodeSimdReg64));
             SetA64("0>101110<<1xxxxx010111xxxxxxxxxx", InstEmit.Uqrshl_V,        typeof(OpCodeSimdReg64));

+ 81 - 0
Ryujinx.Tests/Cpu/CpuTestSimdRegElem.cs

@@ -45,6 +45,32 @@ namespace Ryujinx.Tests.Cpu
                 0x0F808000u  // MUL V0.2S, V0.2S, V0.S[0]
                 0x0F808000u  // MUL V0.2S, V0.2S, V0.S[0]
             };
             };
         }
         }
+
+        private static uint[] _SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S_()
+        {
+            return new uint[]
+            {
+                0x0F402000u, // SMLAL V0.4S, V0.4H, V0.H[0]
+                0x0F406000u, // SMLSL V0.4S, V0.4H, V0.H[0]
+                0x0F40A000u, // SMULL V0.4S, V0.4H, V0.H[0]
+                0x2F402000u, // UMLAL V0.4S, V0.4H, V0.H[0]
+                0x2F406000u, // UMLSL V0.4S, V0.4H, V0.H[0]
+                0x2F40A000u  // UMULL V0.4S, V0.4H, V0.H[0]
+            };
+        }
+
+        private static uint[] _SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D_()
+        {
+            return new uint[]
+            {
+                0x0F802000u, // SMLAL V0.2D, V0.2S, V0.S[0]
+                0x0F806000u, // SMLSL V0.2D, V0.2S, V0.S[0]
+                0x0F80A000u, // SMULL V0.2D, V0.2S, V0.S[0]
+                0x2F802000u, // UMLAL V0.2D, V0.2S, V0.S[0]
+                0x2F806000u, // UMLSL V0.2D, V0.2S, V0.S[0]
+                0x2F80A000u  // UMULL V0.2D, V0.2S, V0.S[0]
+            };
+        }
 #endregion
 #endregion
 
 
         private const int RndCnt = 2;
         private const int RndCnt = 2;
@@ -103,6 +129,61 @@ namespace Ryujinx.Tests.Cpu
 
 
             CompareAgainstUnicorn();
             CompareAgainstUnicorn();
         }
         }
+
+        [Test, Pairwise]
+        public void SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S([ValueSource("_SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S_")] uint opcodes,
+                                                   [Values(0u)]     uint rd,
+                                                   [Values(1u, 0u)] uint rn,
+                                                   [Values(2u, 0u)] uint rm,
+                                                   [ValueSource("_4H_")] [Random(RndCnt)] ulong z,
+                                                   [ValueSource("_4H_")] [Random(RndCnt)] ulong a,
+                                                   [ValueSource("_4H_")] [Random(RndCnt)] ulong b,
+                                                   [Values(0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u)] uint index,
+                                                   [Values(0b0u, 0b1u)] uint q) // <4H4S, 8H4S>
+        {
+            uint h = (index >> 2) & 1;
+            uint l = (index >> 1) & 1;
+            uint m = index & 1;
+
+            opcodes |= ((rm & 15) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
+            opcodes |= (l << 21) | (m << 20) | (h << 11);
+            opcodes |= ((q & 1) << 30);
+
+            Vector128<float> v0 = MakeVectorE0E1(z, z);
+            Vector128<float> v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul);
+            Vector128<float> v2 = MakeVectorE0E1(b, b * h);
+
+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise]
+        public void SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D([ValueSource("_SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D_")] uint opcodes,
+                                                   [Values(0u)]     uint rd,
+                                                   [Values(1u, 0u)] uint rn,
+                                                   [Values(2u, 0u)] uint rm,
+                                                   [ValueSource("_2S_")] [Random(RndCnt)] ulong z,
+                                                   [ValueSource("_2S_")] [Random(RndCnt)] ulong a,
+                                                   [ValueSource("_2S_")] [Random(RndCnt)] ulong b,
+                                                   [Values(0u, 1u, 2u, 3u)] uint index,
+                                                   [Values(0b0u, 0b1u)] uint q) // <2S2D, 4S2D>
+        {
+            uint h = (index >> 1) & 1;
+            uint l = index & 1;
+
+            opcodes |= ((rm & 15) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
+            opcodes |= (l << 21) | (h << 11);
+            opcodes |= ((q & 1) << 30);
+
+            Vector128<float> v0 = MakeVectorE0E1(z, z);
+            Vector128<float> v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul);
+            Vector128<float> v2 = MakeVectorE0E1(b, b * h);
+
+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
 #endif
 #endif
     }
     }
 }
 }