فهرست منبع

Implement VMULL, VMLSL, VRSHR, VQRSHRN, VQRSHRUN AArch32 instructions + other fixes (#977)

* Implement VMULL, VMLSL, VQRSHRN, VQRSHRUN AArch32 instructions plus other fixes

* Re-align opcode table

* Re-enable undefined, use subclasses to fix checks

* Add test and fix VRSHR instruction

* PR feedback
gdkchan 6 سال پیش
والد
کامیت
c26f3774bd

+ 10 - 3
ARMeilleure/Decoders/OpCode32SimdRegElem.cs

@@ -6,13 +6,20 @@
         {
             Q = ((opCode >> 24) & 0x1) != 0;
             F = ((opCode >> 8) & 0x1) != 0;
-            Size = ((opCode >> 20) & 0x3);
+            Size = (opCode >> 20) & 0x3;
 
             RegisterSize = Q ? RegisterSize.Simd128 : RegisterSize.Simd64;
 
-            Vm = ((opCode >> 5) & 0x1) | ((opCode << 1) & 0x1e);
+            if (Size == 1)
+            {
+                Vm = ((opCode >> 3) & 0x1) | ((opCode >> 4) & 0x2) | ((opCode << 2) & 0x1c);
+            }
+            else /* if (Size == 2) */
+            {
+                Vm = ((opCode >> 5) & 0x1) | ((opCode << 1) & 0x1e);
+            }
 
-            if (DecoderHelper.VectorArgumentsInvalid(Q, Vd, Vn) || Size == 0 || (Size == 1 && F))
+            if (GetType() == typeof(OpCode32SimdRegElem) && DecoderHelper.VectorArgumentsInvalid(Q, Vd, Vn) || Size == 0 || (Size == 1 && F))
             {
                 Instruction = InstDescriptor.Undefined;
             }

+ 19 - 0
ARMeilleure/Decoders/OpCode32SimdRegElemLong.cs

@@ -0,0 +1,19 @@
+namespace ARMeilleure.Decoders
+{
+    class OpCode32SimdRegElemLong : OpCode32SimdRegElem
+    {
+        public OpCode32SimdRegElemLong(InstDescriptor inst, ulong address, int opCode) : base(inst, address, opCode)
+        {
+            Q = false;
+            F = false;
+
+            RegisterSize = RegisterSize.Simd64;
+
+            // (Vd & 1) != 0 || Size == 3 are also invalid, but they are checked on encoding.
+            if (Size == 0)
+            {
+                Instruction = InstDescriptor.Undefined;
+            }
+        }
+    }
+}

+ 14 - 0
ARMeilleure/Decoders/OpCode32SimdRegLong.cs

@@ -0,0 +1,14 @@
+namespace ARMeilleure.Decoders
+{
+    class OpCode32SimdRegLong : OpCode32SimdReg
+    {
+        public bool Polynomial { get; private set; }
+
+        public OpCode32SimdRegLong(InstDescriptor inst, ulong address, int opCode) : base(inst, address, opCode)
+        {
+            Q = false;
+            RegisterSize = RegisterSize.Simd64;
+            Polynomial = ((opCode >> 9) & 0x1) != 0;
+        }
+    }
+}

+ 1 - 1
ARMeilleure/Decoders/OpCode32SimdShImm.cs

@@ -35,7 +35,7 @@
                 Instruction = InstDescriptor.Undefined;
             }
 
-            if (DecoderHelper.VectorArgumentsInvalid(Q, Vd, Vm))
+            if (GetType() == typeof(OpCode32SimdShImm) && DecoderHelper.VectorArgumentsInvalid(Q, Vd, Vm))
             {
                 Instruction = InstDescriptor.Undefined;
             }

+ 7 - 0
ARMeilleure/Decoders/OpCode32SimdShImmNarrow.cs

@@ -0,0 +1,7 @@
+namespace ARMeilleure.Decoders
+{
+    class OpCode32SimdShImmNarrow : OpCode32SimdShImm
+    {
+        public OpCode32SimdShImmNarrow(InstDescriptor inst, ulong address, int opCode) : base(inst, address, opCode) { }
+    }
+}

+ 146 - 138
ARMeilleure/Decoders/OpCodeTable.cs

@@ -752,144 +752,152 @@ namespace ARMeilleure.Decoders
             SetA32("<<<<01101111xxxxxxxxxx000111xxxx", InstName.Uxth,    InstEmit32.Uxth,    typeof(OpCode32AluUx));
 
             // FP & SIMD
-            SetA32("<<<<11101x110000xxxx10xx11x0xxxx", InstName.Vabs,    InstEmit32.Vabs_S,   typeof(OpCode32SimdRegS));
-            SetA32("111100111x11xx01xxxx0x110xx0xxxx", InstName.Vabs,    InstEmit32.Vabs_V,   typeof(OpCode32SimdReg));
-            SetA32("111100100xxxxxxxxxxx1000xxx0xxxx", InstName.Vadd,    InstEmit32.Vadd_I,   typeof(OpCode32SimdReg));
-            SetA32("<<<<11100x11xxxxxxxx101xx0x0xxxx", InstName.Vadd,    InstEmit32.Vadd_S,   typeof(OpCode32SimdRegS));
-            SetA32("111100100x00xxxxxxxx1101xxx0xxxx", InstName.Vadd,    InstEmit32.Vadd_V,   typeof(OpCode32SimdReg));
-            SetA32("111100100x00xxxxxxxx0001xxx1xxxx", InstName.Vand,    InstEmit32.Vand_I,   typeof(OpCode32SimdBinary));
-            SetA32("111100110x11xxxxxxxx0001xxx1xxxx", InstName.Vbif,    InstEmit32.Vbif,     typeof(OpCode32SimdBinary));
-            SetA32("111100110x10xxxxxxxx0001xxx1xxxx", InstName.Vbit,    InstEmit32.Vbit,     typeof(OpCode32SimdBinary));
-            SetA32("111100110x01xxxxxxxx0001xxx1xxxx", InstName.Vbsl,    InstEmit32.Vbsl,     typeof(OpCode32SimdBinary));
-            SetA32("111100110x<<xxxxxxxx1000xxx1xxxx", InstName.Vceq,    InstEmit32.Vceq_I,   typeof(OpCode32SimdReg));
-            SetA32("111100100x00xxxxxxxx1110xxx0xxxx", InstName.Vceq,    InstEmit32.Vceq_V,   typeof(OpCode32SimdReg));
-            SetA32("111100111x11xx01xxxx0x010xx0xxxx", InstName.Vceq,    InstEmit32.Vceq_Z,   typeof(OpCode32SimdCmpZ));
-            SetA32("1111001x0x<<xxxxxxxx0011xxx1xxxx", InstName.Vcge,    InstEmit32.Vcge_I,   typeof(OpCode32SimdReg));
-            SetA32("111100110x00xxxxxxxx1110xxx0xxxx", InstName.Vcge,    InstEmit32.Vcge_V,   typeof(OpCode32SimdReg));
-            SetA32("111100111x11xx01xxxx0x001xx0xxxx", InstName.Vcge,    InstEmit32.Vcge_Z,   typeof(OpCode32SimdCmpZ));
-            SetA32("1111001x0x<<xxxxxxxx0011xxx0xxxx", InstName.Vcgt,    InstEmit32.Vcgt_I,   typeof(OpCode32SimdReg));
-            SetA32("111100110x10xxxxxxxx1110xxx0xxxx", InstName.Vcgt,    InstEmit32.Vcgt_V,   typeof(OpCode32SimdReg));
-            SetA32("111100111x11xx01xxxx0x000xx0xxxx", InstName.Vcgt,    InstEmit32.Vcgt_Z,   typeof(OpCode32SimdCmpZ));
-            SetA32("111100111x11xx01xxxx0x011xx0xxxx", InstName.Vcle,    InstEmit32.Vcle_Z,   typeof(OpCode32SimdCmpZ));
-            SetA32("111100111x11xx01xxxx0x100xx0xxxx", InstName.Vclt,    InstEmit32.Vclt_Z,   typeof(OpCode32SimdCmpZ));
-            SetA32("<<<<11101x11010xxxxx101x01x0xxxx", InstName.Vcmp,    InstEmit32.Vcmp,     typeof(OpCode32SimdS));
-            SetA32("<<<<11101x11010xxxxx101x11x0xxxx", InstName.Vcmpe,   InstEmit32.Vcmpe,    typeof(OpCode32SimdS));
-            SetA32("<<<<11101x110111xxxx101x11x0xxxx", InstName.Vcvt,    InstEmit32.Vcvt_FD,  typeof(OpCode32SimdS)); // FP 32 and 64, scalar.
-            SetA32("<<<<11101x11110xxxxx10xx11x0xxxx", InstName.Vcvt,    InstEmit32.Vcvt_FI,  typeof(OpCode32SimdCvtFI)); // FP32 to int.
-            SetA32("<<<<11101x111000xxxx10xxx1x0xxxx", InstName.Vcvt,    InstEmit32.Vcvt_FI,  typeof(OpCode32SimdCvtFI)); // Int to FP32.
-            SetA32("111111101x1111xxxxxx10>>x1x0xxxx", InstName.Vcvt,    InstEmit32.Vcvt_R,   typeof(OpCode32SimdCvtFI)); // The many FP32 to int encodings (fp).
-            SetA32("111100111x111011xxxx011xxxx0xxxx", InstName.Vcvt,    InstEmit32.Vcvt_V,   typeof(OpCode32SimdCmpZ)); // FP and integer, vector.
-            SetA32("<<<<11101x00xxxxxxxx101xx0x0xxxx", InstName.Vdiv,    InstEmit32.Vdiv_S,   typeof(OpCode32SimdRegS));
-            SetA32("<<<<11101xx0xxxxxxxx1011x0x10000", InstName.Vdup,    InstEmit32.Vdup,     typeof(OpCode32SimdDupGP));
-            SetA32("111100111x11xxxxxxxx11000xx0xxxx", InstName.Vdup,    InstEmit32.Vdup_1,   typeof(OpCode32SimdDupElem));
-            SetA32("111100101x11xxxxxxxxxxxxxxx0xxxx", InstName.Vext,    InstEmit32.Vext,     typeof(OpCode32SimdExt));
-            SetA32("111101001x10xxxxxxxxxx00xxxxxxxx", InstName.Vld1,    InstEmit32.Vld1,     typeof(OpCode32SimdMemSingle));
-            SetA32("111101000x10xxxxxxxx0111xxxxxxxx", InstName.Vld1,    InstEmit32.Vld1,     typeof(OpCode32SimdMemPair)); // Regs = 1.
-            SetA32("111101000x10xxxxxxxx1010xxxxxxxx", InstName.Vld1,    InstEmit32.Vld1,     typeof(OpCode32SimdMemPair)); // Regs = 2.
-            SetA32("111101000x10xxxxxxxx0110xxxxxxxx", InstName.Vld1,    InstEmit32.Vld1,     typeof(OpCode32SimdMemPair)); // Regs = 3.
-            SetA32("111101000x10xxxxxxxx0010xxxxxxxx", InstName.Vld1,    InstEmit32.Vld1,     typeof(OpCode32SimdMemPair)); // Regs = 4.
-            SetA32("111101001x10xxxxxxxxxx01xxxxxxxx", InstName.Vld2,    InstEmit32.Vld2,     typeof(OpCode32SimdMemSingle));
-            SetA32("111101000x10xxxxxxxx100xxxxxxxxx", InstName.Vld2,    InstEmit32.Vld2,     typeof(OpCode32SimdMemPair)); // Regs = 1, inc = 1/2 (itype).
-            SetA32("111101000x10xxxxxxxx0011xxxxxxxx", InstName.Vld2,    InstEmit32.Vld2,     typeof(OpCode32SimdMemPair)); // Regs = 2, inc = 2.
-            SetA32("111101001x10xxxxxxxxxx10xxxxxxxx", InstName.Vld3,    InstEmit32.Vld3,     typeof(OpCode32SimdMemSingle));
-            SetA32("111101000x10xxxxxxxx010xxxxxxxxx", InstName.Vld3,    InstEmit32.Vld3,     typeof(OpCode32SimdMemPair)); // Inc = 1/2 (itype).
-            SetA32("111101001x10xxxxxxxxxx11xxxxxxxx", InstName.Vld4,    InstEmit32.Vld4,     typeof(OpCode32SimdMemSingle));
-            SetA32("111101000x10xxxxxxxx000xxxxxxxxx", InstName.Vld4,    InstEmit32.Vld4,     typeof(OpCode32SimdMemPair)); // Inc = 1/2 (itype).
-            SetA32("<<<<11001x01xxxxxxxx1011xxxxxxx0", InstName.Vldm,    InstEmit32.Vldm,     typeof(OpCode32SimdMemMult));
-            SetA32("<<<<11001x11xxxxxxxx1011xxxxxxx0", InstName.Vldm,    InstEmit32.Vldm,     typeof(OpCode32SimdMemMult));
-            SetA32("<<<<11010x11xxxxxxxx1011xxxxxxx0", InstName.Vldm,    InstEmit32.Vldm,     typeof(OpCode32SimdMemMult));
-            SetA32("<<<<11001x01xxxxxxxx1010xxxxxxxx", InstName.Vldm,    InstEmit32.Vldm,     typeof(OpCode32SimdMemMult));
-            SetA32("<<<<11001x11xxxxxxxx1010xxxxxxxx", InstName.Vldm,    InstEmit32.Vldm,     typeof(OpCode32SimdMemMult));
-            SetA32("<<<<11010x11xxxxxxxx1010xxxxxxxx", InstName.Vldm,    InstEmit32.Vldm,     typeof(OpCode32SimdMemMult));
-            SetA32("<<<<1101xx01xxxxxxxx101xxxxxxxxx", InstName.Vldr,    InstEmit32.Vldr,     typeof(OpCode32SimdMemImm));
-            SetA32("1111001x0x<<xxxxxxxx0110xxx0xxxx", InstName.Vmax,    InstEmit32.Vmax_I,   typeof(OpCode32SimdReg));
-            SetA32("111100100x00xxxxxxxx1111xxx0xxxx", InstName.Vmax,    InstEmit32.Vmax_V,   typeof(OpCode32SimdReg));
-            SetA32("1111001x0x<<xxxxxxxx0110xxx1xxxx", InstName.Vmin,    InstEmit32.Vmin_I,   typeof(OpCode32SimdReg));
-            SetA32("111100100x10xxxxxxxx1111xxx0xxxx", InstName.Vmin,    InstEmit32.Vmin_V,   typeof(OpCode32SimdReg));
-            SetA32("111111101x00xxxxxxxx10>>x0x0xxxx", InstName.Vmaxnm,  InstEmit32.Vmaxnm_S, typeof(OpCode32SimdRegS));
-            SetA32("111100110x0xxxxxxxxx1111xxx1xxxx", InstName.Vmaxnm,  InstEmit32.Vmaxnm_V, typeof(OpCode32SimdReg));
-            SetA32("111111101x00xxxxxxxx10>>x1x0xxxx", InstName.Vminnm,  InstEmit32.Vminnm_S, typeof(OpCode32SimdRegS));
-            SetA32("111100110x1xxxxxxxxx1111xxx1xxxx", InstName.Vminnm,  InstEmit32.Vminnm_V, typeof(OpCode32SimdReg));
-            SetA32("1111001x1x<<xxxxxxxx000xx1x0xxxx", InstName.Vmla,    InstEmit32.Vmla_1,   typeof(OpCode32SimdRegElem));
-            SetA32("111100100xxxxxxxxxxx1001xxx0xxxx", InstName.Vmla,    InstEmit32.Vmla_I,   typeof(OpCode32SimdReg));
-            SetA32("<<<<11100x00xxxxxxxx101xx0x0xxxx", InstName.Vmla,    InstEmit32.Vmla_S,   typeof(OpCode32SimdRegS));
-            SetA32("111100100x00xxxxxxxx1101xxx1xxxx", InstName.Vmla,    InstEmit32.Vmla_V,   typeof(OpCode32SimdReg));
-            SetA32("1111001x1x<<xxxxxxxx010xx1x0xxxx", InstName.Vmls,    InstEmit32.Vmls_1,   typeof(OpCode32SimdRegElem));
-            SetA32("<<<<11100x00xxxxxxxx101xx1x0xxxx", InstName.Vmls,    InstEmit32.Vmls_S,   typeof(OpCode32SimdRegS));
-            SetA32("111100100x10xxxxxxxx1101xxx1xxxx", InstName.Vmls,    InstEmit32.Vmls_V,   typeof(OpCode32SimdReg));
-            SetA32("111100110xxxxxxxxxxx1001xxx0xxxx", InstName.Vmls,    InstEmit32.Vmls_I,   typeof(OpCode32SimdReg));
-            SetA32("<<<<11100xx0xxxxxxxx1011xxx10000", InstName.Vmov,    InstEmit32.Vmov_G1,  typeof(OpCode32SimdMovGpElem)); // From gen purpose.
-            SetA32("<<<<1110xxx1xxxxxxxx1011xxx10000", InstName.Vmov,    InstEmit32.Vmov_G1,  typeof(OpCode32SimdMovGpElem)); // To gen purpose.
-            SetA32("<<<<1100010xxxxxxxxx101000x1xxxx", InstName.Vmov,    InstEmit32.Vmov_G2,  typeof(OpCode32SimdMovGpDouble)); // To/from gen purpose x2 and single precision x2.
-            SetA32("<<<<1100010xxxxxxxxx101100x1xxxx", InstName.Vmov,    InstEmit32.Vmov_GD,  typeof(OpCode32SimdMovGpDouble)); // To/from gen purpose x2 and double precision.
-            SetA32("<<<<1110000xxxxxxxxx1010x0010000", InstName.Vmov,    InstEmit32.Vmov_GS,  typeof(OpCode32SimdMovGp)); // To/from gen purpose and single precision.
-            SetA32("1111001x1x000xxxxxxx0xx00x01xxxx", InstName.Vmov,    InstEmit32.Vmov_I,   typeof(OpCode32SimdImm)); // D/Q vector I32.
-            SetA32("<<<<11101x11xxxxxxxx101x0000xxxx", InstName.Vmov,    InstEmit32.Vmov_I,   typeof(OpCode32SimdImm44)); // Scalar f16/32/64 based on size 01 10 11.
-            SetA32("1111001x1x000xxxxxxx10x00x01xxxx", InstName.Vmov,    InstEmit32.Vmov_I,   typeof(OpCode32SimdImm)); // D/Q I16.
-            SetA32("1111001x1x000xxxxxxx11xx0x01xxxx", InstName.Vmov,    InstEmit32.Vmov_I,   typeof(OpCode32SimdImm)); // D/Q (dt - from cmode).
-            SetA32("1111001x1x000xxxxxxx11100x11xxxx", InstName.Vmov,    InstEmit32.Vmov_I,   typeof(OpCode32SimdImm)); // D/Q I64.
-            SetA32("<<<<11101x110000xxxx101x01x0xxxx", InstName.Vmov,    InstEmit32.Vmov_S,   typeof(OpCode32SimdS));
-            SetA32("1111001x1x001000xxx0101000x1xxxx", InstName.Vmovl,   InstEmit32.Vmovl,    typeof(OpCode32SimdLong));
-            SetA32("1111001x1x010000xxx0101000x1xxxx", InstName.Vmovl,   InstEmit32.Vmovl,    typeof(OpCode32SimdLong));
-            SetA32("1111001x1x100000xxx0101000x1xxxx", InstName.Vmovl,   InstEmit32.Vmovl,    typeof(OpCode32SimdLong));
-            SetA32("111100111x11xx10xxxx001000x0xxx0", InstName.Vmovn,   InstEmit32.Vmovn,    typeof(OpCode32SimdCmpZ));
-            SetA32("<<<<11101111xxxxxxxx101000010000", InstName.Vmrs,    InstEmit32.Vmrs,     typeof(OpCode32SimdSpecial));
-            SetA32("<<<<11101110xxxxxxxx101000010000", InstName.Vmsr,    InstEmit32.Vmsr,     typeof(OpCode32SimdSpecial));
-            SetA32("1111001x1x<<xxxxxxxx100xx1x0xxxx", InstName.Vmul,    InstEmit32.Vmul_1,   typeof(OpCode32SimdRegElem));
-            SetA32("1111001x0xxxxxxxxxxx1001xxx1xxxx", InstName.Vmul,    InstEmit32.Vmul_I,   typeof(OpCode32SimdReg));
-            SetA32("<<<<11100x10xxxxxxxx101xx0x0xxxx", InstName.Vmul,    InstEmit32.Vmul_S,   typeof(OpCode32SimdRegS));
-            SetA32("111100110x00xxxxxxxx1101xxx1xxxx", InstName.Vmul,    InstEmit32.Vmul_V,   typeof(OpCode32SimdReg));
-            SetA32("1111001x1x000xxxxxxx0xx00x11xxxx", InstName.Vmvn,    InstEmit32.Vmvn_I,   typeof(OpCode32SimdImm)); // D/Q vector I32.
-            SetA32("1111001x1x000xxxxxxx10x00x11xxxx", InstName.Vmvn,    InstEmit32.Vmvn_I,   typeof(OpCode32SimdImm));
-            SetA32("1111001x1x000xxxxxxx110x0x11xxxx", InstName.Vmvn,    InstEmit32.Vmvn_I,   typeof(OpCode32SimdImm));
-            SetA32("<<<<11101x110001xxxx101x01x0xxxx", InstName.Vneg,    InstEmit32.Vneg_S,   typeof(OpCode32SimdS));
-            SetA32("111100111x11xx01xxxx0x111xx0xxxx", InstName.Vneg,    InstEmit32.Vneg_V,   typeof(OpCode32Simd));
-            SetA32("<<<<11100x01xxxxxxxx101xx1x0xxxx", InstName.Vnmla,   InstEmit32.Vnmla_S,  typeof(OpCode32SimdRegS));
-            SetA32("<<<<11100x01xxxxxxxx101xx0x0xxxx", InstName.Vnmls,   InstEmit32.Vnmls_S,  typeof(OpCode32SimdRegS));
-            SetA32("<<<<11100x10xxxxxxxx101xx1x0xxxx", InstName.Vnmul,   InstEmit32.Vnmul_S,  typeof(OpCode32SimdRegS));
-            SetA32("111100100x10xxxxxxxx0001xxx1xxxx", InstName.Vorr,    InstEmit32.Vorr_I,   typeof(OpCode32SimdBinary));
-            SetA32("1111001x1x000xxxxxxx0xx10x01xxxx", InstName.Vorr,    InstEmit32.Vorr_II,  typeof(OpCode32SimdImm));
-            SetA32("111100100x<<xxxxxxxx1011x0x1xxxx", InstName.Vpadd,   InstEmit32.Vpadd_I,  typeof(OpCode32SimdReg));
-            SetA32("111100110x00xxxxxxxx1101x0x0xxxx", InstName.Vpadd,   InstEmit32.Vpadd_V,  typeof(OpCode32SimdReg));
-            SetA32("111100111x111011xxxx010x0xx0xxxx", InstName.Vrecpe,  InstEmit32.Vrecpe,   typeof(OpCode32SimdSqrte));
-            SetA32("111100100x00xxxxxxxx1111xxx1xxxx", InstName.Vrecps,  InstEmit32.Vrecps,   typeof(OpCode32SimdReg));
-            SetA32("111100111x11xx00xxxx000<<xx0xxxx", InstName.Vrev,    InstEmit32.Vrev,     typeof(OpCode32SimdRev));
-            SetA32("111111101x1110xxxxxx101x01x0xxxx", InstName.Vrint,   InstEmit32.Vrint_RM, typeof(OpCode32SimdCvtFI));
-            SetA32("<<<<11101x110110xxxx101x11x0xxxx", InstName.Vrint,   InstEmit32.Vrint_Z,  typeof(OpCode32SimdCvtFI));
-            SetA32("111100111x111011xxxx010x1xx0xxxx", InstName.Vrsqrte, InstEmit32.Vrsqrte,  typeof(OpCode32SimdSqrte));
-            SetA32("111100100x10xxxxxxxx1111xxx1xxxx", InstName.Vrsqrts, InstEmit32.Vrsqrts,  typeof(OpCode32SimdReg));
-            SetA32("111111100xxxxxxxxxxx101xx0x0xxxx", InstName.Vsel,    InstEmit32.Vsel,     typeof(OpCode32SimdSel));
-            SetA32("111100101x>>>xxxxxxx0101>xx1xxxx", InstName.Vshl,    InstEmit32.Vshl,     typeof(OpCode32SimdShImm));
-            SetA32("1111001x0xxxxxxxxxxx0100xxx0xxxx", InstName.Vshl,    InstEmit32.Vshl_I,   typeof(OpCode32SimdReg));
-            SetA32("1111001x1x>>>xxxxxxx0000>xx1xxxx", InstName.Vshr,    InstEmit32.Vshr,     typeof(OpCode32SimdShImm));
-            SetA32("111100101x>>>xxxxxxx100000x1xxx0", InstName.Vshrn,   InstEmit32.Vshrn,    typeof(OpCode32SimdShImm));
-            SetA32("<<<<11101x110001xxxx101x11x0xxxx", InstName.Vsqrt,   InstEmit32.Vsqrt_S,  typeof(OpCode32SimdS));
-            SetA32("111101001x00xxxxxxxx<<00xxxxxxxx", InstName.Vst1,    InstEmit32.Vst1,     typeof(OpCode32SimdMemSingle));
-            SetA32("111101000x00xxxxxxxx0111xxxxxxxx", InstName.Vst1,    InstEmit32.Vst1,     typeof(OpCode32SimdMemPair)); // Regs = 1.
-            SetA32("111101000x00xxxxxxxx1010xxxxxxxx", InstName.Vst1,    InstEmit32.Vst1,     typeof(OpCode32SimdMemPair)); // Regs = 2.
-            SetA32("111101000x00xxxxxxxx0110xxxxxxxx", InstName.Vst1,    InstEmit32.Vst1,     typeof(OpCode32SimdMemPair)); // Regs = 3.
-            SetA32("111101000x00xxxxxxxx0010xxxxxxxx", InstName.Vst1,    InstEmit32.Vst1,     typeof(OpCode32SimdMemPair)); // Regs = 4.
-            SetA32("111101001x00xxxxxxxx<<01xxxxxxxx", InstName.Vst2,    InstEmit32.Vst2,     typeof(OpCode32SimdMemSingle));
-            SetA32("111101000x00xxxxxxxx100xxxxxxxxx", InstName.Vst2,    InstEmit32.Vst2,     typeof(OpCode32SimdMemPair)); // Regs = 1, inc = 1/2 (itype).
-            SetA32("111101000x00xxxxxxxx0011xxxxxxxx", InstName.Vst2,    InstEmit32.Vst2,     typeof(OpCode32SimdMemPair)); // Regs = 2, inc = 2.
-            SetA32("111101001x00xxxxxxxx<<10xxxxxxxx", InstName.Vst3,    InstEmit32.Vst3,     typeof(OpCode32SimdMemSingle));
-            SetA32("111101000x00xxxxxxxx010xxxxxxxxx", InstName.Vst3,    InstEmit32.Vst3,     typeof(OpCode32SimdMemPair)); // Inc = 1/2 (itype).
-            SetA32("111101001x00xxxxxxxx<<11xxxxxxxx", InstName.Vst4,    InstEmit32.Vst4,     typeof(OpCode32SimdMemSingle));
-            SetA32("111101000x00xxxxxxxx000xxxxxxxxx", InstName.Vst4,    InstEmit32.Vst4,     typeof(OpCode32SimdMemPair)); // Inc = 1/2 (itype).
-            SetA32("<<<<11001x00xxxxxxxx1011xxxxxxx0", InstName.Vstm,    InstEmit32.Vstm,     typeof(OpCode32SimdMemMult));
-            SetA32("<<<<11001x10xxxxxxxx1011xxxxxxx0", InstName.Vstm,    InstEmit32.Vstm,     typeof(OpCode32SimdMemMult));
-            SetA32("<<<<11010x10xxxxxxxx1011xxxxxxx0", InstName.Vstm,    InstEmit32.Vstm,     typeof(OpCode32SimdMemMult));
-            SetA32("<<<<11001x00xxxxxxxx1010xxxxxxxx", InstName.Vstm,    InstEmit32.Vstm,     typeof(OpCode32SimdMemMult));
-            SetA32("<<<<11001x10xxxxxxxx1010xxxxxxxx", InstName.Vstm,    InstEmit32.Vstm,     typeof(OpCode32SimdMemMult));
-            SetA32("<<<<11010x10xxxxxxxx1010xxxxxxxx", InstName.Vstm,    InstEmit32.Vstm,     typeof(OpCode32SimdMemMult));
-            SetA32("<<<<1101xx00xxxxxxxx101xxxxxxxxx", InstName.Vstr,    InstEmit32.Vstr,     typeof(OpCode32SimdMemImm));
-            SetA32("111100110xxxxxxxxxxx1000xxx0xxxx", InstName.Vsub,    InstEmit32.Vsub_I,   typeof(OpCode32SimdReg));
-            SetA32("<<<<11100x11xxxxxxxx101xx1x0xxxx", InstName.Vsub,    InstEmit32.Vsub_S,   typeof(OpCode32SimdRegS));
-            SetA32("111100100x10xxxxxxxx1101xxx0xxxx", InstName.Vsub,    InstEmit32.Vsub_V,   typeof(OpCode32SimdReg));
-            SetA32("111100111x11xxxxxxxx10xxxxx0xxxx", InstName.Vtbl,    InstEmit32.Vtbl,     typeof(OpCode32SimdTbl));
-            SetA32("111100111x11<<10xxxx00001xx0xxxx", InstName.Vtrn,    InstEmit32.Vtrn,     typeof(OpCode32SimdCmpZ));
-            SetA32("111100111x11<<10xxxx00010xx0xxxx", InstName.Vuzp,    InstEmit32.Vuzp,     typeof(OpCode32SimdCmpZ));
-            SetA32("111100111x11<<10xxxx00011xx0xxxx", InstName.Vzip,    InstEmit32.Vzip,     typeof(OpCode32SimdCmpZ));
+            SetA32("<<<<11101x110000xxxx10xx11x0xxxx", InstName.Vabs,     InstEmit32.Vabs_S,   typeof(OpCode32SimdRegS));
+            SetA32("111100111x11xx01xxxx0x110xx0xxxx", InstName.Vabs,     InstEmit32.Vabs_V,   typeof(OpCode32SimdReg));
+            SetA32("111100100xxxxxxxxxxx1000xxx0xxxx", InstName.Vadd,     InstEmit32.Vadd_I,   typeof(OpCode32SimdReg));
+            SetA32("<<<<11100x11xxxxxxxx101xx0x0xxxx", InstName.Vadd,     InstEmit32.Vadd_S,   typeof(OpCode32SimdRegS));
+            SetA32("111100100x00xxxxxxxx1101xxx0xxxx", InstName.Vadd,     InstEmit32.Vadd_V,   typeof(OpCode32SimdReg));
+            SetA32("111100100x00xxxxxxxx0001xxx1xxxx", InstName.Vand,     InstEmit32.Vand_I,   typeof(OpCode32SimdBinary));
+            SetA32("111100110x11xxxxxxxx0001xxx1xxxx", InstName.Vbif,     InstEmit32.Vbif,     typeof(OpCode32SimdBinary));
+            SetA32("111100110x10xxxxxxxx0001xxx1xxxx", InstName.Vbit,     InstEmit32.Vbit,     typeof(OpCode32SimdBinary));
+            SetA32("111100110x01xxxxxxxx0001xxx1xxxx", InstName.Vbsl,     InstEmit32.Vbsl,     typeof(OpCode32SimdBinary));
+            SetA32("111100110x<<xxxxxxxx1000xxx1xxxx", InstName.Vceq,     InstEmit32.Vceq_I,   typeof(OpCode32SimdReg));
+            SetA32("111100100x00xxxxxxxx1110xxx0xxxx", InstName.Vceq,     InstEmit32.Vceq_V,   typeof(OpCode32SimdReg));
+            SetA32("111100111x11xx01xxxx0x010xx0xxxx", InstName.Vceq,     InstEmit32.Vceq_Z,   typeof(OpCode32SimdCmpZ));
+            SetA32("1111001x0x<<xxxxxxxx0011xxx1xxxx", InstName.Vcge,     InstEmit32.Vcge_I,   typeof(OpCode32SimdReg));
+            SetA32("111100110x00xxxxxxxx1110xxx0xxxx", InstName.Vcge,     InstEmit32.Vcge_V,   typeof(OpCode32SimdReg));
+            SetA32("111100111x11xx01xxxx0x001xx0xxxx", InstName.Vcge,     InstEmit32.Vcge_Z,   typeof(OpCode32SimdCmpZ));
+            SetA32("1111001x0x<<xxxxxxxx0011xxx0xxxx", InstName.Vcgt,     InstEmit32.Vcgt_I,   typeof(OpCode32SimdReg));
+            SetA32("111100110x10xxxxxxxx1110xxx0xxxx", InstName.Vcgt,     InstEmit32.Vcgt_V,   typeof(OpCode32SimdReg));
+            SetA32("111100111x11xx01xxxx0x000xx0xxxx", InstName.Vcgt,     InstEmit32.Vcgt_Z,   typeof(OpCode32SimdCmpZ));
+            SetA32("111100111x11xx01xxxx0x011xx0xxxx", InstName.Vcle,     InstEmit32.Vcle_Z,   typeof(OpCode32SimdCmpZ));
+            SetA32("111100111x11xx01xxxx0x100xx0xxxx", InstName.Vclt,     InstEmit32.Vclt_Z,   typeof(OpCode32SimdCmpZ));
+            SetA32("<<<<11101x11010xxxxx101x01x0xxxx", InstName.Vcmp,     InstEmit32.Vcmp,     typeof(OpCode32SimdS));
+            SetA32("<<<<11101x11010xxxxx101x11x0xxxx", InstName.Vcmpe,    InstEmit32.Vcmpe,    typeof(OpCode32SimdS));
+            SetA32("<<<<11101x110111xxxx101x11x0xxxx", InstName.Vcvt,     InstEmit32.Vcvt_FD,  typeof(OpCode32SimdS)); // FP 32 and 64, scalar.
+            SetA32("<<<<11101x11110xxxxx10xx11x0xxxx", InstName.Vcvt,     InstEmit32.Vcvt_FI,  typeof(OpCode32SimdCvtFI)); // FP32 to int.
+            SetA32("<<<<11101x111000xxxx10xxx1x0xxxx", InstName.Vcvt,     InstEmit32.Vcvt_FI,  typeof(OpCode32SimdCvtFI)); // Int to FP32.
+            SetA32("111111101x1111xxxxxx10>>x1x0xxxx", InstName.Vcvt,     InstEmit32.Vcvt_R,   typeof(OpCode32SimdCvtFI)); // The many FP32 to int encodings (fp).
+            SetA32("111100111x111011xxxx011xxxx0xxxx", InstName.Vcvt,     InstEmit32.Vcvt_V,   typeof(OpCode32SimdCmpZ)); // FP and integer, vector.
+            SetA32("<<<<11101x00xxxxxxxx101xx0x0xxxx", InstName.Vdiv,     InstEmit32.Vdiv_S,   typeof(OpCode32SimdRegS));
+            SetA32("<<<<11101xx0xxxxxxxx1011x0x10000", InstName.Vdup,     InstEmit32.Vdup,     typeof(OpCode32SimdDupGP));
+            SetA32("111100111x11xxxxxxxx11000xx0xxxx", InstName.Vdup,     InstEmit32.Vdup_1,   typeof(OpCode32SimdDupElem));
+            SetA32("111100101x11xxxxxxxxxxxxxxx0xxxx", InstName.Vext,     InstEmit32.Vext,     typeof(OpCode32SimdExt));
+            SetA32("111101001x10xxxxxxxxxx00xxxxxxxx", InstName.Vld1,     InstEmit32.Vld1,     typeof(OpCode32SimdMemSingle));
+            SetA32("111101000x10xxxxxxxx0111xxxxxxxx", InstName.Vld1,     InstEmit32.Vld1,     typeof(OpCode32SimdMemPair)); // Regs = 1.
+            SetA32("111101000x10xxxxxxxx1010xxxxxxxx", InstName.Vld1,     InstEmit32.Vld1,     typeof(OpCode32SimdMemPair)); // Regs = 2.
+            SetA32("111101000x10xxxxxxxx0110xxxxxxxx", InstName.Vld1,     InstEmit32.Vld1,     typeof(OpCode32SimdMemPair)); // Regs = 3.
+            SetA32("111101000x10xxxxxxxx0010xxxxxxxx", InstName.Vld1,     InstEmit32.Vld1,     typeof(OpCode32SimdMemPair)); // Regs = 4.
+            SetA32("111101001x10xxxxxxxxxx01xxxxxxxx", InstName.Vld2,     InstEmit32.Vld2,     typeof(OpCode32SimdMemSingle));
+            SetA32("111101000x10xxxxxxxx100xxxxxxxxx", InstName.Vld2,     InstEmit32.Vld2,     typeof(OpCode32SimdMemPair)); // Regs = 1, inc = 1/2 (itype).
+            SetA32("111101000x10xxxxxxxx0011xxxxxxxx", InstName.Vld2,     InstEmit32.Vld2,     typeof(OpCode32SimdMemPair)); // Regs = 2, inc = 2.
+            SetA32("111101001x10xxxxxxxxxx10xxxxxxxx", InstName.Vld3,     InstEmit32.Vld3,     typeof(OpCode32SimdMemSingle));
+            SetA32("111101000x10xxxxxxxx010xxxxxxxxx", InstName.Vld3,     InstEmit32.Vld3,     typeof(OpCode32SimdMemPair)); // Inc = 1/2 (itype).
+            SetA32("111101001x10xxxxxxxxxx11xxxxxxxx", InstName.Vld4,     InstEmit32.Vld4,     typeof(OpCode32SimdMemSingle));
+            SetA32("111101000x10xxxxxxxx000xxxxxxxxx", InstName.Vld4,     InstEmit32.Vld4,     typeof(OpCode32SimdMemPair)); // Inc = 1/2 (itype).
+            SetA32("<<<<11001x01xxxxxxxx1011xxxxxxx0", InstName.Vldm,     InstEmit32.Vldm,     typeof(OpCode32SimdMemMult));
+            SetA32("<<<<11001x11xxxxxxxx1011xxxxxxx0", InstName.Vldm,     InstEmit32.Vldm,     typeof(OpCode32SimdMemMult));
+            SetA32("<<<<11010x11xxxxxxxx1011xxxxxxx0", InstName.Vldm,     InstEmit32.Vldm,     typeof(OpCode32SimdMemMult));
+            SetA32("<<<<11001x01xxxxxxxx1010xxxxxxxx", InstName.Vldm,     InstEmit32.Vldm,     typeof(OpCode32SimdMemMult));
+            SetA32("<<<<11001x11xxxxxxxx1010xxxxxxxx", InstName.Vldm,     InstEmit32.Vldm,     typeof(OpCode32SimdMemMult));
+            SetA32("<<<<11010x11xxxxxxxx1010xxxxxxxx", InstName.Vldm,     InstEmit32.Vldm,     typeof(OpCode32SimdMemMult));
+            SetA32("<<<<1101xx01xxxxxxxx101xxxxxxxxx", InstName.Vldr,     InstEmit32.Vldr,     typeof(OpCode32SimdMemImm));
+            SetA32("1111001x0x<<xxxxxxxx0110xxx0xxxx", InstName.Vmax,     InstEmit32.Vmax_I,   typeof(OpCode32SimdReg));
+            SetA32("111100100x00xxxxxxxx1111xxx0xxxx", InstName.Vmax,     InstEmit32.Vmax_V,   typeof(OpCode32SimdReg));
+            SetA32("1111001x0x<<xxxxxxxx0110xxx1xxxx", InstName.Vmin,     InstEmit32.Vmin_I,   typeof(OpCode32SimdReg));
+            SetA32("111100100x10xxxxxxxx1111xxx0xxxx", InstName.Vmin,     InstEmit32.Vmin_V,   typeof(OpCode32SimdReg));
+            SetA32("111111101x00xxxxxxxx10>>x0x0xxxx", InstName.Vmaxnm,   InstEmit32.Vmaxnm_S, typeof(OpCode32SimdRegS));
+            SetA32("111100110x0xxxxxxxxx1111xxx1xxxx", InstName.Vmaxnm,   InstEmit32.Vmaxnm_V, typeof(OpCode32SimdReg));
+            SetA32("111111101x00xxxxxxxx10>>x1x0xxxx", InstName.Vminnm,   InstEmit32.Vminnm_S, typeof(OpCode32SimdRegS));
+            SetA32("111100110x1xxxxxxxxx1111xxx1xxxx", InstName.Vminnm,   InstEmit32.Vminnm_V, typeof(OpCode32SimdReg));
+            SetA32("1111001x1x<<xxxxxxxx000xx1x0xxxx", InstName.Vmla,     InstEmit32.Vmla_1,   typeof(OpCode32SimdRegElem));
+            SetA32("111100100xxxxxxxxxxx1001xxx0xxxx", InstName.Vmla,     InstEmit32.Vmla_I,   typeof(OpCode32SimdReg));
+            SetA32("<<<<11100x00xxxxxxxx101xx0x0xxxx", InstName.Vmla,     InstEmit32.Vmla_S,   typeof(OpCode32SimdRegS));
+            SetA32("111100100x00xxxxxxxx1101xxx1xxxx", InstName.Vmla,     InstEmit32.Vmla_V,   typeof(OpCode32SimdReg));
+            SetA32("1111001x1x<<xxxxxxxx010xx1x0xxxx", InstName.Vmls,     InstEmit32.Vmls_1,   typeof(OpCode32SimdRegElem));
+            SetA32("<<<<11100x00xxxxxxxx101xx1x0xxxx", InstName.Vmls,     InstEmit32.Vmls_S,   typeof(OpCode32SimdRegS));
+            SetA32("111100100x10xxxxxxxx1101xxx1xxxx", InstName.Vmls,     InstEmit32.Vmls_V,   typeof(OpCode32SimdReg));
+            SetA32("111100110xxxxxxxxxxx1001xxx0xxxx", InstName.Vmls,     InstEmit32.Vmls_I,   typeof(OpCode32SimdReg));
+            SetA32("1111001x1x<<xxxxxxx01010x0x0xxxx", InstName.Vmlsl,    InstEmit32.Vmlsl_I,  typeof(OpCode32SimdRegLong));
+            SetA32("<<<<11100xx0xxxxxxxx1011xxx10000", InstName.Vmov,     InstEmit32.Vmov_G1,  typeof(OpCode32SimdMovGpElem)); // From gen purpose.
+            SetA32("<<<<1110xxx1xxxxxxxx1011xxx10000", InstName.Vmov,     InstEmit32.Vmov_G1,  typeof(OpCode32SimdMovGpElem)); // To gen purpose.
+            SetA32("<<<<1100010xxxxxxxxx101000x1xxxx", InstName.Vmov,     InstEmit32.Vmov_G2,  typeof(OpCode32SimdMovGpDouble)); // To/from gen purpose x2 and single precision x2.
+            SetA32("<<<<1100010xxxxxxxxx101100x1xxxx", InstName.Vmov,     InstEmit32.Vmov_GD,  typeof(OpCode32SimdMovGpDouble)); // To/from gen purpose x2 and double precision.
+            SetA32("<<<<1110000xxxxxxxxx1010x0010000", InstName.Vmov,     InstEmit32.Vmov_GS,  typeof(OpCode32SimdMovGp)); // To/from gen purpose and single precision.
+            SetA32("1111001x1x000xxxxxxx0xx00x01xxxx", InstName.Vmov,     InstEmit32.Vmov_I,   typeof(OpCode32SimdImm)); // D/Q vector I32.
+            SetA32("<<<<11101x11xxxxxxxx101x0000xxxx", InstName.Vmov,     InstEmit32.Vmov_I,   typeof(OpCode32SimdImm44)); // Scalar f16/32/64 based on size 01 10 11.
+            SetA32("1111001x1x000xxxxxxx10x00x01xxxx", InstName.Vmov,     InstEmit32.Vmov_I,   typeof(OpCode32SimdImm)); // D/Q I16.
+            SetA32("1111001x1x000xxxxxxx11xx0x01xxxx", InstName.Vmov,     InstEmit32.Vmov_I,   typeof(OpCode32SimdImm)); // D/Q (dt - from cmode).
+            SetA32("1111001x1x000xxxxxxx11100x11xxxx", InstName.Vmov,     InstEmit32.Vmov_I,   typeof(OpCode32SimdImm)); // D/Q I64.
+            SetA32("<<<<11101x110000xxxx101x01x0xxxx", InstName.Vmov,     InstEmit32.Vmov_S,   typeof(OpCode32SimdS));
+            SetA32("1111001x1x001000xxx0101000x1xxxx", InstName.Vmovl,    InstEmit32.Vmovl,    typeof(OpCode32SimdLong));
+            SetA32("1111001x1x010000xxx0101000x1xxxx", InstName.Vmovl,    InstEmit32.Vmovl,    typeof(OpCode32SimdLong));
+            SetA32("1111001x1x100000xxx0101000x1xxxx", InstName.Vmovl,    InstEmit32.Vmovl,    typeof(OpCode32SimdLong));
+            SetA32("111100111x11xx10xxxx001000x0xxx0", InstName.Vmovn,    InstEmit32.Vmovn,    typeof(OpCode32SimdCmpZ));
+            SetA32("<<<<11101111xxxxxxxx101000010000", InstName.Vmrs,     InstEmit32.Vmrs,     typeof(OpCode32SimdSpecial));
+            SetA32("<<<<11101110xxxxxxxx101000010000", InstName.Vmsr,     InstEmit32.Vmsr,     typeof(OpCode32SimdSpecial));
+            SetA32("1111001x1x<<xxxxxxxx100xx1x0xxxx", InstName.Vmul,     InstEmit32.Vmul_1,   typeof(OpCode32SimdRegElem));
+            SetA32("111100100x<<xxxxxxxx1001xxx1xxxx", InstName.Vmul,     InstEmit32.Vmul_I,   typeof(OpCode32SimdReg));
+            SetA32("111100110x00xxxxxxxx1001xxx1xxxx", InstName.Vmul,     InstEmit32.Vmul_I,   typeof(OpCode32SimdReg));
+            SetA32("<<<<11100x10xxxxxxxx101xx0x0xxxx", InstName.Vmul,     InstEmit32.Vmul_S,   typeof(OpCode32SimdRegS));
+            SetA32("111100110x00xxxxxxxx1101xxx1xxxx", InstName.Vmul,     InstEmit32.Vmul_V,   typeof(OpCode32SimdReg));
+            SetA32("1111001x1x<<xxxxxxx01010x1x0xxxx", InstName.Vmull,    InstEmit32.Vmull_1,  typeof(OpCode32SimdRegElemLong));
+            SetA32("1111001x1x<<xxxxxxx01100x0x0xxxx", InstName.Vmull,    InstEmit32.Vmull_I,  typeof(OpCode32SimdRegLong));
+            SetA32("111100101x00xxxxxxx01110x0x0xxxx", InstName.Vmull,    InstEmit32.Vmull_I,  typeof(OpCode32SimdRegLong)); // Polynomial
+            SetA32("1111001x1x000xxxxxxx0xx00x11xxxx", InstName.Vmvn,     InstEmit32.Vmvn_I,   typeof(OpCode32SimdImm)); // D/Q vector I32.
+            SetA32("1111001x1x000xxxxxxx10x00x11xxxx", InstName.Vmvn,     InstEmit32.Vmvn_I,   typeof(OpCode32SimdImm));
+            SetA32("1111001x1x000xxxxxxx110x0x11xxxx", InstName.Vmvn,     InstEmit32.Vmvn_I,   typeof(OpCode32SimdImm));
+            SetA32("<<<<11101x110001xxxx101x01x0xxxx", InstName.Vneg,     InstEmit32.Vneg_S,   typeof(OpCode32SimdS));
+            SetA32("111100111x11xx01xxxx0x111xx0xxxx", InstName.Vneg,     InstEmit32.Vneg_V,   typeof(OpCode32Simd));
+            SetA32("<<<<11100x01xxxxxxxx101xx1x0xxxx", InstName.Vnmla,    InstEmit32.Vnmla_S,  typeof(OpCode32SimdRegS));
+            SetA32("<<<<11100x01xxxxxxxx101xx0x0xxxx", InstName.Vnmls,    InstEmit32.Vnmls_S,  typeof(OpCode32SimdRegS));
+            SetA32("<<<<11100x10xxxxxxxx101xx1x0xxxx", InstName.Vnmul,    InstEmit32.Vnmul_S,  typeof(OpCode32SimdRegS));
+            SetA32("111100100x10xxxxxxxx0001xxx1xxxx", InstName.Vorr,     InstEmit32.Vorr_I,   typeof(OpCode32SimdBinary));
+            SetA32("1111001x1x000xxxxxxx0xx10x01xxxx", InstName.Vorr,     InstEmit32.Vorr_II,  typeof(OpCode32SimdImm));
+            SetA32("111100100x<<xxxxxxxx1011x0x1xxxx", InstName.Vpadd,    InstEmit32.Vpadd_I,  typeof(OpCode32SimdReg));
+            SetA32("111100110x00xxxxxxxx1101x0x0xxxx", InstName.Vpadd,    InstEmit32.Vpadd_V,  typeof(OpCode32SimdReg));
+            SetA32("1111001x1x>>>xxxxxxx100101x1xxx0", InstName.Vqrshrn,  InstEmit32.Vqrshrn,  typeof(OpCode32SimdShImmNarrow));
+            SetA32("111100111x>>>xxxxxxx100001x1xxx0", InstName.Vqrshrun, InstEmit32.Vqrshrun, typeof(OpCode32SimdShImmNarrow));
+            SetA32("111100111x111011xxxx010x0xx0xxxx", InstName.Vrecpe,   InstEmit32.Vrecpe,   typeof(OpCode32SimdSqrte));
+            SetA32("111100100x00xxxxxxxx1111xxx1xxxx", InstName.Vrecps,   InstEmit32.Vrecps,   typeof(OpCode32SimdReg));
+            SetA32("111100111x11xx00xxxx000<<xx0xxxx", InstName.Vrev,     InstEmit32.Vrev,     typeof(OpCode32SimdRev));
+            SetA32("111111101x1110xxxxxx101x01x0xxxx", InstName.Vrint,    InstEmit32.Vrint_RM, typeof(OpCode32SimdCvtFI));
+            SetA32("<<<<11101x110110xxxx101x11x0xxxx", InstName.Vrint,    InstEmit32.Vrint_Z,  typeof(OpCode32SimdCvtFI));
+            SetA32("1111001x1x>>>xxxxxxx0010>xx1xxxx", InstName.Vrshr,    InstEmit32.Vrshr,    typeof(OpCode32SimdShImm));
+            SetA32("111100111x111011xxxx010x1xx0xxxx", InstName.Vrsqrte,  InstEmit32.Vrsqrte,  typeof(OpCode32SimdSqrte));
+            SetA32("111100100x10xxxxxxxx1111xxx1xxxx", InstName.Vrsqrts,  InstEmit32.Vrsqrts,  typeof(OpCode32SimdReg));
+            SetA32("111111100xxxxxxxxxxx101xx0x0xxxx", InstName.Vsel,     InstEmit32.Vsel,     typeof(OpCode32SimdSel));
+            SetA32("111100101x>>>xxxxxxx0101>xx1xxxx", InstName.Vshl,     InstEmit32.Vshl,     typeof(OpCode32SimdShImm));
+            SetA32("1111001x0xxxxxxxxxxx0100xxx0xxxx", InstName.Vshl,     InstEmit32.Vshl_I,   typeof(OpCode32SimdReg));
+            SetA32("1111001x1x>>>xxxxxxx0000>xx1xxxx", InstName.Vshr,     InstEmit32.Vshr,     typeof(OpCode32SimdShImm));
+            SetA32("111100101x>>>xxxxxxx100000x1xxx0", InstName.Vshrn,    InstEmit32.Vshrn,    typeof(OpCode32SimdShImmNarrow));
+            SetA32("<<<<11101x110001xxxx101x11x0xxxx", InstName.Vsqrt,    InstEmit32.Vsqrt_S,  typeof(OpCode32SimdS));
+            SetA32("111101001x00xxxxxxxx<<00xxxxxxxx", InstName.Vst1,     InstEmit32.Vst1,     typeof(OpCode32SimdMemSingle));
+            SetA32("111101000x00xxxxxxxx0111xxxxxxxx", InstName.Vst1,     InstEmit32.Vst1,     typeof(OpCode32SimdMemPair)); // Regs = 1.
+            SetA32("111101000x00xxxxxxxx1010xxxxxxxx", InstName.Vst1,     InstEmit32.Vst1,     typeof(OpCode32SimdMemPair)); // Regs = 2.
+            SetA32("111101000x00xxxxxxxx0110xxxxxxxx", InstName.Vst1,     InstEmit32.Vst1,     typeof(OpCode32SimdMemPair)); // Regs = 3.
+            SetA32("111101000x00xxxxxxxx0010xxxxxxxx", InstName.Vst1,     InstEmit32.Vst1,     typeof(OpCode32SimdMemPair)); // Regs = 4.
+            SetA32("111101001x00xxxxxxxx<<01xxxxxxxx", InstName.Vst2,     InstEmit32.Vst2,     typeof(OpCode32SimdMemSingle));
+            SetA32("111101000x00xxxxxxxx100xxxxxxxxx", InstName.Vst2,     InstEmit32.Vst2,     typeof(OpCode32SimdMemPair)); // Regs = 1, inc = 1/2 (itype).
+            SetA32("111101000x00xxxxxxxx0011xxxxxxxx", InstName.Vst2,     InstEmit32.Vst2,     typeof(OpCode32SimdMemPair)); // Regs = 2, inc = 2.
+            SetA32("111101001x00xxxxxxxx<<10xxxxxxxx", InstName.Vst3,     InstEmit32.Vst3,     typeof(OpCode32SimdMemSingle));
+            SetA32("111101000x00xxxxxxxx010xxxxxxxxx", InstName.Vst3,     InstEmit32.Vst3,     typeof(OpCode32SimdMemPair)); // Inc = 1/2 (itype).
+            SetA32("111101001x00xxxxxxxx<<11xxxxxxxx", InstName.Vst4,     InstEmit32.Vst4,     typeof(OpCode32SimdMemSingle));
+            SetA32("111101000x00xxxxxxxx000xxxxxxxxx", InstName.Vst4,     InstEmit32.Vst4,     typeof(OpCode32SimdMemPair)); // Inc = 1/2 (itype).
+            SetA32("<<<<11001x00xxxxxxxx1011xxxxxxx0", InstName.Vstm,     InstEmit32.Vstm,     typeof(OpCode32SimdMemMult));
+            SetA32("<<<<11001x10xxxxxxxx1011xxxxxxx0", InstName.Vstm,     InstEmit32.Vstm,     typeof(OpCode32SimdMemMult));
+            SetA32("<<<<11010x10xxxxxxxx1011xxxxxxx0", InstName.Vstm,     InstEmit32.Vstm,     typeof(OpCode32SimdMemMult));
+            SetA32("<<<<11001x00xxxxxxxx1010xxxxxxxx", InstName.Vstm,     InstEmit32.Vstm,     typeof(OpCode32SimdMemMult));
+            SetA32("<<<<11001x10xxxxxxxx1010xxxxxxxx", InstName.Vstm,     InstEmit32.Vstm,     typeof(OpCode32SimdMemMult));
+            SetA32("<<<<11010x10xxxxxxxx1010xxxxxxxx", InstName.Vstm,     InstEmit32.Vstm,     typeof(OpCode32SimdMemMult));
+            SetA32("<<<<1101xx00xxxxxxxx101xxxxxxxxx", InstName.Vstr,     InstEmit32.Vstr,     typeof(OpCode32SimdMemImm));
+            SetA32("111100110xxxxxxxxxxx1000xxx0xxxx", InstName.Vsub,     InstEmit32.Vsub_I,   typeof(OpCode32SimdReg));
+            SetA32("<<<<11100x11xxxxxxxx101xx1x0xxxx", InstName.Vsub,     InstEmit32.Vsub_S,   typeof(OpCode32SimdRegS));
+            SetA32("111100100x10xxxxxxxx1101xxx0xxxx", InstName.Vsub,     InstEmit32.Vsub_V,   typeof(OpCode32SimdReg));
+            SetA32("111100111x11xxxxxxxx10xxxxx0xxxx", InstName.Vtbl,     InstEmit32.Vtbl,     typeof(OpCode32SimdTbl));
+            SetA32("111100111x11<<10xxxx00001xx0xxxx", InstName.Vtrn,     InstEmit32.Vtrn,     typeof(OpCode32SimdCmpZ));
+            SetA32("111100111x11<<10xxxx00010xx0xxxx", InstName.Vuzp,     InstEmit32.Vuzp,     typeof(OpCode32SimdCmpZ));
+            SetA32("111100111x11<<10xxxx00011xx0xxxx", InstName.Vzip,     InstEmit32.Vzip,     typeof(OpCode32SimdCmpZ));
 #endregion
 
             FillFastLookupTable(_instA32FastLookup, _allInstA32);

+ 0 - 2
ARMeilleure/Instructions/InstEmitAlu32.cs

@@ -128,8 +128,6 @@ namespace ARMeilleure.Instructions
 
         public static void Cmp(ArmEmitterContext context)
         {
-            IOpCode32Alu op = (IOpCode32Alu)context.CurrOp;
-
             Operand n = GetAluN(context);
             Operand m = GetAluM(context, setCarry: false);
 

+ 124 - 79
ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs

@@ -2,6 +2,7 @@
 using ARMeilleure.IntermediateRepresentation;
 using ARMeilleure.Translation;
 using System;
+using System.Diagnostics;
 
 using static ARMeilleure.Instructions.InstEmitFlowHelper;
 using static ARMeilleure.Instructions.InstEmitHelper;
@@ -113,20 +114,13 @@ namespace ARMeilleure.Instructions
             Operand insert = GetIntA32(context, op.Rt);
 
             // Zero extend into an I64, then replicate. Saves the most time over elementwise inserts.
-            switch (op.Size)
+            insert = op.Size switch
             {
-                case 2:
-                    insert = context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u));
-                    break;
-                case 1:
-                    insert = context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u));
-                    break;
-                case 0:
-                    insert = context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u));
-                    break;
-                default:
-                    throw new InvalidOperationException("Unknown Vdup Size.");
-            }
+                2 => context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)),
+                1 => context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)),
+                0 => context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)),
+                _ => throw new InvalidOperationException($"Invalid Vdup size \"{op.Size}\".")
+            };
 
             InsertScalar(context, op.Vd, insert);
             if (op.Q)
@@ -142,20 +136,13 @@ namespace ARMeilleure.Instructions
             Operand insert = EmitVectorExtractZx32(context, op.Vm >> 1, ((op.Vm & 1) << (3 - op.Size)) + op.Index, op.Size);
 
             // Zero extend into an I64, then replicate. Saves the most time over elementwise inserts.
-            switch (op.Size)
+            insert = op.Size switch
             {
-                case 2:
-                    insert = context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u));
-                    break;
-                case 1:
-                    insert = context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u));
-                    break;
-                case 0:
-                    insert = context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u));
-                    break;
-                default:
-                    throw new InvalidOperationException("Unknown Vdup Size.");
-            }
+                2 => context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)),
+                1 => context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)),
+                0 => context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)),
+                _ => throw new InvalidOperationException($"Invalid Vdup size \"{op.Size}\".")
+            };
 
             InsertScalar(context, op.Vd, insert);
             if (op.Q)
@@ -575,51 +562,53 @@ namespace ARMeilleure.Instructions
             }
         }
 
-        public static void Vmul_S(ArmEmitterContext context)
+        public static void Vmla_S(ArmEmitterContext context)
         {
             if (Optimizations.FastFP && Optimizations.UseSse2)
             {
-                EmitScalarBinaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd);
+                EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd);
             }
             else if (Optimizations.FastFP)
             {
-                EmitScalarBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2));
+                EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
+                {
+                    return context.Add(op1, context.Multiply(op2, op3));
+                });
             }
             else
             {
-                EmitScalarBinaryOpF32(context, (op1, op2) =>
+                EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
                 {
-                    return EmitSoftFloatCall(context, SoftFloat32.FPMul, SoftFloat64.FPMul, op1, op2);
+                    return EmitSoftFloatCall(context, SoftFloat32.FPMulAdd, SoftFloat64.FPMulAdd, op1, op2, op3);
                 });
             }
         }
 
-        public static void Vmul_V(ArmEmitterContext context)
+        public static void Vmla_V(ArmEmitterContext context)
         {
             if (Optimizations.FastFP && Optimizations.UseSse2)
             {
-                EmitVectorBinaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
+                EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
             }
             else if (Optimizations.FastFP)
             {
-                EmitVectorBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2));
+                EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
             }
             else
             {
-                EmitVectorBinaryOpF32(context, (op1, op2) =>
+                EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
                 {
-                    return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulFpscr, SoftFloat64.FPMulFpscr, op1, op2);
+                    return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulAddFpscr, SoftFloat64.FPMulAddFpscr, op1, op2, op3);
                 });
             }
         }
 
-        public static void Vmul_I(ArmEmitterContext context)
+        public static void Vmla_I(ArmEmitterContext context)
         {
-            if ((context.CurrOp as OpCode32SimdReg).U) throw new NotImplementedException("Polynomial mode not implemented");
-            EmitVectorBinaryOpSx32(context, (op1, op2) => context.Multiply(op1, op2));
+            EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
         }
 
-        public static void Vmul_1(ArmEmitterContext context)
+        public static void Vmla_1(ArmEmitterContext context)
         {
             OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
 
@@ -627,70 +616,70 @@ namespace ARMeilleure.Instructions
             {
                 if (Optimizations.FastFP && Optimizations.UseSse2)
                 {
-                    EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
+                    EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
                 }
                 else if (Optimizations.FastFP)
                 {
-                    EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2));
+                    EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
                 }
                 else
                 {
-                    EmitVectorByScalarOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulFpscr, SoftFloat64.FPMulFpscr, op1, op2));
+                    EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulAddFpscr, SoftFloat64.FPMulAddFpscr, op1, op2, op3));
                 }
             }
             else
             {
-                EmitVectorByScalarOpI32(context, (op1, op2) => context.Multiply(op1, op2), false);
+                EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)), false);
             }
         }
 
-        public static void Vmla_S(ArmEmitterContext context)
+        public static void Vmls_S(ArmEmitterContext context)
         {
             if (Optimizations.FastFP && Optimizations.UseSse2)
             {
-                EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd);
+                EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd);
             }
             else if (Optimizations.FastFP)
             {
                 EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
                 {
-                    return context.Add(op1, context.Multiply(op2, op3));
+                    return context.Subtract(op1, context.Multiply(op2, op3));
                 });
             }
             else
             {
                 EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
                 {
-                    return EmitSoftFloatCall(context, SoftFloat32.FPMulAdd, SoftFloat64.FPMulAdd, op1, op2, op3);
+                    return EmitSoftFloatCall(context, SoftFloat32.FPMulSub, SoftFloat64.FPMulSub, op1, op2, op3);
                 });
             }
         }
 
-        public static void Vmla_V(ArmEmitterContext context)
+        public static void Vmls_V(ArmEmitterContext context)
         {
             if (Optimizations.FastFP && Optimizations.UseSse2)
             {
-                EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
+                EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
             }
             else if (Optimizations.FastFP)
             {
-                EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
+                EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
             }
             else
             {
                 EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
                 {
-                    return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulAddFpscr, SoftFloat64.FPMulAddFpscr, op1, op2, op3);
+                    return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulSubFpscr, SoftFloat64.FPMulSubFpscr, op1, op2, op3);
                 });
             }
         }
 
-        public static void Vmla_I(ArmEmitterContext context)
+        public static void Vmls_I(ArmEmitterContext context)
         {
-            EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
+            EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
         }
 
-        public static void Vmla_1(ArmEmitterContext context)
+        public static void Vmls_1(ArmEmitterContext context)
         {
             OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
 
@@ -698,70 +687,83 @@ namespace ARMeilleure.Instructions
             {
                 if (Optimizations.FastFP && Optimizations.UseSse2)
                 {
-                    EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
+                    EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
                 }
                 else if (Optimizations.FastFP)
                 {
-                    EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
+                    EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
                 }
                 else
                 {
-                    EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulAddFpscr, SoftFloat64.FPMulAddFpscr, op1, op2, op3));
+                    EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulSubFpscr, SoftFloat64.FPMulSubFpscr, op1, op2, op3));
                 }
             }
             else
             {
-                EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)), false);
+                EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)), false);
             }
         }
 
-        public static void Vmls_S(ArmEmitterContext context)
+        public static void Vmlsl_I(ArmEmitterContext context)
+        {
+            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
+
+            EmitVectorTernaryLongOpI32(context, (opD, op1, op2) => context.Subtract(opD, context.Multiply(op1, op2)), !op.U);
+        }
+
+        public static void Vmul_S(ArmEmitterContext context)
         {
             if (Optimizations.FastFP && Optimizations.UseSse2)
             {
-                EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd);
+                EmitScalarBinaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd);
             }
             else if (Optimizations.FastFP)
             {
-                EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
-                {
-                    return context.Subtract(op1, context.Multiply(op2, op3));
-                });
+                EmitScalarBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2));
             }
             else
             {
-                EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
+                EmitScalarBinaryOpF32(context, (op1, op2) =>
                 {
-                    return EmitSoftFloatCall(context, SoftFloat32.FPMulSub, SoftFloat64.FPMulSub, op1, op2, op3);
+                    return EmitSoftFloatCall(context, SoftFloat32.FPMul, SoftFloat64.FPMul, op1, op2);
                 });
             }
         }
 
-        public static void Vmls_V(ArmEmitterContext context)
+        public static void Vmul_V(ArmEmitterContext context)
         {
             if (Optimizations.FastFP && Optimizations.UseSse2)
             {
-                EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
+                EmitVectorBinaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
             }
             else if (Optimizations.FastFP)
             {
-                EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
+                EmitVectorBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2));
             }
             else
             {
-                EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
+                EmitVectorBinaryOpF32(context, (op1, op2) =>
                 {
-                    return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulSubFpscr, SoftFloat64.FPMulSubFpscr, op1, op2, op3);
+                    return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulFpscr, SoftFloat64.FPMulFpscr, op1, op2);
                 });
             }
         }
 
-        public static void Vmls_I(ArmEmitterContext context)
+        public static void Vmul_I(ArmEmitterContext context)
         {
-            EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
+            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
+
+            if (op.U) // This instruction is always signed, U indicates polynomial mode.
+            {
+                EmitVectorBinaryOpZx32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size));
+            }
+            else
+            {
+                EmitVectorBinaryOpSx32(context, (op1, op2) => context.Multiply(op1, op2));
+            }
         }
 
-        public static void Vmls_1(ArmEmitterContext context)
+        public static void Vmul_1(ArmEmitterContext context)
         {
             OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
 
@@ -769,20 +771,41 @@ namespace ARMeilleure.Instructions
             {
                 if (Optimizations.FastFP && Optimizations.UseSse2)
                 {
-                    EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
+                    EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
                 }
                 else if (Optimizations.FastFP)
                 {
-                    EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
+                    EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2));
                 }
                 else
                 {
-                    EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulSubFpscr, SoftFloat64.FPMulSubFpscr, op1, op2, op3));
+                    EmitVectorByScalarOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulFpscr, SoftFloat64.FPMulFpscr, op1, op2));
                 }
             }
             else
             {
-                EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)), false);
+                EmitVectorByScalarOpI32(context, (op1, op2) => context.Multiply(op1, op2), false);
+            }
+        }
+
+        public static void Vmull_1(ArmEmitterContext context)
+        {
+            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
+
+            EmitVectorByScalarLongOpI32(context, (op1, op2) => context.Multiply(op1, op2), !op.U);
+        }
+
+        public static void Vmull_I(ArmEmitterContext context)
+        {
+            OpCode32SimdRegLong op = (OpCode32SimdRegLong)context.CurrOp;
+
+            if (op.Polynomial)
+            {
+                EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false);
+            }
+            else
+            {
+                EmitVectorBinaryLongOpI32(context, (op1, op2) => context.Multiply(op1, op2), !op.U);
             }
         }
 
@@ -1157,5 +1180,27 @@ namespace ARMeilleure.Instructions
                 EmitVectorBinaryOpSimd32(context, genericEmit);
             }
         }
+
+        private static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize)
+        {
+            Debug.Assert(eSize <= 32);
+
+            Operand result = eSize == 32 ? Const(0L) : Const(0);
+
+            if (eSize == 32)
+            {
+                op1 = context.ZeroExtend32(OperandType.I64, op1);
+                op2 = context.ZeroExtend32(OperandType.I64, op2);
+            }
+
+            for (int i = 0; i < eSize; i++)
+            {
+                Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i));
+
+                result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask));
+            }
+
+            return result;
+        }
     }
 }

+ 90 - 5
ARMeilleure/Instructions/InstEmitSimdHelper32.cs

@@ -57,7 +57,6 @@ namespace ARMeilleure.Instructions
                 // From dreg.
                 vec = GetVecA32(reg >> 1);
                 insert = context.VectorInsert(vec, value, reg & 1);
-                
             }
             else
             {
@@ -69,6 +68,11 @@ namespace ARMeilleure.Instructions
             context.Copy(vec, insert);
         }
 
+        public static Operand ExtractElement(ArmEmitterContext context, int reg, int size, bool signed)
+        {
+            return EmitVectorExtract32(context, reg >> (4 - size), reg & ((16 >> size) - 1), size, signed);
+        }
+
         public static void EmitVectorImmUnaryOp32(ArmEmitterContext context, Func1I emit)
         {
             IOpCode32SimdImm op = (IOpCode32SimdImm)context.CurrOp;
@@ -250,6 +254,57 @@ namespace ARMeilleure.Instructions
             context.Copy(GetVecA32(op.Qd), res);
         }
 
+        public static void EmitVectorBinaryLongOpI32(ArmEmitterContext context, Func2I emit, bool signed)
+        {
+            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
+
+            Operand res = context.VectorZero();
+
+            int elems = op.GetBytesCount() >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);
+                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed);
+
+                if (op.Size == 2)
+                {
+                    ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne);
+                    me = signed ? context.SignExtend32(OperandType.I64, me) : context.ZeroExtend32(OperandType.I64, me);
+                }
+
+                res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size + 1);
+            }
+
+            context.Copy(GetVecA32(op.Qd), res);
+        }
+
+        public static void EmitVectorTernaryLongOpI32(ArmEmitterContext context, Func3I emit, bool signed)
+        {
+            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
+
+            Operand res = context.VectorZero();
+
+            int elems = op.GetBytesCount() >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                Operand de = EmitVectorExtract32(context, op.Qd, op.Id + index, op.Size + 1, signed);
+                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size,     signed);
+                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size,     signed);
+
+                if (op.Size == 2)
+                {
+                    ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne);
+                    me = signed ? context.SignExtend32(OperandType.I64, me) : context.ZeroExtend32(OperandType.I64, me);
+                }
+
+                res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size + 1);
+            }
+
+            context.Copy(GetVecA32(op.Qd), res);
+        }
+
         public static void EmitVectorTernaryOpI32(ArmEmitterContext context, Func3I emit, bool signed)
         {
             OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
@@ -330,7 +385,7 @@ namespace ARMeilleure.Instructions
         {
             OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
 
-            Operand m = EmitVectorExtract32(context, op.Vm >> (4 - op.Size), op.Vm & ((1 << (4 - op.Size)) - 1), op.Size, signed);
+            Operand m = ExtractElement(context, op.Vm, op.Size, signed);
 
             Operand res = GetVecA32(op.Qd);
 
@@ -340,7 +395,37 @@ namespace ARMeilleure.Instructions
             {
                 Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);
 
-                res = EmitVectorInsert(context, res, emit(ne, m), op.In + index, op.Size);
+                res = EmitVectorInsert(context, res, emit(ne, m), op.Id + index, op.Size);
+            }
+
+            context.Copy(GetVecA32(op.Qd), res);
+        }
+
+        public static void EmitVectorByScalarLongOpI32(ArmEmitterContext context, Func2I emit, bool signed)
+        {
+            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
+
+            Operand m = ExtractElement(context, op.Vm, op.Size, signed);
+
+            if (op.Size == 2)
+            {
+                m = signed ? context.SignExtend32(OperandType.I64, m) : context.ZeroExtend32(OperandType.I64, m);
+            }
+
+            Operand res = context.VectorZero();
+
+            int elems = op.GetBytesCount() >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);
+
+                if (op.Size == 2)
+                {
+                    ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne);
+                }
+
+                res = EmitVectorInsert(context, res, emit(ne, m), index, op.Size + 1);
             }
 
             context.Copy(GetVecA32(op.Qd), res);
@@ -454,7 +539,7 @@ namespace ARMeilleure.Instructions
 
         // Narrow
 
-        public static void EmitVectorUnaryNarrowOp32(ArmEmitterContext context, Func1I emit)
+        public static void EmitVectorUnaryNarrowOp32(ArmEmitterContext context, Func1I emit, bool signed = false)
         {
             OpCode32Simd op = (OpCode32Simd)context.CurrOp;
 
@@ -465,7 +550,7 @@ namespace ARMeilleure.Instructions
 
             for (int index = 0; index < elems; index++)
             {
-                Operand m = EmitVectorExtract32(context, op.Qm, index, op.Size + 1, false);
+                Operand m = EmitVectorExtract32(context, op.Qm, index, op.Size + 1, signed);
 
                 res = EmitVectorInsert(context, res, emit(m), id + index, op.Size);
             }

+ 180 - 2
ARMeilleure/Instructions/InstEmitSimdShift32.cs

@@ -1,5 +1,6 @@
 using ARMeilleure.Decoders;
 using ARMeilleure.IntermediateRepresentation;
+using ARMeilleure.State;
 using ARMeilleure.Translation;
 using System;
 using System.Diagnostics;
@@ -11,6 +12,78 @@ namespace ARMeilleure.Instructions
 {
     static partial class InstEmit32
     {
+        public static void Vqrshrn(ArmEmitterContext context)
+        {
+            OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp;
+
+            EmitRoundShrImmSaturatingNarrowOp(context, op.U ? ShrImmSaturatingNarrowFlags.VectorZxZx : ShrImmSaturatingNarrowFlags.VectorSxSx);
+        }
+
+        public static void Vqrshrun(ArmEmitterContext context)
+        {
+            EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx);
+        }
+
+        public static void Vrshr(ArmEmitterContext context)
+        {
+            OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp;
+            int shift = GetImmShr(op);
+            long roundConst = 1L << (shift - 1);
+
+            if (op.U)
+            {
+                if (op.Size < 2)
+                {
+                    EmitVectorUnaryOpZx32(context, (op1) =>
+                    {
+                        op1 = context.Add(op1, Const(op1.Type, roundConst));
+
+                        return context.ShiftRightUI(op1, Const(shift));
+                    });
+                }
+                else if (op.Size == 2)
+                {
+                    EmitVectorUnaryOpZx32(context, (op1) =>
+                    {
+                        op1 = context.ZeroExtend32(OperandType.I64, op1);
+                        op1 = context.Add(op1, Const(op1.Type, roundConst));
+
+                        return context.ConvertI64ToI32(context.ShiftRightUI(op1, Const(shift)));
+                    });
+                }
+                else /* if (op.Size == 3) */
+                {
+                    EmitVectorUnaryOpZx32(context, (op1) => EmitShrImm64(context, op1, signed: false, roundConst, shift));
+                }
+            }
+            else
+            {
+                if (op.Size < 2)
+                {
+                    EmitVectorUnaryOpSx32(context, (op1) =>
+                    {
+                        op1 = context.Add(op1, Const(op1.Type, roundConst));
+
+                        return context.ShiftRightSI(op1, Const(shift));
+                    });
+                }
+                else if (op.Size == 2)
+                {
+                    EmitVectorUnaryOpSx32(context, (op1) =>
+                    {
+                        op1 = context.SignExtend32(OperandType.I64, op1);
+                        op1 = context.Add(op1, Const(op1.Type, roundConst));
+
+                        return context.ConvertI64ToI32(context.ShiftRightSI(op1, Const(shift)));
+                    });
+                }
+                else /* if (op.Size == 3) */
+                {
+                    EmitVectorUnaryOpZx32(context, (op1) => EmitShrImm64(context, op1, signed: true, roundConst, shift));
+                }
+            }
+        }
+
         public static void Vshl(ArmEmitterContext context)
         {
             OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp;
@@ -35,7 +108,7 @@ namespace ARMeilleure.Instructions
         public static void Vshr(ArmEmitterContext context)
         {
             OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp;
-            int shift = (8 << op.Size) - op.Shift; // Shr amount is flipped.
+            int shift = GetImmShr(op);
             int maxShift = (8 << op.Size) - 1;
 
             if (op.U)
@@ -51,7 +124,7 @@ namespace ARMeilleure.Instructions
         public static void Vshrn(ArmEmitterContext context)
         {
             OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp;
-            int shift = (8 << op.Size) - op.Shift; // Shr amount is flipped.
+            int shift = GetImmShr(op);
 
             EmitVectorUnaryNarrowOp32(context, (op1) => context.ShiftRightUI(op1, Const(shift)));
         }
@@ -96,5 +169,110 @@ namespace ARMeilleure.Instructions
                 return context.ConditionalSelect(isOutOfRange0, Const(op.Type, 0), context.ConditionalSelect(isOutOfRangeN, min, res));
             }
         }
+
+        [Flags]
+        private enum ShrImmSaturatingNarrowFlags
+        {
+            Scalar = 1 << 0,
+            SignedSrc = 1 << 1,
+            SignedDst = 1 << 2,
+
+            Round = 1 << 3,
+
+            ScalarSxSx = Scalar | SignedSrc | SignedDst,
+            ScalarSxZx = Scalar | SignedSrc,
+            ScalarZxZx = Scalar,
+
+            VectorSxSx = SignedSrc | SignedDst,
+            VectorSxZx = SignedSrc,
+            VectorZxZx = 0
+        }
+
+        private static void EmitRoundShrImmSaturatingNarrowOp(ArmEmitterContext context, ShrImmSaturatingNarrowFlags flags)
+        {
+            EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.Round | flags);
+        }
+
+        private static void EmitShrImmSaturatingNarrowOp(ArmEmitterContext context, ShrImmSaturatingNarrowFlags flags)
+        {
+            OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp;
+
+            bool scalar    = (flags & ShrImmSaturatingNarrowFlags.Scalar)    != 0;
+            bool signedSrc = (flags & ShrImmSaturatingNarrowFlags.SignedSrc) != 0;
+            bool signedDst = (flags & ShrImmSaturatingNarrowFlags.SignedDst) != 0;
+            bool round     = (flags & ShrImmSaturatingNarrowFlags.Round)     != 0;
+
+            if (scalar)
+            {
+                // TODO: Support scalar operation.
+                throw new NotImplementedException();
+            }
+
+            int shift = GetImmShr(op);
+            long roundConst = 1L << (shift - 1);
+
+            EmitVectorUnaryNarrowOp32(context, (op1) =>
+            {
+                if (op.Size <= 1 || !round)
+                {
+                    if (round)
+                    {
+                        op1 = context.Add(op1, Const(op1.Type, roundConst));
+                    }
+
+                    op1 = signedSrc ? context.ShiftRightSI(op1, Const(shift)) : context.ShiftRightUI(op1, Const(shift));
+                }
+                else /* if (op.Size == 2 && round) */
+                {
+                    op1 = EmitShrImm64(context, op1, signedSrc, roundConst, shift); // shift <= 32
+                }
+
+                return EmitSatQ(context, op1, 8 << op.Size, signedDst);
+            }, signedSrc);
+        }
+
+        private static int GetImmShr(OpCode32SimdShImm op)
+        {
+            return (8 << op.Size) - op.Shift; // Shr amount is flipped.
+        }
+
+        // dst64 = (Int(src64, signed) + roundConst) >> shift;
+        private static Operand EmitShrImm64(
+            ArmEmitterContext context,
+            Operand value,
+            bool signed,
+            long roundConst,
+            int shift)
+        {
+            Delegate dlg = signed
+                ? (Delegate)new _S64_S64_S64_S32(SoftFallback.SignedShrImm64)
+                : (Delegate)new _U64_U64_S64_S32(SoftFallback.UnsignedShrImm64);
+
+            return context.Call(dlg, value, Const(roundConst), Const(shift));
+        }
+
+        private static Operand EmitSatQ(ArmEmitterContext context, Operand value, int eSize, bool signed)
+        {
+            Debug.Assert(eSize <= 32);
+
+            long intMin = signed ? -(1L << (eSize - 1)) : 0;
+            long intMax = signed ? (1L << (eSize - 1)) - 1 : (1L << eSize) - 1;
+
+            Operand gt = context.ICompareGreater(value, Const(value.Type, intMax));
+            Operand lt = context.ICompareLess(value, Const(value.Type, intMin));
+
+            value = context.ConditionalSelect(gt, Const(value.Type, intMax), value);
+            value = context.ConditionalSelect(lt, Const(value.Type, intMin), value);
+
+            Operand lblNoSat = Label();
+
+            context.BranchIfFalse(lblNoSat, context.BitwiseOr(gt, lt));
+
+            // TODO: Set QC (to 1) on FPSCR here.
+
+            context.MarkLabel(lblNoSat);
+
+            return value;
+        }
     }
 }

+ 5 - 0
ARMeilleure/Instructions/InstName.cs

@@ -560,12 +560,14 @@ namespace ARMeilleure.Instructions
         Vminnm,
         Vmla,
         Vmls,
+        Vmlsl,
         Vmov,
         Vmovl,
         Vmovn,
         Vmrs,
         Vmsr,
         Vmul,
+        Vmull,
         Vmvn,
         Vneg,
         Vnmul,
@@ -573,8 +575,11 @@ namespace ARMeilleure.Instructions
         Vnmls,
         Vorr,
         Vpadd,
+        Vqrshrn,
+        Vqrshrun,
         Vrev,
         Vrint,
+        Vrshr,
         Vsel,
         Vshl,
         Vshr,

+ 73 - 1
Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs

@@ -256,7 +256,7 @@ namespace Ryujinx.Tests.Cpu
             {
                 opcode |= ((rm & 0xf) << 0)  | ((rm & 0x10) << 1);
                 opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
-            } 
+            }
             else
             {
                 opcode |= ((rm & 0x1e) >> 1)  | ((rm & 0x1) << 5);
@@ -284,6 +284,78 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
+        [Test, Pairwise, Description("VMLSL.<type><size> <Vd>, <Vn>, <Vm>")]
+        public void Vmlsl_I([Values(0u)] uint rd,
+                            [Values(1u, 0u)] uint rn,
+                            [Values(2u, 0u)] uint rm,
+                            [Values(0u, 1u, 2u)] uint size,
+                            [Random(RndCnt)] ulong z,
+                            [Random(RndCnt)] ulong a,
+                            [Random(RndCnt)] ulong b,
+                            [Values] bool u)
+        {
+            uint opcode = 0xf2800a00u; // VMLSL.S8 Q0, D0, D0
+
+            opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1);
+            opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
+            opcode |= ((rn & 0xf) << 16) | ((rn & 0x10) << 3);
+
+            opcode |= size << 20;
+
+            if (u)
+            {
+                opcode |= 1 << 24;
+            }
+
+            V128 v0 = MakeVectorE0E1(z, z);
+            V128 v1 = MakeVectorE0E1(a, z);
+            V128 v2 = MakeVectorE0E1(b, z);
+
+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise, Description("VMULL.<size> <Vd>, <Vn>, <Vm>")]
+        public void Vmull_I([Values(0u)] uint rd,
+                            [Values(1u, 0u)] uint rn,
+                            [Values(2u, 0u)] uint rm,
+                            [Values(0u, 1u, 2u)] uint size,
+                            [Random(RndCnt)] ulong z,
+                            [Random(RndCnt)] ulong a,
+                            [Random(RndCnt)] ulong b,
+                            [Values] bool op,
+                            [Values] bool u)
+        {
+            uint opcode = 0xf2800c00u; // VMULL.S8 Q0, D0, D0
+
+            opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1);
+            opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
+            opcode |= ((rn & 0xf) << 16) | ((rn & 0x10) << 3);
+
+            if (op)
+            {
+                opcode |= 1 << 9;
+                size = 0;
+                u = false;
+            }
+
+            opcode |= size << 20;
+
+            if (u)
+            {
+                opcode |= 1 << 24;
+            }
+
+            V128 v0 = MakeVectorE0E1(z, z);
+            V128 v1 = MakeVectorE0E1(a, z);
+            V128 v2 = MakeVectorE0E1(b, z);
+
+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
+
         [Test, Pairwise, Description("VSHL.<size> {<Vd>}, <Vm>, <Vn>")]
         public void Vshl([Values(0u)] uint rd,
                          [Values(1u, 0u)] uint rn,

+ 81 - 0
Ryujinx.Tests/Cpu/CpuTestSimdRegElem32.cs

@@ -0,0 +1,81 @@
+#define SimdRegElem32
+
+using ARMeilleure.State;
+using NUnit.Framework;
+using System;
+
+namespace Ryujinx.Tests.Cpu
+{
+    [Category("SimdRegElem32")]
+    public sealed class CpuTestSimdRegElem32 : CpuTest32
+    {
+#if SimdRegElem32
+        private const int RndCnt = 2;
+
+        [Test, Pairwise, Description("VMUL.<size> {<Vd>}, <Vn>, <Vm>[<index>]")]
+        public void Vmul_1I([Values(1u, 0u)] uint rd,
+                            [Values(1u, 0u)] uint rn,
+                            [Values(26u, 25u, 10u, 9u, 2u, 0u)] uint rm,
+                            [Values(1u, 2u)] uint size,
+                            [Random(RndCnt)] ulong z,
+                            [Random(RndCnt)] ulong a,
+                            [Random(RndCnt)] ulong b,
+                            [Values] bool q)
+        {
+            uint opcode = 0xf2900840u & ~(3u << 20); // VMUL.I16 D0, D0, D0[0]
+            if (q)
+            {
+                opcode |= 1 << 24;
+                rn <<= 1;
+                rd <<= 1;
+            }
+
+            opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1);
+            opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
+            opcode |= ((rn & 0xf) << 16) | ((rn & 0x10) << 3);
+
+            opcode |= size << 20;
+
+            V128 v0 = MakeVectorE0E1(z, z);
+            V128 v1 = MakeVectorE0E1(a, z);
+            V128 v2 = MakeVectorE0E1(b, z);
+
+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise, Description("VMULL.<size> <Vd>, <Vn>, <Vm>[<index>]")]
+        public void Vmull_1([Values(2u, 0u)] uint rd,
+                            [Values(1u, 0u)] uint rn,
+                            [Values(26u, 25u, 10u, 9u, 2u, 0u)] uint rm,
+                            [Values(1u, 2u)] uint size,
+                            [Random(RndCnt)] ulong z,
+                            [Random(RndCnt)] ulong a,
+                            [Random(RndCnt)] ulong b,
+                            [Values] bool u)
+        {
+            uint opcode = 0xf2900a40u & ~(3u << 20); // VMULL.S16 Q0, D0, D0[0]
+
+            opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1);
+            opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
+            opcode |= ((rn & 0xf) << 16) | ((rn & 0x10) << 3);
+
+            opcode |= size << 20;
+
+            if (u)
+            {
+                opcode |= 1 << 24;
+            }
+
+            V128 v0 = MakeVectorE0E1(z, z);
+            V128 v1 = MakeVectorE0E1(a, z);
+            V128 v2 = MakeVectorE0E1(b, z);
+
+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
+#endif
+    }
+}

+ 80 - 14
Ryujinx.Tests/Cpu/CpuTestSimdShImm32.cs

@@ -11,17 +11,19 @@ namespace Ryujinx.Tests.Cpu
 #if SimdShImm32
         private const int RndCnt = 2;
 
-        [Test, Pairwise, Description("VSHL.<size> {<Vd>}, <Vm>, #<imm>")]
-        public void Vshl_Imm([Values(0u)] uint rd,
+        [Test, Pairwise]
+        public void Vrshr_Vshr_Imm([Values(0u)] uint rd,
                              [Values(2u, 0u)] uint rm,
                              [Values(0u, 1u, 2u, 3u)] uint size,
                              [Random(RndCnt), Values(0u)] uint shiftImm,
                              [Random(RndCnt)] ulong z,
                              [Random(RndCnt)] ulong a,
                              [Random(RndCnt)] ulong b,
-                             [Values] bool q)
+                             [Values] bool u,
+                             [Values] bool q,
+                             [Values] bool round)
         {
-            uint opcode = 0xf2800510u; // VORR.I32 D0, #0 (immediate value changes it into SHL)
+            uint opcode = 0xf2800010u; // VMOV.I32 D0, #0 (immediate value changes it into SHR)
             if (q)
             {
                 opcode |= 1 << 6;
@@ -29,6 +31,16 @@ namespace Ryujinx.Tests.Cpu
                 rd <<= 1;
             }
 
+            if (round)
+            {
+                opcode |= 1 << 9; // Turn into VRSHR
+            }
+
+            if (u)
+            {
+                opcode |= 1 << 24;
+            }
+
             uint imm = 1u << ((int)size + 3);
             imm |= shiftImm & (imm - 1);
 
@@ -45,18 +57,17 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
-        [Test, Pairwise, Description("VSHR.<size> {<Vd>}, <Vm>, #<imm>")]
-        public void Vshr_Imm([Values(0u)] uint rd,
+        [Test, Pairwise, Description("VSHL.<size> {<Vd>}, <Vm>, #<imm>")]
+        public void Vshl_Imm([Values(0u)] uint rd,
                              [Values(2u, 0u)] uint rm,
                              [Values(0u, 1u, 2u, 3u)] uint size,
                              [Random(RndCnt), Values(0u)] uint shiftImm,
                              [Random(RndCnt)] ulong z,
                              [Random(RndCnt)] ulong a,
                              [Random(RndCnt)] ulong b,
-                             [Values] bool u,
                              [Values] bool q)
         {
-            uint opcode = 0xf2800010u; // VMOV.I32 D0, #0 (immediate value changes it into SHR)
+            uint opcode = 0xf2800510u; // VORR.I32 D0, #0 (immediate value changes it into SHL)
             if (q)
             {
                 opcode |= 1 << 6;
@@ -64,11 +75,6 @@ namespace Ryujinx.Tests.Cpu
                 rd <<= 1;
             }
 
-            if (u)
-            {
-                opcode |= 1 << 24;
-            }
-
             uint imm = 1u << ((int)size + 3);
             imm |= shiftImm & (imm - 1);
 
@@ -85,7 +91,7 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
-        [Test, Pairwise, Description("VSHRN.<size> {<Vd>}, <Vm>, #<imm>")]
+        [Test, Pairwise, Description("VSHRN.<size> <Vd>, <Vm>, #<imm>")]
         public void Vshrn_Imm([Values(0u, 1u)] uint rd,
                               [Values(2u, 0u)] uint rm,
                               [Values(0u, 1u, 2u)] uint size,
@@ -111,6 +117,66 @@ namespace Ryujinx.Tests.Cpu
 
             CompareAgainstUnicorn();
         }
+
+        [Test, Pairwise, Description("VQRSHRN.<type><size> <Vd>, <Vm>, #<imm>")]
+        public void Vqrshrn_Imm([Values(0u, 1u)] uint rd,
+                                [Values(2u, 0u)] uint rm,
+                                [Values(0u, 1u, 2u)] uint size,
+                                [Random(RndCnt), Values(0u)] uint shiftImm,
+                                [Random(RndCnt)] ulong z,
+                                [Random(RndCnt)] ulong a,
+                                [Random(RndCnt)] ulong b,
+                                [Values] bool u)
+        {
+            uint opcode = 0xf2800950u; // VORR.I16 Q0, #0 (immediate value changes it into QRSHRN)
+
+            uint imm = 1u << ((int)size + 3);
+            imm |= shiftImm & (imm - 1);
+
+            opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1);
+            opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
+            opcode |= ((imm & 0x3f) << 16);
+
+            if (u)
+            {
+                opcode |= 1u << 24;
+            }
+
+            V128 v0 = MakeVectorE0E1(z, z);
+            V128 v1 = MakeVectorE0E1(a, z);
+            V128 v2 = MakeVectorE0E1(b, z);
+
+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise, Description("VQRSHRUN.<type><size> <Vd>, <Vm>, #<imm>")]
+        public void Vqrshrun_Imm([Values(0u, 1u)] uint rd,
+                                 [Values(2u, 0u)] uint rm,
+                                 [Values(0u, 1u, 2u)] uint size,
+                                 [Random(RndCnt), Values(0u)] uint shiftImm,
+                                 [Random(RndCnt)] ulong z,
+                                 [Random(RndCnt)] ulong a,
+                                 [Random(RndCnt)] ulong b)
+        {
+            uint opcode = 0xf3800850u; // VMOV.I16 Q0, #0x80 (immediate value changes it into QRSHRUN)
+
+            uint imm = 1u << ((int)size + 3);
+            imm |= shiftImm & (imm - 1);
+
+            opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1);
+            opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
+            opcode |= ((imm & 0x3f) << 16);
+
+            V128 v0 = MakeVectorE0E1(z, z);
+            V128 v1 = MakeVectorE0E1(a, z);
+            V128 v2 = MakeVectorE0E1(b, z);
+
+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn();
+        }
 #endif
     }
 }