7 سال پیش · e603b7afbc
--- a/ChocolArm64/Instructions/InstEmitAluHelper.cs
+++ b/ChocolArm64/Instructions/InstEmitAluHelper.cs
@@ -190,23 +190,32 @@ namespace ChocolArm64.Instructions
 
				             }
			
 
				         }
			
 
				 
			
 
				-        public static void EmitSetNzcv(ILEmitterCtx context, int nzcv)
			
 
				+        public static void EmitSetNzcv(ILEmitterCtx context)
			
 
				         {
			
 
				-            context.EmitLdc_I4((nzcv >> 0) & 1);
			
 
				-
			
 
				+            context.Emit(OpCodes.Dup);
			
 
				+            context.Emit(OpCodes.Ldc_I4_1);
			
 
				+            context.Emit(OpCodes.And);
			
 
				             context.EmitStflg((int)PState.VBit);
			
 
				 
			
 
				-            context.EmitLdc_I4((nzcv >> 1) & 1);
			
 
				-
			
 
				+            context.Emit(OpCodes.Ldc_I4_1);
			
 
				+            context.Emit(OpCodes.Shr);
			
 
				+            context.Emit(OpCodes.Dup);
			
 
				+            context.Emit(OpCodes.Ldc_I4_1);
			
 
				+            context.Emit(OpCodes.And);
			
 
				             context.EmitStflg((int)PState.CBit);
			
 
				 
			
 
				-            context.EmitLdc_I4((nzcv >> 2) & 1);
			
 
				-
			
 
				+            context.Emit(OpCodes.Ldc_I4_1);
			
 
				+            context.Emit(OpCodes.Shr);
			
 
				+            context.Emit(OpCodes.Dup);
			
 
				+            context.Emit(OpCodes.Ldc_I4_1);
			
 
				+            context.Emit(OpCodes.And);
			
 
				             context.EmitStflg((int)PState.ZBit);
			
 
				 
			
 
				-            context.EmitLdc_I4((nzcv >> 3) & 1);
			
 
				-
			
 
				+            context.Emit(OpCodes.Ldc_I4_1);
			
 
				+            context.Emit(OpCodes.Shr);
			
 
				+            context.Emit(OpCodes.Ldc_I4_1);
			
 
				+            context.Emit(OpCodes.And);
			
 
				             context.EmitStflg((int)PState.NBit);
			
 
				         }
			
 
				     }
			
 
				-}
			
 
				+}
			
--- a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
@@ -186,18 +186,101 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				         public static void Fabs_S(ILEmitterCtx context)
			
 
				         {
			
 
				-            EmitScalarUnaryOpF(context, () =>
			
 
				+            if (Optimizations.UseSse2)
			
 
				             {
			
 
				-                EmitUnaryMathCall(context, nameof(Math.Abs));
			
 
				-            });
			
 
				+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
			
 
				+
			
 
				+                if (op.Size == 0)
			
 
				+                {
			
 
				+                    Type[] typesSsv    = new Type[] { typeof(float) };
			
 
				+                    Type[] typesAndNot = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
			
 
				+
			
 
				+                    context.EmitLdc_R4(-0f);
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetScalarVector128), typesSsv));
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rn);
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot), typesAndNot));
			
 
				+
			
 
				+                    context.EmitStvec(op.Rd);
			
 
				+
			
 
				+                    EmitVectorZero32_128(context, op.Rd);
			
 
				+                }
			
 
				+                else /* if (op.Size == 1) */
			
 
				+                {
			
 
				+                    Type[] typesSsv    = new Type[] { typeof(double) };
			
 
				+                    Type[] typesAndNot = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
			
 
				+
			
 
				+                    context.EmitLdc_R8(-0d);
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetScalarVector128), typesSsv));
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rn);
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNot));
			
 
				+
			
 
				+                    EmitStvecWithCastFromDouble(context, op.Rd);
			
 
				+
			
 
				+                    EmitVectorZeroUpper(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitScalarUnaryOpF(context, () =>
			
 
				+                {
			
 
				+                    EmitUnaryMathCall(context, nameof(Math.Abs));
			
 
				+                });
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Fabs_V(ILEmitterCtx context)
			
 
				         {
			
 
				-            EmitVectorUnaryOpF(context, () =>
			
 
				+            if (Optimizations.UseSse2)
			
 
				             {
			
 
				-                EmitUnaryMathCall(context, nameof(Math.Abs));
			
 
				-            });
			
 
				+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
			
 
				+
			
 
				+                int sizeF = op.Size & 1;
			
 
				+
			
 
				+                if (sizeF == 0)
			
 
				+                {
			
 
				+                    Type[] typesSav    = new Type[] { typeof(float) };
			
 
				+                    Type[] typesAndNot = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
			
 
				+
			
 
				+                    context.EmitLdc_R4(-0f);
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetAllVector128), typesSav));
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rn);
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot), typesAndNot));
			
 
				+
			
 
				+                    context.EmitStvec(op.Rd);
			
 
				+
			
 
				+                    if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                    {
			
 
				+                        EmitVectorZeroUpper(context, op.Rd);
			
 
				+                    }
			
 
				+                }
			
 
				+                else /* if (sizeF == 1) */
			
 
				+                {
			
 
				+                    Type[] typesSav    = new Type[] { typeof(double) };
			
 
				+                    Type[] typesAndNot = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
			
 
				+
			
 
				+                    context.EmitLdc_R8(-0d);
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rn);
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNot));
			
 
				+
			
 
				+                    EmitStvecWithCastFromDouble(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorUnaryOpF(context, () =>
			
 
				+                {
			
 
				+                    EmitUnaryMathCall(context, nameof(Math.Abs));
			
 
				+                });
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Fadd_S(ILEmitterCtx context)
			
@@ -283,7 +366,7 @@ namespace ChocolArm64.Instructions
 
				             }
			
 
				         }
			
 
				 
			
 
				-        public static void Fmadd_S(ILEmitterCtx context)
			
 
				+        public static void Fmadd_S(ILEmitterCtx context) // Fused.
			
 
				         {
			
 
				             if (Optimizations.FastFP && Optimizations.UseSse2)
			
 
				             {
			
@@ -450,22 +533,118 @@ namespace ChocolArm64.Instructions
 
				             });
			
 
				         }
			
 
				 
			
 
				-        public static void Fmla_V(ILEmitterCtx context)
			
 
				+        public static void Fmla_V(ILEmitterCtx context) // Fused.
			
 
				         {
			
 
				-            EmitVectorTernaryOpF(context, () =>
			
 
				+            if (Optimizations.FastFP && Optimizations.UseSse2)
			
 
				             {
			
 
				-                context.Emit(OpCodes.Mul);
			
 
				-                context.Emit(OpCodes.Add);
			
 
				-            });
			
 
				+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
			
 
				+
			
 
				+                int sizeF = op.Size & 1;
			
 
				+
			
 
				+                if (sizeF == 0)
			
 
				+                {
			
 
				+                    Type[] typesMulAdd = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rd);
			
 
				+                    context.EmitLdvec(op.Rn);
			
 
				+                    context.EmitLdvec(op.Rm);
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulAdd));
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Add),      typesMulAdd));
			
 
				+
			
 
				+                    context.EmitStvec(op.Rd);
			
 
				+
			
 
				+                    if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                    {
			
 
				+                        EmitVectorZeroUpper(context, op.Rd);
			
 
				+                    }
			
 
				+                }
			
 
				+                else /* if (sizeF == 1) */
			
 
				+                {
			
 
				+                    Type[] typesMulAdd = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rd);
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rn);
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rm);
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulAdd));
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add),      typesMulAdd));
			
 
				+
			
 
				+                    EmitStvecWithCastFromDouble(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorTernaryOpF(context, () =>
			
 
				+                {
			
 
				+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd));
			
 
				+                });
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				-        public static void Fmla_Ve(ILEmitterCtx context)
			
 
				+        public static void Fmla_Ve(ILEmitterCtx context) // Fused.
			
 
				         {
			
 
				-            EmitVectorTernaryOpByElemF(context, () =>
			
 
				+            if (Optimizations.FastFP && Optimizations.UseSse2)
			
 
				             {
			
 
				-                context.Emit(OpCodes.Mul);
			
 
				-                context.Emit(OpCodes.Add);
			
 
				-            });
			
 
				+                OpCodeSimdRegElemF64 op = (OpCodeSimdRegElemF64)context.CurrOp;
			
 
				+
			
 
				+                int sizeF = op.Size & 1;
			
 
				+
			
 
				+                if (sizeF == 0)
			
 
				+                {
			
 
				+                    Type[] typesSfl    = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>), typeof(byte) };
			
 
				+                    Type[] typesMulAdd = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rd);
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rn);
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rm);
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+
			
 
				+                    context.EmitLdc_I4(op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6);
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl));
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulAdd));
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Add), typesMulAdd));
			
 
				+
			
 
				+                    context.EmitStvec(op.Rd);
			
 
				+
			
 
				+                    if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                    {
			
 
				+                        EmitVectorZeroUpper(context, op.Rd);
			
 
				+                    }
			
 
				+                }
			
 
				+                else /* if (sizeF == 1) */
			
 
				+                {
			
 
				+                    Type[] typesSfl    = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>), typeof(byte) };
			
 
				+                    Type[] typesMulAdd = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rd);
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rn);
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rm);
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+
			
 
				+                    context.EmitLdc_I4(op.Index | op.Index << 1);
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Shuffle), typesSfl));
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulAdd));
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesMulAdd));
			
 
				+
			
 
				+                    EmitStvecWithCastFromDouble(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorTernaryOpByElemF(context, () =>
			
 
				+                {
			
 
				+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd));
			
 
				+                });
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Fmls_Se(ILEmitterCtx context)
			
@@ -477,25 +656,121 @@ namespace ChocolArm64.Instructions
 
				             });
			
 
				         }
			
 
				 
			
 
				-        public static void Fmls_V(ILEmitterCtx context)
			
 
				+        public static void Fmls_V(ILEmitterCtx context) // Fused.
			
 
				         {
			
 
				-            EmitVectorTernaryOpF(context, () =>
			
 
				+            if (Optimizations.FastFP && Optimizations.UseSse2)
			
 
				             {
			
 
				-                context.Emit(OpCodes.Mul);
			
 
				-                context.Emit(OpCodes.Sub);
			
 
				-            });
			
 
				+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
			
 
				+
			
 
				+                int sizeF = op.Size & 1;
			
 
				+
			
 
				+                if (sizeF == 0)
			
 
				+                {
			
 
				+                    Type[] typesMulSub = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rd);
			
 
				+                    context.EmitLdvec(op.Rn);
			
 
				+                    context.EmitLdvec(op.Rm);
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulSub));
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Subtract), typesMulSub));
			
 
				+
			
 
				+                    context.EmitStvec(op.Rd);
			
 
				+
			
 
				+                    if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                    {
			
 
				+                        EmitVectorZeroUpper(context, op.Rd);
			
 
				+                    }
			
 
				+                }
			
 
				+                else /* if (sizeF == 1) */
			
 
				+                {
			
 
				+                    Type[] typesMulSub = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rd);
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rn);
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rm);
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulSub));
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));
			
 
				+
			
 
				+                    EmitStvecWithCastFromDouble(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorTernaryOpF(context, () =>
			
 
				+                {
			
 
				+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub));
			
 
				+                });
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				-        public static void Fmls_Ve(ILEmitterCtx context)
			
 
				+        public static void Fmls_Ve(ILEmitterCtx context) // Fused.
			
 
				         {
			
 
				-            EmitVectorTernaryOpByElemF(context, () =>
			
 
				+            if (Optimizations.FastFP && Optimizations.UseSse2)
			
 
				             {
			
 
				-                context.Emit(OpCodes.Mul);
			
 
				-                context.Emit(OpCodes.Sub);
			
 
				-            });
			
 
				+                OpCodeSimdRegElemF64 op = (OpCodeSimdRegElemF64)context.CurrOp;
			
 
				+
			
 
				+                int sizeF = op.Size & 1;
			
 
				+
			
 
				+                if (sizeF == 0)
			
 
				+                {
			
 
				+                    Type[] typesSfl    = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>), typeof(byte) };
			
 
				+                    Type[] typesMulSub = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rd);
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rn);
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rm);
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+
			
 
				+                    context.EmitLdc_I4(op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6);
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl));
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulSub));
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Subtract), typesMulSub));
			
 
				+
			
 
				+                    context.EmitStvec(op.Rd);
			
 
				+
			
 
				+                    if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                    {
			
 
				+                        EmitVectorZeroUpper(context, op.Rd);
			
 
				+                    }
			
 
				+                }
			
 
				+                else /* if (sizeF == 1) */
			
 
				+                {
			
 
				+                    Type[] typesSfl    = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>), typeof(byte) };
			
 
				+                    Type[] typesMulSub = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rd);
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rn);
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rm);
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+
			
 
				+                    context.EmitLdc_I4(op.Index | op.Index << 1);
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Shuffle), typesSfl));
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulSub));
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));
			
 
				+
			
 
				+                    EmitStvecWithCastFromDouble(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorTernaryOpByElemF(context, () =>
			
 
				+                {
			
 
				+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub));
			
 
				+                });
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				-        public static void Fmsub_S(ILEmitterCtx context)
			
 
				+        public static void Fmsub_S(ILEmitterCtx context) // Fused.
			
 
				         {
			
 
				             if (Optimizations.FastFP && Optimizations.UseSse2)
			
 
				             {
			
@@ -580,7 +855,59 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				         public static void Fmul_Ve(ILEmitterCtx context)
			
 
				         {
			
 
				-            EmitVectorBinaryOpByElemF(context, () => context.Emit(OpCodes.Mul));
			
 
				+            if (Optimizations.FastFP && Optimizations.UseSse2)
			
 
				+            {
			
 
				+                OpCodeSimdRegElemF64 op = (OpCodeSimdRegElemF64)context.CurrOp;
			
 
				+
			
 
				+                int sizeF = op.Size & 1;
			
 
				+
			
 
				+                if (sizeF == 0)
			
 
				+                {
			
 
				+                    Type[] typesSfl = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>), typeof(byte) };
			
 
				+                    Type[] typesMul = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rn);
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rm);
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+
			
 
				+                    context.EmitLdc_I4(op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6);
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl));
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMul));
			
 
				+
			
 
				+                    context.EmitStvec(op.Rd);
			
 
				+
			
 
				+                    if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                    {
			
 
				+                        EmitVectorZeroUpper(context, op.Rd);
			
 
				+                    }
			
 
				+                }
			
 
				+                else /* if (sizeF == 1) */
			
 
				+                {
			
 
				+                    Type[] typesSfl = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>), typeof(byte) };
			
 
				+                    Type[] typesMul = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rn);
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rm);
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+
			
 
				+                    context.EmitLdc_I4(op.Index | op.Index << 1);
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Shuffle), typesSfl));
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMul));
			
 
				+
			
 
				+                    EmitStvecWithCastFromDouble(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorBinaryOpByElemF(context, () =>
			
 
				+                {
			
 
				+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul));
			
 
				+                });
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Fmulx_S(ILEmitterCtx context)
			
@@ -607,22 +934,105 @@ namespace ChocolArm64.Instructions
 
				             });
			
 
				         }
			
 
				 
			
 
				-        public static void Fmulx_Ve(ILEmitterCtx context)
			
 
				+        public static void Fmulx_Ve(ILEmitterCtx context)
			
 
				+        {
			
 
				+            EmitVectorBinaryOpByElemF(context, () =>
			
 
				+            {
			
 
				+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX));
			
 
				+            });
			
 
				+        }
			
 
				+
			
 
				+        public static void Fneg_S(ILEmitterCtx context)
			
 
				+        {
			
 
				+            if (Optimizations.UseSse2)
			
 
				+            {
			
 
				+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
			
 
				+
			
 
				+                if (op.Size == 0)
			
 
				+                {
			
 
				+                    Type[] typesSsv = new Type[] { typeof(float) };
			
 
				+                    Type[] typesXor = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
			
 
				+
			
 
				+                    context.EmitLdc_R4(-0f);
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetScalarVector128), typesSsv));
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rn);
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Xor), typesXor));
			
 
				+
			
 
				+                    context.EmitStvec(op.Rd);
			
 
				+
			
 
				+                    EmitVectorZero32_128(context, op.Rd);
			
 
				+                }
			
 
				+                else /* if (op.Size == 1) */
			
 
				+                {
			
 
				+                    Type[] typesSsv = new Type[] { typeof(double) };
			
 
				+                    Type[] typesXor = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
			
 
				+
			
 
				+                    context.EmitLdc_R8(-0d);
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetScalarVector128), typesSsv));
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rn);
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXor));
			
 
				+
			
 
				+                    EmitStvecWithCastFromDouble(context, op.Rd);
			
 
				+
			
 
				+                    EmitVectorZeroUpper(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitScalarUnaryOpF(context, () => context.Emit(OpCodes.Neg));
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        public static void Fneg_V(ILEmitterCtx context)
			
 
				         {
			
 
				-            EmitVectorBinaryOpByElemF(context, () =>
			
 
				+            if (Optimizations.UseSse2)
			
 
				             {
			
 
				-                EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX));
			
 
				-            });
			
 
				-        }
			
 
				+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
			
 
				 
			
 
				-        public static void Fneg_S(ILEmitterCtx context)
			
 
				-        {
			
 
				-            EmitScalarUnaryOpF(context, () => context.Emit(OpCodes.Neg));
			
 
				-        }
			
 
				+                int sizeF = op.Size & 1;
			
 
				 
			
 
				-        public static void Fneg_V(ILEmitterCtx context)
			
 
				-        {
			
 
				-            EmitVectorUnaryOpF(context, () => context.Emit(OpCodes.Neg));
			
 
				+                if (sizeF == 0)
			
 
				+                {
			
 
				+                    Type[] typesSav = new Type[] { typeof(float) };
			
 
				+                    Type[] typesXor = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
			
 
				+
			
 
				+                    context.EmitLdc_R4(-0f);
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetAllVector128), typesSav));
			
 
				+
			
 
				+                    context.EmitLdvec(op.Rn);
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Xor), typesXor));
			
 
				+
			
 
				+                    context.EmitStvec(op.Rd);
			
 
				+
			
 
				+                    if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                    {
			
 
				+                        EmitVectorZeroUpper(context, op.Rd);
			
 
				+                    }
			
 
				+                }
			
 
				+                else /* if (sizeF == 1) */
			
 
				+                {
			
 
				+                    Type[] typesSav = new Type[] { typeof(double) };
			
 
				+                    Type[] typesXor = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
			
 
				+
			
 
				+                    context.EmitLdc_R8(-0d);
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
			
 
				+
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rn);
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXor));
			
 
				+
			
 
				+                    EmitStvecWithCastFromDouble(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorUnaryOpF(context, () => context.Emit(OpCodes.Neg));
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Fnmadd_S(ILEmitterCtx context)
			
@@ -689,7 +1099,7 @@ namespace ChocolArm64.Instructions
 
				             });
			
 
				         }
			
 
				 
			
 
				-        public static void Frecps_S(ILEmitterCtx context)
			
 
				+        public static void Frecps_S(ILEmitterCtx context) // Fused.
			
 
				         {
			
 
				             if (Optimizations.FastFP && Optimizations.UseSse2)
			
 
				             {
			
@@ -743,7 +1153,7 @@ namespace ChocolArm64.Instructions
 
				             }
			
 
				         }
			
 
				 
			
 
				-        public static void Frecps_V(ILEmitterCtx context)
			
 
				+        public static void Frecps_V(ILEmitterCtx context) // Fused.
			
 
				         {
			
 
				             if (Optimizations.FastFP && Optimizations.UseSse2)
			
 
				             {
			
@@ -986,7 +1396,7 @@ namespace ChocolArm64.Instructions
 
				             });
			
 
				         }
			
 
				 
			
 
				-        public static void Frsqrts_S(ILEmitterCtx context)
			
 
				+        public static void Frsqrts_S(ILEmitterCtx context) // Fused.
			
 
				         {
			
 
				             if (Optimizations.FastFP && Optimizations.UseSse2)
			
 
				             {
			
@@ -1048,7 +1458,7 @@ namespace ChocolArm64.Instructions
 
				             }
			
 
				         }
			
 
				 
			
 
				-        public static void Frsqrts_V(ILEmitterCtx context)
			
 
				+        public static void Frsqrts_V(ILEmitterCtx context) // Fused.
			
 
				         {
			
 
				             if (Optimizations.FastFP && Optimizations.UseSse2)
			
 
				             {
			
@@ -1310,7 +1720,7 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
			
 
				 
			
 
				-                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
			
 
				+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
			
 
				 
			
 
				                 context.EmitLdc_I4(numBytes);
			
 
				                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
			
@@ -1334,7 +1744,38 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				         public static void Saddw_V(ILEmitterCtx context)
			
 
				         {
			
 
				-            EmitVectorWidenRmBinaryOpSx(context, () => context.Emit(OpCodes.Add));
			
 
				+            if (Optimizations.UseSse41)
			
 
				+            {
			
 
				+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
			
 
				+
			
 
				+                Type[] typesSrl = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
			
 
				+                Type[] typesCvt = new Type[] { VectorIntTypesPerSizeLog2[op.Size] };
			
 
				+                Type[] typesAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
			
 
				+                                               VectorIntTypesPerSizeLog2[op.Size + 1] };
			
 
				+
			
 
				+                string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16),
			
 
				+                                                   nameof(Sse41.ConvertToVector128Int32),
			
 
				+                                                   nameof(Sse41.ConvertToVector128Int64) };
			
 
				+
			
 
				+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
			
 
				+
			
 
				+                EmitLdvecWithSignedCast(context, op.Rn, op.Size + 1);
			
 
				+
			
 
				+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
			
 
				+
			
 
				+                context.EmitLdc_I4(numBytes);
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
			
 
				+
			
 
				+                EmitStvecWithSignedCast(context, op.Rd, op.Size + 1);
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorWidenRmBinaryOpSx(context, () => context.Emit(OpCodes.Add));
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Shadd_V(ILEmitterCtx context)
			
@@ -1439,11 +1880,34 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				         public static void Smax_V(ILEmitterCtx context)
			
 
				         {
			
 
				-            Type[] types = new Type[] { typeof(long), typeof(long) };
			
 
				+            if (Optimizations.UseSse41)
			
 
				+            {
			
 
				+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
			
 
				 
			
 
				-            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Max), types);
			
 
				+                Type[] typesMax = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
			
 
				+
			
 
				+                Type typeSse = op.Size == 1 ? typeof(Sse2) : typeof(Sse41);
			
 
				+
			
 
				+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
			
 
				+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
			
 
				+
			
 
				+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.Max), typesMax));
			
 
				+
			
 
				+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
			
 
				+
			
 
				+                if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                {
			
 
				+                    EmitVectorZeroUpper(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                Type[] types = new Type[] { typeof(long), typeof(long) };
			
 
				 
			
 
				-            EmitVectorBinaryOpSx(context, () => context.EmitCall(mthdInfo));
			
 
				+                MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Max), types);
			
 
				+
			
 
				+                EmitVectorBinaryOpSx(context, () => context.EmitCall(mthdInfo));
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Smaxp_V(ILEmitterCtx context)
			
@@ -1457,11 +1921,34 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				         public static void Smin_V(ILEmitterCtx context)
			
 
				         {
			
 
				-            Type[] types = new Type[] { typeof(long), typeof(long) };
			
 
				+            if (Optimizations.UseSse41)
			
 
				+            {
			
 
				+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
			
 
				 
			
 
				-            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Min), types);
			
 
				+                Type[] typesMin = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
			
 
				+
			
 
				+                Type typeSse = op.Size == 1 ? typeof(Sse2) : typeof(Sse41);
			
 
				+
			
 
				+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
			
 
				+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
			
 
				+
			
 
				+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.Min), typesMin));
			
 
				+
			
 
				+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
			
 
				+
			
 
				+                if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                {
			
 
				+                    EmitVectorZeroUpper(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                Type[] types = new Type[] { typeof(long), typeof(long) };
			
 
				 
			
 
				-            EmitVectorBinaryOpSx(context, () => context.EmitCall(mthdInfo));
			
 
				+                MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Min), types);
			
 
				+
			
 
				+                EmitVectorBinaryOpSx(context, () => context.EmitCall(mthdInfo));
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Sminp_V(ILEmitterCtx context)
			
@@ -1484,7 +1971,7 @@ namespace ChocolArm64.Instructions
 
				                 Type[] typesMulAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
			
 
				                                                   VectorIntTypesPerSizeLog2[op.Size + 1] };
			
 
				 
			
 
				-                Type typeMul = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
			
 
				+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
			
 
				 
			
 
				                 string nameCvt = op.Size == 0
			
 
				                     ? nameof(Sse41.ConvertToVector128Int16)
			
@@ -1508,7 +1995,7 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				                 context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
			
 
				 
			
 
				-                context.EmitCall(typeMul.GetMethod(nameof(Sse2.MultiplyLow), typesMulAdd));
			
 
				+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.MultiplyLow), typesMulAdd));
			
 
				 
			
 
				                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesMulAdd));
			
 
				 
			
@@ -1535,7 +2022,7 @@ namespace ChocolArm64.Instructions
 
				                 Type[] typesMulSub = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
			
 
				                                                   VectorIntTypesPerSizeLog2[op.Size + 1] };
			
 
				 
			
 
				-                Type typeMul = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
			
 
				+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
			
 
				 
			
 
				                 string nameCvt = op.Size == 0
			
 
				                     ? nameof(Sse41.ConvertToVector128Int16)
			
@@ -1559,7 +2046,7 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				                 context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
			
 
				 
			
 
				-                context.EmitCall(typeMul.GetMethod(nameof(Sse2.MultiplyLow), typesMulSub));
			
 
				+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.MultiplyLow), typesMulSub));
			
 
				 
			
 
				                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));
			
 
				 
			
@@ -1735,7 +2222,7 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				                 context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
			
 
				 
			
 
				-                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
			
 
				+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
			
 
				 
			
 
				                 context.EmitLdc_I4(numBytes);
			
 
				                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
			
@@ -1754,7 +2241,38 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				         public static void Ssubw_V(ILEmitterCtx context)
			
 
				         {
			
 
				-            EmitVectorWidenRmBinaryOpSx(context, () => context.Emit(OpCodes.Sub));
			
 
				+            if (Optimizations.UseSse41)
			
 
				+            {
			
 
				+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
			
 
				+
			
 
				+                Type[] typesSrl = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
			
 
				+                Type[] typesCvt = new Type[] { VectorIntTypesPerSizeLog2[op.Size] };
			
 
				+                Type[] typesSub = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
			
 
				+                                               VectorIntTypesPerSizeLog2[op.Size + 1] };
			
 
				+
			
 
				+                string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16),
			
 
				+                                                   nameof(Sse41.ConvertToVector128Int32),
			
 
				+                                                   nameof(Sse41.ConvertToVector128Int64) };
			
 
				+
			
 
				+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
			
 
				+
			
 
				+                EmitLdvecWithSignedCast(context, op.Rn, op.Size + 1);
			
 
				+
			
 
				+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
			
 
				+
			
 
				+                context.EmitLdc_I4(numBytes);
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSub));
			
 
				+
			
 
				+                EmitStvecWithSignedCast(context, op.Rd, op.Size + 1);
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorWidenRmBinaryOpSx(context, () => context.Emit(OpCodes.Sub));
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Sub_S(ILEmitterCtx context)
			
@@ -1901,7 +2419,38 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				         public static void Uaddw_V(ILEmitterCtx context)
			
 
				         {
			
 
				-            EmitVectorWidenRmBinaryOpZx(context, () => context.Emit(OpCodes.Add));
			
 
				+            if (Optimizations.UseSse41)
			
 
				+            {
			
 
				+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
			
 
				+
			
 
				+                Type[] typesSrl = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
			
 
				+                Type[] typesCvt = new Type[] { VectorUIntTypesPerSizeLog2[op.Size] };
			
 
				+                Type[] typesAdd = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1],
			
 
				+                                               VectorUIntTypesPerSizeLog2[op.Size + 1] };
			
 
				+
			
 
				+                string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16),
			
 
				+                                                   nameof(Sse41.ConvertToVector128Int32),
			
 
				+                                                   nameof(Sse41.ConvertToVector128Int64) };
			
 
				+
			
 
				+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
			
 
				+
			
 
				+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size + 1);
			
 
				+
			
 
				+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
			
 
				+
			
 
				+                context.EmitLdc_I4(numBytes);
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
			
 
				+
			
 
				+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size + 1);
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorWidenRmBinaryOpZx(context, () => context.Emit(OpCodes.Add));
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Uhadd_V(ILEmitterCtx context)
			
@@ -1992,11 +2541,34 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				         public static void Umax_V(ILEmitterCtx context)
			
 
				         {
			
 
				-            Type[] types = new Type[] { typeof(ulong), typeof(ulong) };
			
 
				+            if (Optimizations.UseSse41)
			
 
				+            {
			
 
				+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
			
 
				 
			
 
				-            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Max), types);
			
 
				+                Type[] typesMax = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
			
 
				+
			
 
				+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
			
 
				+
			
 
				+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
			
 
				+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
			
 
				+
			
 
				+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.Max), typesMax));
			
 
				 
			
 
				-            EmitVectorBinaryOpZx(context, () => context.EmitCall(mthdInfo));
			
 
				+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
			
 
				+
			
 
				+                if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                {
			
 
				+                    EmitVectorZeroUpper(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                Type[] types = new Type[] { typeof(ulong), typeof(ulong) };
			
 
				+
			
 
				+                MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Max), types);
			
 
				+
			
 
				+                EmitVectorBinaryOpZx(context, () => context.EmitCall(mthdInfo));
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Umaxp_V(ILEmitterCtx context)
			
@@ -2010,11 +2582,34 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				         public static void Umin_V(ILEmitterCtx context)
			
 
				         {
			
 
				-            Type[] types = new Type[] { typeof(ulong), typeof(ulong) };
			
 
				+            if (Optimizations.UseSse41)
			
 
				+            {
			
 
				+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
			
 
				 
			
 
				-            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Min), types);
			
 
				+                Type[] typesMin = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
			
 
				+
			
 
				+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
			
 
				+
			
 
				+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
			
 
				+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
			
 
				+
			
 
				+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.Min), typesMin));
			
 
				+
			
 
				+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
			
 
				+
			
 
				+                if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                {
			
 
				+                    EmitVectorZeroUpper(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                Type[] types = new Type[] { typeof(ulong), typeof(ulong) };
			
 
				 
			
 
				-            EmitVectorBinaryOpZx(context, () => context.EmitCall(mthdInfo));
			
 
				+                MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Min), types);
			
 
				+
			
 
				+                EmitVectorBinaryOpZx(context, () => context.EmitCall(mthdInfo));
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Uminp_V(ILEmitterCtx context)
			
@@ -2037,7 +2632,7 @@ namespace ChocolArm64.Instructions
 
				                 Type[] typesMulAdd = new Type[] { VectorIntTypesPerSizeLog2 [op.Size + 1],
			
 
				                                                   VectorIntTypesPerSizeLog2 [op.Size + 1] };
			
 
				 
			
 
				-                Type typeMul = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
			
 
				+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
			
 
				 
			
 
				                 string nameCvt = op.Size == 0
			
 
				                     ? nameof(Sse41.ConvertToVector128Int16)
			
@@ -2061,7 +2656,7 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				                 context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
			
 
				 
			
 
				-                context.EmitCall(typeMul.GetMethod(nameof(Sse2.MultiplyLow), typesMulAdd));
			
 
				+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.MultiplyLow), typesMulAdd));
			
 
				 
			
 
				                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesMulAdd));
			
 
				 
			
@@ -2088,7 +2683,7 @@ namespace ChocolArm64.Instructions
 
				                 Type[] typesMulSub = new Type[] { VectorIntTypesPerSizeLog2 [op.Size + 1],
			
 
				                                                   VectorIntTypesPerSizeLog2 [op.Size + 1] };
			
 
				 
			
 
				-                Type typeMul = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
			
 
				+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
			
 
				 
			
 
				                 string nameCvt = op.Size == 0
			
 
				                     ? nameof(Sse41.ConvertToVector128Int16)
			
@@ -2112,7 +2707,7 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				                 context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
			
 
				 
			
 
				-                context.EmitCall(typeMul.GetMethod(nameof(Sse2.MultiplyLow), typesMulSub));
			
 
				+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.MultiplyLow), typesMulSub));
			
 
				 
			
 
				                 context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));
			
 
				 
			
@@ -2251,7 +2846,38 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				         public static void Usubw_V(ILEmitterCtx context)
			
 
				         {
			
 
				-            EmitVectorWidenRmBinaryOpZx(context, () => context.Emit(OpCodes.Sub));
			
 
				+            if (Optimizations.UseSse41)
			
 
				+            {
			
 
				+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
			
 
				+
			
 
				+                Type[] typesSrl = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
			
 
				+                Type[] typesCvt = new Type[] { VectorUIntTypesPerSizeLog2[op.Size] };
			
 
				+                Type[] typesSub = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1],
			
 
				+                                               VectorUIntTypesPerSizeLog2[op.Size + 1] };
			
 
				+
			
 
				+                string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16),
			
 
				+                                                   nameof(Sse41.ConvertToVector128Int32),
			
 
				+                                                   nameof(Sse41.ConvertToVector128Int64) };
			
 
				+
			
 
				+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
			
 
				+
			
 
				+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size + 1);
			
 
				+
			
 
				+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
			
 
				+
			
 
				+                context.EmitLdc_I4(numBytes);
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSub));
			
 
				+
			
 
				+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size + 1);
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorWidenRmBinaryOpZx(context, () => context.Emit(OpCodes.Sub));
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         private static void EmitAbs(ILEmitterCtx context)
			
--- a/ChocolArm64/Instructions/InstEmitSimdCmp.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdCmp.cs
@@ -3,6 +3,7 @@ using ChocolArm64.State;
 
				 using ChocolArm64.Translation;
			
 
				 using System;
			
 
				 using System.Reflection.Emit;
			
 
				+using System.Runtime.Intrinsics;
			
 
				 using System.Runtime.Intrinsics.X86;
			
 
				 
			
 
				 using static ChocolArm64.Instructions.InstEmitAluHelper;
			
@@ -137,26 +138,43 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				             context.EmitCondBranch(lblTrue, op.Cond);
			
 
				 
			
 
				-            EmitSetNzcv(context, op.Nzcv);
			
 
				+            context.EmitLdc_I4(op.Nzcv);
			
 
				+            EmitSetNzcv(context);
			
 
				 
			
 
				             context.Emit(OpCodes.Br, lblEnd);
			
 
				 
			
 
				             context.MarkLabel(lblTrue);
			
 
				 
			
 
				-            Fcmp_S(context);
			
 
				+            EmitFcmpE(context, signalNaNs: false);
			
 
				 
			
 
				             context.MarkLabel(lblEnd);
			
 
				         }
			
 
				 
			
 
				         public static void Fccmpe_S(ILEmitterCtx context)
			
 
				         {
			
 
				-            Fccmp_S(context);
			
 
				+            OpCodeSimdFcond64 op = (OpCodeSimdFcond64)context.CurrOp;
			
 
				+
			
 
				+            ILLabel lblTrue = new ILLabel();
			
 
				+            ILLabel lblEnd  = new ILLabel();
			
 
				+
			
 
				+            context.EmitCondBranch(lblTrue, op.Cond);
			
 
				+
			
 
				+            context.EmitLdc_I4(op.Nzcv);
			
 
				+            EmitSetNzcv(context);
			
 
				+
			
 
				+            context.Emit(OpCodes.Br, lblEnd);
			
 
				+
			
 
				+            context.MarkLabel(lblTrue);
			
 
				+
			
 
				+            EmitFcmpE(context, signalNaNs: true);
			
 
				+
			
 
				+            context.MarkLabel(lblEnd);
			
 
				         }
			
 
				 
			
 
				         public static void Fcmeq_S(ILEmitterCtx context)
			
 
				         {
			
 
				             if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
			
 
				-                                                 && Optimizations.UseSse2)
			
 
				+                                                  && Optimizations.UseSse2)
			
 
				             {
			
 
				                 EmitScalarSseOrSse2OpF(context, nameof(Sse.CompareEqualScalar));
			
 
				             }
			
@@ -169,7 +187,7 @@ namespace ChocolArm64.Instructions
 
				         public static void Fcmeq_V(ILEmitterCtx context)
			
 
				         {
			
 
				             if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
			
 
				-                                                 && Optimizations.UseSse2)
			
 
				+                                                  && Optimizations.UseSse2)
			
 
				             {
			
 
				                 EmitVectorSseOrSse2OpF(context, nameof(Sse.CompareEqual));
			
 
				             }
			
@@ -182,7 +200,7 @@ namespace ChocolArm64.Instructions
 
				         public static void Fcmge_S(ILEmitterCtx context)
			
 
				         {
			
 
				             if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
			
 
				-                                                 && Optimizations.UseSse2)
			
 
				+                                                  && Optimizations.UseSse2)
			
 
				             {
			
 
				                 EmitScalarSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar));
			
 
				             }
			
@@ -195,7 +213,7 @@ namespace ChocolArm64.Instructions
 
				         public static void Fcmge_V(ILEmitterCtx context)
			
 
				         {
			
 
				             if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
			
 
				-                                                 && Optimizations.UseSse2)
			
 
				+                                                  && Optimizations.UseSse2)
			
 
				             {
			
 
				                 EmitVectorSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual));
			
 
				             }
			
@@ -208,7 +226,7 @@ namespace ChocolArm64.Instructions
 
				         public static void Fcmgt_S(ILEmitterCtx context)
			
 
				         {
			
 
				             if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
			
 
				-                                                 && Optimizations.UseSse2)
			
 
				+                                                  && Optimizations.UseSse2)
			
 
				             {
			
 
				                 EmitScalarSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar));
			
 
				             }
			
@@ -221,7 +239,7 @@ namespace ChocolArm64.Instructions
 
				         public static void Fcmgt_V(ILEmitterCtx context)
			
 
				         {
			
 
				             if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
			
 
				-                                                 && Optimizations.UseSse2)
			
 
				+                                                  && Optimizations.UseSse2)
			
 
				             {
			
 
				                 EmitVectorSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan));
			
 
				             }
			
@@ -252,110 +270,181 @@ namespace ChocolArm64.Instructions
 
				         }
			
 
				 
			
 
				         public static void Fcmp_S(ILEmitterCtx context)
			
 
				+        {
			
 
				+            EmitFcmpE(context, signalNaNs: false);
			
 
				+        }
			
 
				+
			
 
				+        public static void Fcmpe_S(ILEmitterCtx context)
			
 
				+        {
			
 
				+            EmitFcmpE(context, signalNaNs: true);
			
 
				+        }
			
 
				+
			
 
				+        private static void EmitFcmpE(ILEmitterCtx context, bool signalNaNs)
			
 
				         {
			
 
				             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
			
 
				 
			
 
				             bool cmpWithZero = !(op is OpCodeSimdFcond64) ? op.Bit3 : false;
			
 
				 
			
 
				-            //Handle NaN case.
			
 
				-            //If any number is NaN, then NZCV = 0011.
			
 
				-            if (cmpWithZero)
			
 
				-            {
			
 
				-                EmitNaNCheck(context, op.Rn);
			
 
				-            }
			
 
				-            else
			
 
				+            if (Optimizations.FastFP && Optimizations.UseSse2)
			
 
				             {
			
 
				-                EmitNaNCheck(context, op.Rn);
			
 
				-                EmitNaNCheck(context, op.Rm);
			
 
				+                if (op.Size == 0)
			
 
				+                {
			
 
				+                    Type[] typesCmp = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
			
 
				 
			
 
				-                context.Emit(OpCodes.Or);
			
 
				-            }
			
 
				+                    ILLabel lblNaN = new ILLabel();
			
 
				+                    ILLabel lblEnd = new ILLabel();
			
 
				 
			
 
				-            ILLabel lblNaN = new ILLabel();
			
 
				-            ILLabel lblEnd = new ILLabel();
			
 
				+                    context.EmitLdvec(op.Rn);
			
 
				 
			
 
				-            context.Emit(OpCodes.Brtrue_S, lblNaN);
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+                    context.EmitStvectmp();
			
 
				 
			
 
				-            void EmitLoadOpers()
			
 
				-            {
			
 
				-                EmitVectorExtractF(context, op.Rn, 0, op.Size);
			
 
				-
			
 
				-                if (cmpWithZero)
			
 
				-                {
			
 
				-                    if (op.Size == 0)
			
 
				+                    if (cmpWithZero)
			
 
				                     {
			
 
				-                        context.EmitLdc_R4(0f);
			
 
				+                        VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
			
 
				                     }
			
 
				-                    else /* if (Op.Size == 1) */
			
 
				+                    else
			
 
				                     {
			
 
				-                        context.EmitLdc_R8(0d);
			
 
				+                        context.EmitLdvec(op.Rm);
			
 
				                     }
			
 
				+
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+                    context.EmitStvectmp2();
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareOrderedScalar), typesCmp));
			
 
				+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareEqualOrderedScalar), typesCmp));
			
 
				+
			
 
				+                    context.Emit(OpCodes.Brtrue_S, lblNaN);
			
 
				+
			
 
				+                    context.EmitLdc_I4(0);
			
 
				+
			
 
				+                    context.EmitLdvectmp();
			
 
				+                    context.EmitLdvectmp2();
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareGreaterThanOrEqualOrderedScalar), typesCmp));
			
 
				+
			
 
				+                    context.EmitLdvectmp();
			
 
				+                    context.EmitLdvectmp2();
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareEqualOrderedScalar), typesCmp));
			
 
				+
			
 
				+                    context.EmitLdvectmp();
			
 
				+                    context.EmitLdvectmp2();
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareLessThanOrderedScalar), typesCmp));
			
 
				+
			
 
				+                    context.EmitStflg((int)PState.NBit);
			
 
				+                    context.EmitStflg((int)PState.ZBit);
			
 
				+                    context.EmitStflg((int)PState.CBit);
			
 
				+                    context.EmitStflg((int)PState.VBit);
			
 
				+
			
 
				+                    context.Emit(OpCodes.Br_S, lblEnd);
			
 
				+
			
 
				+                    context.MarkLabel(lblNaN);
			
 
				+
			
 
				+                    context.EmitLdc_I4(1);
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+                    context.EmitLdc_I4(0);
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+
			
 
				+                    context.EmitStflg((int)PState.NBit);
			
 
				+                    context.EmitStflg((int)PState.ZBit);
			
 
				+                    context.EmitStflg((int)PState.CBit);
			
 
				+                    context.EmitStflg((int)PState.VBit);
			
 
				+
			
 
				+                    context.MarkLabel(lblEnd);
			
 
				                 }
			
 
				-                else
			
 
				+                else /* if (op.Size == 1) */
			
 
				                 {
			
 
				-                    EmitVectorExtractF(context, op.Rm, 0, op.Size);
			
 
				-                }
			
 
				-            }
			
 
				-
			
 
				-            //Z = Rn == Rm
			
 
				-            EmitLoadOpers();
			
 
				+                    Type[] typesCmp = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
			
 
				 
			
 
				-            context.Emit(OpCodes.Ceq);
			
 
				-            context.Emit(OpCodes.Dup);
			
 
				+                    ILLabel lblNaN = new ILLabel();
			
 
				+                    ILLabel lblEnd = new ILLabel();
			
 
				 
			
 
				-            context.EmitStflg((int)PState.ZBit);
			
 
				+                    EmitLdvecWithCastToDouble(context, op.Rn);
			
 
				 
			
 
				-            //C = Rn >= Rm
			
 
				-            EmitLoadOpers();
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+                    context.EmitStvectmp();
			
 
				 
			
 
				-            context.Emit(OpCodes.Cgt);
			
 
				-            context.Emit(OpCodes.Or);
			
 
				+                    if (cmpWithZero)
			
 
				+                    {
			
 
				+                        VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleZero));
			
 
				+                    }
			
 
				+                    else
			
 
				+                    {
			
 
				+                        EmitLdvecWithCastToDouble(context, op.Rm);
			
 
				+                    }
			
 
				 
			
 
				-            context.EmitStflg((int)PState.CBit);
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+                    context.EmitStvectmp2();
			
 
				 
			
 
				-            //N = Rn < Rm
			
 
				-            EmitLoadOpers();
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareOrderedScalar), typesCmp));
			
 
				+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleZero));
			
 
				 
			
 
				-            context.Emit(OpCodes.Clt);
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareEqualOrderedScalar), typesCmp));
			
 
				 
			
 
				-            context.EmitStflg((int)PState.NBit);
			
 
				+                    context.Emit(OpCodes.Brtrue_S, lblNaN);
			
 
				 
			
 
				-            //V = 0
			
 
				-            context.EmitLdc_I4(0);
			
 
				+                    context.EmitLdc_I4(0);
			
 
				 
			
 
				-            context.EmitStflg((int)PState.VBit);
			
 
				+                    context.EmitLdvectmp();
			
 
				+                    context.EmitLdvectmp2();
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareGreaterThanOrEqualOrderedScalar), typesCmp));
			
 
				 
			
 
				-            context.Emit(OpCodes.Br_S, lblEnd);
			
 
				+                    context.EmitLdvectmp();
			
 
				+                    context.EmitLdvectmp2();
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareEqualOrderedScalar), typesCmp));
			
 
				 
			
 
				-            context.MarkLabel(lblNaN);
			
 
				+                    context.EmitLdvectmp();
			
 
				+                    context.EmitLdvectmp2();
			
 
				+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareLessThanOrderedScalar), typesCmp));
			
 
				 
			
 
				-            EmitSetNzcv(context, 0b0011);
			
 
				+                    context.EmitStflg((int)PState.NBit);
			
 
				+                    context.EmitStflg((int)PState.ZBit);
			
 
				+                    context.EmitStflg((int)PState.CBit);
			
 
				+                    context.EmitStflg((int)PState.VBit);
			
 
				 
			
 
				-            context.MarkLabel(lblEnd);
			
 
				-        }
			
 
				+                    context.Emit(OpCodes.Br_S, lblEnd);
			
 
				 
			
 
				-        public static void Fcmpe_S(ILEmitterCtx context)
			
 
				-        {
			
 
				-            Fcmp_S(context);
			
 
				-        }
			
 
				+                    context.MarkLabel(lblNaN);
			
 
				 
			
 
				-        private static void EmitNaNCheck(ILEmitterCtx context, int reg)
			
 
				-        {
			
 
				-            IOpCodeSimd64 op = (IOpCodeSimd64)context.CurrOp;
			
 
				+                    context.EmitLdc_I4(1);
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				+                    context.EmitLdc_I4(0);
			
 
				+                    context.Emit(OpCodes.Dup);
			
 
				 
			
 
				-            EmitVectorExtractF(context, reg, 0, op.Size);
			
 
				+                    context.EmitStflg((int)PState.NBit);
			
 
				+                    context.EmitStflg((int)PState.ZBit);
			
 
				+                    context.EmitStflg((int)PState.CBit);
			
 
				+                    context.EmitStflg((int)PState.VBit);
			
 
				 
			
 
				-            if (op.Size == 0)
			
 
				-            {
			
 
				-                context.EmitCall(typeof(float), nameof(float.IsNaN));
			
 
				-            }
			
 
				-            else if (op.Size == 1)
			
 
				-            {
			
 
				-                context.EmitCall(typeof(double), nameof(double.IsNaN));
			
 
				+                    context.MarkLabel(lblEnd);
			
 
				+                }
			
 
				             }
			
 
				             else
			
 
				             {
			
 
				-                throw new InvalidOperationException();
			
 
				+                EmitVectorExtractF(context, op.Rn, 0, op.Size);
			
 
				+
			
 
				+                if (cmpWithZero)
			
 
				+                {
			
 
				+                    if (op.Size == 0)
			
 
				+                    {
			
 
				+                        context.EmitLdc_R4(0f);
			
 
				+                    }
			
 
				+                    else // if (op.Size == 1)
			
 
				+                    {
			
 
				+                        context.EmitLdc_R8(0d);
			
 
				+                    }
			
 
				+                }
			
 
				+                else
			
 
				+                {
			
 
				+                    EmitVectorExtractF(context, op.Rm, 0, op.Size);
			
 
				+                }
			
 
				+
			
 
				+                context.EmitLdc_I4(!signalNaNs ? 0 : 1);
			
 
				+
			
 
				+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPCompare));
			
 
				+
			
 
				+                EmitSetNzcv(context);
			
 
				             }
			
 
				         }
			
 
				 
			
@@ -486,7 +575,7 @@ namespace ChocolArm64.Instructions
 
				             {
			
 
				                 context.EmitLdc_R4(0f);
			
 
				             }
			
 
				-            else /* if (SizeF == 1) */
			
 
				+            else /* if (sizeF == 1) */
			
 
				             {
			
 
				                 context.EmitLdc_R8(0d);
			
 
				             }
			
--- a/ChocolArm64/Instructions/InstEmitSimdCvt.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdCvt.cs
@@ -76,33 +76,54 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				             int sizeF = op.Size & 1;
			
 
				 
			
 
				-            int elems = 4 >> sizeF;
			
 
				+            if (Optimizations.UseSse2 && sizeF == 1)
			
 
				+            {
			
 
				+                Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
			
 
				+                Type[] typesCvt = new Type[] { typeof(Vector128<float>) };
			
 
				 
			
 
				-            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
			
 
				+                string nameMov = op.RegisterSize == RegisterSize.Simd128
			
 
				+                    ? nameof(Sse.MoveHighToLow)
			
 
				+                    : nameof(Sse.MoveLowToHigh);
			
 
				 
			
 
				-            for (int index = 0; index < elems; index++)
			
 
				+                context.EmitLdvec(op.Rn);
			
 
				+                context.Emit(OpCodes.Dup);
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Double), typesCvt));
			
 
				+
			
 
				+                EmitStvecWithCastFromDouble(context, op.Rd);
			
 
				+            }
			
 
				+            else
			
 
				             {
			
 
				-                if (sizeF == 0)
			
 
				-                {
			
 
				-                    EmitVectorExtractZx(context, op.Rn, part + index, 1);
			
 
				-                    context.Emit(OpCodes.Conv_U2);
			
 
				+                int elems = 4 >> sizeF;
			
 
				 
			
 
				-                    context.EmitLdarg(TranslatedSub.StateArgIdx);
			
 
				+                int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
			
 
				 
			
 
				-                    context.EmitCall(typeof(SoftFloat16_32), nameof(SoftFloat16_32.FPConvert));
			
 
				-                }
			
 
				-                else /* if (sizeF == 1) */
			
 
				+                for (int index = 0; index < elems; index++)
			
 
				                 {
			
 
				-                    EmitVectorExtractF(context, op.Rn, part + index, 0);
			
 
				+                    if (sizeF == 0)
			
 
				+                    {
			
 
				+                        EmitVectorExtractZx(context, op.Rn, part + index, 1);
			
 
				+                        context.Emit(OpCodes.Conv_U2);
			
 
				+
			
 
				+                        context.EmitLdarg(TranslatedSub.StateArgIdx);
			
 
				+
			
 
				+                        context.EmitCall(typeof(SoftFloat16_32), nameof(SoftFloat16_32.FPConvert));
			
 
				+                    }
			
 
				+                    else /* if (sizeF == 1) */
			
 
				+                    {
			
 
				+                        EmitVectorExtractF(context, op.Rn, part + index, 0);
			
 
				 
			
 
				-                    context.Emit(OpCodes.Conv_R8);
			
 
				+                        context.Emit(OpCodes.Conv_R8);
			
 
				+                    }
			
 
				+
			
 
				+                    EmitVectorInsertTmpF(context, index, sizeF);
			
 
				                 }
			
 
				 
			
 
				-                EmitVectorInsertTmpF(context, index, sizeF);
			
 
				+                context.EmitLdvectmp();
			
 
				+                context.EmitStvec(op.Rd);
			
 
				             }
			
 
				-
			
 
				-            context.EmitLdvectmp();
			
 
				-            context.EmitStvec(op.Rd);
			
 
				         }
			
 
				 
			
 
				         public static void Fcvtms_Gp(ILEmitterCtx context)
			
@@ -121,43 +142,70 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				             int sizeF = op.Size & 1;
			
 
				 
			
 
				-            int elems = 4 >> sizeF;
			
 
				+            if (Optimizations.UseSse2 && sizeF == 1)
			
 
				+            {
			
 
				+                Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
			
 
				+                Type[] typesCvt = new Type[] { typeof(Vector128<double>) };
			
 
				 
			
 
				-            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
			
 
				+                string nameMov = op.RegisterSize == RegisterSize.Simd128
			
 
				+                    ? nameof(Sse.MoveLowToHigh)
			
 
				+                    : nameof(Sse.MoveHighToLow);
			
 
				 
			
 
				-            if (part != 0)
			
 
				-            {
			
 
				                 context.EmitLdvec(op.Rd);
			
 
				-                context.EmitStvectmp();
			
 
				-            }
			
 
				+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
			
 
				 
			
 
				-            for (int index = 0; index < elems; index++)
			
 
				-            {
			
 
				-                EmitVectorExtractF(context, op.Rn, index, sizeF);
			
 
				+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
			
 
				 
			
 
				-                if (sizeF == 0)
			
 
				-                {
			
 
				-                    context.EmitLdarg(TranslatedSub.StateArgIdx);
			
 
				+                EmitLdvecWithCastToDouble(context, op.Rn);
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt));
			
 
				+                context.Emit(OpCodes.Dup);
			
 
				 
			
 
				-                    context.EmitCall(typeof(SoftFloat32_16), nameof(SoftFloat32_16.FPConvert));
			
 
				+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
			
 
				 
			
 
				-                    context.Emit(OpCodes.Conv_U8);
			
 
				-                    EmitVectorInsertTmp(context, part + index, 1);
			
 
				+                context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));
			
 
				+
			
 
				+                context.EmitStvec(op.Rd);
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                int elems = 4 >> sizeF;
			
 
				+
			
 
				+                int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
			
 
				+
			
 
				+                if (part != 0)
			
 
				+                {
			
 
				+                    context.EmitLdvec(op.Rd);
			
 
				+                    context.EmitStvectmp();
			
 
				                 }
			
 
				-                else /* if (sizeF == 1) */
			
 
				+
			
 
				+                for (int index = 0; index < elems; index++)
			
 
				                 {
			
 
				-                    context.Emit(OpCodes.Conv_R4);
			
 
				+                    EmitVectorExtractF(context, op.Rn, index, sizeF);
			
 
				+
			
 
				+                    if (sizeF == 0)
			
 
				+                    {
			
 
				+                        context.EmitLdarg(TranslatedSub.StateArgIdx);
			
 
				+
			
 
				+                        context.EmitCall(typeof(SoftFloat32_16), nameof(SoftFloat32_16.FPConvert));
			
 
				+
			
 
				+                        context.Emit(OpCodes.Conv_U8);
			
 
				+                        EmitVectorInsertTmp(context, part + index, 1);
			
 
				+                    }
			
 
				+                    else /* if (sizeF == 1) */
			
 
				+                    {
			
 
				+                        context.Emit(OpCodes.Conv_R4);
			
 
				 
			
 
				-                    EmitVectorInsertTmpF(context, part + index, 0);
			
 
				+                        EmitVectorInsertTmpF(context, part + index, 0);
			
 
				+                    }
			
 
				                 }
			
 
				-            }
			
 
				 
			
 
				-            context.EmitLdvectmp();
			
 
				-            context.EmitStvec(op.Rd);
			
 
				+                context.EmitLdvectmp();
			
 
				+                context.EmitStvec(op.Rd);
			
 
				 
			
 
				-            if (part == 0)
			
 
				-            {
			
 
				-                EmitVectorZeroUpper(context, op.Rd);
			
 
				+                if (part == 0)
			
 
				+                {
			
 
				+                    EmitVectorZeroUpper(context, op.Rd);
			
 
				+                }
			
 
				             }
			
 
				         }
			
 
				 
			
@@ -260,7 +308,29 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				         public static void Scvtf_V(ILEmitterCtx context)
			
 
				         {
			
 
				-            EmitVectorCvtf(context, signed: true);
			
 
				+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
			
 
				+
			
 
				+            int sizeF = op.Size & 1;
			
 
				+
			
 
				+            if (Optimizations.UseSse2 && sizeF == 0)
			
 
				+            {
			
 
				+                Type[] typesCvt = new Type[] { typeof(Vector128<int>) };
			
 
				+
			
 
				+                EmitLdvecWithSignedCast(context, op.Rn, 2);
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt));
			
 
				+
			
 
				+                context.EmitStvec(op.Rd);
			
 
				+
			
 
				+                if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                {
			
 
				+                    EmitVectorZeroUpper(context, op.Rd);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorCvtf(context, signed: true);
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void Ucvtf_Gp(ILEmitterCtx context)
			
@@ -441,16 +511,6 @@ namespace ChocolArm64.Instructions
 
				             context.EmitStintzr(op.Rd);
			
 
				         }
			
 
				 
			
 
				-        private static void EmitVectorScvtf(ILEmitterCtx context)
			
 
				-        {
			
 
				-            EmitVectorCvtf(context, true);
			
 
				-        }
			
 
				-
			
 
				-        private static void EmitVectorUcvtf(ILEmitterCtx context)
			
 
				-        {
			
 
				-            EmitVectorCvtf(context, false);
			
 
				-        }
			
 
				-
			
 
				         private static void EmitVectorCvtf(ILEmitterCtx context, bool signed)
			
 
				         {
			
 
				             OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
			
--- a/ChocolArm64/Instructions/InstEmitSimdHelper.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdHelper.cs
@@ -219,7 +219,7 @@ namespace ChocolArm64.Instructions
 
				                 type     = typeof(Sse);
			
 
				                 baseType = typeof(Vector128<float>);
			
 
				             }
			
 
				-            else /* if (SizeF == 1) */
			
 
				+            else /* if (sizeF == 1) */
			
 
				             {
			
 
				                 type     = typeof(Sse2);
			
 
				                 baseType = typeof(Vector128<double>);
			
@@ -249,7 +249,7 @@ namespace ChocolArm64.Instructions
 
				                 {
			
 
				                     EmitVectorZero32_128(context, op.Rd);
			
 
				                 }
			
 
				-                else /* if (SizeF == 1) */
			
 
				+                else /* if (sizeF == 1) */
			
 
				                 {
			
 
				                     EmitVectorZeroUpper(context, op.Rd);
			
 
				                 }
			
@@ -272,7 +272,7 @@ namespace ChocolArm64.Instructions
 
				             {
			
 
				                 mthdInfo = typeof(MathF).GetMethod(name, new Type[] { typeof(float) });
			
 
				             }
			
 
				-            else /* if (SizeF == 1) */
			
 
				+            else /* if (sizeF == 1) */
			
 
				             {
			
 
				                 mthdInfo = typeof(Math).GetMethod(name, new Type[] { typeof(double) });
			
 
				             }
			
@@ -292,7 +292,7 @@ namespace ChocolArm64.Instructions
 
				             {
			
 
				                 mthdInfo = typeof(MathF).GetMethod(name, new Type[] { typeof(float), typeof(float) });
			
 
				             }
			
 
				-            else /* if (SizeF == 1) */
			
 
				+            else /* if (sizeF == 1) */
			
 
				             {
			
 
				                 mthdInfo = typeof(Math).GetMethod(name, new Type[] { typeof(double), typeof(double) });
			
 
				             }
			
@@ -312,7 +312,7 @@ namespace ChocolArm64.Instructions
 
				             {
			
 
				                 mthdInfo = typeof(MathF).GetMethod(nameof(MathF.Round), new Type[] { typeof(float), typeof(MidpointRounding) });
			
 
				             }
			
 
				-            else /* if (SizeF == 1) */
			
 
				+            else /* if (sizeF == 1) */
			
 
				             {
			
 
				                 mthdInfo = typeof(Math).GetMethod(nameof(Math.Round), new Type[] { typeof(double), typeof(MidpointRounding) });
			
 
				             }
			
@@ -334,7 +334,7 @@ namespace ChocolArm64.Instructions
 
				             {
			
 
				                 mthdInfo = typeof(SoftFloat).GetMethod(name, new Type[] { typeof(float) });
			
 
				             }
			
 
				-            else /* if (SizeF == 1) */
			
 
				+            else /* if (sizeF == 1) */
			
 
				             {
			
 
				                 mthdInfo = typeof(SoftFloat).GetMethod(name, new Type[] { typeof(double) });
			
 
				             }
			
@@ -961,7 +961,7 @@ namespace ChocolArm64.Instructions
 
				                 {
			
 
				                     EmitSatQ(context, op.Size, true, true);
			
 
				                 }
			
 
				-                else /* if (Op.Size == 3) */
			
 
				+                else /* if (op.Size == 3) */
			
 
				                 {
			
 
				                     EmitUnarySignedSatQAbsOrNeg(context);
			
 
				                 }
			
@@ -1022,7 +1022,7 @@ namespace ChocolArm64.Instructions
 
				             {
			
 
				                 for (int index = 0; index < elems; index++)
			
 
				                 {
			
 
				-                    EmitVectorExtract(context,                   op.Rn, index, op.Size, signed);
			
 
				+                    EmitVectorExtract(context,                   op.Rn,  index, op.Size, signed);
			
 
				                     EmitVectorExtract(context, ((OpCodeSimdReg64)op).Rm, index, op.Size, signed);
			
 
				 
			
 
				                     if (op.Size <= 2)
			
@@ -1031,13 +1031,13 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				                         EmitSatQ(context, op.Size, true, signed);
			
 
				                     }
			
 
				-                    else /* if (Op.Size == 3) */
			
 
				+                    else /* if (op.Size == 3) */
			
 
				                     {
			
 
				                         if (add)
			
 
				                         {
			
 
				                             EmitBinarySatQAdd(context, signed);
			
 
				                         }
			
 
				-                        else /* if (Sub) */
			
 
				+                        else /* if (sub) */
			
 
				                         {
			
 
				                             EmitBinarySatQSub(context, signed);
			
 
				                         }
			
@@ -1059,7 +1059,7 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				                         EmitSatQ(context, op.Size, true, signed);
			
 
				                     }
			
 
				-                    else /* if (Op.Size == 3) */
			
 
				+                    else /* if (op.Size == 3) */
			
 
				                     {
			
 
				                         EmitBinarySatQAccumulate(context, signed);
			
 
				                     }
			
@@ -1071,7 +1071,7 @@ namespace ChocolArm64.Instructions
 
				             {
			
 
				                 for (int index = 0; index < elems; index++)
			
 
				                 {
			
 
				-                    EmitVectorExtract(context,                   op.Rn, index, op.Size, signed);
			
 
				+                    EmitVectorExtract(context,                   op.Rn,  index, op.Size, signed);
			
 
				                     EmitVectorExtract(context, ((OpCodeSimdReg64)op).Rm, index, op.Size, signed);
			
 
				 
			
 
				                     emit();
			
@@ -1304,52 +1304,64 @@ namespace ChocolArm64.Instructions
 
				             }
			
 
				         }
			
 
				 
			
 
				-        public static void EmitVectorZeroAll(ILEmitterCtx context, int rd)
			
 
				+        public static void EmitVectorZeroAll(ILEmitterCtx context, int reg)
			
 
				         {
			
 
				-            if (Optimizations.UseSse2)
			
 
				+            if (Optimizations.UseSse)
			
 
				             {
			
 
				                 VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
			
 
				 
			
 
				-                context.EmitStvec(rd);
			
 
				+                context.EmitStvec(reg);
			
 
				             }
			
 
				             else
			
 
				             {
			
 
				-                EmitVectorZeroLower(context, rd);
			
 
				-                EmitVectorZeroUpper(context, rd);
			
 
				+                EmitVectorZeroLower(context, reg);
			
 
				+                EmitVectorZeroUpper(context, reg);
			
 
				             }
			
 
				         }
			
 
				 
			
 
				-        public static void EmitVectorZeroLower(ILEmitterCtx context, int rd)
			
 
				+        public static void EmitVectorZeroLower(ILEmitterCtx context, int reg)
			
 
				         {
			
 
				-            EmitVectorInsert(context, rd, 0, 3, 0);
			
 
				+            EmitVectorInsert(context, reg, 0, 3, 0);
			
 
				         }
			
 
				 
			
 
				         public static void EmitVectorZeroLowerTmp(ILEmitterCtx context)
			
 
				         {
			
 
				-            EmitVectorInsertTmp(context, 0, 3, 0);
			
 
				+            if (Optimizations.UseSse)
			
 
				+            {
			
 
				+                context.EmitLdvectmp();
			
 
				+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveHighToLow)));
			
 
				+
			
 
				+                context.EmitStvectmp();
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                EmitVectorInsertTmp(context, 0, 3, 0);
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         public static void EmitVectorZeroUpper(ILEmitterCtx context, int reg)
			
 
				         {
			
 
				-            if (Optimizations.UseSse2)
			
 
				+            if (Optimizations.UseSse)
			
 
				             {
			
 
				-                //TODO: Use MoveScalar once it is fixed, as of the
			
 
				-                //time of writing it just crashes the JIT.
			
 
				-                EmitLdvecWithUnsignedCast(context, reg, 3);
			
 
				+                //TODO: Use Sse2.MoveScalar once it is fixed,
			
 
				+                //as of the time of writing it just crashes the JIT (SDK 2.1.500).
			
 
				 
			
 
				-                Type[] types = new Type[] { typeof(Vector128<ulong>), typeof(byte) };
			
 
				+                /*Type[] typesMov = new Type[] { typeof(Vector128<ulong>) };
			
 
				 
			
 
				-                //Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MoveScalar), Types));
			
 
				+                EmitLdvecWithUnsignedCast(context, reg, 3);
			
 
				 
			
 
				-                context.EmitLdc_I4(8);
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MoveScalar), typesMov));
			
 
				 
			
 
				-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical128BitLane), types));
			
 
				+                EmitStvecWithUnsignedCast(context, reg, 3);*/
			
 
				 
			
 
				-                context.EmitLdc_I4(8);
			
 
				+                context.EmitLdvec(reg);
			
 
				+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
			
 
				 
			
 
				-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), types));
			
 
				+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
			
 
				 
			
 
				-                EmitStvecWithUnsignedCast(context, reg, 3);
			
 
				+                context.EmitStvec(reg);
			
 
				             }
			
 
				             else
			
 
				             {
			
@@ -1359,9 +1371,15 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				         public static void EmitVectorZero32_128(ILEmitterCtx context, int reg)
			
 
				         {
			
 
				+            if (!Sse.IsSupported)
			
 
				+            {
			
 
				+                throw new PlatformNotSupportedException();
			
 
				+            }
			
 
				+
			
 
				+            VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
			
 
				             context.EmitLdvec(reg);
			
 
				 
			
 
				-            VectorHelper.EmitCall(context, nameof(VectorHelper.VectorZero32_128));
			
 
				+            context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveScalar)));
			
 
				 
			
 
				             context.EmitStvec(reg);
			
 
				         }
			
--- a/ChocolArm64/Instructions/InstEmitSimdMove.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdMove.cs
@@ -3,6 +3,7 @@ using ChocolArm64.State;
 
				 using ChocolArm64.Translation;
			
 
				 using System;
			
 
				 using System.Reflection.Emit;
			
 
				+using System.Runtime.Intrinsics;
			
 
				 using System.Runtime.Intrinsics.X86;
			
 
				 
			
 
				 using static ChocolArm64.Instructions.InstEmitSimdHelper;
			
@@ -17,6 +18,8 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				             if (Optimizations.UseSse2)
			
 
				             {
			
 
				+                Type[] typesSav = new Type[] { UIntTypesPerSizeLog2[op.Size] };
			
 
				+
			
 
				                 context.EmitLdintzr(op.Rn);
			
 
				 
			
 
				                 switch (op.Size)
			
@@ -26,16 +29,9 @@ namespace ChocolArm64.Instructions
 
				                     case 2: context.Emit(OpCodes.Conv_U4); break;
			
 
				                 }
			
 
				 
			
 
				-                Type[] types = new Type[] { UIntTypesPerSizeLog2[op.Size] };
			
 
				-
			
 
				-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), types));
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
			
 
				 
			
 
				                 EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
			
 
				-
			
 
				-                if (op.RegisterSize == RegisterSize.Simd64)
			
 
				-                {
			
 
				-                    EmitVectorZeroUpper(context, op.Rd);
			
 
				-                }
			
 
				             }
			
 
				             else
			
 
				             {
			
@@ -48,11 +44,11 @@ namespace ChocolArm64.Instructions
 
				 
			
 
				                     EmitVectorInsert(context, op.Rd, index, op.Size);
			
 
				                 }
			
 
				+            }
			
 
				 
			
 
				-                if (op.RegisterSize == RegisterSize.Simd64)
			
 
				-                {
			
 
				-                    EmitVectorZeroUpper(context, op.Rd);
			
 
				-                }
			
 
				+            if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+            {
			
 
				+                EmitVectorZeroUpper(context, op.Rd);
			
 
				             }
			
 
				         }
			
 
				 
			
@@ -69,14 +65,34 @@ namespace ChocolArm64.Instructions
 
				         {
			
 
				             OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
			
 
				 
			
 
				-            int bytes = op.GetBitsCount() >> 3;
			
 
				-            int elems = bytes >> op.Size;
			
 
				-
			
 
				-            for (int index = 0; index < elems; index++)
			
 
				+            if (Optimizations.UseSse2)
			
 
				             {
			
 
				+                Type[] typesSav = new Type[] { UIntTypesPerSizeLog2[op.Size] };
			
 
				+
			
 
				                 EmitVectorExtractZx(context, op.Rn, op.DstIndex, op.Size);
			
 
				 
			
 
				-                EmitVectorInsert(context, op.Rd, index, op.Size);
			
 
				+                switch (op.Size)
			
 
				+                {
			
 
				+                    case 0: context.Emit(OpCodes.Conv_U1); break;
			
 
				+                    case 1: context.Emit(OpCodes.Conv_U2); break;
			
 
				+                    case 2: context.Emit(OpCodes.Conv_U4); break;
			
 
				+                }
			
 
				+
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
			
 
				+
			
 
				+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                int bytes = op.GetBitsCount() >> 3;
			
 
				+                int elems = bytes >> op.Size;
			
 
				+
			
 
				+                for (int index = 0; index < elems; index++)
			
 
				+                {
			
 
				+                    EmitVectorExtractZx(context, op.Rn, op.DstIndex, op.Size);
			
 
				+
			
 
				+                    EmitVectorInsert(context, op.Rd, index, op.Size);
			
 
				+                }
			
 
				             }
			
 
				 
			
 
				             if (op.RegisterSize == RegisterSize.Simd64)
			
@@ -89,32 +105,65 @@ namespace ChocolArm64.Instructions
 
				         {
			
 
				             OpCodeSimdExt64 op = (OpCodeSimdExt64)context.CurrOp;
			
 
				 
			
 
				-            context.EmitLdvec(op.Rd);
			
 
				-            context.EmitStvectmp();
			
 
				+            if (Optimizations.UseSse2)
			
 
				+            {
			
 
				+                Type[] typesShs = new Type[] { typeof(Vector128<byte>), typeof(byte) };
			
 
				+                Type[] typesOr  = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
			
 
				 
			
 
				-            int bytes = op.GetBitsCount() >> 3;
			
 
				+                EmitLdvecWithUnsignedCast(context, op.Rn, 0);
			
 
				 
			
 
				-            int position = op.Imm4;
			
 
				+                if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                {
			
 
				+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
			
 
				 
			
 
				-            for (int index = 0; index < bytes; index++)
			
 
				-            {
			
 
				-                int reg = op.Imm4 + index < bytes ? op.Rn : op.Rm;
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
			
 
				+                }
			
 
				+
			
 
				+                context.EmitLdc_I4(op.Imm4);
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesShs));
			
 
				 
			
 
				-                if (position == bytes)
			
 
				+                EmitLdvecWithUnsignedCast(context, op.Rm, 0);
			
 
				+
			
 
				+                context.EmitLdc_I4((op.RegisterSize == RegisterSize.Simd64 ? 8 : 16) - op.Imm4);
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical128BitLane), typesShs));
			
 
				+
			
 
				+                if (op.RegisterSize == RegisterSize.Simd64)
			
 
				                 {
			
 
				-                    position = 0;
			
 
				+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
			
 
				+
			
 
				+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
			
 
				                 }
			
 
				 
			
 
				-                EmitVectorExtractZx(context, reg, position++, 0);
			
 
				-                EmitVectorInsertTmp(context, index, 0);
			
 
				+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), typesOr));
			
 
				+
			
 
				+                EmitStvecWithUnsignedCast(context, op.Rd, 0);
			
 
				             }
			
 
				+            else
			
 
				+            {
			
 
				+                int bytes = op.GetBitsCount() >> 3;
			
 
				 
			
 
				-            context.EmitLdvectmp();
			
 
				-            context.EmitStvec(op.Rd);
			
 
				+                int position = op.Imm4;
			
 
				 
			
 
				-            if (op.RegisterSize == RegisterSize.Simd64)
			
 
				-            {
			
 
				-                EmitVectorZeroUpper(context, op.Rd);
			
 
				+                for (int index = 0; index < bytes; index++)
			
 
				+                {
			
 
				+                    int reg = op.Imm4 + index < bytes ? op.Rn : op.Rm;
			
 
				+
			
 
				+                    if (position == bytes)
			
 
				+                    {
			
 
				+                        position = 0;
			
 
				+                    }
			
 
				+
			
 
				+                    EmitVectorExtractZx(context, reg, position++, 0);
			
 
				+                    EmitVectorInsertTmp(context, index, 0);
			
 
				+                }
			
 
				+
			
 
				+                context.EmitLdvectmp();
			
 
				+                context.EmitStvec(op.Rd);
			
 
				+
			
 
				+                if (op.RegisterSize == RegisterSize.Simd64)
			
 
				+                {
			
 
				+                    EmitVectorZeroUpper(context, op.Rd);
			
 
				+                }
			
 
				             }
			
 
				         }
			
 
				 
			
--- a/ChocolArm64/Instructions/SoftFloat.cs
+++ b/ChocolArm64/Instructions/SoftFloat.cs
@@ -789,6 +789,43 @@ namespace ChocolArm64.Instructions
 
				             return result;
			
 
				         }
			
 
				 
			
 
				+        public static int FPCompare(float value1, float value2, bool signalNaNs, CpuThreadState state)
			
 
				+        {
			
 
				+            Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPCompare: state.Fpcr = 0x{state.Fpcr:X8}");
			
 
				+
			
 
				+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out _, state);
			
 
				+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out _, state);
			
 
				+
			
 
				+            int result;
			
 
				+
			
 
				+            if (type1 == FpType.SNaN || type1 == FpType.QNaN || type2 == FpType.SNaN || type2 == FpType.QNaN)
			
 
				+            {
			
 
				+                result = 0b0011;
			
 
				+
			
 
				+                if (type1 == FpType.SNaN || type2 == FpType.SNaN || signalNaNs)
			
 
				+                {
			
 
				+                    FPProcessException(FpExc.InvalidOp, state);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                if (value1 == value2)
			
 
				+                {
			
 
				+                    result = 0b0110;
			
 
				+                }
			
 
				+                else if (value1 < value2)
			
 
				+                {
			
 
				+                    result = 0b1000;
			
 
				+                }
			
 
				+                else
			
 
				+                {
			
 
				+                    result = 0b0010;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            return result;
			
 
				+        }
			
 
				+
			
 
				         public static float FPDiv(float value1, float value2, CpuThreadState state)
			
 
				         {
			
 
				             Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPDiv: state.Fpcr = 0x{state.Fpcr:X8}");
			
@@ -1584,6 +1621,43 @@ namespace ChocolArm64.Instructions
 
				             return result;
			
 
				         }
			
 
				 
			
 
				+        public static int FPCompare(double value1, double value2, bool signalNaNs, CpuThreadState state)
			
 
				+        {
			
 
				+            Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPCompare: state.Fpcr = 0x{state.Fpcr:X8}");
			
 
				+
			
 
				+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out _, state);
			
 
				+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out _, state);
			
 
				+
			
 
				+            int result;
			
 
				+
			
 
				+            if (type1 == FpType.SNaN || type1 == FpType.QNaN || type2 == FpType.SNaN || type2 == FpType.QNaN)
			
 
				+            {
			
 
				+                result = 0b0011;
			
 
				+
			
 
				+                if (type1 == FpType.SNaN || type2 == FpType.SNaN || signalNaNs)
			
 
				+                {
			
 
				+                    FPProcessException(FpExc.InvalidOp, state);
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                if (value1 == value2)
			
 
				+                {
			
 
				+                    result = 0b0110;
			
 
				+                }
			
 
				+                else if (value1 < value2)
			
 
				+                {
			
 
				+                    result = 0b1000;
			
 
				+                }
			
 
				+                else
			
 
				+                {
			
 
				+                    result = 0b0010;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            return result;
			
 
				+        }
			
 
				+
			
 
				         public static double FPDiv(double value1, double value2, CpuThreadState state)
			
 
				         {
			
 
				             Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPDiv: state.Fpcr = 0x{state.Fpcr:X8}");
			
--- a/ChocolArm64/Instructions/VectorHelper.cs
+++ b/ChocolArm64/Instructions/VectorHelper.cs
@@ -9,18 +9,6 @@ namespace ChocolArm64.Instructions
 
				 {
			
 
				     static class VectorHelper
			
 
				     {
			
 
				-        private static readonly Vector128<float> Zero32128Mask;
			
 
				-
			
 
				-        static VectorHelper()
			
 
				-        {
			
 
				-            if (!Sse2.IsSupported)
			
 
				-            {
			
 
				-                throw new PlatformNotSupportedException();
			
 
				-            }
			
 
				-
			
 
				-            Zero32128Mask = Sse.StaticCast<uint, float>(Sse2.SetVector128(0, 0, 0, 0xffffffff));
			
 
				-        }
			
 
				-
			
 
				         public static void EmitCall(ILEmitterCtx context, string name64, string name128)
			
 
				         {
			
 
				             bool isSimd64 = context.CurrOp.RegisterSize == RegisterSize.Simd64;
			
@@ -491,7 +479,7 @@ namespace ChocolArm64.Instructions
 
				             {
			
 
				                 int intValue = BitConverter.SingleToInt32Bits(value);
			
 
				 
			
 
				-                ushort low  = (ushort)(intValue >> 0);
			
 
				+                ushort low  = (ushort)(intValue >>  0);
			
 
				                 ushort high = (ushort)(intValue >> 16);
			
 
				 
			
 
				                 Vector128<ushort> shortVector = Sse.StaticCast<float, ushort>(vector);
			
@@ -578,17 +566,6 @@ namespace ChocolArm64.Instructions
 
				             throw new PlatformNotSupportedException();
			
 
				         }
			
 
				 
			
 
				-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
			
 
				-        public static Vector128<float> VectorZero32_128(Vector128<float> vector)
			
 
				-        {
			
 
				-            if (Sse.IsSupported)
			
 
				-            {
			
 
				-                return Sse.And(vector, Zero32128Mask);
			
 
				-            }
			
 
				-
			
 
				-            throw new PlatformNotSupportedException();
			
 
				-        }
			
 
				-
			
 
				         [MethodImpl(MethodImplOptions.AggressiveInlining)]
			
 
				         public static Vector128<sbyte> VectorSingleToSByte(Vector128<float> vector)
			
 
				         {
			
--- a/ChocolArm64/OpCodeTable.cs
+++ b/ChocolArm64/OpCodeTable.cs
@@ -216,9 +216,9 @@ namespace ChocolArm64
 
				             SetA64("01011110111xxxxx100011xxxxxxxxxx", InstEmit.Cmtst_S,       typeof(OpCodeSimdReg64));
			
 
				             SetA64("0>001110<<1xxxxx100011xxxxxxxxxx", InstEmit.Cmtst_V,       typeof(OpCodeSimdReg64));
			
 
				             SetA64("0x00111000100000010110xxxxxxxxxx", InstEmit.Cnt_V,         typeof(OpCodeSimd64));
			
 
				-            SetA64("0x001110000xxxxx000011xxxxxxxxxx", InstEmit.Dup_Gp,        typeof(OpCodeSimdIns64));
			
 
				+            SetA64("0>001110000x<>>>000011xxxxxxxxxx", InstEmit.Dup_Gp,        typeof(OpCodeSimdIns64));
			
 
				             SetA64("01011110000xxxxx000001xxxxxxxxxx", InstEmit.Dup_S,         typeof(OpCodeSimdIns64));
			
 
				-            SetA64("0x001110000xxxxx000001xxxxxxxxxx", InstEmit.Dup_V,         typeof(OpCodeSimdIns64));
			
 
				+            SetA64("0>001110000x<>>>000001xxxxxxxxxx", InstEmit.Dup_V,         typeof(OpCodeSimdIns64));
			
 
				             SetA64("0x101110001xxxxx000111xxxxxxxxxx", InstEmit.Eor_V,         typeof(OpCodeSimdReg64));
			
 
				             SetA64("0>101110000xxxxx0<xxx0xxxxxxxxxx", InstEmit.Ext_V,         typeof(OpCodeSimdExt64));
			
 
				             SetA64("011111101x1xxxxx110101xxxxxxxxxx", InstEmit.Fabd_S,        typeof(OpCodeSimdReg64));
			
@@ -384,9 +384,9 @@ namespace ChocolArm64
 
				             SetA64("0x001110<<1xxxxx000000xxxxxxxxxx", InstEmit.Saddl_V,       typeof(OpCodeSimdReg64));
			
 
				             SetA64("0x001110<<100000001010xxxxxxxxxx", InstEmit.Saddlp_V,      typeof(OpCodeSimd64));
			
 
				             SetA64("0x001110<<1xxxxx000100xxxxxxxxxx", InstEmit.Saddw_V,       typeof(OpCodeSimdReg64));
			
 
				-            SetA64("x0011110xx100010000000xxxxxxxxxx", InstEmit.Scvtf_Gp,      typeof(OpCodeSimdCvt64));
			
 
				+            SetA64("x00111100x100010000000xxxxxxxxxx", InstEmit.Scvtf_Gp,      typeof(OpCodeSimdCvt64));
			
 
				             SetA64("010111100x100001110110xxxxxxxxxx", InstEmit.Scvtf_S,       typeof(OpCodeSimd64));
			
 
				-            SetA64("0x0011100x100001110110xxxxxxxxxx", InstEmit.Scvtf_V,       typeof(OpCodeSimd64));
			
 
				+            SetA64("0>0011100<100001110110xxxxxxxxxx", InstEmit.Scvtf_V,       typeof(OpCodeSimd64));
			
 
				             SetA64("01011110000xxxxx000000xxxxxxxxxx", InstEmit.Sha1c_V,       typeof(OpCodeSimdReg64));
			
 
				             SetA64("0101111000101000000010xxxxxxxxxx", InstEmit.Sha1h_V,       typeof(OpCodeSimd64));
			
 
				             SetA64("01011110000xxxxx001000xxxxxxxxxx", InstEmit.Sha1m_V,       typeof(OpCodeSimdReg64));
			
@@ -486,9 +486,9 @@ namespace ChocolArm64
 
				             SetA64("001011100x110000001110xxxxxxxxxx", InstEmit.Uaddlv_V,      typeof(OpCodeSimd64));
			
 
				             SetA64("01101110<<110000001110xxxxxxxxxx", InstEmit.Uaddlv_V,      typeof(OpCodeSimd64));
			
 
				             SetA64("0x101110<<1xxxxx000100xxxxxxxxxx", InstEmit.Uaddw_V,       typeof(OpCodeSimdReg64));
			
 
				-            SetA64("x0011110xx100011000000xxxxxxxxxx", InstEmit.Ucvtf_Gp,      typeof(OpCodeSimdCvt64));
			
 
				+            SetA64("x00111100x100011000000xxxxxxxxxx", InstEmit.Ucvtf_Gp,      typeof(OpCodeSimdCvt64));
			
 
				             SetA64("011111100x100001110110xxxxxxxxxx", InstEmit.Ucvtf_S,       typeof(OpCodeSimd64));
			
 
				-            SetA64("0x1011100x100001110110xxxxxxxxxx", InstEmit.Ucvtf_V,       typeof(OpCodeSimd64));
			
 
				+            SetA64("0>1011100<100001110110xxxxxxxxxx", InstEmit.Ucvtf_V,       typeof(OpCodeSimd64));
			
 
				             SetA64("0x101110<<1xxxxx000001xxxxxxxxxx", InstEmit.Uhadd_V,       typeof(OpCodeSimdReg64));
			
 
				             SetA64("0x101110<<1xxxxx001001xxxxxxxxxx", InstEmit.Uhsub_V,       typeof(OpCodeSimdReg64));
			
 
				             SetA64("0x101110<<1xxxxx011001xxxxxxxxxx", InstEmit.Umax_V,        typeof(OpCodeSimdReg64));
			
--- a/Ryujinx.Tests/Cpu/CpuTest.cs
+++ b/Ryujinx.Tests/Cpu/CpuTest.cs
@@ -333,7 +333,6 @@ namespace Ryujinx.Tests.Cpu
 
				             Assert.That(_thread.ThreadState.V29, Is.EqualTo(_unicornEmu.Q[29]));
			
 
				             Assert.That(_thread.ThreadState.V30, Is.EqualTo(_unicornEmu.Q[30]));
			
 
				             Assert.That(_thread.ThreadState.V31, Is.EqualTo(_unicornEmu.Q[31]));
			
 
				-            Assert.That(_thread.ThreadState.V31, Is.EqualTo(_unicornEmu.Q[31]));
			
 
				 
			
 
				             Assert.That(_thread.ThreadState.Fpcr,                 Is.EqualTo(_unicornEmu.Fpcr));
			
 
				             Assert.That(_thread.ThreadState.Fpsr & (int)fpsrMask, Is.EqualTo(_unicornEmu.Fpsr & (int)fpsrMask));
			
--- a/Ryujinx.Tests/Cpu/CpuTestSimd.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimd.cs
@@ -39,6 +39,18 @@ namespace Ryujinx.Tests.Cpu
 
				                                  0x8000000000000000ul, 0xFFFFFFFFFFFFFFFFul };
			
 
				         }
			
 
				 
			
 
				+        private static ulong[] _1S_()
			
 
				+        {
			
 
				+            return new ulong[] { 0x0000000000000000ul, 0x000000007FFFFFFFul,
			
 
				+                                 0x0000000080000000ul, 0x00000000FFFFFFFFul };
			
 
				+        }
			
 
				+
			
 
				+        private static ulong[] _2S_()
			
 
				+        {
			
 
				+            return new ulong[] { 0x0000000000000000ul, 0x7FFFFFFF7FFFFFFFul,
			
 
				+                                 0x8000000080000000ul, 0xFFFFFFFFFFFFFFFFul };
			
 
				+        }
			
 
				+
			
 
				         private static ulong[] _4H2S1D_()
			
 
				         {
			
 
				             return new ulong[] { 0x0000000000000000ul, 0x7FFF7FFF7FFF7FFFul,
			
@@ -244,6 +256,24 @@ namespace Ryujinx.Tests.Cpu
 
				 #endregion
			
 
				 
			
 
				 #region "ValueSource (Opcodes)"
			
 
				+        private static uint[] _F_Cmp_Cmpe_S_S_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x1E202028u, // FCMP  S1, #0.0
			
 
				+                0x1E202038u  // FCMPE S1, #0.0
			
 
				+            };
			
 
				+        }
			
 
				+
			
 
				+        private static uint[] _F_Cmp_Cmpe_S_D_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x1E602028u, // FCMP  D1, #0.0
			
 
				+                0x1E602038u  // FCMPE D1, #0.0
			
 
				+            };
			
 
				+        }
			
 
				+
			
 
				         private static uint[] _F_Cvt_S_SD_()
			
 
				         {
			
 
				             return new uint[]
			
@@ -336,37 +366,81 @@ namespace Ryujinx.Tests.Cpu
 
				             };
			
 
				         }
			
 
				 
			
 
				-        private static uint[] _F_Recpx_Sqrt_S_S_()
			
 
				+        private static uint[] _F_Abs_Neg_Recpx_Sqrt_S_S_()
			
 
				         {
			
 
				             return new uint[]
			
 
				             {
			
 
				+                0x1E20C020u, // FABS   S0, S1
			
 
				+                0x1E214020u, // FNEG   S0, S1
			
 
				                 0x5EA1F820u, // FRECPX S0, S1
			
 
				                 0x1E21C020u  // FSQRT  S0, S1
			
 
				             };
			
 
				         }
			
 
				 
			
 
				-        private static uint[] _F_Recpx_Sqrt_S_D_()
			
 
				+        private static uint[] _F_Abs_Neg_Recpx_Sqrt_S_D_()
			
 
				         {
			
 
				             return new uint[]
			
 
				             {
			
 
				+                0x1E60C020u, // FABS   D0, D1
			
 
				+                0x1E614020u, // FNEG   D0, D1
			
 
				                 0x5EE1F820u, // FRECPX D0, D1
			
 
				                 0x1E61C020u  // FSQRT  D0, D1
			
 
				             };
			
 
				         }
			
 
				 
			
 
				-        private static uint[] _F_Sqrt_V_2S_4S_()
			
 
				+        private static uint[] _F_Abs_Neg_Sqrt_V_2S_4S_()
			
 
				         {
			
 
				             return new uint[]
			
 
				             {
			
 
				-                0x2EA1F800u // FSQRT V0.2S, V0.2S
			
 
				+                0x0EA0F800u, // FABS  V0.2S, V0.2S
			
 
				+                0x2EA0F800u, // FNEG  V0.2S, V0.2S
			
 
				+                0x2EA1F800u  // FSQRT V0.2S, V0.2S
			
 
				             };
			
 
				         }
			
 
				 
			
 
				-        private static uint[] _F_Sqrt_V_2D_()
			
 
				+        private static uint[] _F_Abs_Neg_Sqrt_V_2D_()
			
 
				         {
			
 
				             return new uint[]
			
 
				             {
			
 
				-                0x6EE1F800u // FSQRT V0.2D, V0.2D
			
 
				+                0x4EE0F800u, // FABS  V0.2D, V0.2D
			
 
				+                0x6EE0F800u, // FNEG  V0.2D, V0.2D
			
 
				+                0x6EE1F800u  // FSQRT V0.2D, V0.2D
			
 
				+            };
			
 
				+        }
			
 
				+
			
 
				+        private static uint[] _SU_Cvt_F_S_S_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x5E21D820u, // SCVTF S0, S1
			
 
				+                0x7E21D820u  // UCVTF S0, S1
			
 
				+            };
			
 
				+        }
			
 
				+
			
 
				+        private static uint[] _SU_Cvt_F_S_D_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x5E61D820u, // SCVTF D0, D1
			
 
				+                0x7E61D820u  // UCVTF D0, D1
			
 
				+            };
			
 
				+        }
			
 
				+
			
 
				+        private static uint[] _SU_Cvt_F_V_2S_4S_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x0E21D800u, // SCVTF V0.2S, V0.2S
			
 
				+                0x2E21D800u  // UCVTF V0.2S, V0.2S
			
 
				+            };
			
 
				+        }
			
 
				+
			
 
				+        private static uint[] _SU_Cvt_F_V_2D_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x4E61D800u, // SCVTF V0.2D, V0.2D
			
 
				+                0x6E61D800u  // UCVTF V0.2D, V0.2D
			
 
				             };
			
 
				         }
			
 
				 
			
@@ -889,6 +963,38 @@ namespace Ryujinx.Tests.Cpu
 
				             CompareAgainstUnicorn();
			
 
				         }
			
 
				 
			
 
				+        [Test, Pairwise] [Explicit]
			
 
				+        public void F_Cmp_Cmpe_S_S([ValueSource("_F_Cmp_Cmpe_S_S_")] uint opcodes,
			
 
				+                                   [ValueSource("_1S_F_")] ulong a)
			
 
				+        {
			
 
				+            Vector128<float> v1 = MakeVectorE0(a);
			
 
				+
			
 
				+            bool v = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool c = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool z = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool n = TestContext.CurrentContext.Random.NextBool();
			
 
				+
			
 
				+            SingleOpcode(opcodes, v1: v1, overflow: v, carry: c, zero: z, negative: n);
			
 
				+
			
 
				+            CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc);
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise] [Explicit]
			
 
				+        public void F_Cmp_Cmpe_S_D([ValueSource("_F_Cmp_Cmpe_S_D_")] uint opcodes,
			
 
				+                                   [ValueSource("_1D_F_")] ulong a)
			
 
				+        {
			
 
				+            Vector128<float> v1 = MakeVectorE0(a);
			
 
				+
			
 
				+            bool v = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool c = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool z = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool n = TestContext.CurrentContext.Random.NextBool();
			
 
				+
			
 
				+            SingleOpcode(opcodes, v1: v1, overflow: v, carry: c, zero: z, negative: n);
			
 
				+
			
 
				+            CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc);
			
 
				+        }
			
 
				+
			
 
				         [Test, Pairwise] [Explicit]
			
 
				         public void F_Cvt_S_SD([ValueSource("_F_Cvt_S_SD_")] uint opcodes,
			
 
				                                [ValueSource("_1S_F_")] ulong a)
			
@@ -1070,12 +1176,12 @@ namespace Ryujinx.Tests.Cpu
 
				         }
			
 
				 
			
 
				         [Test, Pairwise] [Explicit]
			
 
				-        public void F_Recpx_Sqrt_S_S([ValueSource("_F_Recpx_Sqrt_S_S_")] uint opcodes,
			
 
				-                                     [ValueSource("_1S_F_")] ulong a)
			
 
				+        public void F_Abs_Neg_Recpx_Sqrt_S_S([ValueSource("_F_Abs_Neg_Recpx_Sqrt_S_S_")] uint opcodes,
			
 
				+                                             [ValueSource("_1S_F_")] ulong a)
			
 
				         {
			
 
				             ulong z = TestContext.CurrentContext.Random.NextULong();
			
 
				             Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				-            Vector128<float> v1 = MakeVectorE0(a);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, z);
			
 
				 
			
 
				             int rnd = (int)TestContext.CurrentContext.Random.NextUInt();
			
 
				 
			
@@ -1088,12 +1194,12 @@ namespace Ryujinx.Tests.Cpu
 
				         }
			
 
				 
			
 
				         [Test, Pairwise] [Explicit]
			
 
				-        public void F_Recpx_Sqrt_S_D([ValueSource("_F_Recpx_Sqrt_S_D_")] uint opcodes,
			
 
				-                                     [ValueSource("_1D_F_")] ulong a)
			
 
				+        public void F_Abs_Neg_Recpx_Sqrt_S_D([ValueSource("_F_Abs_Neg_Recpx_Sqrt_S_D_")] uint opcodes,
			
 
				+                                             [ValueSource("_1D_F_")] ulong a)
			
 
				         {
			
 
				             ulong z = TestContext.CurrentContext.Random.NextULong();
			
 
				             Vector128<float> v0 = MakeVectorE1(z);
			
 
				-            Vector128<float> v1 = MakeVectorE0(a);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, z);
			
 
				 
			
 
				             int rnd = (int)TestContext.CurrentContext.Random.NextUInt();
			
 
				 
			
@@ -1106,12 +1212,12 @@ namespace Ryujinx.Tests.Cpu
 
				         }
			
 
				 
			
 
				         [Test, Pairwise] [Explicit]
			
 
				-        public void F_Sqrt_V_2S_4S([ValueSource("_F_Sqrt_V_2S_4S_")] uint opcodes,
			
 
				-                                   [Values(0u)]     uint rd,
			
 
				-                                   [Values(1u, 0u)] uint rn,
			
 
				-                                   [ValueSource("_2S_F_")] ulong z,
			
 
				-                                   [ValueSource("_2S_F_")] ulong a,
			
 
				-                                   [Values(0b0u, 0b1u)] uint q) // <2S, 4S>
			
 
				+        public void F_Abs_Neg_Sqrt_V_2S_4S([ValueSource("_F_Abs_Neg_Sqrt_V_2S_4S_")] uint opcodes,
			
 
				+                                           [Values(0u)]     uint rd,
			
 
				+                                           [Values(1u, 0u)] uint rn,
			
 
				+                                           [ValueSource("_2S_F_")] ulong z,
			
 
				+                                           [ValueSource("_2S_F_")] ulong a,
			
 
				+                                           [Values(0b0u, 0b1u)] uint q) // <2S, 4S>
			
 
				         {
			
 
				             opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				             opcodes |= ((q & 1) << 30);
			
@@ -1130,11 +1236,11 @@ namespace Ryujinx.Tests.Cpu
 
				         }
			
 
				 
			
 
				         [Test, Pairwise] [Explicit]
			
 
				-        public void F_Sqrt_V_2D([ValueSource("_F_Sqrt_V_2D_")] uint opcodes,
			
 
				-                                [Values(0u)]     uint rd,
			
 
				-                                [Values(1u, 0u)] uint rn,
			
 
				-                                [ValueSource("_1D_F_")] ulong z,
			
 
				-                                [ValueSource("_1D_F_")] ulong a)
			
 
				+        public void F_Abs_Neg_Sqrt_V_2D([ValueSource("_F_Abs_Neg_Sqrt_V_2D_")] uint opcodes,
			
 
				+                                        [Values(0u)]     uint rd,
			
 
				+                                        [Values(1u, 0u)] uint rn,
			
 
				+                                        [ValueSource("_1D_F_")] ulong z,
			
 
				+                                        [ValueSource("_1D_F_")] ulong a)
			
 
				         {
			
 
				             opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				 
			
@@ -1460,6 +1566,68 @@ namespace Ryujinx.Tests.Cpu
 
				             CompareAgainstUnicorn();
			
 
				         }
			
 
				 
			
 
				+        [Test, Pairwise] [Explicit]
			
 
				+        public void SU_Cvt_F_S_S([ValueSource("_SU_Cvt_F_S_S_")] uint opcodes,
			
 
				+                                 [ValueSource("_1S_")] [Random(RndCnt)] ulong a)
			
 
				+        {
			
 
				+            ulong z = TestContext.CurrentContext.Random.NextULong();
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0(a);
			
 
				+
			
 
				+            SingleOpcode(opcodes, v0: v0, v1: v1);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise] [Explicit]
			
 
				+        public void SU_Cvt_F_S_D([ValueSource("_SU_Cvt_F_S_D_")] uint opcodes,
			
 
				+                                 [ValueSource("_1D_")] [Random(RndCnt)] ulong a)
			
 
				+        {
			
 
				+            ulong z = TestContext.CurrentContext.Random.NextULong();
			
 
				+            Vector128<float> v0 = MakeVectorE1(z);
			
 
				+            Vector128<float> v1 = MakeVectorE0(a);
			
 
				+
			
 
				+            SingleOpcode(opcodes, v0: v0, v1: v1);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise] [Explicit]
			
 
				+        public void SU_Cvt_F_V_2S_4S([ValueSource("_SU_Cvt_F_V_2S_4S_")] uint opcodes,
			
 
				+                                     [Values(0u)]     uint rd,
			
 
				+                                     [Values(1u, 0u)] uint rn,
			
 
				+                                     [ValueSource("_2S_")] [Random(RndCnt)] ulong z,
			
 
				+                                     [ValueSource("_2S_")] [Random(RndCnt)] ulong a,
			
 
				+                                     [Values(0b0u, 0b1u)] uint q) // <2S, 4S>
			
 
				+        {
			
 
				+            opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				+            opcodes |= ((q & 1) << 30);
			
 
				+
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a * q);
			
 
				+
			
 
				+            SingleOpcode(opcodes, v0: v0, v1: v1);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise] [Explicit]
			
 
				+        public void SU_Cvt_F_V_2D([ValueSource("_SU_Cvt_F_V_2D_")] uint opcodes,
			
 
				+                                  [Values(0u)]     uint rd,
			
 
				+                                  [Values(1u, 0u)] uint rn,
			
 
				+                                  [ValueSource("_1D_")] [Random(RndCnt)] ulong z,
			
 
				+                                  [ValueSource("_1D_")] [Random(RndCnt)] ulong a)
			
 
				+        {
			
 
				+            opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				+
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a);
			
 
				+
			
 
				+            SingleOpcode(opcodes, v0: v0, v1: v1);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				         [Test, Pairwise]
			
 
				         public void Sha1h_Sha1su1_V([ValueSource("_Sha1h_Sha1su1_V_")] uint opcodes,
			
 
				                                     [Values(0u)]     uint rd,
			
--- a/Ryujinx.Tests/Cpu/CpuTestSimdExt.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdExt.cs
@@ -0,0 +1,73 @@
 
				+#define SimdExt
			
 
				+
			
 
				+using NUnit.Framework;
			
 
				+
			
 
				+using System.Runtime.Intrinsics;
			
 
				+
			
 
				+namespace Ryujinx.Tests.Cpu
			
 
				+{
			
 
				+    [Category("SimdExt")]
			
 
				+    public sealed class CpuTestSimdExt : CpuTest
			
 
				+    {
			
 
				+#if SimdExt
			
 
				+
			
 
				+#region "ValueSource"
			
 
				+        private static ulong[] _8B_()
			
 
				+        {
			
 
				+            return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful,
			
 
				+                                 0x8080808080808080ul, 0xFFFFFFFFFFFFFFFFul };
			
 
				+        }
			
 
				+#endregion
			
 
				+
			
 
				+        private const int RndCnt = 2;
			
 
				+
			
 
				+        [Test, Pairwise, Description("EXT <Vd>.8B, <Vn>.8B, <Vm>.8B, #<index>")]
			
 
				+        public void Ext_V_8B([Values(0u)]     uint rd,
			
 
				+                             [Values(1u, 0u)] uint rn,
			
 
				+                             [Values(2u, 0u)] uint rm,
			
 
				+                             [ValueSource("_8B_")] [Random(RndCnt)] ulong z,
			
 
				+                             [ValueSource("_8B_")] [Random(RndCnt)] ulong a,
			
 
				+                             [ValueSource("_8B_")] [Random(RndCnt)] ulong b,
			
 
				+                             [Range(0u, 7u)] uint index)
			
 
				+        {
			
 
				+            uint imm4 = index & 0x7u;
			
 
				+
			
 
				+            uint opcode = 0x2E000000; // EXT V0.8B, V0.8B, V0.8B, #0
			
 
				+            opcode |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				+            opcode |= (imm4 << 11);
			
 
				+
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0(a);
			
 
				+            Vector128<float> v2 = MakeVectorE0(b);
			
 
				+
			
 
				+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise, Description("EXT <Vd>.16B, <Vn>.16B, <Vm>.16B, #<index>")]
			
 
				+        public void Ext_V_16B([Values(0u)]     uint rd,
			
 
				+                              [Values(1u, 0u)] uint rn,
			
 
				+                              [Values(2u, 0u)] uint rm,
			
 
				+                              [ValueSource("_8B_")] [Random(RndCnt)] ulong z,
			
 
				+                              [ValueSource("_8B_")] [Random(RndCnt)] ulong a,
			
 
				+                              [ValueSource("_8B_")] [Random(RndCnt)] ulong b,
			
 
				+                              [Range(0u, 15u)] uint index)
			
 
				+        {
			
 
				+            uint imm4 = index & 0xFu;
			
 
				+
			
 
				+            uint opcode = 0x6E000000; // EXT V0.16B, V0.16B, V0.16B, #0
			
 
				+            opcode |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				+            opcode |= (imm4 << 11);
			
 
				+
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a);
			
 
				+            Vector128<float> v2 = MakeVectorE0E1(b, b);
			
 
				+
			
 
				+            SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+#endif
			
 
				+    }
			
 
				+}
			
--- a/Ryujinx.Tests/Cpu/CpuTestSimdFcond.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdFcond.cs
@@ -0,0 +1,178 @@
 
				+#define SimdFcond
			
 
				+
			
 
				+using NUnit.Framework;
			
 
				+
			
 
				+using System.Collections.Generic;
			
 
				+using System.Runtime.Intrinsics;
			
 
				+
			
 
				+namespace Ryujinx.Tests.Cpu
			
 
				+{
			
 
				+    [Category("SimdFcond")]
			
 
				+    public sealed class CpuTestSimdFcond : CpuTest
			
 
				+    {
			
 
				+#if SimdFcond
			
 
				+
			
 
				+#region "ValueSource (Types)"
			
 
				+        private static IEnumerable<ulong> _1S_F_()
			
 
				+        {
			
 
				+            yield return 0x00000000FF7FFFFFul; // -Max Normal    (float.MinValue)
			
 
				+            yield return 0x0000000080800000ul; // -Min Normal
			
 
				+            yield return 0x00000000807FFFFFul; // -Max Subnormal
			
 
				+            yield return 0x0000000080000001ul; // -Min Subnormal (-float.Epsilon)
			
 
				+            yield return 0x000000007F7FFFFFul; // +Max Normal    (float.MaxValue)
			
 
				+            yield return 0x0000000000800000ul; // +Min Normal
			
 
				+            yield return 0x00000000007FFFFFul; // +Max Subnormal
			
 
				+            yield return 0x0000000000000001ul; // +Min Subnormal (float.Epsilon)
			
 
				+
			
 
				+            if (!NoZeros)
			
 
				+            {
			
 
				+                yield return 0x0000000080000000ul; // -Zero
			
 
				+                yield return 0x0000000000000000ul; // +Zero
			
 
				+            }
			
 
				+
			
 
				+            if (!NoInfs)
			
 
				+            {
			
 
				+                yield return 0x00000000FF800000ul; // -Infinity
			
 
				+                yield return 0x000000007F800000ul; // +Infinity
			
 
				+            }
			
 
				+
			
 
				+            if (!NoNaNs)
			
 
				+            {
			
 
				+                yield return 0x00000000FFC00000ul; // -QNaN (all zeros payload) (float.NaN)
			
 
				+                yield return 0x00000000FFBFFFFFul; // -SNaN (all ones  payload)
			
 
				+                yield return 0x000000007FC00000ul; // +QNaN (all zeros payload) (-float.NaN) (DefaultNaN)
			
 
				+                yield return 0x000000007FBFFFFFul; // +SNaN (all ones  payload)
			
 
				+            }
			
 
				+
			
 
				+            for (int cnt = 1; cnt <= RndCnt; cnt++)
			
 
				+            {
			
 
				+                ulong grbg = TestContext.CurrentContext.Random.NextUInt();
			
 
				+                ulong rnd1 = GenNormalS();
			
 
				+                ulong rnd2 = GenSubnormalS();
			
 
				+
			
 
				+                yield return (grbg << 32) | rnd1;
			
 
				+                yield return (grbg << 32) | rnd2;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        private static IEnumerable<ulong> _1D_F_()
			
 
				+        {
			
 
				+            yield return 0xFFEFFFFFFFFFFFFFul; // -Max Normal    (double.MinValue)
			
 
				+            yield return 0x8010000000000000ul; // -Min Normal
			
 
				+            yield return 0x800FFFFFFFFFFFFFul; // -Max Subnormal
			
 
				+            yield return 0x8000000000000001ul; // -Min Subnormal (-double.Epsilon)
			
 
				+            yield return 0x7FEFFFFFFFFFFFFFul; // +Max Normal    (double.MaxValue)
			
 
				+            yield return 0x0010000000000000ul; // +Min Normal
			
 
				+            yield return 0x000FFFFFFFFFFFFFul; // +Max Subnormal
			
 
				+            yield return 0x0000000000000001ul; // +Min Subnormal (double.Epsilon)
			
 
				+
			
 
				+            if (!NoZeros)
			
 
				+            {
			
 
				+                yield return 0x8000000000000000ul; // -Zero
			
 
				+                yield return 0x0000000000000000ul; // +Zero
			
 
				+            }
			
 
				+
			
 
				+            if (!NoInfs)
			
 
				+            {
			
 
				+                yield return 0xFFF0000000000000ul; // -Infinity
			
 
				+                yield return 0x7FF0000000000000ul; // +Infinity
			
 
				+            }
			
 
				+
			
 
				+            if (!NoNaNs)
			
 
				+            {
			
 
				+                yield return 0xFFF8000000000000ul; // -QNaN (all zeros payload) (double.NaN)
			
 
				+                yield return 0xFFF7FFFFFFFFFFFFul; // -SNaN (all ones  payload)
			
 
				+                yield return 0x7FF8000000000000ul; // +QNaN (all zeros payload) (-double.NaN) (DefaultNaN)
			
 
				+                yield return 0x7FF7FFFFFFFFFFFFul; // +SNaN (all ones  payload)
			
 
				+            }
			
 
				+
			
 
				+            for (int cnt = 1; cnt <= RndCnt; cnt++)
			
 
				+            {
			
 
				+                ulong rnd1 = GenNormalD();
			
 
				+                ulong rnd2 = GenSubnormalD();
			
 
				+
			
 
				+                yield return rnd1;
			
 
				+                yield return rnd2;
			
 
				+            }
			
 
				+        }
			
 
				+#endregion
			
 
				+
			
 
				+#region "ValueSource (Opcodes)"
			
 
				+        private static uint[] _F_Ccmp_Ccmpe_S_S_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x1E220420u, // FCCMP  S1, S2, #0, EQ
			
 
				+                0x1E220430u  // FCCMPE S1, S2, #0, EQ
			
 
				+            };
			
 
				+        }
			
 
				+
			
 
				+        private static uint[] _F_Ccmp_Ccmpe_S_D_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x1E620420u, // FCCMP  D1, D2, #0, EQ
			
 
				+                0x1E620430u  // FCCMPE D1, D2, #0, EQ
			
 
				+            };
			
 
				+        }
			
 
				+#endregion
			
 
				+
			
 
				+        private const int RndCnt     = 2;
			
 
				+        private const int RndCntNzcv = 2;
			
 
				+
			
 
				+        private static readonly bool NoZeros = false;
			
 
				+        private static readonly bool NoInfs  = false;
			
 
				+        private static readonly bool NoNaNs  = false;
			
 
				+
			
 
				+        [Test, Pairwise] [Explicit]
			
 
				+        public void F_Ccmp_Ccmpe_S_S([ValueSource("_F_Ccmp_Ccmpe_S_S_")] uint opcodes,
			
 
				+                                     [ValueSource("_1S_F_")] ulong a,
			
 
				+                                     [ValueSource("_1S_F_")] ulong b,
			
 
				+                                     [Random(0u, 15u, RndCntNzcv)] uint nzcv,
			
 
				+                                     [Values(0b0000u, 0b0001u, 0b0010u, 0b0011u,             // <EQ, NE, CS/HS, CC/LO,
			
 
				+                                             0b0100u, 0b0101u, 0b0110u, 0b0111u,             //  MI, PL, VS, VC,
			
 
				+                                             0b1000u, 0b1001u, 0b1010u, 0b1011u,             //  HI, LS, GE, LT,
			
 
				+                                             0b1100u, 0b1101u, 0b1110u, 0b1111u)] uint cond) //  GT, LE, AL, NV>
			
 
				+        {
			
 
				+            opcodes |= ((cond & 15) << 12) | ((nzcv & 15) << 0);
			
 
				+
			
 
				+            Vector128<float> v1 = MakeVectorE0(a);
			
 
				+            Vector128<float> v2 = MakeVectorE0(b);
			
 
				+
			
 
				+            bool v = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool c = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool z = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool n = TestContext.CurrentContext.Random.NextBool();
			
 
				+
			
 
				+            SingleOpcode(opcodes, v1: v1, v2: v2, overflow: v, carry: c, zero: z, negative: n);
			
 
				+
			
 
				+            CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc);
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise] [Explicit]
			
 
				+        public void F_Ccmp_Ccmpe_S_D([ValueSource("_F_Ccmp_Ccmpe_S_D_")] uint opcodes,
			
 
				+                                     [ValueSource("_1D_F_")] ulong a,
			
 
				+                                     [ValueSource("_1D_F_")] ulong b,
			
 
				+                                     [Random(0u, 15u, RndCntNzcv)] uint nzcv,
			
 
				+                                     [Values(0b0000u, 0b0001u, 0b0010u, 0b0011u,             // <EQ, NE, CS/HS, CC/LO,
			
 
				+                                             0b0100u, 0b0101u, 0b0110u, 0b0111u,             //  MI, PL, VS, VC,
			
 
				+                                             0b1000u, 0b1001u, 0b1010u, 0b1011u,             //  HI, LS, GE, LT,
			
 
				+                                             0b1100u, 0b1101u, 0b1110u, 0b1111u)] uint cond) //  GT, LE, AL, NV>
			
 
				+        {
			
 
				+            opcodes |= ((cond & 15) << 12) | ((nzcv & 15) << 0);
			
 
				+
			
 
				+            Vector128<float> v1 = MakeVectorE0(a);
			
 
				+            Vector128<float> v2 = MakeVectorE0(b);
			
 
				+
			
 
				+            bool v = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool c = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool z = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool n = TestContext.CurrentContext.Random.NextBool();
			
 
				+
			
 
				+            SingleOpcode(opcodes, v1: v1, v2: v2, overflow: v, carry: c, zero: z, negative: n);
			
 
				+
			
 
				+            CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc);
			
 
				+        }
			
 
				+#endif
			
 
				+    }
			
 
				+}
			
--- a/Ryujinx.Tests/Cpu/CpuTestSimdIns.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdIns.cs
@@ -18,6 +18,24 @@ namespace Ryujinx.Tests.Cpu
 
				                                  0x8000000000000000ul, 0xFFFFFFFFFFFFFFFFul };
			
 
				         }
			
 
				 
			
 
				+        private static ulong[] _2S_()
			
 
				+        {
			
 
				+            return new ulong[] { 0x0000000000000000ul, 0x7FFFFFFF7FFFFFFFul,
			
 
				+                                 0x8000000080000000ul, 0xFFFFFFFFFFFFFFFFul };
			
 
				+        }
			
 
				+
			
 
				+        private static ulong[] _4H_()
			
 
				+        {
			
 
				+            return new ulong[] { 0x0000000000000000ul, 0x7FFF7FFF7FFF7FFFul,
			
 
				+                                 0x8000800080008000ul, 0xFFFFFFFFFFFFFFFFul };
			
 
				+        }
			
 
				+
			
 
				+        private static ulong[] _8B_()
			
 
				+        {
			
 
				+            return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful,
			
 
				+                                 0x8080808080808080ul, 0xFFFFFFFFFFFFFFFFul };
			
 
				+        }
			
 
				+
			
 
				         private static ulong[] _8B4H_()
			
 
				         {
			
 
				             return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful,
			
@@ -89,6 +107,186 @@ namespace Ryujinx.Tests.Cpu
 
				             CompareAgainstUnicorn();
			
 
				         }
			
 
				 
			
 
				+        [Test, Pairwise, Description("DUP B0, V1.B[<index>]")]
			
 
				+        public void Dup_S_B([ValueSource("_8B_")] [Random(RndCnt)] ulong a,
			
 
				+                            [Range(0u, 15u)] uint index)
			
 
				+        {
			
 
				+            const int size = 0;
			
 
				+
			
 
				+            uint imm5 = (index << (size + 1) | 1u << size) & 0x1Fu;
			
 
				+
			
 
				+            uint opcode = 0x5E000420; // RESERVED
			
 
				+            opcode |= (imm5 << 16);
			
 
				+
			
 
				+            ulong z = TestContext.CurrentContext.Random.NextULong();
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a);
			
 
				+
			
 
				+            SingleOpcode(opcode, v0: v0, v1: v1);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise, Description("DUP H0, V1.H[<index>]")]
			
 
				+        public void Dup_S_H([ValueSource("_4H_")] [Random(RndCnt)] ulong a,
			
 
				+                            [Range(0u, 7u)] uint index)
			
 
				+        {
			
 
				+            const int size = 1;
			
 
				+
			
 
				+            uint imm5 = (index << (size + 1) | 1u << size) & 0x1Fu;
			
 
				+
			
 
				+            uint opcode = 0x5E000420; // RESERVED
			
 
				+            opcode |= (imm5 << 16);
			
 
				+
			
 
				+            ulong z = TestContext.CurrentContext.Random.NextULong();
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a);
			
 
				+
			
 
				+            SingleOpcode(opcode, v0: v0, v1: v1);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise, Description("DUP S0, V1.S[<index>]")]
			
 
				+        public void Dup_S_S([ValueSource("_2S_")] [Random(RndCnt)] ulong a,
			
 
				+                            [Range(0u, 3u)] uint index)
			
 
				+        {
			
 
				+            const int size = 2;
			
 
				+
			
 
				+            uint imm5 = (index << (size + 1) | 1u << size) & 0x1Fu;
			
 
				+
			
 
				+            uint opcode = 0x5E000420; // RESERVED
			
 
				+            opcode |= (imm5 << 16);
			
 
				+
			
 
				+            ulong z = TestContext.CurrentContext.Random.NextULong();
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a);
			
 
				+
			
 
				+            SingleOpcode(opcode, v0: v0, v1: v1);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise, Description("DUP D0, V1.D[<index>]")]
			
 
				+        public void Dup_S_D([ValueSource("_1D_")] [Random(RndCnt)] ulong a,
			
 
				+                            [Range(0u, 1u)] uint index)
			
 
				+        {
			
 
				+            const int size = 3;
			
 
				+
			
 
				+            uint imm5 = (index << (size + 1) | 1u << size) & 0x1Fu;
			
 
				+
			
 
				+            uint opcode = 0x5E000420; // RESERVED
			
 
				+            opcode |= (imm5 << 16);
			
 
				+
			
 
				+            ulong z = TestContext.CurrentContext.Random.NextULong();
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a);
			
 
				+
			
 
				+            SingleOpcode(opcode, v0: v0, v1: v1);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise, Description("DUP <Vd>.<T>, <Vn>.B[<index>]")]
			
 
				+        public void Dup_V_8B_16B([Values(0u)]     uint rd,
			
 
				+                                 [Values(1u, 0u)] uint rn,
			
 
				+                                 [ValueSource("_8B_")] [Random(RndCnt)] ulong z,
			
 
				+                                 [ValueSource("_8B_")] [Random(RndCnt)] ulong a,
			
 
				+                                 [Range(0u, 15u)] uint index,
			
 
				+                                 [Values(0b0u, 0b1u)] uint q) // <8B, 16B>
			
 
				+        {
			
 
				+            const int size = 0;
			
 
				+
			
 
				+            uint imm5 = (index << (size + 1) | 1u << size) & 0x1Fu;
			
 
				+
			
 
				+            uint opcode = 0x0E000400; // RESERVED
			
 
				+            opcode |= ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				+            opcode |= (imm5 << 16);
			
 
				+            opcode |= ((q & 1) << 30);
			
 
				+
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a);
			
 
				+
			
 
				+            SingleOpcode(opcode, v0: v0, v1: v1);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise, Description("DUP <Vd>.<T>, <Vn>.H[<index>]")]
			
 
				+        public void Dup_V_4H_8H([Values(0u)]     uint rd,
			
 
				+                                [Values(1u, 0u)] uint rn,
			
 
				+                                [ValueSource("_4H_")] [Random(RndCnt)] ulong z,
			
 
				+                                [ValueSource("_4H_")] [Random(RndCnt)] ulong a,
			
 
				+                                [Range(0u, 7u)] uint index,
			
 
				+                                [Values(0b0u, 0b1u)] uint q) // <4H, 8H>
			
 
				+        {
			
 
				+            const int size = 1;
			
 
				+
			
 
				+            uint imm5 = (index << (size + 1) | 1u << size) & 0x1Fu;
			
 
				+
			
 
				+            uint opcode = 0x0E000400; // RESERVED
			
 
				+            opcode |= ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				+            opcode |= (imm5 << 16);
			
 
				+            opcode |= ((q & 1) << 30);
			
 
				+
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a);
			
 
				+
			
 
				+            SingleOpcode(opcode, v0: v0, v1: v1);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise, Description("DUP <Vd>.<T>, <Vn>.S[<index>]")]
			
 
				+        public void Dup_V_2S_4S([Values(0u)]     uint rd,
			
 
				+                                [Values(1u, 0u)] uint rn,
			
 
				+                                [ValueSource("_2S_")] [Random(RndCnt)] ulong z,
			
 
				+                                [ValueSource("_2S_")] [Random(RndCnt)] ulong a,
			
 
				+                                [Range(0u, 3u)] uint index,
			
 
				+                                [Values(0b0u, 0b1u)] uint q) // <2S, 4S>
			
 
				+        {
			
 
				+            const int size = 2;
			
 
				+
			
 
				+            uint imm5 = (index << (size + 1) | 1u << size) & 0x1Fu;
			
 
				+
			
 
				+            uint opcode = 0x0E000400; // RESERVED
			
 
				+            opcode |= ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				+            opcode |= (imm5 << 16);
			
 
				+            opcode |= ((q & 1) << 30);
			
 
				+
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a);
			
 
				+
			
 
				+            SingleOpcode(opcode, v0: v0, v1: v1);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise, Description("DUP <Vd>.<T>, <Vn>.D[<index>]")]
			
 
				+        public void Dup_V_2D([Values(0u)]     uint rd,
			
 
				+                             [Values(1u, 0u)] uint rn,
			
 
				+                             [ValueSource("_1D_")] [Random(RndCnt)] ulong z,
			
 
				+                             [ValueSource("_1D_")] [Random(RndCnt)] ulong a,
			
 
				+                             [Range(0u, 1u)] uint index,
			
 
				+                             [Values(0b1u)] uint q) // <2D>
			
 
				+        {
			
 
				+            const int size = 3;
			
 
				+
			
 
				+            uint imm5 = (index << (size + 1) | 1u << size) & 0x1Fu;
			
 
				+
			
 
				+            uint opcode = 0x0E000400; // RESERVED
			
 
				+            opcode |= ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				+            opcode |= (imm5 << 16);
			
 
				+            opcode |= ((q & 1) << 30);
			
 
				+
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a);
			
 
				+
			
 
				+            SingleOpcode(opcode, v0: v0, v1: v1);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				         [Test, Pairwise, Description("SMOV <Wd>, <Vn>.<Ts>[<index>]")]
			
 
				         public void Smov_S_W([Values(0u, 31u)] uint rd,
			
 
				                              [Values(1u)]      uint rn,
			
--- a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
@@ -250,6 +250,24 @@ namespace Ryujinx.Tests.Cpu
 
				             };
			
 
				         }
			
 
				 
			
 
				+        private static uint[] _F_Cmp_Cmpe_S_S_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x1E222020u, // FCMP  S1, S2
			
 
				+                0x1E222030u  // FCMPE S1, S2
			
 
				+            };
			
 
				+        }
			
 
				+
			
 
				+        private static uint[] _F_Cmp_Cmpe_S_D_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x1E622020u, // FCMP  D1, D2
			
 
				+                0x1E622030u  // FCMPE D1, D2
			
 
				+            };
			
 
				+        }
			
 
				+
			
 
				         private static uint[] _F_Madd_Msub_S_S_()
			
 
				         {
			
 
				             return new uint[]
			
@@ -316,6 +334,24 @@ namespace Ryujinx.Tests.Cpu
 
				             };
			
 
				         }
			
 
				 
			
 
				+        private static uint[] _F_Mla_Mls_V_2S_4S_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x0E20CC00u, // FMLA V0.2S, V0.2S, V0.2S
			
 
				+                0x0EA0CC00u  // FMLS V0.2S, V0.2S, V0.2S
			
 
				+            };
			
 
				+        }
			
 
				+
			
 
				+        private static uint[] _F_Mla_Mls_V_2D_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x4E60CC00u, // FMLA V0.2D, V0.2D, V0.2D
			
 
				+                0x4EE0CC00u  // FMLS V0.2D, V0.2D, V0.2D
			
 
				+            };
			
 
				+        }
			
 
				+
			
 
				         private static uint[] _F_Recps_Rsqrts_S_S_()
			
 
				         {
			
 
				             return new uint[]
			
@@ -372,6 +408,28 @@ namespace Ryujinx.Tests.Cpu
 
				                 0x5E006000u  // SHA256SU1 V0.4S, V0.4S, V0.4S
			
 
				             };
			
 
				         }
			
 
				+
			
 
				+        private static uint[] _S_Max_Min_P_V_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x0E206400u, // SMAX  V0.8B, V0.8B, V0.8B
			
 
				+                0x0E20A400u, // SMAXP V0.8B, V0.8B, V0.8B
			
 
				+                0x0E206C00u, // SMIN  V0.8B, V0.8B, V0.8B
			
 
				+                0x0E20AC00u  // SMINP V0.8B, V0.8B, V0.8B
			
 
				+            };
			
 
				+        }
			
 
				+
			
 
				+        private static uint[] _U_Max_Min_P_V_()
			
 
				+        {
			
 
				+            return new uint[]
			
 
				+            {
			
 
				+                0x2E206400u, // UMAX  V0.8B, V0.8B, V0.8B
			
 
				+                0x2E20A400u, // UMAXP V0.8B, V0.8B, V0.8B
			
 
				+                0x2E206C00u, // UMIN  V0.8B, V0.8B, V0.8B
			
 
				+                0x2E20AC00u  // UMINP V0.8B, V0.8B, V0.8B
			
 
				+            };
			
 
				+        }
			
 
				 #endregion
			
 
				 
			
 
				         private const int RndCnt = 2;
			
@@ -1248,6 +1306,42 @@ namespace Ryujinx.Tests.Cpu
 
				             CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Dzc | Fpsr.Idc);
			
 
				         }
			
 
				 
			
 
				+        [Test, Pairwise] [Explicit]
			
 
				+        public void F_Cmp_Cmpe_S_S([ValueSource("_F_Cmp_Cmpe_S_S_")] uint opcodes,
			
 
				+                                   [ValueSource("_1S_F_")] ulong a,
			
 
				+                                   [ValueSource("_1S_F_")] ulong b)
			
 
				+        {
			
 
				+            Vector128<float> v1 = MakeVectorE0(a);
			
 
				+            Vector128<float> v2 = MakeVectorE0(b);
			
 
				+
			
 
				+            bool v = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool c = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool z = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool n = TestContext.CurrentContext.Random.NextBool();
			
 
				+
			
 
				+            SingleOpcode(opcodes, v1: v1, v2: v2, overflow: v, carry: c, zero: z, negative: n);
			
 
				+
			
 
				+            CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc);
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise] [Explicit]
			
 
				+        public void F_Cmp_Cmpe_S_D([ValueSource("_F_Cmp_Cmpe_S_D_")] uint opcodes,
			
 
				+                                   [ValueSource("_1D_F_")] ulong a,
			
 
				+                                   [ValueSource("_1D_F_")] ulong b)
			
 
				+        {
			
 
				+            Vector128<float> v1 = MakeVectorE0(a);
			
 
				+            Vector128<float> v2 = MakeVectorE0(b);
			
 
				+
			
 
				+            bool v = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool c = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool z = TestContext.CurrentContext.Random.NextBool();
			
 
				+            bool n = TestContext.CurrentContext.Random.NextBool();
			
 
				+
			
 
				+            SingleOpcode(opcodes, v1: v1, v2: v2, overflow: v, carry: c, zero: z, negative: n);
			
 
				+
			
 
				+            CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc);
			
 
				+        }
			
 
				+
			
 
				         [Test, Pairwise] [Explicit] // Fused.
			
 
				         public void F_Madd_Msub_S_S([ValueSource("_F_Madd_Msub_S_S_")] uint opcodes,
			
 
				                                     [ValueSource("_1S_F_")] ulong a,
			
@@ -1384,6 +1478,58 @@ namespace Ryujinx.Tests.Cpu
 
				             CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc);
			
 
				         }
			
 
				 
			
 
				+        [Test, Pairwise] [Explicit] // Fused.
			
 
				+        public void F_Mla_Mls_V_2S_4S([ValueSource("_F_Mla_Mls_V_2S_4S_")] uint opcodes,
			
 
				+                                      [Values(0u)]     uint rd,
			
 
				+                                      [Values(1u, 0u)] uint rn,
			
 
				+                                      [Values(2u, 0u)] uint rm,
			
 
				+                                      [ValueSource("_2S_F_")] ulong z,
			
 
				+                                      [ValueSource("_2S_F_")] ulong a,
			
 
				+                                      [ValueSource("_2S_F_")] ulong b,
			
 
				+                                      [Values(0b0u, 0b1u)] uint q) // <2S, 4S>
			
 
				+        {
			
 
				+            opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				+            opcodes |= ((q & 1) << 30);
			
 
				+
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a * q);
			
 
				+            Vector128<float> v2 = MakeVectorE0E1(b, b * q);
			
 
				+
			
 
				+            int rnd = (int)TestContext.CurrentContext.Random.NextUInt();
			
 
				+
			
 
				+            int fpcr = rnd & (1 << (int)Fpcr.Fz);
			
 
				+            fpcr |= rnd & (1 << (int)Fpcr.Dn);
			
 
				+
			
 
				+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2, fpcr: fpcr);
			
 
				+
			
 
				+            CompareAgainstUnicorn(Fpsr.Ioc | Fpsr.Idc, FpSkips.IfUnderflow, FpTolerances.UpToOneUlpsS);
			
 
				+        }
			
 
				+
			
 
				+        [Test, Pairwise] [Explicit] // Fused.
			
 
				+        public void F_Mla_Mls_V_2D([ValueSource("_F_Mla_Mls_V_2D_")] uint opcodes,
			
 
				+                                   [Values(0u)]     uint rd,
			
 
				+                                   [Values(1u, 0u)] uint rn,
			
 
				+                                   [Values(2u, 0u)] uint rm,
			
 
				+                                   [ValueSource("_1D_F_")] ulong z,
			
 
				+                                   [ValueSource("_1D_F_")] ulong a,
			
 
				+                                   [ValueSource("_1D_F_")] ulong b)
			
 
				+        {
			
 
				+            opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				+
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a);
			
 
				+            Vector128<float> v2 = MakeVectorE0E1(b, b);
			
 
				+
			
 
				+            int rnd = (int)TestContext.CurrentContext.Random.NextUInt();
			
 
				+
			
 
				+            int fpcr = rnd & (1 << (int)Fpcr.Fz);
			
 
				+            fpcr |= rnd & (1 << (int)Fpcr.Dn);
			
 
				+
			
 
				+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2, fpcr: fpcr);
			
 
				+
			
 
				+            CompareAgainstUnicorn(Fpsr.Ioc | Fpsr.Idc, FpSkips.IfUnderflow, FpTolerances.UpToOneUlpsD);
			
 
				+        }
			
 
				+
			
 
				         [Test, Pairwise] [Explicit] // Fused.
			
 
				         public void F_Recps_Rsqrts_S_S([ValueSource("_F_Recps_Rsqrts_S_S_")] uint opcodes,
			
 
				                                        [ValueSource("_1S_F_")] ulong a,
			
@@ -2036,6 +2182,30 @@ namespace Ryujinx.Tests.Cpu
 
				             CompareAgainstUnicorn();
			
 
				         }
			
 
				 
			
 
				+        [Test, Pairwise]
			
 
				+        public void S_Max_Min_P_V([ValueSource("_S_Max_Min_P_V_")] uint opcodes,
			
 
				+                                  [Values(0u)]     uint rd,
			
 
				+                                  [Values(1u, 0u)] uint rn,
			
 
				+                                  [Values(2u, 0u)] uint rm,
			
 
				+                                  [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong z,
			
 
				+                                  [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong a,
			
 
				+                                  [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong b,
			
 
				+                                  [Values(0b00u, 0b01u, 0b10u)] uint size, // Q0: <8B,  4H, 2S>
			
 
				+                                  [Values(0b0u, 0b1u)] uint q)             // Q1: <16B, 8H, 4S>
			
 
				+        {
			
 
				+            opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				+            opcodes |= ((size & 3) << 22);
			
 
				+            opcodes |= ((q & 1) << 30);
			
 
				+
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a * q);
			
 
				+            Vector128<float> v2 = MakeVectorE0E1(b, b * q);
			
 
				+
			
 
				+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				         [Test, Pairwise, Description("SMLAL{2} <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.<Tb>")]
			
 
				         public void Smlal_V_8B8H_4H4S_2S2D([Values(0u)]     uint rd,
			
 
				                                            [Values(1u, 0u)] uint rn,
			
@@ -3068,6 +3238,30 @@ namespace Ryujinx.Tests.Cpu
 
				             CompareAgainstUnicorn();
			
 
				         }
			
 
				 
			
 
				+        [Test, Pairwise]
			
 
				+        public void U_Max_Min_P_V([ValueSource("_U_Max_Min_P_V_")] uint opcodes,
			
 
				+                                  [Values(0u)]     uint rd,
			
 
				+                                  [Values(1u, 0u)] uint rn,
			
 
				+                                  [Values(2u, 0u)] uint rm,
			
 
				+                                  [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong z,
			
 
				+                                  [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong a,
			
 
				+                                  [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong b,
			
 
				+                                  [Values(0b00u, 0b01u, 0b10u)] uint size, // Q0: <8B,  4H, 2S>
			
 
				+                                  [Values(0b0u, 0b1u)] uint q)             // Q1: <16B, 8H, 4S>
			
 
				+        {
			
 
				+            opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
			
 
				+            opcodes |= ((size & 3) << 22);
			
 
				+            opcodes |= ((q & 1) << 30);
			
 
				+
			
 
				+            Vector128<float> v0 = MakeVectorE0E1(z, z);
			
 
				+            Vector128<float> v1 = MakeVectorE0E1(a, a * q);
			
 
				+            Vector128<float> v2 = MakeVectorE0E1(b, b * q);
			
 
				+
			
 
				+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
			
 
				+
			
 
				+            CompareAgainstUnicorn();
			
 
				+        }
			
 
				+
			
 
				         [Test, Pairwise, Description("UMLAL{2} <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.<Tb>")]
			
 
				         public void Umlal_V_8B8H_4H4S_2S2D([Values(0u)]     uint rd,
			
 
				                                            [Values(1u, 0u)] uint rn,
			
--- a/Ryujinx.Tests/Ryujinx.Tests.csproj
+++ b/Ryujinx.Tests/Ryujinx.Tests.csproj
@@ -18,7 +18,7 @@
 
				   <ItemGroup>
			
 
				     <PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.9.0" />
			
 
				     <PackageReference Include="NUnit" Version="3.11.0" />
			
 
				-    <PackageReference Include="NUnit3TestAdapter" Version="3.10.0" />
			
 
				+    <PackageReference Include="NUnit3TestAdapter" Version="3.11.0" />
			
 
				     <PackageReference Include="System.Runtime.Intrinsics.Experimental" Version="4.5.0-rc1" />
			
 
				   </ItemGroup>