Forráskód Böngészése

Add Fmax/minv_V & S/Ushl_S Inst.s with Tests. Fix Maxps/d & Minps/d d… (#1335)

* Add Fmax/minv_V & S/Ushl_S Inst.s with Tests. Fix Maxps/d & Minps/d double zero sign handling. Allows better handling of NaNs.

* Optimized EmitSse2VectorIsNaNOpF() for multiple uses per opF.
LDj3SNuD 5 éve
szülő
commit
a804db6eed

+ 6 - 0
ARMeilleure/Decoders/OpCodeTable.cs

@@ -332,14 +332,18 @@ namespace ARMeilleure.Decoders
             SetA64("0>0011100<1xxxxx111101xxxxxxxxxx", InstName.Fmax_V,          InstEmit.Fmax_V,          typeof(OpCodeSimdReg));
             SetA64("000111100x1xxxxx011010xxxxxxxxxx", InstName.Fmaxnm_S,        InstEmit.Fmaxnm_S,        typeof(OpCodeSimdReg));
             SetA64("0>0011100<1xxxxx110001xxxxxxxxxx", InstName.Fmaxnm_V,        InstEmit.Fmaxnm_V,        typeof(OpCodeSimdReg));
+            SetA64("0>1011100<1xxxxx110001xxxxxxxxxx", InstName.Fmaxnmp_V,       InstEmit.Fmaxnmp_V,       typeof(OpCodeSimdReg));
             SetA64("0110111000110000110010xxxxxxxxxx", InstName.Fmaxnmv_V,       InstEmit.Fmaxnmv_V,       typeof(OpCodeSimd));
             SetA64("0>1011100<1xxxxx111101xxxxxxxxxx", InstName.Fmaxp_V,         InstEmit.Fmaxp_V,         typeof(OpCodeSimdReg));
+            SetA64("0110111000110000111110xxxxxxxxxx", InstName.Fmaxv_V,         InstEmit.Fmaxv_V,         typeof(OpCodeSimd));
             SetA64("000111100x1xxxxx010110xxxxxxxxxx", InstName.Fmin_S,          InstEmit.Fmin_S,          typeof(OpCodeSimdReg));
             SetA64("0>0011101<1xxxxx111101xxxxxxxxxx", InstName.Fmin_V,          InstEmit.Fmin_V,          typeof(OpCodeSimdReg));
             SetA64("000111100x1xxxxx011110xxxxxxxxxx", InstName.Fminnm_S,        InstEmit.Fminnm_S,        typeof(OpCodeSimdReg));
             SetA64("0>0011101<1xxxxx110001xxxxxxxxxx", InstName.Fminnm_V,        InstEmit.Fminnm_V,        typeof(OpCodeSimdReg));
+            SetA64("0>1011101<1xxxxx110001xxxxxxxxxx", InstName.Fminnmp_V,       InstEmit.Fminnmp_V,       typeof(OpCodeSimdReg));
             SetA64("0110111010110000110010xxxxxxxxxx", InstName.Fminnmv_V,       InstEmit.Fminnmv_V,       typeof(OpCodeSimd));
             SetA64("0>1011101<1xxxxx111101xxxxxxxxxx", InstName.Fminp_V,         InstEmit.Fminp_V,         typeof(OpCodeSimdReg));
+            SetA64("0110111010110000111110xxxxxxxxxx", InstName.Fminv_V,         InstEmit.Fminv_V,         typeof(OpCodeSimd));
             SetA64("010111111xxxxxxx0001x0xxxxxxxxxx", InstName.Fmla_Se,         InstEmit.Fmla_Se,         typeof(OpCodeSimdRegElemF));
             SetA64("0>0011100<1xxxxx110011xxxxxxxxxx", InstName.Fmla_V,          InstEmit.Fmla_V,          typeof(OpCodeSimdReg));
             SetA64("0>0011111<xxxxxx0001x0xxxxxxxxxx", InstName.Fmla_Ve,         InstEmit.Fmla_Ve,         typeof(OpCodeSimdRegElemF));
@@ -529,6 +533,7 @@ namespace ARMeilleure.Decoders
             SetA64("0101111101xxxxxx001101xxxxxxxxxx", InstName.Srsra_S,         InstEmit.Srsra_S,         typeof(OpCodeSimdShImm));
             SetA64("0x00111100>>>xxx001101xxxxxxxxxx", InstName.Srsra_V,         InstEmit.Srsra_V,         typeof(OpCodeSimdShImm));
             SetA64("0100111101xxxxxx001101xxxxxxxxxx", InstName.Srsra_V,         InstEmit.Srsra_V,         typeof(OpCodeSimdShImm));
+            SetA64("01011110111xxxxx010001xxxxxxxxxx", InstName.Sshl_S,          InstEmit.Sshl_S,          typeof(OpCodeSimdReg));
             SetA64("0>001110<<1xxxxx010001xxxxxxxxxx", InstName.Sshl_V,          InstEmit.Sshl_V,          typeof(OpCodeSimdReg));
             SetA64("0x00111100>>>xxx101001xxxxxxxxxx", InstName.Sshll_V,         InstEmit.Sshll_V,         typeof(OpCodeSimdShImm));
             SetA64("0101111101xxxxxx000001xxxxxxxxxx", InstName.Sshr_S,          InstEmit.Sshr_S,          typeof(OpCodeSimdShImm));
@@ -611,6 +616,7 @@ namespace ARMeilleure.Decoders
             SetA64("0111111101xxxxxx001101xxxxxxxxxx", InstName.Ursra_S,         InstEmit.Ursra_S,         typeof(OpCodeSimdShImm));
             SetA64("0x10111100>>>xxx001101xxxxxxxxxx", InstName.Ursra_V,         InstEmit.Ursra_V,         typeof(OpCodeSimdShImm));
             SetA64("0110111101xxxxxx001101xxxxxxxxxx", InstName.Ursra_V,         InstEmit.Ursra_V,         typeof(OpCodeSimdShImm));
+            SetA64("01111110111xxxxx010001xxxxxxxxxx", InstName.Ushl_S,          InstEmit.Ushl_S,          typeof(OpCodeSimdReg));
             SetA64("0>101110<<1xxxxx010001xxxxxxxxxx", InstName.Ushl_V,          InstEmit.Ushl_V,          typeof(OpCodeSimdReg));
             SetA64("0x10111100>>>xxx101001xxxxxxxxxx", InstName.Ushll_V,         InstEmit.Ushll_V,         typeof(OpCodeSimdShImm));
             SetA64("0111111101xxxxxx000001xxxxxxxxxx", InstName.Ushr_S,          InstEmit.Ushr_S,          typeof(OpCodeSimdShImm));

+ 313 - 43
ARMeilleure/Instructions/InstEmitSimdArithmetic.cs

@@ -382,7 +382,14 @@ namespace ARMeilleure.Instructions
         {
             if (Optimizations.FastFP && Optimizations.UseSse2)
             {
-                EmitSse2VectorPairwiseOpF(context, Intrinsic.X86Addps, Intrinsic.X86Addpd);
+                EmitSse2VectorPairwiseOpF(context, (op1, op2) =>
+                {
+                    IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
+
+                    Intrinsic addInst = (op.Size & 1) == 0 ? Intrinsic.X86Addps : Intrinsic.X86Addpd;
+
+                    return context.AddIntrinsic(addInst, op1, op2);
+                });
             }
             else
             {
@@ -468,9 +475,12 @@ namespace ARMeilleure.Instructions
 
         public static void Fmax_S(ArmEmitterContext context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse41)
             {
-                EmitScalarBinaryOpF(context, Intrinsic.X86Maxss, Intrinsic.X86Maxsd);
+                EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
+                {
+                    return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
+                }, scalar: true);
             }
             else
             {
@@ -483,9 +493,12 @@ namespace ARMeilleure.Instructions
 
         public static void Fmax_V(ArmEmitterContext context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse41)
             {
-                EmitVectorBinaryOpF(context, Intrinsic.X86Maxps, Intrinsic.X86Maxpd);
+                EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
+                {
+                    return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
+                }, scalar: false);
             }
             else
             {
@@ -526,19 +539,53 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Fmaxnmp_V(ArmEmitterContext context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse41)
+            {
+                EmitSse2VectorPairwiseOpF(context, (op1, op2) =>
+                {
+                    return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: false, op1, op2);
+                });
+            }
+            else
+            {
+                EmitVectorPairwiseOpF(context, (op1, op2) =>
+                {
+                    return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2);
+                });
+            }
+        }
+
         public static void Fmaxnmv_V(ArmEmitterContext context)
         {
-            EmitVectorAcrossVectorOpF(context, (op1, op2) =>
+            if (Optimizations.FastFP && Optimizations.UseSse41)
             {
-                return context.Call(typeof(SoftFloat32).GetMethod(nameof(SoftFloat32.FPMaxNum)), op1, op2);
-            });
+                EmitSse2VectorAcrossVectorOpF(context, (op1, op2) =>
+                {
+                    return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: false, op1, op2);
+                });
+            }
+            else
+            {
+                EmitVectorAcrossVectorOpF(context, (op1, op2) =>
+                {
+                    return context.Call(typeof(SoftFloat32).GetMethod(nameof(SoftFloat32.FPMaxNum)), op1, op2);
+                });
+            }
         }
 
         public static void Fmaxp_V(ArmEmitterContext context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse41)
             {
-                EmitSse2VectorPairwiseOpF(context, Intrinsic.X86Maxps, Intrinsic.X86Maxpd);
+                EmitSse2VectorPairwiseOpF(context, (op1, op2) =>
+                {
+                    return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
+                    {
+                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
+                    }, scalar: false, op1, op2);
+                });
             }
             else
             {
@@ -549,11 +596,35 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Fmaxv_V(ArmEmitterContext context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse41)
+            {
+                EmitSse2VectorAcrossVectorOpF(context, (op1, op2) =>
+                {
+                    return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
+                    {
+                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
+                    }, scalar: false, op1, op2);
+                });
+            }
+            else
+            {
+                EmitVectorAcrossVectorOpF(context, (op1, op2) =>
+                {
+                    return context.Call(typeof(SoftFloat32).GetMethod(nameof(SoftFloat32.FPMax)), op1, op2);
+                });
+            }
+        }
+
         public static void Fmin_S(ArmEmitterContext context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse41)
             {
-                EmitScalarBinaryOpF(context, Intrinsic.X86Minss, Intrinsic.X86Minsd);
+                EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
+                {
+                    return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
+                }, scalar: true);
             }
             else
             {
@@ -566,9 +637,12 @@ namespace ARMeilleure.Instructions
 
         public static void Fmin_V(ArmEmitterContext context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse41)
             {
-                EmitVectorBinaryOpF(context, Intrinsic.X86Minps, Intrinsic.X86Minpd);
+                EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
+                {
+                    return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
+                }, scalar: false);
             }
             else
             {
@@ -609,19 +683,53 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Fminnmp_V(ArmEmitterContext context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse41)
+            {
+                EmitSse2VectorPairwiseOpF(context, (op1, op2) =>
+                {
+                    return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: false, op1, op2);
+                });
+            }
+            else
+            {
+                EmitVectorPairwiseOpF(context, (op1, op2) =>
+                {
+                    return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2);
+                });
+            }
+        }
+
         public static void Fminnmv_V(ArmEmitterContext context)
         {
-            EmitVectorAcrossVectorOpF(context, (op1, op2) =>
+            if (Optimizations.FastFP && Optimizations.UseSse41)
             {
-                return context.Call(typeof(SoftFloat32).GetMethod(nameof(SoftFloat32.FPMinNum)), op1, op2);
-            });
+                EmitSse2VectorAcrossVectorOpF(context, (op1, op2) =>
+                {
+                    return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: false, op1, op2);
+                });
+            }
+            else
+            {
+                EmitVectorAcrossVectorOpF(context, (op1, op2) =>
+                {
+                    return context.Call(typeof(SoftFloat32).GetMethod(nameof(SoftFloat32.FPMinNum)), op1, op2);
+                });
+            }
         }
 
         public static void Fminp_V(ArmEmitterContext context)
         {
-            if (Optimizations.FastFP && Optimizations.UseSse2)
+            if (Optimizations.FastFP && Optimizations.UseSse41)
             {
-                EmitSse2VectorPairwiseOpF(context, Intrinsic.X86Minps, Intrinsic.X86Minpd);
+                EmitSse2VectorPairwiseOpF(context, (op1, op2) =>
+                {
+                    return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
+                    {
+                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
+                    }, scalar: false, op1, op2);
+                });
             }
             else
             {
@@ -632,6 +740,27 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Fminv_V(ArmEmitterContext context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse41)
+            {
+                EmitSse2VectorAcrossVectorOpF(context, (op1, op2) =>
+                {
+                    return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
+                    {
+                        return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
+                    }, scalar: false, op1, op2);
+                });
+            }
+            else
+            {
+                EmitVectorAcrossVectorOpF(context, (op1, op2) =>
+                {
+                    return context.Call(typeof(SoftFloat32).GetMethod(nameof(SoftFloat32.FPMin)), op1, op2);
+                });
+            }
+        }
+
         public static void Fmla_Se(ArmEmitterContext context) // Fused.
         {
             EmitScalarTernaryOpByElemF(context, (op1, op2, op3) =>
@@ -3111,7 +3240,12 @@ namespace ARMeilleure.Instructions
             context.Copy(GetVec(op.Rd), res);
         }
 
-        public static Operand EmitSse2VectorIsQNaNOpF(ArmEmitterContext context, Operand opF)
+        public static void EmitSse2VectorIsNaNOpF(
+            ArmEmitterContext context,
+            Operand opF,
+            out Operand qNaNMask,
+            out Operand sNaNMask,
+            bool? isQNaN = null)
         {
             IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
 
@@ -3126,7 +3260,8 @@ namespace ARMeilleure.Instructions
                 Operand mask2 = context.AddIntrinsic(Intrinsic.X86Pand,  opF,   qMask);
                         mask2 = context.AddIntrinsic(Intrinsic.X86Cmpps, mask2, qMask, Const((int)CmpCondition.Equal));
 
-                return context.AddIntrinsic(Intrinsic.X86Andps, mask1, mask2);
+                qNaNMask = isQNaN == null ||  (bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andps,  mask2, mask1) : null;
+                sNaNMask = isQNaN == null || !(bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andnps, mask2, mask1) : null;
             }
             else /* if ((op.Size & 1) == 1) */
             {
@@ -3139,67 +3274,202 @@ namespace ARMeilleure.Instructions
                 Operand mask2 = context.AddIntrinsic(Intrinsic.X86Pand,  opF,   qMask);
                         mask2 = context.AddIntrinsic(Intrinsic.X86Cmppd, mask2, qMask, Const((int)CmpCondition.Equal));
 
-                return context.AddIntrinsic(Intrinsic.X86Andpd, mask1, mask2);
+                qNaNMask = isQNaN == null ||  (bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andpd,  mask2, mask1) : null;
+                sNaNMask = isQNaN == null || !(bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andnpd, mask2, mask1) : null;
             }
         }
 
-        private static void EmitSse41MaxMinNumOpF(ArmEmitterContext context, bool isMaxNum, bool scalar)
+        public static Operand EmitSse41ProcessNaNsOpF(
+            ArmEmitterContext context,
+            Func2I emit,
+            bool scalar,
+            Operand n = null,
+            Operand m = null)
         {
-            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
+            Operand nCopy = n ?? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rn));
+            Operand mCopy = m ?? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rm));
 
-            Operand d = GetVec(op.Rd);
-            Operand n = GetVec(op.Rn);
-            Operand m = GetVec(op.Rm);
+            EmitSse2VectorIsNaNOpF(context, nCopy, out Operand nQNaNMask, out Operand nSNaNMask);
+            EmitSse2VectorIsNaNOpF(context, mCopy, out _, out Operand mSNaNMask, isQNaN: false);
 
-            Operand nNum = context.Copy(n);
-            Operand mNum = context.Copy(m);
+            int sizeF = ((IOpCodeSimd)context.CurrOp).Size & 1;
 
-            Operand nQNaNMask = EmitSse2VectorIsQNaNOpF(context, nNum);
-            Operand mQNaNMask = EmitSse2VectorIsQNaNOpF(context, mNum);
+            if (sizeF == 0)
+            {
+                const int QBit = 22;
 
-            int sizeF = op.Size & 1;
+                Operand qMask = scalar ? X86GetScalar(context, 1 << QBit) : X86GetAllElements(context, 1 << QBit);
+
+                Operand resNaNMask = context.AddIntrinsic(Intrinsic.X86Pandn, mSNaNMask,  nQNaNMask);
+                        resNaNMask = context.AddIntrinsic(Intrinsic.X86Por,   resNaNMask, nSNaNMask);
+
+                Operand resNaN = context.AddIntrinsic(Intrinsic.X86Blendvps, mCopy, nCopy, resNaNMask);
+                        resNaN = context.AddIntrinsic(Intrinsic.X86Por, resNaN, qMask);
+
+                Operand resMask = context.AddIntrinsic(Intrinsic.X86Cmpps, nCopy, mCopy, Const((int)CmpCondition.OrderedQ));
+
+                Operand res = context.AddIntrinsic(Intrinsic.X86Blendvps, resNaN, emit(nCopy, mCopy), resMask);
+
+                if (n != null || m != null)
+                {
+                    return res;
+                }
+
+                if (scalar)
+                {
+                    res = context.VectorZeroUpper96(res);
+                }
+                else if (((OpCodeSimdReg)context.CurrOp).RegisterSize == RegisterSize.Simd64)
+                {
+                    res = context.VectorZeroUpper64(res);
+                }
+
+                context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res);
+
+                return null;
+            }
+            else /* if (sizeF == 1) */
+            {
+                const int QBit = 51;
+
+                Operand qMask = scalar ? X86GetScalar(context, 1L << QBit) : X86GetAllElements(context, 1L << QBit);
+
+                Operand resNaNMask = context.AddIntrinsic(Intrinsic.X86Pandn, mSNaNMask,  nQNaNMask);
+                        resNaNMask = context.AddIntrinsic(Intrinsic.X86Por,   resNaNMask, nSNaNMask);
+
+                Operand resNaN = context.AddIntrinsic(Intrinsic.X86Blendvpd, mCopy, nCopy, resNaNMask);
+                        resNaN = context.AddIntrinsic(Intrinsic.X86Por, resNaN, qMask);
+
+                Operand resMask = context.AddIntrinsic(Intrinsic.X86Cmppd, nCopy, mCopy, Const((int)CmpCondition.OrderedQ));
+
+                Operand res = context.AddIntrinsic(Intrinsic.X86Blendvpd, resNaN, emit(nCopy, mCopy), resMask);
+
+                if (n != null || m != null)
+                {
+                    return res;
+                }
+
+                if (scalar)
+                {
+                    res = context.VectorZeroUpper64(res);
+                }
+
+                context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res);
+
+                return null;
+            }
+        }
+
+        private static Operand EmitSse2VectorMaxMinOpF(ArmEmitterContext context, Operand n, Operand m, bool isMax)
+        {
+            IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
+
+            if ((op.Size & 1) == 0)
+            {
+                Operand mask = X86GetAllElements(context, -0f);
+
+                Operand res = context.AddIntrinsic(isMax ? Intrinsic.X86Maxps : Intrinsic.X86Minps, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Andnps, mask, res);
+
+                Operand resSign = context.AddIntrinsic(isMax ? Intrinsic.X86Pand : Intrinsic.X86Por, n, m);
+                        resSign = context.AddIntrinsic(Intrinsic.X86Andps, mask, resSign);
+
+                return context.AddIntrinsic(Intrinsic.X86Por, res, resSign);
+            }
+            else /* if ((op.Size & 1) == 1) */
+            {
+                Operand mask = X86GetAllElements(context, -0d);
+
+                Operand res = context.AddIntrinsic(isMax ? Intrinsic.X86Maxpd : Intrinsic.X86Minpd, n, m);
+                        res = context.AddIntrinsic(Intrinsic.X86Andnpd, mask, res);
+
+                Operand resSign = context.AddIntrinsic(isMax ? Intrinsic.X86Pand : Intrinsic.X86Por, n, m);
+                        resSign = context.AddIntrinsic(Intrinsic.X86Andpd, mask, resSign);
+
+                return context.AddIntrinsic(Intrinsic.X86Por, res, resSign);
+            }
+        }
+
+        private static Operand EmitSse41MaxMinNumOpF(
+            ArmEmitterContext context,
+            bool isMaxNum,
+            bool scalar,
+            Operand n = null,
+            Operand m = null)
+        {
+            Operand nCopy = n ?? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rn));
+            Operand mCopy = m ?? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rm));
+
+            EmitSse2VectorIsNaNOpF(context, nCopy, out Operand nQNaNMask, out _, isQNaN: true);
+            EmitSse2VectorIsNaNOpF(context, mCopy, out Operand mQNaNMask, out _, isQNaN: true);
+
+            int sizeF = ((IOpCodeSimd)context.CurrOp).Size & 1;
 
             if (sizeF == 0)
             {
-                Operand negInfMask = X86GetAllElements(context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity);
+                Operand negInfMask = scalar
+                    ? X86GetScalar     (context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity)
+                    : X86GetAllElements(context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity);
 
                 Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnps, mQNaNMask, nQNaNMask);
                 Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnps, nQNaNMask, mQNaNMask);
 
-                nNum = context.AddIntrinsic(Intrinsic.X86Blendvps, nNum, negInfMask, nMask);
-                mNum = context.AddIntrinsic(Intrinsic.X86Blendvps, mNum, negInfMask, mMask);
+                nCopy = context.AddIntrinsic(Intrinsic.X86Blendvps, nCopy, negInfMask, nMask);
+                mCopy = context.AddIntrinsic(Intrinsic.X86Blendvps, mCopy, negInfMask, mMask);
+
+                Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
+                {
+                    return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum);
+                }, scalar: scalar, nCopy, mCopy);
 
-                Operand res = context.AddIntrinsic(isMaxNum ? Intrinsic.X86Maxps : Intrinsic.X86Minps, nNum, mNum);
+                if (n != null || m != null)
+                {
+                    return res;
+                }
 
                 if (scalar)
                 {
                     res = context.VectorZeroUpper96(res);
                 }
-                else if (op.RegisterSize == RegisterSize.Simd64)
+                else if (((OpCodeSimdReg)context.CurrOp).RegisterSize == RegisterSize.Simd64)
                 {
                     res = context.VectorZeroUpper64(res);
                 }
 
-                context.Copy(d, res);
+                context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res);
+
+                return null;
             }
             else /* if (sizeF == 1) */
             {
-                Operand negInfMask = X86GetAllElements(context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity);
+                Operand negInfMask = scalar
+                    ? X86GetScalar     (context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity)
+                    : X86GetAllElements(context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity);
 
                 Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnpd, mQNaNMask, nQNaNMask);
                 Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnpd, nQNaNMask, mQNaNMask);
 
-                nNum = context.AddIntrinsic(Intrinsic.X86Blendvpd, nNum, negInfMask, nMask);
-                mNum = context.AddIntrinsic(Intrinsic.X86Blendvpd, mNum, negInfMask, mMask);
+                nCopy = context.AddIntrinsic(Intrinsic.X86Blendvpd, nCopy, negInfMask, nMask);
+                mCopy = context.AddIntrinsic(Intrinsic.X86Blendvpd, mCopy, negInfMask, mMask);
 
-                Operand res = context.AddIntrinsic(isMaxNum ? Intrinsic.X86Maxpd : Intrinsic.X86Minpd, nNum, mNum);
+                Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
+                {
+                    return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum);
+                }, scalar: scalar, nCopy, mCopy);
+
+                if (n != null || m != null)
+                {
+                    return res;
+                }
 
                 if (scalar)
                 {
                     res = context.VectorZeroUpper64(res);
                 }
 
-                context.Copy(d, res);
+                context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res);
+
+                return null;
             }
         }
 

+ 2 - 2
ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs

@@ -1200,8 +1200,8 @@ namespace ARMeilleure.Instructions
                 Operand nNum = context.Copy(n);
                 Operand mNum = context.Copy(m);
 
-                Operand nQNaNMask = InstEmit.EmitSse2VectorIsQNaNOpF(context, nNum);
-                Operand mQNaNMask = InstEmit.EmitSse2VectorIsQNaNOpF(context, mNum);
+                InstEmit.EmitSse2VectorIsNaNOpF(context, nNum, out Operand nQNaNMask, out _, isQNaN: true);
+                InstEmit.EmitSse2VectorIsNaNOpF(context, mNum, out Operand mQNaNMask, out _, isQNaN: true);
 
                 int sizeF = op.Size & 1;
 

+ 34 - 11
ARMeilleure/Instructions/InstEmitSimdHelper.cs

@@ -1095,6 +1095,29 @@ namespace ARMeilleure.Instructions
             context.Copy(GetVec(op.Rd), d);
         }
 
+        public static void EmitSse2VectorAcrossVectorOpF(ArmEmitterContext context, Func2I emit)
+        {
+            OpCodeSimd op = (OpCodeSimd)context.CurrOp;
+
+            Debug.Assert((op.Size & 1) == 0 && op.RegisterSize == RegisterSize.Simd128);
+
+            const int sm0 = 0 << 6 | 0 << 4 | 0 << 2 | 0 << 0;
+            const int sm1 = 1 << 6 | 1 << 4 | 1 << 2 | 1 << 0;
+            const int sm2 = 2 << 6 | 2 << 4 | 2 << 2 | 2 << 0;
+            const int sm3 = 3 << 6 | 3 << 4 | 3 << 2 | 3 << 0;
+
+            Operand nCopy = context.Copy(GetVec(op.Rn));
+
+            Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm0));
+            Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm1));
+            Operand part2 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm2));
+            Operand part3 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm3));
+
+            Operand res = emit(emit(part0, part1), emit(part2, part3));
+
+            context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
+        }
+
         public static void EmitVectorPairwiseOpF(ArmEmitterContext context, Func2I emit)
         {
             OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
@@ -1124,12 +1147,12 @@ namespace ARMeilleure.Instructions
             context.Copy(GetVec(op.Rd), res);
         }
 
-        public static void EmitSse2VectorPairwiseOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
+        public static void EmitSse2VectorPairwiseOpF(ArmEmitterContext context, Func2I emit)
         {
             OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
 
-            Operand n = GetVec(op.Rn);
-            Operand m = GetVec(op.Rm);
+            Operand nCopy = context.Copy(GetVec(op.Rn));
+            Operand mCopy = context.Copy(GetVec(op.Rm));
 
             int sizeF = op.Size & 1;
 
@@ -1137,32 +1160,32 @@ namespace ARMeilleure.Instructions
             {
                 if (op.RegisterSize == RegisterSize.Simd64)
                 {
-                    Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, n, m);
+                    Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, nCopy, mCopy);
 
                     Operand zero = context.VectorZero();
 
                     Operand part0 = context.AddIntrinsic(Intrinsic.X86Movlhps, unpck, zero);
                     Operand part1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, unpck);
 
-                    context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst32, part0, part1));
+                    context.Copy(GetVec(op.Rd), emit(part0, part1));
                 }
                 else /* if (op.RegisterSize == RegisterSize.Simd128) */
                 {
                     const int sm0 = 2 << 6 | 0 << 4 | 2 << 2 | 0 << 0;
                     const int sm1 = 3 << 6 | 1 << 4 | 3 << 2 | 1 << 0;
 
-                    Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, n, m, Const(sm0));
-                    Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, n, m, Const(sm1));
+                    Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, mCopy, Const(sm0));
+                    Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, mCopy, Const(sm1));
 
-                    context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst32, part0, part1));
+                    context.Copy(GetVec(op.Rd), emit(part0, part1));
                 }
             }
             else /* if (sizeF == 1) */
             {
-                Operand part0 = context.AddIntrinsic(Intrinsic.X86Unpcklpd, n, m);
-                Operand part1 = context.AddIntrinsic(Intrinsic.X86Unpckhpd, n, m);
+                Operand part0 = context.AddIntrinsic(Intrinsic.X86Unpcklpd, nCopy, mCopy);
+                Operand part1 = context.AddIntrinsic(Intrinsic.X86Unpckhpd, nCopy, mCopy);
 
-                context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst64, part0, part1));
+                context.Copy(GetVec(op.Rd), emit(part0, part1));
             }
         }
 

+ 56 - 42
ARMeilleure/Instructions/InstEmitSimdShift.cs

@@ -391,25 +391,14 @@ namespace ARMeilleure.Instructions
             }
         }
 
-        public static void Sshl_V(ArmEmitterContext context)
+        public static void Sshl_S(ArmEmitterContext context)
         {
-            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
-
-            Operand res = context.VectorZero();
-
-            int elems = op.GetBytesCount() >> op.Size;
-
-            for (int index = 0; index < elems; index++)
-            {
-                Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size);
-                Operand me = EmitVectorExtractSx(context, op.Rm, index, op.Size);
-
-                Operand e = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.SignedShlReg)), ne, me, Const(0), Const(op.Size));
-
-                res = EmitVectorInsert(context, res, e, index, op.Size);
-            }
+            EmitSshlOrUshl(context, signed: true, scalar: true);
+        }
 
-            context.Copy(GetVec(op.Rd), res);
+        public static void Sshl_V(ArmEmitterContext context)
+        {
+            EmitSshlOrUshl(context, signed: true, scalar: false);
         }
 
         public static void Sshll_V(ArmEmitterContext context)
@@ -686,25 +675,14 @@ namespace ARMeilleure.Instructions
             }
         }
 
-        public static void Ushl_V(ArmEmitterContext context)
+        public static void Ushl_S(ArmEmitterContext context)
         {
-            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
-
-            Operand res = context.VectorZero();
-
-            int elems = op.GetBytesCount() >> op.Size;
-
-            for (int index = 0; index < elems; index++)
-            {
-                Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
-                Operand me = EmitVectorExtractSx(context, op.Rm, index << op.Size, 0);
-
-                Operand e = EmitUnsignedShlRegOp(context, ne, context.ConvertI64ToI32(me), op.Size);
-
-                res = EmitVectorInsert(context, res, e, index, op.Size);
-            }
+            EmitSshlOrUshl(context, signed: false, scalar: true);
+        }
 
-            context.Copy(GetVec(op.Rd), res);
+        public static void Ushl_V(ArmEmitterContext context)
+        {
+            EmitSshlOrUshl(context, signed: false, scalar: false);
         }
 
         public static void Ushll_V(ArmEmitterContext context)
@@ -894,7 +872,7 @@ namespace ARMeilleure.Instructions
             context.Copy(GetVec(op.Rd), res);
         }
 
-        private static Operand EmitUnsignedShlRegOp(ArmEmitterContext context, Operand op, Operand shiftLsB, int size)
+        private static Operand EmitShlRegOp(ArmEmitterContext context, Operand op, Operand shiftLsB, int size, bool signed)
         {
             Debug.Assert(op.Type       == OperandType.I64);
             Debug.Assert(shiftLsB.Type == OperandType.I32);
@@ -902,18 +880,33 @@ namespace ARMeilleure.Instructions
 
             Operand negShiftLsB = context.Negate(shiftLsB);
 
+            Operand isInRange = context.BitwiseAnd(
+                context.ICompareLess(shiftLsB,    Const(8 << size)),
+                context.ICompareLess(negShiftLsB, Const(8 << size)));
+
             Operand isPositive = context.ICompareGreaterOrEqual(shiftLsB, Const(0));
 
-            Operand shl = context.ShiftLeft   (op, shiftLsB);
-            Operand shr = context.ShiftRightUI(op, negShiftLsB);
+            Operand shl = context.ShiftLeft(op, shiftLsB);
+
+            Operand sarOrShr = signed
+                ? context.ShiftRightSI(op, negShiftLsB)
+                : context.ShiftRightUI(op, negShiftLsB);
 
-            Operand res = context.ConditionalSelect(isPositive, shl, shr);
+            Operand res = context.ConditionalSelect(isPositive, shl, sarOrShr);
 
-            Operand isOutOfRange = context.BitwiseOr(
-                context.ICompareGreaterOrEqual(shiftLsB,    Const(8 << size)),
-                context.ICompareGreaterOrEqual(negShiftLsB, Const(8 << size)));
+            if (signed)
+            {
+                Operand isPositive2 = context.ICompareGreaterOrEqual(op, Const(0L));
+
+                Operand res2 = context.ConditionalSelect(isPositive2, Const(0L), Const(-1L));
+                        res2 = context.ConditionalSelect(isPositive,  Const(0L), res2);
 
-            return context.ConditionalSelect(isOutOfRange, Const(0UL), res);
+                return context.ConditionalSelect(isInRange, res, res2);
+            }
+            else
+            {
+                return context.ConditionalSelect(isInRange, res, Const(0UL));
+            }
         }
 
         private static void EmitVectorShrImmNarrowOpZx(ArmEmitterContext context, bool round)
@@ -1174,5 +1167,26 @@ namespace ARMeilleure.Instructions
                 context.Copy(GetVec(op.Rd), res);
             }
         }
+
+        private static void EmitSshlOrUshl(ArmEmitterContext context, bool signed, bool scalar)
+        {
+            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
+
+            Operand res = context.VectorZero();
+
+            int elems = !scalar ? op.GetBytesCount() >> op.Size : 1;
+
+            for (int index = 0; index < elems; index++)
+            {
+                Operand ne = EmitVectorExtract  (context, op.Rn, index, op.Size, signed);
+                Operand me = EmitVectorExtractSx(context, op.Rm, index << op.Size, 0);
+
+                Operand e = EmitShlRegOp(context, ne, context.ConvertI64ToI32(me), op.Size, signed);
+
+                res = EmitVectorInsert(context, res, e, index, op.Size);
+            }
+
+            context.Copy(GetVec(op.Rd), res);
+        }
     }
 }

+ 6 - 0
ARMeilleure/Instructions/InstName.cs

@@ -212,14 +212,18 @@ namespace ARMeilleure.Instructions
         Fmax_V,
         Fmaxnm_S,
         Fmaxnm_V,
+        Fmaxnmp_V,
         Fmaxnmv_V,
         Fmaxp_V,
+        Fmaxv_V,
         Fmin_S,
         Fmin_V,
         Fminnm_S,
         Fminnm_V,
+        Fminnmp_V,
         Fminnmv_V,
         Fminp_V,
+        Fminv_V,
         Fmla_Se,
         Fmla_V,
         Fmla_Ve,
@@ -378,6 +382,7 @@ namespace ARMeilleure.Instructions
         Srshr_V,
         Srsra_S,
         Srsra_V,
+        Sshl_S,
         Sshl_V,
         Sshll_V,
         Sshr_S,
@@ -444,6 +449,7 @@ namespace ARMeilleure.Instructions
         Urshr_V,
         Ursra_S,
         Ursra_V,
+        Ushl_S,
         Ushl_V,
         Ushll_V,
         Ushr_S,

+ 2 - 2
ARMeilleure/Translation/PTC/Ptc.cs

@@ -19,8 +19,8 @@ namespace ARMeilleure.Translation.PTC
     public static class Ptc
     {
         private const string HeaderMagic = "PTChd";
-
-        private const int InternalVersion = 9; //! To be incremented manually for each change to the ARMeilleure project.
+      
+        private const int InternalVersion = 10; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string BaseDir = "Ryujinx";
 

+ 102 - 45
Ryujinx.Tests/Cpu/CpuTest.cs

@@ -12,10 +12,14 @@ namespace Ryujinx.Tests.Cpu
     [TestFixture]
     public class CpuTest
     {
-        private ulong _currAddress;
-        private ulong _size;
+        protected const ulong Size = 0x1000;
+        protected const ulong CodeBaseAddress = 0x1000;
+        protected const ulong DataBaseAddress = CodeBaseAddress + Size;
+
+        private const bool Ignore_FpcrFz_FpcrDn = false;
+        private const bool IgnoreAllExcept_FpsrQc = false;
 
-        private ulong _entryPoint;
+        private ulong _currAddress;
 
         private MemoryBlock _ram;
 
@@ -28,6 +32,8 @@ namespace Ryujinx.Tests.Cpu
         private static bool _unicornAvailable;
         private UnicornAArch64 _unicornEmu;
 
+        private bool _usingMemory;
+
         static CpuTest()
         {
             _unicornAvailable = UnicornAArch64.IsAvailable();
@@ -41,14 +47,11 @@ namespace Ryujinx.Tests.Cpu
         [SetUp]
         public void Setup()
         {
-            _currAddress = 0x1000;
-            _size        = 0x1000;
-
-            _entryPoint = _currAddress;
+            _currAddress = CodeBaseAddress;
 
-            _ram = new MemoryBlock(_size);
-            _memory = new MemoryManager(_ram, 1UL << 16);
-            _memory.Map(_currAddress, 0, _size);
+            _ram = new MemoryBlock(Size * 2);
+            _memory = new MemoryManager(_ram, 1ul << 16);
+            _memory.Map(CodeBaseAddress, 0, Size * 2);
 
             _context = CpuContext.CreateExecutionContext();
 
@@ -57,8 +60,9 @@ namespace Ryujinx.Tests.Cpu
             if (_unicornAvailable)
             {
                 _unicornEmu = new UnicornAArch64();
-                _unicornEmu.MemoryMap(_currAddress, _size, MemoryPermission.READ | MemoryPermission.EXEC);
-                _unicornEmu.PC = _entryPoint;
+                _unicornEmu.MemoryMap(CodeBaseAddress, Size, MemoryPermission.READ | MemoryPermission.EXEC);
+                _unicornEmu.MemoryMap(DataBaseAddress, Size, MemoryPermission.READ | MemoryPermission.WRITE);
+                _unicornEmu.PC = CodeBaseAddress;
             }
         }
 
@@ -73,6 +77,8 @@ namespace Ryujinx.Tests.Cpu
             _context    = null;
             _cpuContext = null;
             _unicornEmu = null;
+
+            _usingMemory = false;
         }
 
         protected void Reset()
@@ -169,11 +175,11 @@ namespace Ryujinx.Tests.Cpu
 
         protected void ExecuteOpcodes(bool runUnicorn = true)
         {
-            _cpuContext.Execute(_context, _entryPoint);
+            _cpuContext.Execute(_context, CodeBaseAddress);
 
             if (_unicornAvailable && runUnicorn)
             {
-                _unicornEmu.RunForCount((_currAddress - _entryPoint - 4) / 4);
+                _unicornEmu.RunForCount((_currAddress - CodeBaseAddress - 4) / 4);
             }
         }
 
@@ -199,6 +205,11 @@ namespace Ryujinx.Tests.Cpu
                                                 int   fpsr       = 0,
                                                 bool  runUnicorn = true)
         {
+            if (Ignore_FpcrFz_FpcrDn)
+            {
+                fpcr &= ~((int)FPCR.Fz | (int)FPCR.Dn);
+            }
+
             Opcode(opcode);
             Opcode(0xD65F03C0); // RET
             SetContext(x0, x1, x2, x3, x31, v0, v1, v2, v3, v4, v5, v30, v31, overflow, carry, zero, negative, fpcr, fpsr);
@@ -207,6 +218,30 @@ namespace Ryujinx.Tests.Cpu
             return GetContext();
         }
 
+        protected void SetWorkingMemory(ulong offset, byte[] data)
+        {
+            _memory.Write(DataBaseAddress + offset, data);
+
+            if (_unicornAvailable)
+            {
+                _unicornEmu.MemoryWrite(DataBaseAddress + offset, data);
+            }
+
+            _usingMemory = true; // When true, CompareAgainstUnicorn checks the working memory for equality too.
+        }
+
+        protected void SetWorkingMemory(ulong offset, byte data)
+        {
+            _memory.Write(DataBaseAddress + offset, data);
+
+            if (_unicornAvailable)
+            {
+                _unicornEmu.MemoryWrite8(DataBaseAddress + offset, data);
+            }
+
+            _usingMemory = true; // When true, CompareAgainstUnicorn checks the working memory for equality too.
+        }
+
         /// <summary>Rounding Mode control field.</summary>
         public enum RMode
         {
@@ -284,15 +319,20 @@ namespace Ryujinx.Tests.Cpu
                 return;
             }
 
+            if (IgnoreAllExcept_FpsrQc)
+            {
+                fpsrMask &= Fpsr.Qc;
+            }
+
             if (fpSkips != FpSkips.None)
             {
                 ManageFpSkips(fpSkips);
             }
 
-            Assert.That(_context.GetX(0),  Is.EqualTo(_unicornEmu.X[0]));
-            Assert.That(_context.GetX(1),  Is.EqualTo(_unicornEmu.X[1]));
-            Assert.That(_context.GetX(2),  Is.EqualTo(_unicornEmu.X[2]));
-            Assert.That(_context.GetX(3),  Is.EqualTo(_unicornEmu.X[3]));
+            Assert.That(_context.GetX(0),  Is.EqualTo(_unicornEmu.X[0]), "X0");
+            Assert.That(_context.GetX(1),  Is.EqualTo(_unicornEmu.X[1]), "X1");
+            Assert.That(_context.GetX(2),  Is.EqualTo(_unicornEmu.X[2]), "X2");
+            Assert.That(_context.GetX(3),  Is.EqualTo(_unicornEmu.X[3]), "X3");
             Assert.That(_context.GetX(4),  Is.EqualTo(_unicornEmu.X[4]));
             Assert.That(_context.GetX(5),  Is.EqualTo(_unicornEmu.X[5]));
             Assert.That(_context.GetX(6),  Is.EqualTo(_unicornEmu.X[6]));
@@ -321,21 +361,21 @@ namespace Ryujinx.Tests.Cpu
             Assert.That(_context.GetX(29), Is.EqualTo(_unicornEmu.X[29]));
             Assert.That(_context.GetX(30), Is.EqualTo(_unicornEmu.X[30]));
 
-            Assert.That(_context.GetX(31), Is.EqualTo(_unicornEmu.SP));
+            Assert.That(_context.GetX(31), Is.EqualTo(_unicornEmu.SP), "X31");
 
             if (fpTolerances == FpTolerances.None)
             {
-                Assert.That(V128ToSimdValue(_context.GetV(0)), Is.EqualTo(_unicornEmu.Q[0]));
+                Assert.That(V128ToSimdValue(_context.GetV(0)), Is.EqualTo(_unicornEmu.Q[0]), "V0");
             }
             else
             {
                 ManageFpTolerances(fpTolerances);
             }
-            Assert.That(V128ToSimdValue(_context.GetV(1)),  Is.EqualTo(_unicornEmu.Q[1]));
-            Assert.That(V128ToSimdValue(_context.GetV(2)),  Is.EqualTo(_unicornEmu.Q[2]));
-            Assert.That(V128ToSimdValue(_context.GetV(3)),  Is.EqualTo(_unicornEmu.Q[3]));
-            Assert.That(V128ToSimdValue(_context.GetV(4)),  Is.EqualTo(_unicornEmu.Q[4]));
-            Assert.That(V128ToSimdValue(_context.GetV(5)),  Is.EqualTo(_unicornEmu.Q[5]));
+            Assert.That(V128ToSimdValue(_context.GetV(1)),  Is.EqualTo(_unicornEmu.Q[1]), "V1");
+            Assert.That(V128ToSimdValue(_context.GetV(2)),  Is.EqualTo(_unicornEmu.Q[2]), "V2");
+            Assert.That(V128ToSimdValue(_context.GetV(3)),  Is.EqualTo(_unicornEmu.Q[3]), "V3");
+            Assert.That(V128ToSimdValue(_context.GetV(4)),  Is.EqualTo(_unicornEmu.Q[4]), "V4");
+            Assert.That(V128ToSimdValue(_context.GetV(5)),  Is.EqualTo(_unicornEmu.Q[5]), "V5");
             Assert.That(V128ToSimdValue(_context.GetV(6)),  Is.EqualTo(_unicornEmu.Q[6]));
             Assert.That(V128ToSimdValue(_context.GetV(7)),  Is.EqualTo(_unicornEmu.Q[7]));
             Assert.That(V128ToSimdValue(_context.GetV(8)),  Is.EqualTo(_unicornEmu.Q[8]));
@@ -360,16 +400,27 @@ namespace Ryujinx.Tests.Cpu
             Assert.That(V128ToSimdValue(_context.GetV(27)), Is.EqualTo(_unicornEmu.Q[27]));
             Assert.That(V128ToSimdValue(_context.GetV(28)), Is.EqualTo(_unicornEmu.Q[28]));
             Assert.That(V128ToSimdValue(_context.GetV(29)), Is.EqualTo(_unicornEmu.Q[29]));
-            Assert.That(V128ToSimdValue(_context.GetV(30)), Is.EqualTo(_unicornEmu.Q[30]));
-            Assert.That(V128ToSimdValue(_context.GetV(31)), Is.EqualTo(_unicornEmu.Q[31]));
+            Assert.That(V128ToSimdValue(_context.GetV(30)), Is.EqualTo(_unicornEmu.Q[30]), "V30");
+            Assert.That(V128ToSimdValue(_context.GetV(31)), Is.EqualTo(_unicornEmu.Q[31]), "V31");
+
+            Assert.That((int)_context.Fpcr,                 Is.EqualTo(_unicornEmu.Fpcr),                 "Fpcr");
+            Assert.That((int)_context.Fpsr & (int)fpsrMask, Is.EqualTo(_unicornEmu.Fpsr & (int)fpsrMask), "Fpsr");
 
-            Assert.That((int)_context.Fpcr,                 Is.EqualTo(_unicornEmu.Fpcr));
-            Assert.That((int)_context.Fpsr & (int)fpsrMask, Is.EqualTo(_unicornEmu.Fpsr & (int)fpsrMask));
+            Assert.Multiple(() =>
+            {
+                Assert.That(_context.GetPstateFlag(PState.VFlag), Is.EqualTo(_unicornEmu.OverflowFlag), "VFlag");
+                Assert.That(_context.GetPstateFlag(PState.CFlag), Is.EqualTo(_unicornEmu.CarryFlag),    "CFlag");
+                Assert.That(_context.GetPstateFlag(PState.ZFlag), Is.EqualTo(_unicornEmu.ZeroFlag),     "ZFlag");
+                Assert.That(_context.GetPstateFlag(PState.NFlag), Is.EqualTo(_unicornEmu.NegativeFlag), "NFlag");
+            });
+
+            if (_usingMemory)
+            {
+                byte[] mem = _memory.GetSpan(DataBaseAddress, (int)Size).ToArray();
+                byte[] unicornMem = _unicornEmu.MemoryRead(DataBaseAddress, Size);
 
-            Assert.That(_context.GetPstateFlag(PState.VFlag), Is.EqualTo(_unicornEmu.OverflowFlag));
-            Assert.That(_context.GetPstateFlag(PState.CFlag), Is.EqualTo(_unicornEmu.CarryFlag));
-            Assert.That(_context.GetPstateFlag(PState.ZFlag), Is.EqualTo(_unicornEmu.ZeroFlag));
-            Assert.That(_context.GetPstateFlag(PState.NFlag), Is.EqualTo(_unicornEmu.NegativeFlag));
+                Assert.That(mem, Is.EqualTo(unicornMem), "Data");
+            }
         }
 
         private void ManageFpSkips(FpSkips fpSkips)
@@ -418,14 +469,17 @@ namespace Ryujinx.Tests.Cpu
                     if (IsNormalOrSubnormalS(_unicornEmu.Q[0].AsFloat()) &&
                         IsNormalOrSubnormalS(_context.GetV(0).As<float>()))
                     {
-                        Assert.That   (_context.GetV(0).Extract<float>(0),
-                            Is.EqualTo(_unicornEmu.Q[0].GetFloat(0)).Within(1).Ulps);
-                        Assert.That   (_context.GetV(0).Extract<float>(1),
-                            Is.EqualTo(_unicornEmu.Q[0].GetFloat(1)).Within(1).Ulps);
-                        Assert.That   (_context.GetV(0).Extract<float>(2),
-                            Is.EqualTo(_unicornEmu.Q[0].GetFloat(2)).Within(1).Ulps);
-                        Assert.That   (_context.GetV(0).Extract<float>(3),
-                            Is.EqualTo(_unicornEmu.Q[0].GetFloat(3)).Within(1).Ulps);
+                        Assert.Multiple(() =>
+                        {
+                            Assert.That   (_context.GetV(0).Extract<float>(0),
+                                Is.EqualTo(_unicornEmu.Q[0].GetFloat(0)).Within(1).Ulps, "V0[0]");
+                            Assert.That   (_context.GetV(0).Extract<float>(1),
+                                Is.EqualTo(_unicornEmu.Q[0].GetFloat(1)).Within(1).Ulps, "V0[1]");
+                            Assert.That   (_context.GetV(0).Extract<float>(2),
+                                Is.EqualTo(_unicornEmu.Q[0].GetFloat(2)).Within(1).Ulps, "V0[2]");
+                            Assert.That   (_context.GetV(0).Extract<float>(3),
+                                Is.EqualTo(_unicornEmu.Q[0].GetFloat(3)).Within(1).Ulps, "V0[3]");
+                        });
 
                         Console.WriteLine(fpTolerances);
                     }
@@ -440,10 +494,13 @@ namespace Ryujinx.Tests.Cpu
                     if (IsNormalOrSubnormalD(_unicornEmu.Q[0].AsDouble()) &&
                         IsNormalOrSubnormalD(_context.GetV(0).As<double>()))
                     {
-                        Assert.That   (_context.GetV(0).Extract<double>(0),
-                            Is.EqualTo(_unicornEmu.Q[0].GetDouble(0)).Within(1).Ulps);
-                        Assert.That   (_context.GetV(0).Extract<double>(1),
-                            Is.EqualTo(_unicornEmu.Q[0].GetDouble(1)).Within(1).Ulps);
+                        Assert.Multiple(() =>
+                        {
+                            Assert.That   (_context.GetV(0).Extract<double>(0),
+                                Is.EqualTo(_unicornEmu.Q[0].GetDouble(0)).Within(1).Ulps, "V0[0]");
+                            Assert.That   (_context.GetV(0).Extract<double>(1),
+                                Is.EqualTo(_unicornEmu.Q[0].GetDouble(1)).Within(1).Ulps, "V0[1]");
+                        });
 
                         Console.WriteLine(fpTolerances);
                     }

+ 123 - 0
Ryujinx.Tests/Cpu/CpuTestMisc.cs

@@ -4,15 +4,67 @@ using ARMeilleure.State;
 
 using NUnit.Framework;
 
+using System;
+using System.Collections.Generic;
+
 namespace Ryujinx.Tests.Cpu
 {
     [Category("Misc")]
     public sealed class CpuTestMisc : CpuTest
     {
 #if Misc
+
+#region "ValueSource (Types)"
+        private static IEnumerable<ulong> _1S_F_()
+        {
+            yield return 0x00000000FF7FFFFFul; // -Max Normal    (float.MinValue)
+            yield return 0x0000000080800000ul; // -Min Normal
+            yield return 0x00000000807FFFFFul; // -Max Subnormal
+            yield return 0x0000000080000001ul; // -Min Subnormal (-float.Epsilon)
+            yield return 0x000000007F7FFFFFul; // +Max Normal    (float.MaxValue)
+            yield return 0x0000000000800000ul; // +Min Normal
+            yield return 0x00000000007FFFFFul; // +Max Subnormal
+            yield return 0x0000000000000001ul; // +Min Subnormal (float.Epsilon)
+
+            if (!NoZeros)
+            {
+                yield return 0x0000000080000000ul; // -Zero
+                yield return 0x0000000000000000ul; // +Zero
+            }
+
+            if (!NoInfs)
+            {
+                yield return 0x00000000FF800000ul; // -Infinity
+                yield return 0x000000007F800000ul; // +Infinity
+            }
+
+            if (!NoNaNs)
+            {
+                yield return 0x00000000FFC00000ul; // -QNaN (all zeros payload) (float.NaN)
+                yield return 0x00000000FFBFFFFFul; // -SNaN (all ones  payload)
+                yield return 0x000000007FC00000ul; // +QNaN (all zeros payload) (-float.NaN) (DefaultNaN)
+                yield return 0x000000007FBFFFFFul; // +SNaN (all ones  payload)
+            }
+
+            for (int cnt = 1; cnt <= RndCnt; cnt++)
+            {
+                ulong grbg = TestContext.CurrentContext.Random.NextUInt();
+                ulong rnd1 = GenNormalS();
+                ulong rnd2 = GenSubnormalS();
+
+                yield return (grbg << 32) | rnd1;
+                yield return (grbg << 32) | rnd2;
+            }
+        }
+#endregion
+
         private const int RndCnt    = 2;
         private const int RndCntImm = 2;
 
+        private static readonly bool NoZeros = false;
+        private static readonly bool NoInfs  = false;
+        private static readonly bool NoNaNs  = false;
+
 #region "AluImm & Csel"
         [Test, Pairwise]
         public void Adds_Csinc_64bit([Values(0x0000000000000000ul, 0x7FFFFFFFFFFFFFFFul,
@@ -357,6 +409,77 @@ namespace Ryujinx.Tests.Cpu
 
             Assert.That(context.GetX(0), Is.EqualTo(a));
         }
+
+        [Explicit]
+        [Test, Pairwise]
+        public void Misc4([ValueSource("_1S_F_")] ulong a,
+                          [ValueSource("_1S_F_")] ulong b,
+                          [ValueSource("_1S_F_")] ulong c,
+                          [Values(0ul, 1ul, 2ul, 3ul)] ulong displacement)
+        {
+            if (!BitConverter.IsLittleEndian)
+            {
+                Assert.Ignore();
+            }
+
+            for (ulong gapOffset = 0; gapOffset < displacement; gapOffset++)
+            {
+                SetWorkingMemory(gapOffset, TestContext.CurrentContext.Random.NextByte());
+            }
+
+            SetWorkingMemory(0x0 + displacement, BitConverter.GetBytes((uint)b));
+
+            SetWorkingMemory(0x4 + displacement, BitConverter.GetBytes((uint)c));
+
+            SetWorkingMemory(0x8 + displacement, TestContext.CurrentContext.Random.NextByte());
+            SetWorkingMemory(0x9 + displacement, TestContext.CurrentContext.Random.NextByte());
+            SetWorkingMemory(0xA + displacement, TestContext.CurrentContext.Random.NextByte());
+            SetWorkingMemory(0xB + displacement, TestContext.CurrentContext.Random.NextByte());
+
+            SetContext(
+                x0: DataBaseAddress + displacement,
+                v0: MakeVectorE0E1(a, TestContext.CurrentContext.Random.NextULong()),
+                v1: MakeVectorE0E1(TestContext.CurrentContext.Random.NextULong(), TestContext.CurrentContext.Random.NextULong()),
+                v2: MakeVectorE0E1(TestContext.CurrentContext.Random.NextULong(), TestContext.CurrentContext.Random.NextULong()),
+                overflow: TestContext.CurrentContext.Random.NextBool(),
+                carry:    TestContext.CurrentContext.Random.NextBool(),
+                zero:     TestContext.CurrentContext.Random.NextBool(),
+                negative: TestContext.CurrentContext.Random.NextBool());
+
+            Opcode(0xBD400001); // LDR   S1, [X0,#0]
+            Opcode(0xBD400402); // LDR   S2, [X0,#4]
+            Opcode(0x1E215801); // FMIN  S1, S0, S1
+            Opcode(0x1E222000); // FCMP  S0, S2
+            Opcode(0x1E214C40); // FCSEL S0, S2, S1, MI
+            Opcode(0xBD000800); // STR   S0, [X0,#8]
+            Opcode(0xD65F03C0); // RET
+            ExecuteOpcodes();
+
+            CompareAgainstUnicorn();
+        }
+
+        [Explicit]
+        [Test]
+        public void Misc5([ValueSource("_1S_F_")] ulong a)
+        {
+            SetContext(
+                v0: MakeVectorE0E1(a, TestContext.CurrentContext.Random.NextULong()),
+                v1: MakeVectorE0E1(TestContext.CurrentContext.Random.NextULong(), TestContext.CurrentContext.Random.NextULong()),
+                overflow: TestContext.CurrentContext.Random.NextBool(),
+                carry:    TestContext.CurrentContext.Random.NextBool(),
+                zero:     TestContext.CurrentContext.Random.NextBool(),
+                negative: TestContext.CurrentContext.Random.NextBool());
+
+            Opcode(0x1E202008); // FCMP  S0, #0.0
+            Opcode(0x1E2E1001); // FMOV  S1, #1.0
+            Opcode(0x1E215800); // FMIN  S0, S0, S1
+            Opcode(0x1E2703E1); // FMOV  S1, WZR
+            Opcode(0x1E204C20); // FCSEL S0, S1, S0, MI
+            Opcode(0xD65F03C0); // RET
+            ExecuteOpcodes();
+
+            CompareAgainstUnicorn();
+        }
 #endif
     }
 }

+ 3 - 1
Ryujinx.Tests/Cpu/CpuTestSimd.cs

@@ -918,7 +918,9 @@ namespace Ryujinx.Tests.Cpu
             return new uint[]
             {
                 0x6E30C800u, // FMAXNMV S0, V0.4S
-                0x6EB0C800u  // FMINNMV S0, V0.4S
+                0x6E30F800u, // FMAXV   S0, V0.4S
+                0x6EB0C800u, // FMINNMV S0, V0.4S
+                0x6EB0F800u  // FMINV   S0, V0.4S
             };
         }
 

+ 45 - 12
Ryujinx.Tests/Cpu/CpuTestSimdReg.cs

@@ -373,12 +373,14 @@ namespace Ryujinx.Tests.Cpu
         {
             return new uint[]
             {
-                0x0E20F400u, // FMAX   V0.2S, V0.2S, V0.2S
-                0x0E20C400u, // FMAXNM V0.2S, V0.2S, V0.2S
-                0x2E20F400u, // FMAXP  V0.2S, V0.2S, V0.2S
-                0x0EA0F400u, // FMIN   V0.2S, V0.2S, V0.2S
-                0x0EA0C400u, // FMINNM V0.2S, V0.2S, V0.2S
-                0x2EA0F400u  // FMINP  V0.2S, V0.2S, V0.2S
+                0x0E20F400u, // FMAX    V0.2S, V0.2S, V0.2S
+                0x0E20C400u, // FMAXNM  V0.2S, V0.2S, V0.2S
+                0x2E20C400u, // FMAXNMP V0.2S, V0.2S, V0.2S
+                0x2E20F400u, // FMAXP   V0.2S, V0.2S, V0.2S
+                0x0EA0F400u, // FMIN    V0.2S, V0.2S, V0.2S
+                0x0EA0C400u, // FMINNM  V0.2S, V0.2S, V0.2S
+                0x2EA0C400u, // FMINNMP V0.2S, V0.2S, V0.2S
+                0x2EA0F400u  // FMINP   V0.2S, V0.2S, V0.2S
             };
         }
 
@@ -386,12 +388,14 @@ namespace Ryujinx.Tests.Cpu
         {
             return new uint[]
             {
-                0x4E60F400u, // FMAX   V0.2D, V0.2D, V0.2D
-                0x4E60C400u, // FMAXNM V0.2D, V0.2D, V0.2D
-                0x6E60F400u, // FMAXP  V0.2D, V0.2D, V0.2D
-                0x4EE0F400u, // FMIN   V0.2D, V0.2D, V0.2D
-                0x4EE0C400u, // FMINNM V0.2D, V0.2D, V0.2D
-                0x6EE0F400u  // FMINP  V0.2D, V0.2D, V0.2D
+                0x4E60F400u, // FMAX    V0.2D, V0.2D, V0.2D
+                0x4E60C400u, // FMAXNM  V0.2D, V0.2D, V0.2D
+                0x6E60C400u, // FMAXNMP V0.2D, V0.2D, V0.2D
+                0x6E60F400u, // FMAXP   V0.2D, V0.2D, V0.2D
+                0x4EE0F400u, // FMIN    V0.2D, V0.2D, V0.2D
+                0x4EE0C400u, // FMINNM  V0.2D, V0.2D, V0.2D
+                0x6EE0C400u, // FMINNMP V0.2D, V0.2D, V0.2D
+                0x6EE0F400u  // FMINP   V0.2D, V0.2D, V0.2D
             };
         }
 
@@ -531,6 +535,15 @@ namespace Ryujinx.Tests.Cpu
             };
         }
 
+        private static uint[] _ShlReg_S_D_()
+        {
+            return new uint[]
+            {
+                0x5EE04400u, // SSHL D0, D0, D0
+                0x7EE04400u  // USHL D0, D0, D0
+            };
+        }
+
         private static uint[] _ShlReg_V_8B_4H_2S_()
         {
             return new uint[]
@@ -2820,6 +2833,26 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
+        [Test, Pairwise]
+        public void ShlReg_S_D([ValueSource("_ShlReg_S_D_")] uint opcodes,
+                               [Values(0u)]     uint rd,
+                               [Values(1u, 0u)] uint rn,
+                               [Values(2u, 0u)] uint rm,
+                               [ValueSource("_1D_")] [Random(RndCnt)] ulong z,
+                               [ValueSource("_1D_")] [Random(RndCnt)] ulong a,
+                               [ValueSource("_1D_")] [Random(0ul, 255ul, RndCnt)] ulong b)
+        {
+            opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
+
+            V128 v0 = MakeVectorE0E1(z, z);
+            V128 v1 = MakeVectorE0(a);
+            V128 v2 = MakeVectorE0(b);
+
+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn(fpsrMask: Fpsr.Qc);
+        }
+
         [Test, Pairwise]
         public void ShlReg_V_8B_4H_2S([ValueSource("_ShlReg_V_8B_4H_2S_")] uint opcodes,
                                       [Values(0u)]     uint rd,