Просмотр исходного кода

CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Now HardwareCapabilities uses CpuId. (#1650)

* net5.0

* CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Switch to .NET 5.0.

Nits.

Tests performed successfully in both debug and release mode (for all instructions involved).

* Address comment.

* Update appveyor.yml

* Revert "Update appveyor.yml"

This reverts commit 27cdd59e8b90e227e6924d9c162af26c00a89013.

* Remove Assembler CpuId.

* Update appveyor.yml

* Address comment.
LDj3SNuD 5 лет назад
Родитель
Сommit
0679084f11

+ 2 - 6
ARMeilleure/CodeGen/X86/Assembler.cs

@@ -104,7 +104,6 @@ namespace ARMeilleure.CodeGen.X86
             Add(X86Instruction.Cmpxchg8,   new InstructionInfo(0x00000fb0, BadOp,      BadOp,      BadOp,      BadOp,      InstructionFlags.Reg8Src));
             Add(X86Instruction.Comisd,     new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f2f, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Comiss,     new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f2f, InstructionFlags.Vex));
-            Add(X86Instruction.Cpuid,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000fa2, InstructionFlags.RegOnly));
             Add(X86Instruction.Crc32,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38f1, InstructionFlags.PrefixF2));
             Add(X86Instruction.Crc32_16,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38f1, InstructionFlags.PrefixF2 | InstructionFlags.Prefix66));
             Add(X86Instruction.Crc32_8,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f38f0, InstructionFlags.PrefixF2 | InstructionFlags.Reg8Src));
@@ -270,6 +269,8 @@ namespace ARMeilleure.CodeGen.X86
             Add(X86Instruction.Unpcklps,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f14, InstructionFlags.Vex));
             Add(X86Instruction.Vblendvpd,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a4b, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vblendvps,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vcvtph2ps,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
+            Add(X86Instruction.Vcvtps2ph,  new InstructionInfo(0x000f3a1d, BadOp,      BadOp,      BadOp,      BadOp,      InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Vpblendvb,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66));
             Add(X86Instruction.Xor,        new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp,      0x00000033, InstructionFlags.None));
             Add(X86Instruction.Xorpd,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66));
@@ -386,11 +387,6 @@ namespace ARMeilleure.CodeGen.X86
             WriteInstruction(src1, null, src2, X86Instruction.Comiss);
         }
 
-        public void Cpuid()
-        {
-            WriteInstruction(null, null, OperandType.None, X86Instruction.Cpuid);
-        }
-
         public void Cvtsd2ss(Operand dest, Operand src1, Operand src2)
         {
             WriteInstruction(dest, src1, src2, X86Instruction.Cvtsd2ss);

+ 51 - 11
ARMeilleure/CodeGen/X86/HardwareCapabilities.cs

@@ -1,20 +1,60 @@
+using System;
 using System.Runtime.Intrinsics.X86;
 
 namespace ARMeilleure.CodeGen.X86
 {
     static class HardwareCapabilities
     {
-        public static bool SupportsSse => Sse.IsSupported;
-        public static bool SupportsSse2 => Sse2.IsSupported;
-        public static bool SupportsSse3 => Sse3.IsSupported;
-        public static bool SupportsSsse3 => Ssse3.IsSupported;
-        public static bool SupportsSse41 => Sse41.IsSupported;
-        public static bool SupportsSse42 => Sse42.IsSupported;
-        public static bool SupportsPclmulqdq => Pclmulqdq.IsSupported;
-        public static bool SupportsFma => Fma.IsSupported;
-        public static bool SupportsPopcnt => Popcnt.IsSupported;
-        public static bool SupportsAesni => Aes.IsSupported;
-        public static bool SupportsAvx => Avx.IsSupported;
+        static HardwareCapabilities()
+        {
+            if (!X86Base.IsSupported)
+            {
+                return;
+            }
+
+            (_, _, int ecx, int edx) = X86Base.CpuId(0x00000001, 0x00000000);
+
+            FeatureInfoEdx = (FeatureFlagsEdx)edx;
+            FeatureInfoEcx = (FeatureFlagsEcx)ecx;
+        }
+
+        [Flags]
+        public enum FeatureFlagsEdx
+        {
+            Sse = 1 << 25,
+            Sse2 = 1 << 26
+        }
+
+        [Flags]
+        public enum FeatureFlagsEcx
+        {
+            Sse3 = 1 << 0,
+            Pclmulqdq = 1 << 1,
+            Ssse3 = 1 << 9,
+            Fma = 1 << 12,
+            Sse41 = 1 << 19,
+            Sse42 = 1 << 20,
+            Popcnt = 1 << 23,
+            Aes = 1 << 25,
+            Avx = 1 << 28,
+            F16c = 1 << 29
+        }
+
+        public static FeatureFlagsEdx FeatureInfoEdx { get; }
+        public static FeatureFlagsEcx FeatureInfoEcx { get; }
+
+        public static bool SupportsSse => FeatureInfoEdx.HasFlag(FeatureFlagsEdx.Sse);
+        public static bool SupportsSse2 => FeatureInfoEdx.HasFlag(FeatureFlagsEdx.Sse2);
+        public static bool SupportsSse3 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Sse3);
+        public static bool SupportsPclmulqdq => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Pclmulqdq);
+        public static bool SupportsSsse3 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Ssse3);
+        public static bool SupportsFma => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Fma);
+        public static bool SupportsSse41 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Sse41);
+        public static bool SupportsSse42 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Sse42);
+        public static bool SupportsPopcnt => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Popcnt);
+        public static bool SupportsAesni => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Aes);
+        public static bool SupportsAvx => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Avx);
+        public static bool SupportsF16c => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.F16c);
 
         public static bool ForceLegacySse { get; set; }
 

+ 2 - 0
ARMeilleure/CodeGen/X86/IntrinsicTable.cs

@@ -162,6 +162,8 @@ namespace ARMeilleure.CodeGen.X86
             Add(Intrinsic.X86Unpckhps,   new IntrinsicInfo(X86Instruction.Unpckhps,   IntrinsicType.Binary));
             Add(Intrinsic.X86Unpcklpd,   new IntrinsicInfo(X86Instruction.Unpcklpd,   IntrinsicType.Binary));
             Add(Intrinsic.X86Unpcklps,   new IntrinsicInfo(X86Instruction.Unpcklps,   IntrinsicType.Binary));
+            Add(Intrinsic.X86Vcvtph2ps,  new IntrinsicInfo(X86Instruction.Vcvtph2ps,  IntrinsicType.Unary));
+            Add(Intrinsic.X86Vcvtps2ph,  new IntrinsicInfo(X86Instruction.Vcvtps2ph,  IntrinsicType.BinaryImm));
             Add(Intrinsic.X86Xorpd,      new IntrinsicInfo(X86Instruction.Xorpd,      IntrinsicType.Binary));
             Add(Intrinsic.X86Xorps,      new IntrinsicInfo(X86Instruction.Xorps,      IntrinsicType.Binary));
         }

+ 2 - 1
ARMeilleure/CodeGen/X86/X86Instruction.cs

@@ -33,7 +33,6 @@ namespace ARMeilleure.CodeGen.X86
         Cmpxchg8,
         Comisd,
         Comiss,
-        Cpuid,
         Crc32,
         Crc32_16,
         Crc32_8,
@@ -199,6 +198,8 @@ namespace ARMeilleure.CodeGen.X86
         Unpcklps,
         Vblendvpd,
         Vblendvps,
+        Vcvtph2ps,
+        Vcvtps2ph,
         Vpblendvb,
         Xor,
         Xorpd,

+ 65 - 23
ARMeilleure/Instructions/InstEmitSimdCvt.cs

@@ -60,21 +60,48 @@ namespace ARMeilleure.Instructions
             }
             else if (op.Size == 0 && op.Opc == 3) // Single -> Half.
             {
-                Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0);
+                if (Optimizations.UseF16c)
+                {
+                    Debug.Assert(!Optimizations.ForceLegacySse);
 
-                Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne);
+                    Operand n = GetVec(op.Rn);
 
-                res = context.ZeroExtend16(OperandType.I64, res);
+                    Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
+                            res = context.AddIntrinsic(Intrinsic.X86Pslldq, res, Const(14)); // VectorZeroUpper112()
+                            res = context.AddIntrinsic(Intrinsic.X86Psrldq, res, Const(14));
 
-                context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1));
+                    context.Copy(GetVec(op.Rd), res);
+                }
+                else
+                {
+                    Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0);
+
+                    Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne);
+
+                    res = context.ZeroExtend16(OperandType.I64, res);
+
+                    context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1));
+                }
             }
             else if (op.Size == 3 && op.Opc == 0) // Half -> Single.
             {
-                Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1);
+                if (Optimizations.UseF16c)
+                {
+                    Debug.Assert(!Optimizations.ForceLegacySse);
+
+                    Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, GetVec(op.Rn));
+                            res = context.VectorZeroUpper96(res);
+
+                    context.Copy(GetVec(op.Rd), res);
+                }
+                else
+                {
+                    Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1);
 
-                Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne);
+                    Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne);
 
-                context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
+                    context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
+                }
             }
             else if (op.Size == 1 && op.Opc == 3) // Double -> Half.
             {
@@ -129,18 +156,20 @@ namespace ARMeilleure.Instructions
             if (Optimizations.UseSse2 && sizeF == 1)
             {
                 Operand n = GetVec(op.Rn);
-                Operand res;
 
-                if (op.RegisterSize == RegisterSize.Simd128)
-                {
-                    res = context.AddIntrinsic(Intrinsic.X86Movhlps, n, n);
-                }
-                else
-                {
-                    res = n;
-                }
+                Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n;
+                        res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res);
 
-                res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res);
+                context.Copy(GetVec(op.Rd), res);
+            }
+            else if (Optimizations.UseF16c && sizeF == 0)
+            {
+                Debug.Assert(!Optimizations.ForceLegacySse);
+
+                Operand n = GetVec(op.Rn);
+
+                Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n;
+                        res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, res);
 
                 context.Copy(GetVec(op.Rd), res);
             }
@@ -210,17 +239,30 @@ namespace ARMeilleure.Instructions
             {
                 Operand d = GetVec(op.Rd);
 
-                Operand res = context.VectorZeroUpper64(d);
+                Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps;
 
                 Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtpd2ps, GetVec(op.Rn));
+                        nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
 
-                nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
+                Operand res = context.VectorZeroUpper64(d);
+                        res = context.AddIntrinsic(movInst, res, nInt);
+
+                context.Copy(d, res);
+            }
+            else if (Optimizations.UseF16c && sizeF == 0)
+            {
+                Debug.Assert(!Optimizations.ForceLegacySse);
 
-                Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128
-                    ? Intrinsic.X86Movlhps
-                    : Intrinsic.X86Movhlps;
+                Operand d = GetVec(op.Rd);
+                Operand n = GetVec(op.Rn);
+
+                Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps;
 
-                res = context.AddIntrinsic(movInst, res, nInt);
+                Operand nInt = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
+                        nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
+
+                Operand res = context.VectorZeroUpper64(d);
+                        res = context.AddIntrinsic(movInst, res, nInt);
 
                 context.Copy(d, res);
             }

+ 2 - 0
ARMeilleure/IntermediateRepresentation/Intrinsic.cs

@@ -151,6 +151,8 @@ namespace ARMeilleure.IntermediateRepresentation
         X86Unpckhps,
         X86Unpcklpd,
         X86Unpcklps,
+        X86Vcvtph2ps,
+        X86Vcvtps2ph,
         X86Xorpd,
         X86Xorps
     }

+ 2 - 0
ARMeilleure/Optimizations.cs

@@ -14,6 +14,7 @@ namespace ARMeilleure
         public static bool UseSse42IfAvailable     { get; set; } = true;
         public static bool UsePopCntIfAvailable    { get; set; } = true;
         public static bool UseAvxIfAvailable       { get; set; } = true;
+        public static bool UseF16cIfAvailable      { get; set; } = true;
         public static bool UseAesniIfAvailable     { get; set; } = true;
         public static bool UsePclmulqdqIfAvailable { get; set; } = true;
 
@@ -31,6 +32,7 @@ namespace ARMeilleure
         internal static bool UseSse42     => UseSse42IfAvailable     && HardwareCapabilities.SupportsSse42;
         internal static bool UsePopCnt    => UsePopCntIfAvailable    && HardwareCapabilities.SupportsPopcnt;
         internal static bool UseAvx       => UseAvxIfAvailable       && HardwareCapabilities.SupportsAvx && !ForceLegacySse;
+        internal static bool UseF16c      => UseF16cIfAvailable      && HardwareCapabilities.SupportsF16c;
         internal static bool UseAesni     => UseAesniIfAvailable     && HardwareCapabilities.SupportsAesni;
         internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq;
     }

+ 3 - 17
ARMeilleure/Translation/PTC/Ptc.cs

@@ -1,5 +1,6 @@
 using ARMeilleure.CodeGen;
 using ARMeilleure.CodeGen.Unwinding;
+using ARMeilleure.CodeGen.X86;
 using ARMeilleure.Memory;
 using Ryujinx.Common.Configuration;
 using Ryujinx.Common.Logging;
@@ -10,7 +11,6 @@ using System.Diagnostics;
 using System.IO;
 using System.IO.Compression;
 using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics.X86;
 using System.Runtime.Serialization.Formatters.Binary;
 using System.Threading;
 using System.Threading.Tasks;
@@ -21,7 +21,7 @@ namespace ARMeilleure.Translation.PTC
     {
         private const string HeaderMagic = "PTChd";
 
-        private const int InternalVersion = 1273; //! To be incremented manually for each change to the ARMeilleure project.
+        private const int InternalVersion = 1650; //! To be incremented manually for each change to the ARMeilleure project.
 
         private const string ActualDir = "0";
         private const string BackupDir = "1";
@@ -646,21 +646,7 @@ namespace ARMeilleure.Translation.PTC
 
         private static ulong GetFeatureInfo()
         {
-            ulong featureInfo = 0ul;
-
-            featureInfo |= (Sse3.IsSupported      ? 1ul : 0ul) << 0;
-            featureInfo |= (Pclmulqdq.IsSupported ? 1ul : 0ul) << 1;
-            featureInfo |= (Ssse3.IsSupported     ? 1ul : 0ul) << 9;
-            featureInfo |= (Fma.IsSupported       ? 1ul : 0ul) << 12;
-            featureInfo |= (Sse41.IsSupported     ? 1ul : 0ul) << 19;
-            featureInfo |= (Sse42.IsSupported     ? 1ul : 0ul) << 20;
-            featureInfo |= (Popcnt.IsSupported    ? 1ul : 0ul) << 23;
-            featureInfo |= (Aes.IsSupported       ? 1ul : 0ul) << 25;
-            featureInfo |= (Avx.IsSupported       ? 1ul : 0ul) << 28;
-            featureInfo |= (Sse.IsSupported       ? 1ul : 0ul) << 57;
-            featureInfo |= (Sse2.IsSupported      ? 1ul : 0ul) << 58;
-
-            return featureInfo;
+            return (ulong)HardwareCapabilities.FeatureInfoEdx << 32 | (uint)HardwareCapabilities.FeatureInfoEcx;
         }
 
         private struct Header

+ 7 - 4
Ryujinx.Tests/Cpu/CpuTestSimd.cs

@@ -1973,15 +1973,18 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
-        [Test, Pairwise] [Explicit]
+        [Test, Pairwise] [Explicit] // Unicorn seems to default all rounding modes to RMode.Rn.
         public void F_Cvt_S_SH([ValueSource("_F_Cvt_S_SH_")] uint opcodes,
-                               [ValueSource("_1S_F_")] ulong a)
+                               [ValueSource("_1S_F_")] ulong a,
+                               [Values(RMode.Rn)] RMode rMode)
         {
             ulong z = TestContext.CurrentContext.Random.NextULong();
             V128 v0 = MakeVectorE0E1(z, z);
             V128 v1 = MakeVectorE0(a);
 
-            SingleOpcode(opcodes, v0: v0, v1: v1);
+            int fpcr = (int)rMode << (int)Fpcr.RMode;
+
+            SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr);
 
             CompareAgainstUnicorn();
         }
@@ -2134,7 +2137,7 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Ofc | Fpsr.Ufc | Fpsr.Ixc | Fpsr.Idc);
         }
 
-        [Test, Pairwise] [Explicit] // Unicorn seems to default all rounding modes to RMode.Rn.
+        [Test, Pairwise] [Explicit]
         public void F_Cvtn_V_2D2S_2D4S([ValueSource("_F_Cvtn_V_2D2S_2D4S_")] uint opcodes,
                                        [Values(0u)]     uint rd,
                                        [Values(1u, 0u)] uint rn,