Browse Source

Implement VCNT instruction (#1963)

* Implement VCNT based on AArch64 CNT

Add tests

* Update PTC version

* Address LDj's comments

* Explicit size in encoding
* Tighter tests
* Replace SoftFallback with IR helper

Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>

* Reduce one BitwiseAnd from IR fallback

Based on popcount64b from https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation

* Rename parameter and add assert

Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>

Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>
mageven 5 years ago
parent
commit
9bda7b4699

+ 1 - 0
ARMeilleure/Decoders/OpCodeTable.cs

@@ -814,6 +814,7 @@ namespace ARMeilleure.Decoders
             SetA32("111100111x11xx01xxxx0x100xx0xxxx", InstName.Vclt,     InstEmit32.Vclt_Z,   OpCode32SimdCmpZ.Create);
             SetA32("111100111x11xx01xxxx0x100xx0xxxx", InstName.Vclt,     InstEmit32.Vclt_Z,   OpCode32SimdCmpZ.Create);
             SetA32("<<<<11101x11010xxxxx101x01x0xxxx", InstName.Vcmp,     InstEmit32.Vcmp,     OpCode32SimdS.Create);
             SetA32("<<<<11101x11010xxxxx101x01x0xxxx", InstName.Vcmp,     InstEmit32.Vcmp,     OpCode32SimdS.Create);
             SetA32("<<<<11101x11010xxxxx101x11x0xxxx", InstName.Vcmpe,    InstEmit32.Vcmpe,    OpCode32SimdS.Create);
             SetA32("<<<<11101x11010xxxxx101x11x0xxxx", InstName.Vcmpe,    InstEmit32.Vcmpe,    OpCode32SimdS.Create);
+            SetA32("111100111x110000xxxx01010xx0xxxx", InstName.Vcnt,     InstEmit32.Vcnt,     OpCode32SimdCmpZ.Create);
             SetA32("<<<<11101x110111xxxx101x11x0xxxx", InstName.Vcvt,     InstEmit32.Vcvt_FD,  OpCode32SimdS.Create); // FP 32 and 64, scalar.
             SetA32("<<<<11101x110111xxxx101x11x0xxxx", InstName.Vcvt,     InstEmit32.Vcvt_FD,  OpCode32SimdS.Create); // FP 32 and 64, scalar.
             SetA32("<<<<11101x11110xxxxx101x11x0xxxx", InstName.Vcvt,     InstEmit32.Vcvt_FI,  OpCode32SimdCvtFI.Create); // FP32 to int.
             SetA32("<<<<11101x11110xxxxx101x11x0xxxx", InstName.Vcvt,     InstEmit32.Vcvt_FI,  OpCode32SimdCvtFI.Create); // FP32 to int.
             SetA32("<<<<11101x111000xxxx101xx1x0xxxx", InstName.Vcvt,     InstEmit32.Vcvt_FI,  OpCode32SimdCvtFI.Create); // Int to FP32.
             SetA32("<<<<11101x111000xxxx101xx1x0xxxx", InstName.Vcvt,     InstEmit32.Vcvt_FI,  OpCode32SimdCvtFI.Create); // Int to FP32.

+ 1 - 1
ARMeilleure/Instructions/InstEmitSimdArithmetic.cs

@@ -289,7 +289,7 @@ namespace ARMeilleure.Instructions
                 }
                 }
                 else
                 else
                 {
                 {
-                    de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountSetBits8)), ne);
+                    de = EmitCountSetBits8(context, ne);
                 }
                 }
 
 
                 res = EmitVectorInsert(context, res, de, index, 0);
                 res = EmitVectorInsert(context, res, de, index, 0);

+ 28 - 0
ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs

@@ -135,6 +135,34 @@ namespace ARMeilleure.Instructions
             EmitVectorBinaryWideOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
             EmitVectorBinaryWideOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
         }
         }
 
 
+        public static void Vcnt(ArmEmitterContext context)
+        {
+            OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp;
+
+            Operand res = GetVecA32(op.Qd);
+
+            int elems = op.GetBytesCount();
+
+            for (int index = 0; index < elems; index++)
+            {
+                Operand de;
+                Operand me = EmitVectorExtractZx32(context, op.Qm, op.Im + index, op.Size);
+
+                if (Optimizations.UsePopCnt)
+                {
+                    de = context.AddIntrinsicInt(Intrinsic.X86Popcnt, me);
+                }
+                else
+                {
+                    de = EmitCountSetBits8(context, me);
+                }
+
+                res = EmitVectorInsert(context, res, de, op.Id + index, op.Size);
+            }
+
+            context.Copy(GetVecA32(op.Qd), res);
+        }
+
         public static void Vdup(ArmEmitterContext context)
         public static void Vdup(ArmEmitterContext context)
         {
         {
             OpCode32SimdDupGP op = (OpCode32SimdDupGP)context.CurrOp;
             OpCode32SimdDupGP op = (OpCode32SimdDupGP)context.CurrOp;

+ 12 - 0
ARMeilleure/Instructions/InstEmitSimdHelper.cs

@@ -234,6 +234,18 @@ namespace ARMeilleure.Instructions
             throw new ArgumentException($"Invalid rounding mode \"{roundMode}\".");
             throw new ArgumentException($"Invalid rounding mode \"{roundMode}\".");
         }
         }
 
 
+        public static Operand EmitCountSetBits8(ArmEmitterContext context, Operand op) // "size" is 8 (SIMD&FP Inst.).
+        {
+            Debug.Assert(op.Type == OperandType.I32 || op.Type == OperandType.I64);
+
+            Operand op0 = context.Subtract(op, context.BitwiseAnd(context.ShiftRightUI(op, Const(1)), Const(op.Type, 0x55L)));
+
+            Operand c1 = Const(op.Type, 0x33L);
+            Operand op1 = context.Add(context.BitwiseAnd(context.ShiftRightUI(op0, Const(2)), c1), context.BitwiseAnd(op0, c1));
+
+            return context.BitwiseAnd(context.Add(op1, context.ShiftRightUI(op1, Const(4))), Const(op.Type, 0x0fL));
+        }
+
         public static void EmitScalarUnaryOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
         public static void EmitScalarUnaryOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
         {
         {
             OpCodeSimd op = (OpCodeSimd)context.CurrOp;
             OpCodeSimd op = (OpCodeSimd)context.CurrOp;

+ 1 - 0
ARMeilleure/Instructions/InstName.cs

@@ -567,6 +567,7 @@ namespace ARMeilleure.Instructions
         Vclt,
         Vclt,
         Vcmp,
         Vcmp,
         Vcmpe,
         Vcmpe,
+        Vcnt,
         Vcvt,
         Vcvt,
         Vdiv,
         Vdiv,
         Vdup,
         Vdup,

+ 0 - 8
ARMeilleure/Instructions/SoftFallback.cs

@@ -846,14 +846,6 @@ namespace ARMeilleure.Instructions
 
 
             return (ulong)count;
             return (ulong)count;
         }
         }
-
-        public static ulong CountSetBits8(ulong value) // "size" is 8 (SIMD&FP Inst.).
-        {
-            value = ((value >> 1) & 0x55ul) + (value & 0x55ul);
-            value = ((value >> 2) & 0x33ul) + (value & 0x33ul);
-
-            return (value >> 4) + (value & 0x0ful);
-        }
 #endregion
 #endregion
 
 
 #region "Table"
 #region "Table"

+ 0 - 1
ARMeilleure/Translation/Delegates.cs

@@ -148,7 +148,6 @@ namespace ARMeilleure.Translation
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.BinaryUnsignedSatQSub)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.BinaryUnsignedSatQSub)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingSigns)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingSigns)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)));
-            SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountSetBits8)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Crc32b)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Crc32b)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Crc32cb)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Crc32cb)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Crc32ch)));
             SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Crc32ch)));

+ 1 - 1
ARMeilleure/Translation/PTC/Ptc.cs

@@ -26,7 +26,7 @@ namespace ARMeilleure.Translation.PTC
     {
     {
         private const string HeaderMagicString = "PTChd\0\0\0";
         private const string HeaderMagicString = "PTChd\0\0\0";
 
 
-        private const uint InternalVersion = 1968; //! To be incremented manually for each change to the ARMeilleure project.
+        private const int InternalVersion = 1963; //! To be incremented manually for each change to the ARMeilleure project.
 
 
         private const string ActualDir = "0";
         private const string ActualDir = "0";
         private const string BackupDir = "1";
         private const string BackupDir = "1";

+ 37 - 0
Ryujinx.Tests/Cpu/CpuTestSimd32.cs

@@ -154,6 +154,15 @@ namespace Ryujinx.Tests.Cpu
                 yield return rnd2;
                 yield return rnd2;
             }
             }
         }
         }
+
+        private static IEnumerable<ulong> _GenPopCnt8B_()
+        {
+            for (ulong cnt = 0ul; cnt <= 255ul; cnt++)
+            {
+                yield return (cnt << 56) | (cnt << 48) | (cnt << 40) | (cnt << 32) |
+                             (cnt << 24) | (cnt << 16) | (cnt << 08) | cnt;
+            }
+        }
 #endregion
 #endregion
 
 
         private const int RndCnt = 2;
         private const int RndCnt = 2;
@@ -217,6 +226,34 @@ namespace Ryujinx.Tests.Cpu
 
 
             CompareAgainstUnicorn();
             CompareAgainstUnicorn();
         }
         }
+
+        [Test, Pairwise, Description("VCNT.8 D0, D0 | VCNT.8 Q0, Q0")]
+        public void Vcnt([Values(0u, 1u)] uint rd,
+                         [Values(0u, 1u)] uint rm,
+                         [ValueSource(nameof(_GenPopCnt8B_))] [Random(RndCnt)] ulong d0,
+                         [Values] bool q)
+        {
+            ulong d1 = ~d0; // It's expensive to have a second generator.
+
+            uint opcode = 0xf3b00500u; // VCNT.8 D0, D0
+
+            if (q)
+            {
+                opcode |= 1u << 6;
+
+                rd &= ~1u;
+                rm &= ~1u;
+            }
+
+            opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
+            opcode |= ((rm & 0xf) << 0)  | ((rm & 0x10) << 1);
+
+            V128 v0 = MakeVectorE0E1(d0, d1);
+
+            SingleOpcode(opcode, v0: v0);
+
+            CompareAgainstUnicorn();
+        }
 #endif
 #endif
     }
     }
 }
 }