|
|
@@ -10,6 +10,7 @@ using System.Diagnostics;
|
|
|
|
|
|
using static ARMeilleure.Instructions.InstEmitHelper;
|
|
|
using static ARMeilleure.Instructions.InstEmitSimdHelper;
|
|
|
+using static ARMeilleure.Instructions.InstEmitSimdHelper32;
|
|
|
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
|
|
|
|
|
|
namespace ARMeilleure.Instructions
|
|
|
@@ -1928,6 +1929,112 @@ namespace ARMeilleure.Instructions
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ public static void Pmull_V(ArmEmitterContext context)
|
|
|
+ {
|
|
|
+ OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
|
|
|
+
|
|
|
+ if (Optimizations.UsePclmulqdq && op.Size == 3)
|
|
|
+ {
|
|
|
+ Operand n = GetVec(op.Rn);
|
|
|
+ Operand m = GetVec(op.Rm);
|
|
|
+
|
|
|
+ int imm8 = op.RegisterSize == RegisterSize.Simd64 ? 0b0000_0000 : 0b0001_0001;
|
|
|
+
|
|
|
+ Operand res = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, n, m, Const(imm8));
|
|
|
+
|
|
|
+ context.Copy(GetVec(op.Rd), res);
|
|
|
+ }
|
|
|
+ else if (Optimizations.UseSse41)
|
|
|
+ {
|
|
|
+ Operand n = GetVec(op.Rn);
|
|
|
+ Operand m = GetVec(op.Rm);
|
|
|
+
|
|
|
+ if (op.RegisterSize == RegisterSize.Simd64)
|
|
|
+ {
|
|
|
+ n = context.VectorZeroUpper64(n);
|
|
|
+ m = context.VectorZeroUpper64(m);
|
|
|
+ }
|
|
|
+ else /* if (op.RegisterSize == RegisterSize.Simd128) */
|
|
|
+ {
|
|
|
+ n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
|
|
|
+ m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
|
|
|
+ }
|
|
|
+
|
|
|
+ Operand res = context.VectorZero();
|
|
|
+
|
|
|
+ if (op.Size == 0)
|
|
|
+ {
|
|
|
+ n = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, n);
|
|
|
+ m = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, m);
|
|
|
+
|
|
|
+ for (int i = 0; i < 8; i++)
|
|
|
+ {
|
|
|
+ Operand mask = context.AddIntrinsic(Intrinsic.X86Psllw, n, Const(15 - i));
|
|
|
+ mask = context.AddIntrinsic(Intrinsic.X86Psraw, mask, Const(15));
|
|
|
+
|
|
|
+ Operand tmp = context.AddIntrinsic(Intrinsic.X86Psllw, m, Const(i));
|
|
|
+ tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask);
|
|
|
+
|
|
|
+ res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ else /* if (op.Size == 3) */
|
|
|
+ {
|
|
|
+ Operand zero = context.VectorZero();
|
|
|
+
|
|
|
+ for (int i = 0; i < 64; i++)
|
|
|
+ {
|
|
|
+ Operand mask = context.AddIntrinsic(Intrinsic.X86Movlhps, n, n);
|
|
|
+ mask = context.AddIntrinsic(Intrinsic.X86Psllq, mask, Const(63 - i));
|
|
|
+ mask = context.AddIntrinsic(Intrinsic.X86Psrlq, mask, Const(63));
|
|
|
+ mask = context.AddIntrinsic(Intrinsic.X86Psubq, zero, mask);
|
|
|
+
|
|
|
+ Operand tmp = EmitSse2Sll_128(context, m, i);
|
|
|
+ tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask);
|
|
|
+
|
|
|
+ res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ context.Copy(GetVec(op.Rd), res);
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ Operand n = GetVec(op.Rn);
|
|
|
+ Operand m = GetVec(op.Rm);
|
|
|
+
|
|
|
+ Operand res;
|
|
|
+
|
|
|
+ if (op.Size == 0)
|
|
|
+ {
|
|
|
+ res = context.VectorZero();
|
|
|
+
|
|
|
+ int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 8;
|
|
|
+
|
|
|
+ for (int index = 0; index < 8; index++)
|
|
|
+ {
|
|
|
+ Operand ne = context.VectorExtract8(n, part + index);
|
|
|
+ Operand me = context.VectorExtract8(m, part + index);
|
|
|
+
|
|
|
+ Operand de = EmitPolynomialMultiply(context, ne, me, 8);
|
|
|
+
|
|
|
+ res = EmitVectorInsert(context, res, de, index, 1);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ else /* if (op.Size == 3) */
|
|
|
+ {
|
|
|
+ int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 1;
|
|
|
+
|
|
|
+ Operand ne = context.VectorExtract(OperandType.I64, n, part);
|
|
|
+ Operand me = context.VectorExtract(OperandType.I64, m, part);
|
|
|
+
|
|
|
+ res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me);
|
|
|
+ }
|
|
|
+
|
|
|
+ context.Copy(GetVec(op.Rd), res);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
public static void Raddhn_V(ArmEmitterContext context)
|
|
|
{
|
|
|
EmitHighNarrow(context, (op1, op2) => context.Add(op1, op2), round: true);
|
|
|
@@ -3690,5 +3797,23 @@ namespace ARMeilleure.Instructions
|
|
|
|
|
|
context.Copy(GetVec(op.Rd), res);
|
|
|
}
|
|
|
+
|
|
|
+ private static Operand EmitSse2Sll_128(ArmEmitterContext context, Operand op, int shift)
|
|
|
+ {
|
|
|
+ // The upper part of op is assumed to be zero.
|
|
|
+ Debug.Assert(shift >= 0 && shift < 64);
|
|
|
+
|
|
|
+ if (shift == 0)
|
|
|
+ {
|
|
|
+ return op;
|
|
|
+ }
|
|
|
+
|
|
|
+ Operand high = context.AddIntrinsic(Intrinsic.X86Pslldq, op, Const(8));
|
|
|
+ high = context.AddIntrinsic(Intrinsic.X86Psrlq, high, Const(64 - shift));
|
|
|
+
|
|
|
+ Operand low = context.AddIntrinsic(Intrinsic.X86Psllq, op, Const(shift));
|
|
|
+
|
|
|
+ return context.AddIntrinsic(Intrinsic.X86Por, high, low);
|
|
|
+ }
|
|
|
}
|
|
|
}
|