From 8a33e884f8f482e93e2b90380b158c1417cc50f8 Mon Sep 17 00:00:00 2001 From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> Date: Thu, 17 Dec 2020 20:43:41 +0100 Subject: [PATCH] Fix Vnmls_S fast path (F64: losing input d value). Fix Vnmla_S & Vnmls_S slow paths (using fused inst.s). Fix Vfma_V slow path not using StandardFPSCRValue(). (#1775) * Fix Vnmls_S fast path (F64: losing input d value). Fix Vnmla_S & Vnmls_S slow paths (using fused inst.s). Add Vfma_S & Vfms_S Fma fast paths. Add Vfnma_S inst. with Fma/Sse fast paths and slow path. Add Vfnms_S Sse fast path. Add Tests for affected inst.s. Nits. * InternalVersion = 1775 * Nits. * Fix Vfma_V slow path not using StandardFPSCRValue(). * Nit: Fix Vfma_V order. * Add Vfms_V Sse fast path and slow path. * Add Vfma_V and Vfms_V Test. --- ARMeilleure/CodeGen/X86/Assembler.cs | 14 +- ARMeilleure/CodeGen/X86/CodeGenerator.cs | 14 +- ARMeilleure/CodeGen/X86/IntrinsicTable.cs | 8 +- ARMeilleure/CodeGen/X86/X86Instruction.cs | 12 +- ARMeilleure/Decoders/OpCodeTable.cs | 1 + ARMeilleure/Instructions/InstEmitAlu32.cs | 2 +- ARMeilleure/Instructions/InstEmitMul32.cs | 2 +- .../Instructions/InstEmitSimdArithmetic32.cs | 114 ++++---- .../Instructions/InstEmitSimdHelper32.cs | 26 +- .../IntermediateRepresentation/Intrinsic.cs | 8 +- ARMeilleure/Translation/PTC/Ptc.cs | 2 +- Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs | 59 ++-- Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs | 251 +++++++++++------- 13 files changed, 292 insertions(+), 221 deletions(-) diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs index 7f19c3c4..2484e251 100644 --- a/ARMeilleure/CodeGen/X86/Assembler.cs +++ b/ARMeilleure/CodeGen/X86/Assembler.cs @@ -274,17 +274,15 @@ namespace ARMeilleure.CodeGen.X86 Add(X86Instruction.Vcvtph2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vcvtps2ph, new InstructionInfo(0x000f3a1d, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vfmadd231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66)); - Add(X86Instruction.Vfmadd231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); - Add(X86Instruction.Vfmadd231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vfmadd231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); - Add(X86Instruction.Vfmsub231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66)); - Add(X86Instruction.Vfmsub231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); - Add(X86Instruction.Vfmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Vfmadd231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vfmsub231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); - Add(X86Instruction.Vfnmsub231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38be, InstructionFlags.Vex | InstructionFlags.Prefix66)); - Add(X86Instruction.Vfnmsub231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38be, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); - Add(X86Instruction.Vfnmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Vfmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Vfnmadd231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Vfnmadd231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); + Add(X86Instruction.Vfnmadd231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vfnmsub231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); + Add(X86Instruction.Vfnmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vpblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Xor, new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp, 0x00000033, InstructionFlags.None)); Add(X86Instruction.Xorpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66)); diff --git a/ARMeilleure/CodeGen/X86/CodeGenerator.cs b/ARMeilleure/CodeGen/X86/CodeGenerator.cs index 29a4cd78..5f41ff79 100644 --- a/ARMeilleure/CodeGen/X86/CodeGenerator.cs +++ b/ARMeilleure/CodeGen/X86/CodeGenerator.cs @@ -440,9 +440,12 @@ namespace ARMeilleure.CodeGen.X86 else { EnsureSameReg(dest, src1); + Debug.Assert(src3.GetRegister().Index == 0); + context.Assembler.WriteInstruction(info.Inst, dest, src1, src2); } + break; } @@ -474,11 +477,16 @@ namespace ARMeilleure.CodeGen.X86 Operand src2 = operation.GetSource(1); Operand src3 = operation.GetSource(2); - EnsureSameType(dest, src1, src2, src3); - EnsureSameReg(dest, src1); - Debug.Assert(!dest.Type.IsInteger()); Debug.Assert(HardwareCapabilities.SupportsVexEncoding); + Debug.Assert(dest.Kind == OperandKind.Register && src1.Kind == OperandKind.Register && src2.Kind == OperandKind.Register); + Debug.Assert(src3.Kind == OperandKind.Register || src3.Kind == OperandKind.Memory); + + EnsureSameType(dest, src1, src2, src3); + Debug.Assert(dest.Type == OperandType.V128); + + Debug.Assert(dest.Value == src1.Value); + context.Assembler.WriteInstruction(info.Inst, dest, src2, src3); break; diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs index 195ce91d..9030be3c 100644 --- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs +++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs @@ -166,16 +166,14 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Unpcklps, new IntrinsicInfo(X86Instruction.Unpcklps, IntrinsicType.Binary)); Add(Intrinsic.X86Vcvtph2ps, new IntrinsicInfo(X86Instruction.Vcvtph2ps, IntrinsicType.Unary)); Add(Intrinsic.X86Vcvtps2ph, new IntrinsicInfo(X86Instruction.Vcvtps2ph, IntrinsicType.BinaryImm)); - Add(Intrinsic.X86Vfmadd231pd, new IntrinsicInfo(X86Instruction.Vfmadd231pd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmadd231ps, new IntrinsicInfo(X86Instruction.Vfmadd231ps, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmadd231sd, new IntrinsicInfo(X86Instruction.Vfmadd231sd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmadd231ss, new IntrinsicInfo(X86Instruction.Vfmadd231ss, IntrinsicType.Fma)); - Add(Intrinsic.X86Vfmsub231pd, new IntrinsicInfo(X86Instruction.Vfmsub231pd, IntrinsicType.Fma)); - Add(Intrinsic.X86Vfmsub231ps, new IntrinsicInfo(X86Instruction.Vfmsub231ps, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmsub231sd, new IntrinsicInfo(X86Instruction.Vfmsub231sd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmsub231ss, new IntrinsicInfo(X86Instruction.Vfmsub231ss, IntrinsicType.Fma)); - Add(Intrinsic.X86Vfnmsub231pd, new IntrinsicInfo(X86Instruction.Vfnmsub231pd, IntrinsicType.Fma)); - Add(Intrinsic.X86Vfnmsub231ps, new IntrinsicInfo(X86Instruction.Vfnmsub231ps, IntrinsicType.Fma)); + Add(Intrinsic.X86Vfnmadd231ps, new IntrinsicInfo(X86Instruction.Vfnmadd231ps, IntrinsicType.Fma)); + Add(Intrinsic.X86Vfnmadd231sd, new IntrinsicInfo(X86Instruction.Vfnmadd231sd, IntrinsicType.Fma)); + Add(Intrinsic.X86Vfnmadd231ss, new IntrinsicInfo(X86Instruction.Vfnmadd231ss, IntrinsicType.Fma)); Add(Intrinsic.X86Vfnmsub231sd, new IntrinsicInfo(X86Instruction.Vfnmsub231sd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfnmsub231ss, new IntrinsicInfo(X86Instruction.Vfnmsub231ss, IntrinsicType.Fma)); Add(Intrinsic.X86Xorpd, new IntrinsicInfo(X86Instruction.Xorpd, IntrinsicType.Binary)); diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs index 7ed4841c..ed5b50c5 100644 --- a/ARMeilleure/CodeGen/X86/X86Instruction.cs +++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs @@ -203,18 +203,16 @@ namespace ARMeilleure.CodeGen.X86 Vblendvps, Vcvtph2ps, Vcvtps2ph, - Vfmadd231pd, Vfmadd231ps, Vfmadd231sd, Vfmadd231ss, - Vfmsub231ps, - Vfmsub231pd, - Vfmsub231ss, Vfmsub231sd, - Vfnmsub231ps, - Vfnmsub231pd, - Vfnmsub231ss, + Vfmsub231ss, + Vfnmadd231ps, + Vfnmadd231sd, + Vfnmadd231ss, Vfnmsub231sd, + Vfnmsub231ss, Vpblendvb, Xor, Xorpd, diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs index 88c68644..665e7129 100644 --- a/ARMeilleure/Decoders/OpCodeTable.cs +++ b/ARMeilleure/Decoders/OpCodeTable.cs @@ -822,6 +822,7 @@ namespace ARMeilleure.Decoders SetA32("<<<<11101x10xxxxxxxx101xx0x0xxxx", InstName.Vfma, InstEmit32.Vfma_S, OpCode32SimdRegS.Create); SetA32("111100100x00xxxxxxxx1100xxx1xxxx", InstName.Vfma, InstEmit32.Vfma_V, OpCode32SimdReg.Create); SetA32("<<<<11101x10xxxxxxxx101xx1x0xxxx", InstName.Vfms, InstEmit32.Vfms_S, OpCode32SimdRegS.Create); + SetA32("111100100x10xxxxxxxx1100xxx1xxxx", InstName.Vfms, InstEmit32.Vfms_V, OpCode32SimdReg.Create); SetA32("<<<<11101x01xxxxxxxx101xx1x0xxxx", InstName.Vfnma, InstEmit32.Vfnma_S, OpCode32SimdRegS.Create); SetA32("<<<<11101x01xxxxxxxx101xx0x0xxxx", InstName.Vfnms, InstEmit32.Vfnms_S, OpCode32SimdRegS.Create); SetA32("1111001x0x< - { - return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3); - }); - } - } - - public static void Vfma_S(ArmEmitterContext context) // Fused. - { - if (Optimizations.FastFP && Optimizations.UseSse2) - { - // TODO: Use FMA instruction set. EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd); } else @@ -285,11 +271,29 @@ namespace ARMeilleure.Instructions } } + public static void Vfma_V(ArmEmitterContext context) // Fused. + { + if (Optimizations.FastFP && Optimizations.UseFma) + { + EmitVectorTernaryOpF32(context, Intrinsic.X86Vfmadd231ps); + } + else + { + EmitVectorTernaryOpF32(context, (op1, op2, op3) => + { + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulAddFpscr), op1, op2, op3); + }); + } + } + public static void Vfms_S(ArmEmitterContext context) // Fused. { - if (Optimizations.FastFP && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseFma) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Vfnmadd231ss, Intrinsic.X86Vfnmadd231sd); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) { - // TODO: Use FMA instruction set. EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd); } else @@ -301,17 +305,36 @@ namespace ARMeilleure.Instructions } } + public static void Vfms_V(ArmEmitterContext context) // Fused. + { + if (Optimizations.FastFP && Optimizations.UseFma) + { + EmitVectorTernaryOpF32(context, Intrinsic.X86Vfnmadd231ps); + } + else + { + EmitVectorTernaryOpF32(context, (op1, op2, op3) => + { + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulSubFpscr), op1, op2, op3); + }); + } + } + public static void Vfnma_S(ArmEmitterContext context) // Fused. { if (Optimizations.FastFP && Optimizations.UseFma) { EmitScalarTernaryOpF32(context, Intrinsic.X86Vfnmsub231ss, Intrinsic.X86Vfnmsub231sd); } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd, isNegD: true); + } else { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), context.Negate(op1), context.Negate(op2), op3); + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3); }); } } @@ -322,11 +345,15 @@ namespace ARMeilleure.Instructions { EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmsub231ss, Intrinsic.X86Vfmsub231sd); } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd, isNegD: true); + } else { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), context.Negate(op1), op2, op3); + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3); }); } } @@ -422,36 +449,21 @@ namespace ARMeilleure.Instructions if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitScalarTernaryOpSimd32(context, (d, n, m) => - { - if ((op.Size & 1) == 0) - { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addss, d, res); - Operand mask = X86GetScalar(context, -0f); - return context.AddIntrinsic(Intrinsic.X86Xorps, mask, res); - } - else - { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addsd, d, res); - Operand mask = X86GetScalar(context, -0d); - return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res); - } - }); + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd, isNegD: true); } else if (Optimizations.FastFP) { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return context.Negate(context.Add(op1, context.Multiply(op2, op3))); + return context.Subtract(context.Negate(op1), context.Multiply(op2, op3)); }); } else { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3); + Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3); + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), context.Negate(op1), res); }); } } @@ -462,24 +474,7 @@ namespace ARMeilleure.Instructions if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitScalarTernaryOpSimd32(context, (d, n, m) => - { - if ((op.Size & 1) == 0) - { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - Operand mask = X86GetScalar(context, -0f); - d = context.AddIntrinsic(Intrinsic.X86Xorps, mask, d); - return context.AddIntrinsic(Intrinsic.X86Addss, d, res); - - } - else - { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - Operand mask = X86GetScalar(context, -0d); - d = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res); - return context.AddIntrinsic(Intrinsic.X86Addsd, d, res); - } - }); + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd, isNegD: true); } else if (Optimizations.FastFP) { @@ -492,7 +487,8 @@ namespace ARMeilleure.Instructions { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3); + Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3); + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), context.Negate(op1), res); }); } } diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs index 2d5d4ba9..39195057 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs @@ -820,15 +820,15 @@ namespace ARMeilleure.Instructions }); } - public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32) { OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; - Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + Debug.Assert((op.Size & 1) == 0); EmitVectorTernaryOpSimd32(context, (d, n, m) => { - return context.AddIntrinsic(inst, d, n, m); + return context.AddIntrinsic(inst32, d, n, m); }); } @@ -927,7 +927,13 @@ namespace ARMeilleure.Instructions }); } - public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2) + public static void EmitScalarTernaryOpF32( + ArmEmitterContext context, + Intrinsic inst32pt1, + Intrinsic inst64pt1, + Intrinsic inst32pt2, + Intrinsic inst64pt2, + bool isNegD = false) { OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; @@ -939,6 +945,18 @@ namespace ARMeilleure.Instructions EmitScalarTernaryOpSimd32(context, (d, n, m) => { Operand res = context.AddIntrinsic(inst1, n, m); + + if (isNegD) + { + Operand mask = doubleSize + ? X86GetScalar(context, -0d) + : X86GetScalar(context, -0f); + + d = doubleSize + ? context.AddIntrinsic(Intrinsic.X86Xorpd, mask, d) + : context.AddIntrinsic(Intrinsic.X86Xorps, mask, d); + } + return context.AddIntrinsic(inst2, d, res); }); } diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs index 515f1143..e2989863 100644 --- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs +++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs @@ -155,16 +155,14 @@ namespace ARMeilleure.IntermediateRepresentation X86Unpcklps, X86Vcvtph2ps, X86Vcvtps2ph, - X86Vfmadd231pd, X86Vfmadd231ps, X86Vfmadd231sd, X86Vfmadd231ss, - X86Vfmsub231pd, - X86Vfmsub231ps, X86Vfmsub231sd, X86Vfmsub231ss, - X86Vfnmsub231pd, - X86Vfnmsub231ps, + X86Vfnmadd231ps, + X86Vfnmadd231sd, + X86Vfnmadd231ss, X86Vfnmsub231sd, X86Vfnmsub231ss, X86Xorpd, diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs index 3150c97c..b5a92b97 100644 --- a/ARMeilleure/Translation/PTC/Ptc.cs +++ b/ARMeilleure/Translation/PTC/Ptc.cs @@ -22,7 +22,7 @@ namespace ARMeilleure.Translation.PTC { private const string HeaderMagic = "PTChd"; - private const uint InternalVersion = 1713; //! To be incremented manually for each change to the ARMeilleure project. + private const int InternalVersion = 1775; //! To be incremented manually for each change to the ARMeilleure project. private const string ActualDir = "0"; private const string BackupDir = "1"; diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs index 565d231a..395f2464 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs @@ -22,41 +22,45 @@ namespace Ryujinx.Tests.Cpu 0x80000000u, 0xFFFFFFFFu }; } - private static IEnumerable _1S_F_() + private static IEnumerable _1S_F_() { - yield return 0xFF7FFFFFu; // -Max Normal (float.MinValue) - yield return 0x80800000u; // -Min Normal - yield return 0x807FFFFFu; // -Max Subnormal - yield return 0x80000001u; // -Min Subnormal (-float.Epsilon) - yield return 0x7F7FFFFFu; // +Max Normal (float.MaxValue) - yield return 0x00800000u; // +Min Normal - yield return 0x007FFFFFu; // +Max Subnormal - yield return 0x00000001u; // +Min Subnormal (float.Epsilon) + yield return 0x00000000FF7FFFFFul; // -Max Normal (float.MinValue) + yield return 0x0000000080800000ul; // -Min Normal + yield return 0x00000000807FFFFFul; // -Max Subnormal + yield return 0x0000000080000001ul; // -Min Subnormal (-float.Epsilon) + yield return 0x000000007F7FFFFFul; // +Max Normal (float.MaxValue) + yield return 0x0000000000800000ul; // +Min Normal + yield return 0x00000000007FFFFFul; // +Max Subnormal + yield return 0x0000000000000001ul; // +Min Subnormal (float.Epsilon) if (!NoZeros) { - yield return 0x80000000u; // -Zero - yield return 0x00000000u; // +Zero + yield return 0x0000000080000000ul; // -Zero + yield return 0x0000000000000000ul; // +Zero } if (!NoInfs) { - yield return 0xFF800000u; // -Infinity - yield return 0x7F800000u; // +Infinity + yield return 0x00000000FF800000ul; // -Infinity + yield return 0x000000007F800000ul; // +Infinity } if (!NoNaNs) { - yield return 0xFFC00000u; // -QNaN (all zeros payload) (float.NaN) - yield return 0xFFBFFFFFu; // -SNaN (all ones payload) - yield return 0x7FC00000u; // +QNaN (all zeros payload) (-float.NaN) (DefaultNaN) - yield return 0x7FBFFFFFu; // +SNaN (all ones payload) + yield return 0x00000000FFC00000ul; // -QNaN (all zeros payload) (float.NaN) + yield return 0x00000000FFBFFFFFul; // -SNaN (all ones payload) + yield return 0x000000007FC00000ul; // +QNaN (all zeros payload) (-float.NaN) (DefaultNaN) + yield return 0x000000007FBFFFFFul; // +SNaN (all ones payload) } for (int cnt = 1; cnt <= RndCnt; cnt++) { - yield return GenNormalS(); - yield return GenSubnormalS(); + ulong grbg = TestContext.CurrentContext.Random.NextUInt(); + ulong rnd1 = GenNormalS(); + ulong rnd2 = GenSubnormalS(); + + yield return (grbg << 32) | rnd1; + yield return (grbg << 32) | rnd2; } } @@ -93,8 +97,11 @@ namespace Ryujinx.Tests.Cpu for (int cnt = 1; cnt <= RndCnt; cnt++) { - yield return GenNormalD(); - yield return GenSubnormalD(); + ulong rnd1 = GenNormalD(); + ulong rnd2 = GenSubnormalD(); + + yield return rnd1; + yield return rnd2; } } #endregion @@ -109,10 +116,10 @@ namespace Ryujinx.Tests.Cpu [Test, Pairwise, Description("VCVT.
.F32 , ")] public void Vcvt_F32_I32([Values(0u, 1u, 2u, 3u)] uint rd, [Values(0u, 1u, 2u, 3u)] uint rm, - [ValueSource(nameof(_1S_F_))] uint s0, - [ValueSource(nameof(_1S_F_))] uint s1, - [ValueSource(nameof(_1S_F_))] uint s2, - [ValueSource(nameof(_1S_F_))] uint s3, + [ValueSource(nameof(_1S_F_))] ulong s0, + [ValueSource(nameof(_1S_F_))] ulong s1, + [ValueSource(nameof(_1S_F_))] ulong s2, + [ValueSource(nameof(_1S_F_))] ulong s3, [Values] bool unsigned) // { uint opcode = 0xeebc0ac0u; // VCVT.U32.F32 S0, S0 @@ -125,7 +132,7 @@ namespace Ryujinx.Tests.Cpu opcode |= ((rd & 0x1e) << 11) | ((rd & 0x1) << 22); opcode |= ((rm & 0x1e) >> 1) | ((rm & 0x1) << 5); - V128 v0 = MakeVectorE0E1E2E3(s0, s1, s2, s3); + V128 v0 = MakeVectorE0E1E2E3((uint)s0, (uint)s1, (uint)s2, (uint)s3); SingleOpcode(opcode, v0: v0); diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs index 4298bd1f..e8298521 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg32.cs @@ -22,6 +22,59 @@ namespace Ryujinx.Tests.Cpu }; } + private static uint[] _Vfma_Vfms_Vfnma_Vfnms_S_F32_() + { + return new uint[] + { + 0xEEA00A00u, // VFMA. F32 S0, S0, S0 + 0xEEA00A40u, // VFMS. F32 S0, S0, S0 + 0xEE900A40u, // VFNMA.F32 S0, S0, S0 + 0xEE900A00u // VFNMS.F32 S0, S0, S0 + }; + } + + private static uint[] _Vfma_Vfms_Vfnma_Vfnms_S_F64_() + { + return new uint[] + { + 0xEEA00B00u, // VFMA. F64 D0, D0, D0 + 0xEEA00B40u, // VFMS. F64 D0, D0, D0 + 0xEE900B40u, // VFNMA.F64 D0, D0, D0 + 0xEE900B00u // VFNMS.F64 D0, D0, D0 + }; + } + + private static uint[] _Vfma_Vfms_V_F32_() + { + return new uint[] + { + 0xF2000C10u, // VFMA.F32 D0, D0, D0 + 0xF2200C10u // VFMS.F32 D0, D0, D0 + }; + } + + private static uint[] _Vmla_Vmls_Vnmla_Vnmls_S_F32_() + { + return new uint[] + { + 0xEE000A00u, // VMLA. F32 S0, S0, S0 + 0xEE000A40u, // VMLS. F32 S0, S0, S0 + 0xEE100A40u, // VNMLA.F32 S0, S0, S0 + 0xEE100A00u // VNMLS.F32 S0, S0, S0 + }; + } + + private static uint[] _Vmla_Vmls_Vnmla_Vnmls_S_F64_() + { + return new uint[] + { + 0xEE000B00u, // VMLA. F64 D0, D0, D0 + 0xEE000B40u, // VMLS. F64 D0, D0, D0 + 0xEE100B40u, // VNMLA.F64 D0, D0, D0 + 0xEE100B00u // VNMLS.F64 D0, D0, D0 + }; + } + private static uint[] _Vp_Add_Max_Min_F_() { return new uint[] @@ -184,8 +237,8 @@ namespace Ryujinx.Tests.Cpu private const int RndCnt = 2; private static readonly bool NoZeros = false; - private static readonly bool NoInfs = true; - private static readonly bool NoNaNs = true; + private static readonly bool NoInfs = false; + private static readonly bool NoNaNs = false; [Explicit] [Test, Pairwise, Description("VADD.f32 V0, V0, V0")] @@ -293,119 +346,115 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(fpsrMask: Fpsr.Nzcv); } - [Test, Pairwise, Description("VFMA.F , , ")] - public void Vfma([Values(0u, 1u)] uint rd, - [Values(0u, 1u)] uint rn, - [Values(0u, 1u)] uint rm, - [Values(0u, 1u)] uint Q, - [ValueSource("_2S_F_")] ulong z, - [ValueSource("_2S_F_")] ulong a, - [ValueSource("_2S_F_")] ulong b ) + [Test, Pairwise] [Explicit] // Fused. + public void Vfma_Vfms_Vfnma_Vfnms_S_F32([ValueSource(nameof(_Vfma_Vfms_Vfnma_Vfnms_S_F32_))] uint opcode, + [Values(0u, 1u, 2u, 3u)] uint rd, + [Values(0u, 1u, 2u, 3u)] uint rn, + [Values(0u, 1u, 2u, 3u)] uint rm, + [ValueSource(nameof(_1S_F_))] ulong s0, + [ValueSource(nameof(_1S_F_))] ulong s1, + [ValueSource(nameof(_1S_F_))] ulong s2, + [ValueSource(nameof(_1S_F_))] ulong s3) { - uint opcode = 0xf2000c10; - - V128 v0; - V128 v1; - V128 v2; + opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11); + opcode |= (((rn & 0x1) << 7) | (rn & 0x1e) << 15); + opcode |= (((rm & 0x1) << 5) | (rm & 0x1e) >> 1); - uint c = (uint) BitConverter.SingleToInt32Bits(z); - uint d = (uint) BitConverter.SingleToInt32Bits(a); - uint e = (uint) BitConverter.SingleToInt32Bits(b); - if (Q == 0) - { - opcode |= (((rm & 0x1) << 5) | (rm & 0x1e) >> 1); - opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11); - opcode |= (((rn & 0x1) << 7) | (rn & 0x1e) >> 15); + V128 v0 = MakeVectorE0E1E2E3((uint)s0, (uint)s1, (uint)s2, (uint)s3); - v0 = MakeVectorE0E1(c, c); - v1 = MakeVectorE0E1(d, c); - v2 = MakeVectorE0E1(e, c); - } - else - { - rd = rn = rm = 0; // Needed, as these values cannot be odd values if Q == 1. - opcode |= (((rm & 0x10) << 1) | (rm & 0xf) << 0); - opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12); - opcode |= (((rn & 0x10) << 3) | (rn & 0xf) << 16); + SingleOpcode(opcode, v0: v0); - v0 = MakeVectorE0E1E2E3(c, c, d, e); - v1 = MakeVectorE0E1E2E3(d, c, e, c); - v2 = MakeVectorE0E1E2E3(e, c, d, c); - } - - opcode |= ((Q & 1) << 6); - - SingleOpcode(opcode, v0: v0, v1: v1, v2: v2); - CompareAgainstUnicorn(); - } - - [Test, Pairwise, Description("VFNMA.F , , ")] - public void Vfnma([Values(0u, 1u)] uint rd, - [Values(0u, 1u)] uint rn, - [Values(0u, 1u)] uint rm, - [Values(2u, 3u)] uint size, - [ValueSource("_2S_F_")] ulong z, - [ValueSource("_2S_F_")] ulong a, - [ValueSource("_2S_F_")] ulong b) - { - uint opcode = 0xe900840; - - if (size == 2) - { - opcode |= (((rm & 0x1) << 5) | (rm & 0x1e) >> 1); - opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11); - opcode |= (((rn & 0x1) << 7) | (rn & 0x1e) >> 15); - - } - else - { - opcode |= (((rm & 0x10) << 1) | (rm & 0xf) << 0); - opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12); - opcode |= (((rn & 0x10) << 3) | (rn & 0xf) << 16); - } - - opcode |= ((size & 3) << 8); - - V128 v0 = MakeVectorE0E1(z, z); - V128 v1 = MakeVectorE0E1(a, z); - V128 v2 = MakeVectorE0E1(b, z); - - SingleOpcode(opcode, v0: v0, v1: v1, v2: v2); CompareAgainstUnicorn(); } - [Test, Pairwise, Description("VFNMS.F , , ")] - public void Vfnms([Values(0u, 1u)] uint rd, - [Values(0u, 1u)] uint rn, - [Values(0u, 1u)] uint rm, - [Values(2u, 3u)] uint size, - [ValueSource("_2S_F_")] ulong z, - [ValueSource("_2S_F_")] ulong a, - [ValueSource("_2S_F_")] ulong b) + [Test, Pairwise] [Explicit] // Fused. + public void Vfma_Vfms_Vfnma_Vfnms_S_F64([ValueSource(nameof(_Vfma_Vfms_Vfnma_Vfnms_S_F64_))] uint opcode, + [Values(0u, 1u)] uint rd, + [Values(0u, 1u)] uint rn, + [Values(0u, 1u)] uint rm, + [ValueSource(nameof(_1D_F_))] ulong d0, + [ValueSource(nameof(_1D_F_))] ulong d1) { - uint opcode = 0xee900a00; + opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12); + opcode |= (((rn & 0x10) << 3) | (rn & 0xf) << 16); + opcode |= (((rm & 0x10) << 1) | (rm & 0xf) << 0); - if (size == 2) + V128 v0 = MakeVectorE0E1(d0, d1); + + SingleOpcode(opcode, v0: v0); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise] [Explicit] // Fused. + public void Vfma_Vfms_V_F32([ValueSource(nameof(_Vfma_Vfms_V_F32_))] uint opcode, + [Values(0u, 1u, 2u, 3u)] uint rd, + [Values(0u, 1u, 2u, 3u)] uint rn, + [Values(0u, 1u, 2u, 3u)] uint rm, + [ValueSource(nameof(_2S_F_))] ulong d0, + [ValueSource(nameof(_2S_F_))] ulong d1, + [ValueSource(nameof(_2S_F_))] ulong d2, + [ValueSource(nameof(_2S_F_))] ulong d3, + [Values] bool q) + { + if (q) { - opcode |= (((rm & 0x1) << 5) | (rm & 0x1e) >> 1); - opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11); - opcode |= (((rn & 0x1) << 7) | (rn & 0x1e) >> 15); - - } - else - { - opcode |= (((rm & 0x10) << 1) | (rm & 0xf) << 0); - opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12); - opcode |= (((rn & 0x10) << 3) | (rn & 0xf) << 16); + opcode |= 1 << 6; + + rd >>= 1; rd <<= 1; + rn >>= 1; rn <<= 1; + rm >>= 1; rm <<= 1; } - opcode |= ((size & 3) << 8); + opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18); + opcode |= ((rn & 0xf) << 16) | ((rn & 0x10) << 3); + opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1); - V128 v0 = MakeVectorE0E1(z, z); - V128 v1 = MakeVectorE0E1(a, z); - V128 v2 = MakeVectorE0E1(b, z); + V128 v0 = MakeVectorE0E1(d0, d1); + V128 v1 = MakeVectorE0E1(d2, d3); + + SingleOpcode(opcode, v0: v0, v1: v1); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise] [Explicit] + public void Vmla_Vmls_Vnmla_Vnmls_S_F32([ValueSource(nameof(_Vmla_Vmls_Vnmla_Vnmls_S_F32_))] uint opcode, + [Values(0u, 1u, 2u, 3u)] uint rd, + [Values(0u, 1u, 2u, 3u)] uint rn, + [Values(0u, 1u, 2u, 3u)] uint rm, + [ValueSource(nameof(_1S_F_))] ulong s0, + [ValueSource(nameof(_1S_F_))] ulong s1, + [ValueSource(nameof(_1S_F_))] ulong s2, + [ValueSource(nameof(_1S_F_))] ulong s3) + { + opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11); + opcode |= (((rn & 0x1) << 7) | (rn & 0x1e) << 15); + opcode |= (((rm & 0x1) << 5) | (rm & 0x1e) >> 1); + + V128 v0 = MakeVectorE0E1E2E3((uint)s0, (uint)s1, (uint)s2, (uint)s3); + + SingleOpcode(opcode, v0: v0); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise] [Explicit] + public void Vmla_Vmls_Vnmla_Vnmls_S_F64([ValueSource(nameof(_Vmla_Vmls_Vnmla_Vnmls_S_F64_))] uint opcode, + [Values(0u, 1u)] uint rd, + [Values(0u, 1u)] uint rn, + [Values(0u, 1u)] uint rm, + [ValueSource(nameof(_1D_F_))] ulong d0, + [ValueSource(nameof(_1D_F_))] ulong d1) + { + opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12); + opcode |= (((rn & 0x10) << 3) | (rn & 0xf) << 16); + opcode |= (((rm & 0x10) << 1) | (rm & 0xf) << 0); + + V128 v0 = MakeVectorE0E1(d0, d1); + + SingleOpcode(opcode, v0: v0); - SingleOpcode(opcode, v0: v0, v1: v1, v2: v2); CompareAgainstUnicorn(); }