Add SSE4.2 Path for CRC32, add A32 variant, add tests for non-castagnoli variants. (#1328)

* Add CRC32 A32 instructions.

* Fix CRC32 instructions.

* Add CRC intrinsic and fast path.

Loop is currently unrolled, will look into adding temp vars after tests are added.

* Begin work on Crc tests

* Fix SSE4.2 path for CRC32C, finialize tests.

* Remove unused IR path.

* Fix spacing between prefix checks.

* This should be Src.

* PTC Version

* OpCodeTable Order

* Integer check improvement. Value and Crc can be either 32 or 64 size.

* This wasn't necessary...

* If size is 3, value type must be I64.

* Fix same src+dest handling for non crc intrinsics.

* Pre-fix (ha) issue with vex encodings
This commit is contained in:
riperiperi 2020-07-13 11:48:14 +01:00 committed by GitHub
parent 30d4f752f4
commit d7044b10a2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 448 additions and 161 deletions

View File

@ -28,10 +28,10 @@ namespace ARMeilleure.CodeGen.X86
Vex = 1 << 4, Vex = 1 << 4,
PrefixBit = 16, PrefixBit = 16,
PrefixMask = 3 << PrefixBit, PrefixMask = 7 << PrefixBit,
Prefix66 = 1 << PrefixBit, Prefix66 = 1 << PrefixBit,
PrefixF3 = 2 << PrefixBit, PrefixF3 = 2 << PrefixBit,
PrefixF2 = 3 << PrefixBit PrefixF2 = 4 << PrefixBit
} }
private struct InstructionInfo private struct InstructionInfo
@ -104,6 +104,9 @@ namespace ARMeilleure.CodeGen.X86
Add(X86Instruction.Comisd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Comisd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Comiss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex)); Add(X86Instruction.Comiss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex));
Add(X86Instruction.Cpuid, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fa2, InstructionFlags.RegOnly)); Add(X86Instruction.Cpuid, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fa2, InstructionFlags.RegOnly));
Add(X86Instruction.Crc32, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f1, InstructionFlags.PrefixF2));
Add(X86Instruction.Crc32_16, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f1, InstructionFlags.PrefixF2 | InstructionFlags.Prefix66));
Add(X86Instruction.Crc32_8, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f0, InstructionFlags.PrefixF2 | InstructionFlags.Reg8Src));
Add(X86Instruction.Cvtdq2pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe6, InstructionFlags.Vex | InstructionFlags.PrefixF3)); Add(X86Instruction.Cvtdq2pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe6, InstructionFlags.Vex | InstructionFlags.PrefixF3));
Add(X86Instruction.Cvtdq2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f5b, InstructionFlags.Vex)); Add(X86Instruction.Cvtdq2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f5b, InstructionFlags.Vex));
Add(X86Instruction.Cvtpd2dq, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe6, InstructionFlags.Vex | InstructionFlags.PrefixF2)); Add(X86Instruction.Cvtpd2dq, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe6, InstructionFlags.Vex | InstructionFlags.PrefixF2));
@ -1172,7 +1175,15 @@ namespace ARMeilleure.CodeGen.X86
if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding) if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding)
{ {
int vexByte2 = (int)(flags & InstructionFlags.PrefixMask) >> (int)InstructionFlags.PrefixBit; // In a vex encoding, only one prefix can be active at a time. The active prefix is encoded in the second byte using two bits.
int vexByte2 = (flags & InstructionFlags.PrefixMask) switch
{
InstructionFlags.Prefix66 => 1,
InstructionFlags.PrefixF3 => 2,
InstructionFlags.PrefixF2 => 3,
_ => 0
};
if (src1 != null) if (src1 != null)
{ {
@ -1220,11 +1231,19 @@ namespace ARMeilleure.CodeGen.X86
} }
else else
{ {
switch (flags & InstructionFlags.PrefixMask) if (flags.HasFlag(InstructionFlags.Prefix66))
{ {
case InstructionFlags.Prefix66: WriteByte(0x66); break; WriteByte(0x66);
case InstructionFlags.PrefixF2: WriteByte(0xf2); break; }
case InstructionFlags.PrefixF3: WriteByte(0xf3); break;
if (flags.HasFlag(InstructionFlags.PrefixF2))
{
WriteByte(0xf2);
}
if (flags.HasFlag(InstructionFlags.PrefixF3))
{
WriteByte(0xf3);
} }
if (rexPrefix != 0) if (rexPrefix != 0)

View File

@ -333,6 +333,21 @@ namespace ARMeilleure.CodeGen.X86
break; break;
} }
case IntrinsicType.Crc32:
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
EnsureSameReg(dest, src1);
Debug.Assert(dest.Type.IsInteger() && src1.Type.IsInteger() && src2.Type.IsInteger());
context.Assembler.WriteInstruction(info.Inst, dest, src2, dest.Type);
break;
}
case IntrinsicType.BinaryImm: case IntrinsicType.BinaryImm:
{ {
Operand dest = operation.Destination; Operand dest = operation.Destination;

View File

@ -38,6 +38,9 @@ namespace ARMeilleure.CodeGen.X86
Add(Intrinsic.X86Comisseq, new IntrinsicInfo(X86Instruction.Comiss, IntrinsicType.Comis_)); Add(Intrinsic.X86Comisseq, new IntrinsicInfo(X86Instruction.Comiss, IntrinsicType.Comis_));
Add(Intrinsic.X86Comissge, new IntrinsicInfo(X86Instruction.Comiss, IntrinsicType.Comis_)); Add(Intrinsic.X86Comissge, new IntrinsicInfo(X86Instruction.Comiss, IntrinsicType.Comis_));
Add(Intrinsic.X86Comisslt, new IntrinsicInfo(X86Instruction.Comiss, IntrinsicType.Comis_)); Add(Intrinsic.X86Comisslt, new IntrinsicInfo(X86Instruction.Comiss, IntrinsicType.Comis_));
Add(Intrinsic.X86Crc32, new IntrinsicInfo(X86Instruction.Crc32, IntrinsicType.Crc32));
Add(Intrinsic.X86Crc32_16, new IntrinsicInfo(X86Instruction.Crc32_16, IntrinsicType.Crc32));
Add(Intrinsic.X86Crc32_8, new IntrinsicInfo(X86Instruction.Crc32_8, IntrinsicType.Crc32));
Add(Intrinsic.X86Cvtdq2pd, new IntrinsicInfo(X86Instruction.Cvtdq2pd, IntrinsicType.Unary)); Add(Intrinsic.X86Cvtdq2pd, new IntrinsicInfo(X86Instruction.Cvtdq2pd, IntrinsicType.Unary));
Add(Intrinsic.X86Cvtdq2ps, new IntrinsicInfo(X86Instruction.Cvtdq2ps, IntrinsicType.Unary)); Add(Intrinsic.X86Cvtdq2ps, new IntrinsicInfo(X86Instruction.Cvtdq2ps, IntrinsicType.Unary));
Add(Intrinsic.X86Cvtpd2dq, new IntrinsicInfo(X86Instruction.Cvtpd2dq, IntrinsicType.Unary)); Add(Intrinsic.X86Cvtpd2dq, new IntrinsicInfo(X86Instruction.Cvtpd2dq, IntrinsicType.Unary));

View File

@ -9,6 +9,7 @@ namespace ARMeilleure.CodeGen.X86
Binary, Binary,
BinaryGpr, BinaryGpr,
BinaryImm, BinaryImm,
Crc32,
Ternary, Ternary,
TernaryImm TernaryImm
} }

View File

@ -1294,11 +1294,22 @@ namespace ARMeilleure.CodeGen.X86
case Instruction.VectorInsert16: case Instruction.VectorInsert16:
case Instruction.VectorInsert8: case Instruction.VectorInsert8:
return !HardwareCapabilities.SupportsVexEncoding; return !HardwareCapabilities.SupportsVexEncoding;
case Instruction.Extended:
return IsIntrinsicSameOperandDestSrc1(operation);
} }
return IsVexSameOperandDestSrc1(operation); return IsVexSameOperandDestSrc1(operation);
} }
private static bool IsIntrinsicSameOperandDestSrc1(Operation operation)
{
IntrinsicOperation intrinOp = (IntrinsicOperation)operation;
IntrinsicInfo info = IntrinsicTable.GetInfo(intrinOp.Intrinsic);
return info.Type == IntrinsicType.Crc32 || IsVexSameOperandDestSrc1(operation);
}
private static bool IsVexSameOperandDestSrc1(Operation operation) private static bool IsVexSameOperandDestSrc1(Operation operation)
{ {
if (IsIntrinsic(operation.Instruction)) if (IsIntrinsic(operation.Instruction))

View File

@ -33,6 +33,9 @@ namespace ARMeilleure.CodeGen.X86
Comisd, Comisd,
Comiss, Comiss,
Cpuid, Cpuid,
Crc32,
Crc32_16,
Crc32_8,
Cvtdq2pd, Cvtdq2pd,
Cvtdq2ps, Cvtdq2ps,
Cvtpd2dq, Cvtpd2dq,

View File

@ -659,6 +659,12 @@ namespace ARMeilleure.Decoders
SetA32("<<<<00110101xxxx0000xxxxxxxxxxxx", InstName.Cmp, InstEmit32.Cmp, typeof(OpCode32AluImm)); SetA32("<<<<00110101xxxx0000xxxxxxxxxxxx", InstName.Cmp, InstEmit32.Cmp, typeof(OpCode32AluImm));
SetA32("<<<<00010101xxxx0000xxxxxxx0xxxx", InstName.Cmp, InstEmit32.Cmp, typeof(OpCode32AluRsImm)); SetA32("<<<<00010101xxxx0000xxxxxxx0xxxx", InstName.Cmp, InstEmit32.Cmp, typeof(OpCode32AluRsImm));
SetA32("<<<<00010101xxxx0000xxxx0xx1xxxx", InstName.Cmp, InstEmit32.Cmp, typeof(OpCode32AluRsReg)); SetA32("<<<<00010101xxxx0000xxxx0xx1xxxx", InstName.Cmp, InstEmit32.Cmp, typeof(OpCode32AluRsReg));
SetA32("<<<<00010000xxxxxxxx00000100xxxx", InstName.Crc32b, InstEmit32.Crc32b, typeof(OpCode32AluReg));
SetA32("<<<<00010000xxxxxxxx00100100xxxx", InstName.Crc32cb, InstEmit32.Crc32cb, typeof(OpCode32AluReg));
SetA32("<<<<00010010xxxxxxxx00100100xxxx", InstName.Crc32ch, InstEmit32.Crc32ch, typeof(OpCode32AluReg));
SetA32("<<<<00010100xxxxxxxx00100100xxxx", InstName.Crc32cw, InstEmit32.Crc32cw, typeof(OpCode32AluReg));
SetA32("<<<<00010010xxxxxxxx00000100xxxx", InstName.Crc32h, InstEmit32.Crc32h, typeof(OpCode32AluReg));
SetA32("<<<<00010100xxxxxxxx00000100xxxx", InstName.Crc32w, InstEmit32.Crc32w, typeof(OpCode32AluReg));
SetA32("1111010101111111111100000101xxxx", InstName.Dmb, InstEmit32.Dmb, typeof(OpCode32)); SetA32("1111010101111111111100000101xxxx", InstName.Dmb, InstEmit32.Dmb, typeof(OpCode32));
SetA32("1111010101111111111100000100xxxx", InstName.Dsb, InstEmit32.Dsb, typeof(OpCode32)); SetA32("1111010101111111111100000100xxxx", InstName.Dsb, InstEmit32.Dsb, typeof(OpCode32));
SetA32("<<<<0010001xxxxxxxxxxxxxxxxxxxxx", InstName.Eor, InstEmit32.Eor, typeof(OpCode32AluImm)); SetA32("<<<<0010001xxxxxxxxxxxxxxxxxxxxx", InstName.Eor, InstEmit32.Eor, typeof(OpCode32AluImm));

View File

@ -1,182 +1,67 @@
// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
using ARMeilleure.Decoders; using ARMeilleure.Decoders;
using ARMeilleure.IntermediateRepresentation; using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation; using ARMeilleure.Translation;
using static ARMeilleure.Instructions.InstEmitHashHelper;
using static ARMeilleure.Instructions.InstEmitHelper; using static ARMeilleure.Instructions.InstEmitHelper;
using static ARMeilleure.Instructions.InstEmitSimdHelper;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
namespace ARMeilleure.Instructions namespace ARMeilleure.Instructions
{ {
static partial class InstEmit static partial class InstEmit
{ {
private const int ByteSizeLog2 = 0;
private const int HWordSizeLog2 = 1;
private const int WordSizeLog2 = 2;
private const int DWordSizeLog2 = 3;
public static void Crc32b(ArmEmitterContext context) public static void Crc32b(ArmEmitterContext context)
{ {
if (Optimizations.UsePclmulqdq) EmitCrc32Call(context, ByteSizeLog2, false);
{
EmitCrc32Optimized(context, false, 8);
}
else
{
EmitCrc32Call(context, nameof(SoftFallback.Crc32b));
}
} }
public static void Crc32h(ArmEmitterContext context) public static void Crc32h(ArmEmitterContext context)
{ {
if (Optimizations.UsePclmulqdq) EmitCrc32Call(context, HWordSizeLog2, false);
{
EmitCrc32Optimized(context, false, 16);
}
else
{
EmitCrc32Call(context, nameof(SoftFallback.Crc32h));
}
} }
public static void Crc32w(ArmEmitterContext context) public static void Crc32w(ArmEmitterContext context)
{ {
if (Optimizations.UsePclmulqdq) EmitCrc32Call(context, WordSizeLog2, false);
{
EmitCrc32Optimized(context, false, 32);
}
else
{
EmitCrc32Call(context, nameof(SoftFallback.Crc32w));
}
} }
public static void Crc32x(ArmEmitterContext context) public static void Crc32x(ArmEmitterContext context)
{ {
if (Optimizations.UsePclmulqdq) EmitCrc32Call(context, DWordSizeLog2, false);
{
EmitCrc32Optimized64(context, false);
}
else
{
EmitCrc32Call(context, nameof(SoftFallback.Crc32x));
}
} }
public static void Crc32cb(ArmEmitterContext context) public static void Crc32cb(ArmEmitterContext context)
{ {
if (Optimizations.UsePclmulqdq) EmitCrc32Call(context, ByteSizeLog2, true);
{
EmitCrc32Optimized(context, true, 8);
}
else
{
EmitCrc32Call(context, nameof(SoftFallback.Crc32cb));
}
} }
public static void Crc32ch(ArmEmitterContext context) public static void Crc32ch(ArmEmitterContext context)
{ {
if (Optimizations.UsePclmulqdq) EmitCrc32Call(context, HWordSizeLog2, true);
{
EmitCrc32Optimized(context, true, 16);
}
else
{
EmitCrc32Call(context, nameof(SoftFallback.Crc32ch));
}
} }
public static void Crc32cw(ArmEmitterContext context) public static void Crc32cw(ArmEmitterContext context)
{ {
if (Optimizations.UsePclmulqdq) EmitCrc32Call(context, WordSizeLog2, true);
{
EmitCrc32Optimized(context, true, 32);
}
else
{
EmitCrc32Call(context, nameof(SoftFallback.Crc32cw));
}
} }
public static void Crc32cx(ArmEmitterContext context) public static void Crc32cx(ArmEmitterContext context)
{ {
if (Optimizations.UsePclmulqdq) EmitCrc32Call(context, DWordSizeLog2, true);
{
EmitCrc32Optimized64(context, true);
}
else
{
EmitCrc32Call(context, nameof(SoftFallback.Crc32cx));
}
} }
private static void EmitCrc32Optimized(ArmEmitterContext context, bool castagnoli, int bitsize) private static void EmitCrc32Call(ArmEmitterContext context, int size, bool c)
{
OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
Operand crc = GetIntOrZR(context, op.Rn);
Operand data = GetIntOrZR(context, op.Rm);
crc = context.VectorInsert(context.VectorZero(), crc, 0);
switch (bitsize)
{
case 8: data = context.VectorInsert8(context.VectorZero(), data, 0); break;
case 16: data = context.VectorInsert16(context.VectorZero(), data, 0); break;
case 32: data = context.VectorInsert(context.VectorZero(), data, 0); break;
}
Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
if (bitsize < 32)
{
crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8));
tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc);
}
SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2));
}
private static void EmitCrc32Optimized64(ArmEmitterContext context, bool castagnoli)
{
OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
Operand crc = GetIntOrZR(context, op.Rn);
Operand data = GetIntOrZR(context, op.Rm);
crc = context.VectorInsert(context.VectorZero(), crc, 0);
data = context.VectorInsert(context.VectorZero(), data, 0);
Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res);
tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2));
}
private static void EmitCrc32Call(ArmEmitterContext context, string name)
{ {
OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp; OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
Operand n = GetIntOrZR(context, op.Rn); Operand n = GetIntOrZR(context, op.Rn);
Operand m = GetIntOrZR(context, op.Rm); Operand m = GetIntOrZR(context, op.Rm);
Operand d = context.Call(typeof(SoftFallback).GetMethod(name), n, m); Operand d = EmitCrc32(context, n, m, size, c);
SetIntOrZR(context, op.Rd, d); SetIntOrZR(context, op.Rd, d);
} }

View File

@ -0,0 +1,54 @@
using ARMeilleure.Decoders;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using static ARMeilleure.Instructions.InstEmitHelper;
using static ARMeilleure.Instructions.InstEmitHashHelper;
namespace ARMeilleure.Instructions
{
static partial class InstEmit32
{
public static void Crc32b(ArmEmitterContext context)
{
EmitCrc32Call(context, ByteSizeLog2, false);
}
public static void Crc32h(ArmEmitterContext context)
{
EmitCrc32Call(context, HWordSizeLog2, false);
}
public static void Crc32w(ArmEmitterContext context)
{
EmitCrc32Call(context, WordSizeLog2, false);
}
public static void Crc32cb(ArmEmitterContext context)
{
EmitCrc32Call(context, ByteSizeLog2, true);
}
public static void Crc32ch(ArmEmitterContext context)
{
EmitCrc32Call(context, HWordSizeLog2, true);
}
public static void Crc32cw(ArmEmitterContext context)
{
EmitCrc32Call(context, WordSizeLog2, true);
}
private static void EmitCrc32Call(ArmEmitterContext context, int size, bool c)
{
IOpCode32AluReg op = (IOpCode32AluReg)context.CurrOp;
Operand n = GetIntA32(context, op.Rn);
Operand m = GetIntA32(context, op.Rm);
Operand d = EmitCrc32(context, n, m, size, c);
EmitAluStore(context, d);
}
}
}

View File

@ -0,0 +1,119 @@
// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using System;
using System.Diagnostics;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
using static ARMeilleure.Instructions.InstEmitSimdHelper;
namespace ARMeilleure.Instructions
{
static class InstEmitHashHelper
{
public const uint Crc32RevPoly = 0xedb88320;
public const uint Crc32cRevPoly = 0x82f63b78;
public static Operand EmitCrc32(ArmEmitterContext context, Operand crc, Operand value, int size, bool castagnoli)
{
Debug.Assert(crc.Type.IsInteger() && value.Type.IsInteger());
Debug.Assert(size >= 0 && size < 4);
Debug.Assert((size < 3) || (value.Type == OperandType.I64));
if (castagnoli && Optimizations.UseSse42)
{
// The CRC32 instruction does not have an immediate variant, so ensure both inputs are in registers.
value = (value.Kind == OperandKind.Constant) ? context.Copy(value) : value;
crc = (crc.Kind == OperandKind.Constant) ? context.Copy(crc) : crc;
Intrinsic op = size switch
{
0 => Intrinsic.X86Crc32_8,
1 => Intrinsic.X86Crc32_16,
_ => Intrinsic.X86Crc32,
};
return (size == 3) ? context.ConvertI64ToI32(context.AddIntrinsicLong(op, crc, value)) : context.AddIntrinsicInt(op, crc, value);
}
else if (Optimizations.UsePclmulqdq)
{
return size switch
{
3 => EmitCrc32Optimized64(context, crc, value, castagnoli),
_ => EmitCrc32Optimized(context, crc, value, castagnoli, size),
};
}
else
{
string name = (size, castagnoli) switch
{
(0, false) => nameof(SoftFallback.Crc32b),
(1, false) => nameof(SoftFallback.Crc32h),
(2, false) => nameof(SoftFallback.Crc32w),
(3, false) => nameof(SoftFallback.Crc32x),
(0, true) => nameof(SoftFallback.Crc32cb),
(1, true) => nameof(SoftFallback.Crc32ch),
(2, true) => nameof(SoftFallback.Crc32cw),
(3, true) => nameof(SoftFallback.Crc32cx),
_ => throw new ArgumentOutOfRangeException(nameof(size))
};
return context.Call(typeof(SoftFallback).GetMethod(name), crc, value);
}
}
private static Operand EmitCrc32Optimized(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli, int size)
{
long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
crc = context.VectorInsert(context.VectorZero(), crc, 0);
switch (size)
{
case 0: data = context.VectorInsert8(context.VectorZero(), data, 0); break;
case 1: data = context.VectorInsert16(context.VectorZero(), data, 0); break;
case 2: data = context.VectorInsert(context.VectorZero(), data, 0); break;
}
int bitsize = 8 << size;
Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
if (bitsize < 32)
{
crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8));
tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc);
}
return context.VectorExtract(OperandType.I32, tmp, 2);
}
private static Operand EmitCrc32Optimized64(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli)
{
long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
crc = context.VectorInsert(context.VectorZero(), crc, 0);
data = context.VectorInsert(context.VectorZero(), data, 0);
Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res);
tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
return context.VectorExtract(OperandType.I32, tmp, 2);
}
}
}

View File

@ -27,6 +27,9 @@ namespace ARMeilleure.IntermediateRepresentation
X86Comisseq, X86Comisseq,
X86Comissge, X86Comissge,
X86Comisslt, X86Comisslt,
X86Crc32,
X86Crc32_16,
X86Crc32_8,
X86Cvtdq2pd, X86Cvtdq2pd,
X86Cvtdq2ps, X86Cvtdq2ps,
X86Cvtpd2dq, X86Cvtpd2dq,

View File

@ -20,7 +20,7 @@ namespace ARMeilleure.Translation.PTC
{ {
private const string HeaderMagic = "PTChd"; private const string HeaderMagic = "PTChd";
private const int InternalVersion = 8; //! To be incremented manually for each change to the ARMeilleure project. private const int InternalVersion = 9; //! To be incremented manually for each change to the ARMeilleure project.
private const string BaseDir = "Ryujinx"; private const string BaseDir = "Ryujinx";

View File

@ -167,11 +167,11 @@ namespace Ryujinx.Tests.Cpu
} }
} }
protected void ExecuteOpcodes() protected void ExecuteOpcodes(bool runUnicorn = true)
{ {
_cpuContext.Execute(_context, _entryPoint); _cpuContext.Execute(_context, _entryPoint);
if (_unicornAvailable) if (_unicornAvailable && runUnicorn)
{ {
_unicornEmu.RunForCount((_currAddress - _entryPoint - 4) / 4); _unicornEmu.RunForCount((_currAddress - _entryPoint - 4) / 4);
} }
@ -196,12 +196,13 @@ namespace Ryujinx.Tests.Cpu
bool zero = false, bool zero = false,
bool negative = false, bool negative = false,
int fpcr = 0, int fpcr = 0,
int fpsr = 0) int fpsr = 0,
bool runUnicorn = true)
{ {
Opcode(opcode); Opcode(opcode);
Opcode(0xD65F03C0); // RET Opcode(0xD65F03C0); // RET
SetContext(x0, x1, x2, x3, x31, v0, v1, v2, v3, v4, v5, v30, v31, overflow, carry, zero, negative, fpcr, fpsr); SetContext(x0, x1, x2, x3, x31, v0, v1, v2, v3, v4, v5, v30, v31, overflow, carry, zero, negative, fpcr, fpsr);
ExecuteOpcodes(); ExecuteOpcodes(runUnicorn);
return GetContext(); return GetContext();
} }

View File

@ -1,5 +1,6 @@
#define AluBinary #define AluBinary
using ARMeilleure.State;
using NUnit.Framework; using NUnit.Framework;
namespace Ryujinx.Tests.Cpu namespace Ryujinx.Tests.Cpu
@ -8,8 +9,78 @@ namespace Ryujinx.Tests.Cpu
public sealed class CpuTestAluBinary : CpuTest public sealed class CpuTestAluBinary : CpuTest
{ {
#if AluBinary #if AluBinary
public struct CrcTest
{
public uint Crc;
public ulong Value;
public bool C;
public uint[] Results; // One result for each CRC variant (8, 16, 32)
public CrcTest(uint crc, ulong value, bool c, params uint[] results)
{
Crc = crc;
Value = value;
C = c;
Results = results;
}
}
#region "ValueSource (CRC32)"
private static CrcTest[] _CRC32_Test_Values_()
{
// Created with http://www.sunshine2k.de/coding/javascript/crc/crc_js.html, with:
// - non-reflected polynomials
// - input reflected, result reflected
// - bytes in order of increasing significance
// - xor 0
// Only includes non-C variant, as the other can be tested with unicorn.
return new CrcTest[]
{
new CrcTest(0x00000000u, 0x00_00_00_00_00_00_00_00u, false, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
new CrcTest(0x00000000u, 0x7f_ff_ff_ff_ff_ff_ff_ffu, false, 0x2d02ef8d, 0xbe2612ff, 0xdebb20e3, 0xa9de8355),
new CrcTest(0x00000000u, 0x80_00_00_00_00_00_00_00u, false, 0x00000000, 0x00000000, 0x00000000, 0xedb88320),
new CrcTest(0x00000000u, 0xff_ff_ff_ff_ff_ff_ff_ffu, false, 0x2d02ef8d, 0xbe2612ff, 0xdebb20e3, 0x44660075),
new CrcTest(0x00000000u, 0xa0_02_f1_ca_52_78_8c_1cu, false, 0x14015c4f, 0x02799256, 0x9063c9e5, 0x8816610a),
new CrcTest(0xffffffffu, 0x00_00_00_00_00_00_00_00u, false, 0x2dfd1072, 0xbe26ed00, 0xdebb20e3, 0x9add2096),
new CrcTest(0xffffffffu, 0x7f_ff_ff_ff_ff_ff_ff_ffu, false, 0x00ffffff, 0x0000ffff, 0x00000000, 0x3303a3c3),
new CrcTest(0xffffffffu, 0x80_00_00_00_00_00_00_00u, false, 0x2dfd1072, 0xbe26ed00, 0xdebb20e3, 0x7765a3b6),
new CrcTest(0xffffffffu, 0xff_ff_ff_ff_ff_ff_ff_ffu, false, 0x00ffffff, 0x0000ffff, 0x00000000, 0xdebb20e3),
new CrcTest(0xffffffffu, 0xa0_02_f1_ca_52_78_8c_1cu, false, 0x39fc4c3d, 0xbc5f7f56, 0x4ed8e906, 0x12cb419c)
};
}
#endregion
private const int RndCnt = 2; private const int RndCnt = 2;
[Test, Combinatorial]
public void Crc32_b_h_w_x([Values(0u)] uint rd,
[Values(1u)] uint rn,
[Values(2u)] uint rm,
[Range(0u, 3u)] uint size,
[ValueSource("_CRC32_Test_Values_")] CrcTest test)
{
uint opcode = 0x1AC04000; // CRC32B W0, W0, W0
opcode |= size << 10;
opcode |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
if (size == 3)
{
opcode |= 0x80000000;
}
uint w31 = TestContext.CurrentContext.Random.NextUInt();
SingleOpcode(opcode, x1: test.Crc, x2: test.Value, x31: w31, runUnicorn: false);
ExecutionContext context = GetContext();
ulong result = context.GetX((int)rd);
Assert.That(result == test.Results[size]);
}
[Test, Pairwise, Description("CRC32X <Wd>, <Wn>, <Xm>"), Ignore("Unicorn fails.")] [Test, Pairwise, Description("CRC32X <Wd>, <Wn>, <Xm>"), Ignore("Unicorn fails.")]
public void Crc32x([Values(0u, 31u)] uint rd, public void Crc32x([Values(0u, 31u)] uint rd,
[Values(1u, 31u)] uint rn, [Values(1u, 31u)] uint rn,

View File

@ -0,0 +1,96 @@
#define AluBinary32
using ARMeilleure.State;
using NUnit.Framework;
using System;
namespace Ryujinx.Tests.Cpu
{
[Category("AluBinary32")]
public sealed class CpuTestAluBinary32 : CpuTest32
{
#if AluBinary32
public struct CrcTest32
{
public uint Crc;
public uint Value;
public bool C;
public uint[] Results; // One result for each CRC variant (8, 16, 32)
public CrcTest32(uint crc, uint value, bool c, params uint[] results)
{
Crc = crc;
Value = value;
C = c;
Results = results;
}
}
#region "ValueSource (CRC32/CRC32C)"
private static CrcTest32[] _CRC32_Test_Values_()
{
// Created with http://www.sunshine2k.de/coding/javascript/crc/crc_js.html, with:
// - non-reflected polynomials
// - input reflected, result reflected
// - bytes in order of increasing significance
// - xor 0
return new CrcTest32[]
{
new CrcTest32(0x00000000u, 0x00_00_00_00u, false, 0x00000000, 0x00000000, 0x00000000),
new CrcTest32(0x00000000u, 0x7f_ff_ff_ffu, false, 0x2d02ef8d, 0xbe2612ff, 0x3303a3c3),
new CrcTest32(0x00000000u, 0x80_00_00_00u, false, 0x00000000, 0x00000000, 0xedb88320),
new CrcTest32(0x00000000u, 0xff_ff_ff_ffu, false, 0x2d02ef8d, 0xbe2612ff, 0xdebb20e3),
new CrcTest32(0x00000000u, 0x9d_cb_12_f0u, false, 0xbdbdf21c, 0xe70590f5, 0x3f7480c5),
new CrcTest32(0xffffffffu, 0x00_00_00_00u, false, 0x2dfd1072, 0xbe26ed00, 0xdebb20e3),
new CrcTest32(0xffffffffu, 0x7f_ff_ff_ffu, false, 0x00ffffff, 0x0000ffff, 0xedb88320),
new CrcTest32(0xffffffffu, 0x80_00_00_00u, false, 0x2dfd1072, 0xbe26ed00, 0x3303a3c3),
new CrcTest32(0xffffffffu, 0xff_ff_ff_ffu, false, 0x00ffffff, 0x0000ffff, 0x00000000),
new CrcTest32(0xffffffffu, 0x9d_cb_12_f0u, false, 0x9040e26e, 0x59237df5, 0xe1cfa026),
new CrcTest32(0x00000000u, 0x00_00_00_00u, true, 0x00000000, 0x00000000, 0x00000000),
new CrcTest32(0x00000000u, 0x7f_ff_ff_ffu, true, 0xad7d5351, 0x0e9e77d2, 0x356e8f40),
new CrcTest32(0x00000000u, 0x80_00_00_00u, true, 0x00000000, 0x00000000, 0x82f63b78),
new CrcTest32(0x00000000u, 0xff_ff_ff_ffu, true, 0xad7d5351, 0x0e9e77d2, 0xb798b438),
new CrcTest32(0x00000000u, 0x9d_cb_12_f0u, true, 0xf36e6f75, 0xb5ff99e6, 0x782dfbf1),
new CrcTest32(0xffffffffu, 0x00_00_00_00u, true, 0xad82acae, 0x0e9e882d, 0xb798b438),
new CrcTest32(0xffffffffu, 0x7f_ff_ff_ffu, true, 0x00ffffff, 0x0000ffff, 0x82f63b78),
new CrcTest32(0xffffffffu, 0x80_00_00_00u, true, 0xad82acae, 0x0e9e882d, 0x356e8f40),
new CrcTest32(0xffffffffu, 0xff_ff_ff_ffu, true, 0x00ffffff, 0x0000ffff, 0x00000000),
new CrcTest32(0xffffffffu, 0x9d_cb_12_f0u, true, 0x5eecc3db, 0xbb6111cb, 0xcfb54fc9)
};
}
#endregion
[Test, Combinatorial]
public void Crc32_Crc32c_b_h_w([Values(0u)] uint rd,
[Values(1u)] uint rn,
[Values(2u)] uint rm,
[Range(0u, 2u)] uint size,
[ValueSource("_CRC32_Test_Values_")] CrcTest32 test)
{
// Unicorn does not yet support 32bit crc instructions, so test against a known table of results/values.
uint opcode = 0xe1000040; // CRC32B R0, R0, R0
opcode |= ((rm & 15) << 0) | ((rd & 15) << 12) | ((rn & 15) << 16);
opcode |= size << 21;
if (test.C)
{
opcode |= 1 << 9;
}
uint sp = TestContext.CurrentContext.Random.NextUInt();
SingleOpcode(opcode, r1: test.Crc, r2: test.Value, sp: sp, runUnicorn: false);
ExecutionContext context = GetContext();
ulong result = context.GetX((int)rd);
Assert.That(result == test.Results[size]);
}
#endif
}
}