ryujinx/ARMeilleure/Instructions/InstEmitSimdHelper32.cs

using ARMeilleure.Decoders;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using System;
using System.Diagnostics;
using System.Reflection;

using static ARMeilleure.Instructions.InstEmitHelper;
using static ARMeilleure.Instructions.InstEmitSimdHelper;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;

namespace ARMeilleure.Instructions
{
    using Func1I = Func<Operand, Operand>;
    using Func2I = Func<Operand, Operand, Operand>;
    using Func3I = Func<Operand, Operand, Operand, Operand>;

    static class InstEmitSimdHelper32
    {
        public static (int, int) GetQuadwordAndSubindex(int index, RegisterSize size)
        {
            switch (size)
            {
                case RegisterSize.Simd128:
                    return (index >> 1, 0);
                case RegisterSize.Simd64:
                case RegisterSize.Int64:
                    return (index >> 1, index & 1);
                case RegisterSize.Int32:
                    return (index >> 2, index & 3);
            }

            throw new ArgumentException("Unrecognized Vector Register Size.");
        }

        public static Operand ExtractScalar(ArmEmitterContext context, OperandType type, int reg)
        {
            Debug.Assert(type != OperandType.V128);

            if (type == OperandType.FP64 || type == OperandType.I64)
            {
                // From dreg.
                return context.VectorExtract(type, GetVecA32(reg >> 1), reg & 1);
            }
            else
            {
                // From sreg.
                return context.VectorExtract(type, GetVecA32(reg >> 2), reg & 3);
            }
        }

        public static void InsertScalar(ArmEmitterContext context, int reg, Operand value)
        {
            Debug.Assert(value.Type != OperandType.V128);

            Operand vec, insert;
            if (value.Type == OperandType.FP64 || value.Type == OperandType.I64)
            {
                // From dreg.
                vec = GetVecA32(reg >> 1);
                insert = context.VectorInsert(vec, value, reg & 1);
            }
            else
            {
                // From sreg.
                vec = GetVecA32(reg >> 2);
                insert = context.VectorInsert(vec, value, reg & 3);
            }

            context.Copy(vec, insert);
        }

        public static Operand ExtractElement(ArmEmitterContext context, int reg, int size, bool signed)
        {
            return EmitVectorExtract32(context, reg >> (4 - size), reg & ((16 >> size) - 1), size, signed);
        }

        public static void EmitVectorImmUnaryOp32(ArmEmitterContext context, Func1I emit)
        {
            IOpCode32SimdImm op = (IOpCode32SimdImm)context.CurrOp;

            Operand imm = Const(op.Immediate);

            int elems = op.Elems;
            (int index, int subIndex) = GetQuadwordAndSubindex(op.Vd, op.RegisterSize);

            Operand vec = GetVecA32(index);
            Operand res = vec;

            for (int item = 0; item < elems; item++)
            {
                res = EmitVectorInsert(context, res, emit(imm), item + subIndex * elems, op.Size);
            }

            context.Copy(vec, res);
        }

        public static void EmitScalarUnaryOpF32(ArmEmitterContext context, Func1I emit)
        {
            OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;

            OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;

            Operand m = ExtractScalar(context, type, op.Vm);

            InsertScalar(context, op.Vd, emit(m));
        }

        public static void EmitScalarBinaryOpF32(ArmEmitterContext context, Func2I emit)
        {
            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;

            OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;

            Operand n = ExtractScalar(context, type, op.Vn);
            Operand m = ExtractScalar(context, type, op.Vm);

            InsertScalar(context, op.Vd, emit(n, m));
        }

        public static void EmitScalarBinaryOpI32(ArmEmitterContext context, Func2I emit)
        {
            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;

            OperandType type = (op.Size & 1) != 0 ? OperandType.I64 : OperandType.I32;

            if (op.Size < 2)
            {
                throw new NotSupportedException("Cannot perform a scalar SIMD operation on integers smaller than 32 bits.");
            }

            Operand n = ExtractScalar(context, type, op.Vn);
            Operand m = ExtractScalar(context, type, op.Vm);

            InsertScalar(context, op.Vd, emit(n, m));
        }

        public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Func3I emit)
        {
            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;

            OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;

            Operand a = ExtractScalar(context, type, op.Vd);
            Operand n = ExtractScalar(context, type, op.Vn);
            Operand m = ExtractScalar(context, type, op.Vm);

            InsertScalar(context, op.Vd, emit(a, n, m));
        }

        public static void EmitVectorUnaryOpF32(ArmEmitterContext context, Func1I emit)
        {
            OpCode32Simd op = (OpCode32Simd)context.CurrOp;

            int sizeF = op.Size & 1;

            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

            int elems = op.GetBytesCount() >> sizeF + 2;

            Operand res = GetVecA32(op.Qd);

            for (int index = 0; index < elems; index++)
            {
                Operand me = context.VectorExtract(type, GetVecA32(op.Qm), op.Fm + index);

                res = context.VectorInsert(res, emit(me), op.Fd + index);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        public static void EmitVectorBinaryOpF32(ArmEmitterContext context, Func2I emit)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            int sizeF = op.Size & 1;

            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

            int elems = op.GetBytesCount() >> (sizeF + 2);

            Operand res = GetVecA32(op.Qd);

            for (int index = 0; index < elems; index++)
            {
                Operand ne = context.VectorExtract(type, GetVecA32(op.Qn), op.Fn + index);
                Operand me = context.VectorExtract(type, GetVecA32(op.Qm), op.Fm + index);

                res = context.VectorInsert(res, emit(ne, me), op.Fd + index);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Func3I emit)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            int sizeF = op.Size & 1;

            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

            int elems = op.GetBytesCount() >> sizeF + 2;

            Operand res = GetVecA32(op.Qd);

            for (int index = 0; index < elems; index++)
            {
                Operand de = context.VectorExtract(type, GetVecA32(op.Qd), op.Fd + index);
                Operand ne = context.VectorExtract(type, GetVecA32(op.Qn), op.Fn + index);
                Operand me = context.VectorExtract(type, GetVecA32(op.Qm), op.Fm + index);

                res = context.VectorInsert(res, emit(de, ne, me), op.Fd + index);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        // Integer

        public static void EmitVectorUnaryOpI32(ArmEmitterContext context, Func1I emit, bool signed)
        {
            OpCode32Simd op = (OpCode32Simd)context.CurrOp;

            Operand res = GetVecA32(op.Qd);

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed);

                res = EmitVectorInsert(context, res, emit(me), op.Id + index, op.Size);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        public static void EmitVectorBinaryOpI32(ArmEmitterContext context, Func2I emit, bool signed)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            Operand res = GetVecA32(op.Qd);

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);
                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed);

                res = EmitVectorInsert(context, res, emit(ne, me), op.Id + index, op.Size);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        public static void EmitVectorBinaryLongOpI32(ArmEmitterContext context, Func2I emit, bool signed)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);
                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed);

                if (op.Size == 2)
                {
                    ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne);
                    me = signed ? context.SignExtend32(OperandType.I64, me) : context.ZeroExtend32(OperandType.I64, me);
                }

                res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size + 1);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        public static void EmitVectorTernaryLongOpI32(ArmEmitterContext context, Func3I emit, bool signed)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand de = EmitVectorExtract32(context, op.Qd, op.Id + index, op.Size + 1, signed);
                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size,     signed);
                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size,     signed);

                if (op.Size == 2)
                {
                    ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne);
                    me = signed ? context.SignExtend32(OperandType.I64, me) : context.ZeroExtend32(OperandType.I64, me);
                }

                res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size + 1);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        public static void EmitVectorTernaryOpI32(ArmEmitterContext context, Func3I emit, bool signed)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            Operand res = GetVecA32(op.Qd);

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand de = EmitVectorExtract32(context, op.Qd, op.Id + index, op.Size, signed);
                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);
                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed);

                res = EmitVectorInsert(context, res, emit(de, ne, me), op.Id + index, op.Size);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        public static void EmitVectorUnaryOpSx32(ArmEmitterContext context, Func1I emit)
        {
            EmitVectorUnaryOpI32(context, emit, true);
        }

        public static void EmitVectorBinaryOpSx32(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorBinaryOpI32(context, emit, true);
        }

        public static void EmitVectorTernaryOpSx32(ArmEmitterContext context, Func3I emit)
        {
            EmitVectorTernaryOpI32(context, emit, true);
        }

        public static void EmitVectorUnaryOpZx32(ArmEmitterContext context, Func1I emit)
        {
            EmitVectorUnaryOpI32(context, emit, false);
        }

        public static void EmitVectorBinaryOpZx32(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorBinaryOpI32(context, emit, false);
        }

        public static void EmitVectorTernaryOpZx32(ArmEmitterContext context, Func3I emit)
        {
            EmitVectorTernaryOpI32(context, emit, false);
        }

        // Vector by scalar

        public static void EmitVectorByScalarOpF32(ArmEmitterContext context, Func2I emit)
        {
            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;

            int sizeF = op.Size & 1;

            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

            int elems = op.GetBytesCount() >> sizeF + 2;

            Operand m = ExtractScalar(context, type, op.Vm);

            Operand res = GetVecA32(op.Qd);

            for (int index = 0; index < elems; index++)
            {
                Operand ne = context.VectorExtract(type, GetVecA32(op.Qn), op.Fn + index);

                res = context.VectorInsert(res, emit(ne, m), op.Fd + index);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        public static void EmitVectorByScalarOpI32(ArmEmitterContext context, Func2I emit, bool signed)
        {
            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;

            Operand m = ExtractElement(context, op.Vm, op.Size, signed);

            Operand res = GetVecA32(op.Qd);

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);

                res = EmitVectorInsert(context, res, emit(ne, m), op.Id + index, op.Size);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        public static void EmitVectorByScalarLongOpI32(ArmEmitterContext context, Func2I emit, bool signed)
        {
            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;

            Operand m = ExtractElement(context, op.Vm, op.Size, signed);

            if (op.Size == 2)
            {
                m = signed ? context.SignExtend32(OperandType.I64, m) : context.ZeroExtend32(OperandType.I64, m);
            }

            Operand res = context.VectorZero();

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);

                if (op.Size == 2)
                {
                    ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne);
                }

                res = EmitVectorInsert(context, res, emit(ne, m), index, op.Size + 1);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        public static void EmitVectorsByScalarOpF32(ArmEmitterContext context, Func3I emit)
        {
            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;

            int sizeF = op.Size & 1;

            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

            int elems = op.GetBytesCount() >> sizeF + 2;

            Operand m = ExtractScalar(context, type, op.Vm);

            Operand res = GetVecA32(op.Qd);

            for (int index = 0; index < elems; index++)
            {
                Operand de = context.VectorExtract(type, GetVecA32(op.Qd), op.Fd + index);
                Operand ne = context.VectorExtract(type, GetVecA32(op.Qn), op.Fn + index);

                res = context.VectorInsert(res, emit(de, ne, m), op.Fd + index);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        public static void EmitVectorsByScalarOpI32(ArmEmitterContext context, Func3I emit, bool signed)
        {
            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;

            Operand m = EmitVectorExtract32(context, op.Vm >> (4 - op.Size), op.Vm & ((1 << (4 - op.Size)) - 1), op.Size, signed);

            Operand res = GetVecA32(op.Qd);

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand de = EmitVectorExtract32(context, op.Qd, op.Id + index, op.Size, signed);
                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);

                res = EmitVectorInsert(context, res, emit(de, ne, m), op.Id + index, op.Size);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        // Pairwise

        public static void EmitVectorPairwiseOpF32(ArmEmitterContext context, Func2I emit)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            int sizeF = op.Size & 1;

            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

            int elems = op.GetBytesCount() >> (sizeF + 2);
            int pairs = elems >> 1;

            Operand res = GetVecA32(op.Qd);
            Operand mvec = GetVecA32(op.Qm);
            Operand nvec = GetVecA32(op.Qn);

            for (int index = 0; index < pairs; index++)
            {
                int pairIndex = index << 1;

                Operand n1 = context.VectorExtract(type, nvec, op.Fn + pairIndex);
                Operand n2 = context.VectorExtract(type, nvec, op.Fn + pairIndex + 1);

                res = context.VectorInsert(res, emit(n1, n2), op.Fd + index);

                Operand m1 = context.VectorExtract(type, mvec, op.Fm + pairIndex);
                Operand m2 = context.VectorExtract(type, mvec, op.Fm + pairIndex + 1);

                res = context.VectorInsert(res, emit(m1, m2), op.Fd + index + pairs);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        public static void EmitVectorPairwiseOpI32(ArmEmitterContext context, Func2I emit, bool signed)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            int elems = op.GetBytesCount() >> op.Size;
            int pairs = elems >> 1;

            Operand res = GetVecA32(op.Qd);

            for (int index = 0; index < pairs; index++)
            {
                int pairIndex = index << 1;
                Operand n1 = EmitVectorExtract32(context, op.Qn, op.In + pairIndex, op.Size, signed);
                Operand n2 = EmitVectorExtract32(context, op.Qn, op.In + pairIndex + 1, op.Size, signed);

                Operand m1 = EmitVectorExtract32(context, op.Qm, op.Im + pairIndex, op.Size, signed);
                Operand m2 = EmitVectorExtract32(context, op.Qm, op.Im + pairIndex + 1, op.Size, signed);

                res = EmitVectorInsert(context, res, emit(n1, n2), op.Id + index, op.Size);
                res = EmitVectorInsert(context, res, emit(m1, m2), op.Id + index + pairs, op.Size);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        // Narrow

        public static void EmitVectorUnaryNarrowOp32(ArmEmitterContext context, Func1I emit, bool signed = false)
        {
            OpCode32Simd op = (OpCode32Simd)context.CurrOp;

            int elems = 8 >> op.Size; // Size contains the target element size. (for when it becomes a doubleword)

            Operand res = GetVecA32(op.Qd);
            int id = (op.Vd & 1) << (3 - op.Size); // Target doubleword base.

            for (int index = 0; index < elems; index++)
            {
                Operand m = EmitVectorExtract32(context, op.Qm, index, op.Size + 1, signed);

                res = EmitVectorInsert(context, res, emit(m), id + index, op.Size);
            }

            context.Copy(GetVecA32(op.Qd), res);
        }

        // Intrinsic Helpers

        public static Operand EmitMoveDoubleWordToSide(ArmEmitterContext context, Operand input, int originalV, int targetV)
        {
            Debug.Assert(input.Type == OperandType.V128);

            int originalSide = originalV & 1;
            int targetSide = targetV & 1;

            if (originalSide == targetSide)
            {
                return input;
            }

            if (targetSide == 1)
            {
                return context.AddIntrinsic(Intrinsic.X86Movlhps, input, input); // Low to high.
            }
            else
            {
                return context.AddIntrinsic(Intrinsic.X86Movhlps, input, input); // High to low.
            }
        }

        public static Operand EmitDoubleWordInsert(ArmEmitterContext context, Operand target, Operand value, int targetV)
        {
            Debug.Assert(target.Type == OperandType.V128 && value.Type == OperandType.V128);

            int targetSide = targetV & 1;
            int shuffleMask = 2;

            if (targetSide == 1)
            {
                return context.AddIntrinsic(Intrinsic.X86Shufpd, target, value, Const(shuffleMask));
            }
            else
            {
                return context.AddIntrinsic(Intrinsic.X86Shufpd, value, target, Const(shuffleMask));
            }
        }

        public static Operand EmitScalarInsert(ArmEmitterContext context, Operand target, Operand value, int reg, bool doubleWidth)
        {
            Debug.Assert(target.Type == OperandType.V128 && value.Type == OperandType.V128);

            // Insert from index 0 in value to index in target.
            int index = reg & (doubleWidth ? 1 : 3);

            if (doubleWidth)
            {
                if (index == 1)
                {
                    return context.AddIntrinsic(Intrinsic.X86Movlhps, target, value); // Low to high.
                }
                else
                {
                    return context.AddIntrinsic(Intrinsic.X86Shufpd, value, target, Const(2)); // Low to low, keep high from original.
                }
            }
            else
            {
                if (Optimizations.UseSse41)
                {
                    return context.AddIntrinsic(Intrinsic.X86Insertps, target, value, Const(index << 4));
                }
                else
                {
                    target = EmitSwapScalar(context, target, index, doubleWidth); // Swap value to replace into element 0.
                    target = context.AddIntrinsic(Intrinsic.X86Movss, target, value); // Move the value into element 0 of the vector.
                    return EmitSwapScalar(context, target, index, doubleWidth); // Swap new value back to the correct index.
                }
            }
        }

        public static Operand EmitSwapScalar(ArmEmitterContext context, Operand target, int reg, bool doubleWidth)
        {
            // Index into 0, 0 into index. This swap happens at the start of an A32 scalar op if required.
            int index = reg & (doubleWidth ? 1 : 3);
            if (index == 0) return target;

            if (doubleWidth)
            {
                int shuffleMask = 1; // Swap top and bottom. (b0 = 1, b1 = 0)
                return context.AddIntrinsic(Intrinsic.X86Shufpd, target, target, Const(shuffleMask));
            }
            else
            {
                int shuffleMask = (3 << 6) | (2 << 4) | (1 << 2) | index; // Swap index and 0. (others remain)
                shuffleMask &= ~(3 << (index * 2));

                return context.AddIntrinsic(Intrinsic.X86Shufps, target, target, Const(shuffleMask));
            }
        }

        // Vector Operand Templates

        public static void EmitVectorUnaryOpSimd32(ArmEmitterContext context, Func1I vectorFunc)
        {
            OpCode32Simd op = (OpCode32Simd)context.CurrOp;

            Operand m = GetVecA32(op.Qm);
            Operand d = GetVecA32(op.Qd);

            if (!op.Q) // Register swap: move relevant doubleword to destination side.
            {
                m = EmitMoveDoubleWordToSide(context, m, op.Vm, op.Vd);
            }

            Operand res = vectorFunc(m);

            if (!op.Q) // Register insert.
            {
                res = EmitDoubleWordInsert(context, d, res, op.Vd);
            }

            context.Copy(d, res);
        }

        public static void EmitVectorUnaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
        {
            OpCode32Simd op = (OpCode32Simd)context.CurrOp;

            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;

            EmitVectorUnaryOpSimd32(context, (m) => context.AddIntrinsic(inst, m));
        }

        public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc, int side = -1)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            Operand n = GetVecA32(op.Qn);
            Operand m = GetVecA32(op.Qm);
            Operand d = GetVecA32(op.Qd);

            if (side == -1)
            {
                side = op.Vd;
            }

            if (!op.Q) // Register swap: move relevant doubleword to destination side.
            {
                n = EmitMoveDoubleWordToSide(context, n, op.Vn, side);
                m = EmitMoveDoubleWordToSide(context, m, op.Vm, side);
            }

            Operand res = vectorFunc(n, m);

            if (!op.Q) // Register insert.
            {
                if (side != op.Vd)
                {
                    res = EmitMoveDoubleWordToSide(context, res, side, op.Vd);
                }
                res = EmitDoubleWordInsert(context, d, res, op.Vd);
            }

            context.Copy(d, res);
        }

        public static void EmitVectorBinaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
            EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m));
        }

        public static void EmitVectorTernaryOpSimd32(ArmEmitterContext context, Func3I vectorFunc)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            Operand n = GetVecA32(op.Qn);
            Operand m = GetVecA32(op.Qm);
            Operand d = GetVecA32(op.Qd);
            Operand initialD = d;

            if (!op.Q) // Register swap: move relevant doubleword to destination side.
            {
                n = EmitMoveDoubleWordToSide(context, n, op.Vn, op.Vd);
                m = EmitMoveDoubleWordToSide(context, m, op.Vm, op.Vd);
            }

            Operand res = vectorFunc(d, n, m);

            if (!op.Q) // Register insert.
            {
                res = EmitDoubleWordInsert(context, initialD, res, op.Vd);
            }

            context.Copy(initialD, res);
        }

        public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1;
            Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2;

            EmitVectorTernaryOpSimd32(context, (d, n, m) =>
            {
                Operand res = context.AddIntrinsic(inst1, n, m);
                return res = context.AddIntrinsic(inst2, d, res);
            });
        }

        public static void EmitScalarUnaryOpSimd32(ArmEmitterContext context, Func1I scalarFunc)
        {
            OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;

            bool doubleSize = (op.Size & 1) != 0;
            int shift = doubleSize ? 1 : 2;
            Operand m = GetVecA32(op.Vm >> shift);
            Operand d = GetVecA32(op.Vd >> shift);

            m = EmitSwapScalar(context, m, op.Vm, doubleSize);

            Operand res = scalarFunc(m);

            // Insert scalar into vector.
            res = EmitScalarInsert(context, d, res, op.Vd, doubleSize);

            context.Copy(d, res);
        }

        public static void EmitScalarUnaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
        {
            OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;

            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;

            EmitScalarUnaryOpSimd32(context, (m) => (inst == 0) ? m : context.AddIntrinsic(inst, m));
        }

        public static void EmitScalarBinaryOpSimd32(ArmEmitterContext context, Func2I scalarFunc)
        {
            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;

            bool doubleSize = (op.Size & 1) != 0;
            int shift = doubleSize ? 1 : 2;
            Operand n = GetVecA32(op.Vn >> shift);
            Operand m = GetVecA32(op.Vm >> shift);
            Operand d = GetVecA32(op.Vd >> shift);

            n = EmitSwapScalar(context, n, op.Vn, doubleSize);
            m = EmitSwapScalar(context, m, op.Vm, doubleSize);

            Operand res = scalarFunc(n, m);

            // Insert scalar into vector.
            res = EmitScalarInsert(context, d, res, op.Vd, doubleSize);

            context.Copy(d, res);
        }

        public static void EmitScalarBinaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
        {
            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;

            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;

            EmitScalarBinaryOpSimd32(context, (n, m) =>  context.AddIntrinsic(inst, n, m));
        }

        public static void EmitScalarTernaryOpSimd32(ArmEmitterContext context, Func3I scalarFunc)
        {
            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;

            bool doubleSize = (op.Size & 1) != 0;
            int shift = doubleSize ? 1 : 2;
            Operand n = GetVecA32(op.Vn >> shift);
            Operand m = GetVecA32(op.Vm >> shift);
            Operand d = GetVecA32(op.Vd >> shift);
            Operand initialD = d;

            n = EmitSwapScalar(context, n, op.Vn, doubleSize);
            m = EmitSwapScalar(context, m, op.Vm, doubleSize);
            d = EmitSwapScalar(context, d, op.Vd, doubleSize);

            Operand res = scalarFunc(d, n, m);

            // Insert scalar into vector.
            res = EmitScalarInsert(context, initialD, res, op.Vd, doubleSize);

            context.Copy(initialD, res);
        }

        public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
        {
            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;

            bool doubleSize = (op.Size & 1) != 0;
            int shift = doubleSize ? 1 : 2;
            Intrinsic inst1 = doubleSize ? inst64pt1 : inst32pt1;
            Intrinsic inst2 = doubleSize ? inst64pt2 : inst32pt2;

            EmitScalarTernaryOpSimd32(context, (d, n, m) =>
            {
                Operand res = context.AddIntrinsic(inst1, n, m);
                return context.AddIntrinsic(inst2, d, res);
            });
        }

        // By Scalar

        public static void EmitVectorByScalarOpSimd32(ArmEmitterContext context, Func2I vectorFunc)
        {
            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;

            Operand n = GetVecA32(op.Qn);
            Operand d = GetVecA32(op.Qd);

            int index = op.Vm & 3;
            int dupeMask = (index << 6) | (index << 4) | (index << 2) | index;
            Operand m = GetVecA32(op.Vm >> 2);
            m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask));

            if (!op.Q) // Register swap: move relevant doubleword to destination side.
            {
                n = EmitMoveDoubleWordToSide(context, n, op.Vn, op.Vd);
            }

            Operand res = vectorFunc(n, m);

            if (!op.Q) // Register insert.
            {
                res = EmitDoubleWordInsert(context, d, res, op.Vd);
            }

            context.Copy(d, res);
        }

        public static void EmitVectorByScalarOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
        {
            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;

            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
            EmitVectorByScalarOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m));
        }

        public static void EmitVectorsByScalarOpSimd32(ArmEmitterContext context, Func3I vectorFunc)
        {
            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;

            Operand n = GetVecA32(op.Qn);
            Operand d = GetVecA32(op.Qd);
            Operand initialD = d;

            int index = op.Vm & 3;
            int dupeMask = (index << 6) | (index << 4) | (index << 2) | index;
            Operand m = GetVecA32(op.Vm >> 2);
            m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask));

            if (!op.Q) // Register swap: move relevant doubleword to destination side.
            {
                n = EmitMoveDoubleWordToSide(context, n, op.Vn, op.Vd);
            }

            Operand res = vectorFunc(d, n, m);

            if (!op.Q) // Register insert.
            {
                res = EmitDoubleWordInsert(context, initialD, res, op.Vd);
            }

            context.Copy(initialD, res);
        }

        public static void EmitVectorsByScalarOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
        {
            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;

            Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1;
            Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2;

            EmitVectorsByScalarOpSimd32(context, (d, n, m) =>
            {
                Operand res = context.AddIntrinsic(inst1, n, m);
                return res = context.AddIntrinsic(inst2, d, res);
            });
        }

        // Pairwise

        public static void EmitSse2VectorPairwiseOpF32(ArmEmitterContext context, Intrinsic inst32)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            EmitVectorBinaryOpSimd32(context, (n, m) =>
            {
                Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, n, m);

                Operand part0 = unpck;
                Operand part1 = context.AddIntrinsic(Intrinsic.X86Movhlps, unpck, unpck);

                return context.AddIntrinsic(inst32, part0, part1);
            }, 0);
        }

        public static void EmitSsse3VectorPairwiseOp32(ArmEmitterContext context, Intrinsic[] inst)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

            EmitVectorBinaryOpSimd32(context, (n, m) =>
            {
                if (op.RegisterSize == RegisterSize.Simd64)
                {
                    Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]);
                    Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks[op.Size]);

                    Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n

                    Operand left = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroEvenMask); // 0:even from m:n
                    Operand right = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroOddMask); // 0:odd  from m:n

                    return context.AddIntrinsic(inst[op.Size], left, right);
                }
                else if (op.Size < 3)
                {
                    Operand oddEvenMask = X86GetElements(context, OddMasks[op.Size], EvenMasks[op.Size]);

                    Operand oddEvenN = context.AddIntrinsic(Intrinsic.X86Pshufb, n, oddEvenMask); // odd:even from n
                    Operand oddEvenM = context.AddIntrinsic(Intrinsic.X86Pshufb, m, oddEvenMask); // odd:even from m

                    Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, oddEvenN, oddEvenM);
                    Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, oddEvenN, oddEvenM);

                    return context.AddIntrinsic(inst[op.Size], left, right);
                }
                else
                {
                    Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m);
                    Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, n, m);

                    return context.AddIntrinsic(inst[3], left, right);
                }
            }, 0);
        }

        // Generic Functions

        public static Operand EmitSoftFloatCallDefaultFpscr(ArmEmitterContext context, string name, params Operand[] callArgs)
        {
            IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;

            MethodInfo info = (op.Size & 1) == 0
                ? typeof(SoftFloat32).GetMethod(name)
                : typeof(SoftFloat64).GetMethod(name);

            Array.Resize(ref callArgs, callArgs.Length + 1);
            callArgs[callArgs.Length - 1] = Const(1);

            return context.Call(info, callArgs);
        }

        public static Operand EmitVectorExtractSx32(ArmEmitterContext context, int reg, int index, int size)
        {
            return EmitVectorExtract32(context, reg, index, size, true);
        }

        public static Operand EmitVectorExtractZx32(ArmEmitterContext context, int reg, int index, int size)
        {
            return EmitVectorExtract32(context, reg, index, size, false);
        }

        public static Operand EmitVectorExtract32(ArmEmitterContext context, int reg, int index, int size, bool signed)
        {
            ThrowIfInvalid(index, size);

            Operand res = null;

            switch (size)
            {
                case 0:
                    res = context.VectorExtract8(GetVec(reg), index);
                    break;

                case 1:
                    res = context.VectorExtract16(GetVec(reg), index);
                    break;

                case 2:
                    res = context.VectorExtract(OperandType.I32, GetVec(reg), index);
                    break;

                case 3:
                    res = context.VectorExtract(OperandType.I64, GetVec(reg), index);
                    break;
            }

            if (signed)
            {
                switch (size)
                {
                    case 0: res = context.SignExtend8(OperandType.I32, res); break;
                    case 1: res = context.SignExtend16(OperandType.I32, res); break;
                }
            }
            else
            {
                switch (size)
                {
                    case 0: res = context.ZeroExtend8(OperandType.I32, res); break;
                    case 1: res = context.ZeroExtend16(OperandType.I32, res); break;
                }
            }

            return res;
        }
    }
}
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
+								using ARMeilleure.Decoders;
 								using ARMeilleure.IntermediateRepresentation;
 								using ARMeilleure.Translation;
 								using System;
 								using System.Diagnostics;
-												Add Profiled Persistent Translation Cache. (#769)

* Delete DelegateTypes.cs

* Delete DelegateCache.cs

* Add files via upload

* Update Horizon.cs

* Update Program.cs

* Update MainWindow.cs

* Update Aot.cs

* Update RelocEntry.cs

* Update Translator.cs

* Update MemoryManager.cs

* Update InstEmitMemoryHelper.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nit.

* 10 fewer MSIL bytes for us

* Add comment. Nits.

* Update Translator.cs

* Update Aot.cs

* Nits.

* Opt..

* Opt..

* Opt..

* Opt..

* Allow to change compression level.

* Update MemoryManager.cs

* Update Translator.cs

* Manage corner cases during the save phase. Nits.

* Update Aot.cs

* Translator response tweak for Aot disabled. Nit.

* Nit.

* Nits.

* Create DelegateHelpers.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nits.

* Fix due to #784.

* Fixes due to #757 & #841.

* Fix due to #846.

* Fix due to #847.

* Use MethodInfo for managed method calls.

Use IR methods instead of managed methods about Max/Min (S/U).
Follow-ups & Nits.

* Add missing exception messages.

Reintroduce slow path for Fmov_Vi.
Implement slow path for Fmov_Si.

* Switch to the new folder structure.

Nits.

* Impl. index-based relocation information. Impl. cache file version field.

* Nit.

* Address gdkchan comments.

Mainly:
- fixed cache file corruption issue on exit; - exposed a way to disable AOT on the GUI.

* Address AcK77 comment.

* Address Thealexbarney, jduncanator & emmauss comments.

Header magic, CpuId (FI) & Aot -> Ptc.

* Adaptation to the new application reloading system.

Improvements to the call system of managed methods.
Follow-ups.
Nits.

* Get the same boot times as on master when PTC is disabled.

* Profiled Aot.

* A32 support (#897).

* #975 support (1 of 2).

* #975 support (2 of 2).

* Rebase fix & nits.

* Some fixes and nits (still one bug left).

* One fix & nits.

* Tests fix (by gdk) & nits.

* Support translations not only in high quality and rejit.

Nits.

* Added possibility to skip translations and continue execution, using `ESC` key.

* Update SettingsWindow.cs

* Update GLRenderer.cs

* Update Ptc.cs

* Disabled Profiled PTC by default as requested in the past by gdk.

* Fix rejit bug. Increased number of parallel translations. Add stack unwinding stuffs support (1 of 2).

Nits.

* Add stack unwinding stuffs support (2 of 2). Tuned number of parallel translations.

* Restored the ability to assemble jumps with 8-bit offset when Profiled PTC is disabled or during profiling.

Modifications due to rebase.
Nits.

* Limited profiling of the functions to be translated to the addresses belonging to the range of static objects only.

* Nits.

* Nits.

* Update Delegates.cs

* Nit.

* Update InstEmitSimdArithmetic.cs

* Address riperiperi comments.

* Fixed the issue of unjustifiably longer boot times at the second boot than at the first boot, measured at the same time or reference point and with the same number of translated functions.

* Implemented a simple redundant load/save mechanism.

Halved the value of Decoder.MaxInstsPerFunction more appropriate for the current performance of the Translator.
Replaced by Logger.PrintError to Logger.PrintDebug in TexturePool.cs about the supposed invalid texture format to avoid the spawn of the log.
Nits.

* Nit.

Improved Logger.PrintError in TexturePool.cs to avoid log spawn.
Added missing code for FZ handling (in output) for fp max/min instructions (slow paths).

* Add configuration migration for PTC

Co-authored-by: Thog <me@thog.eu>
											
										
										
											2020-06-16 18:28:02 +00:00
+								using System.Reflection;
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
+								using static ARMeilleure.Instructions.InstEmitHelper;
 								using static ARMeilleure.Instructions.InstEmitSimdHelper;
 								using static ARMeilleure.IntermediateRepresentation.OperandHelper;
 								namespace ARMeilleure.Instructions
 								{
 								    using Func1I = Func<Operand, Operand>;
 								    using Func2I = Func<Operand, Operand, Operand>;
 								    using Func3I = Func<Operand, Operand, Operand, Operand>;
 								    static class InstEmitSimdHelper32
 								    {
 								        public static (int, int) GetQuadwordAndSubindex(int index, RegisterSize size)
 								        {
 								            switch (size)
 								            {
 								                case RegisterSize.Simd128:
 								                    return (index >> 1, 0);
 								                case RegisterSize.Simd64:
 								                case RegisterSize.Int64:
 								                    return (index >> 1, index & 1);
 								                case RegisterSize.Int32:
 								                    return (index >> 2, index & 3);
 								            }
 								            throw new ArgumentException("Unrecognized Vector Register Size.");
 								        }
 								        public static Operand ExtractScalar(ArmEmitterContext context, OperandType type, int reg)
 								        {
 								            Debug.Assert(type != OperandType.V128);
 								            if (type == OperandType.FP64 || type == OperandType.I64)
 								            {
 								                // From dreg.
 								                return context.VectorExtract(type, GetVecA32(reg >> 1), reg & 1);
-												Add Profiled Persistent Translation Cache. (#769)

* Delete DelegateTypes.cs

* Delete DelegateCache.cs

* Add files via upload

* Update Horizon.cs

* Update Program.cs

* Update MainWindow.cs

* Update Aot.cs

* Update RelocEntry.cs

* Update Translator.cs

* Update MemoryManager.cs

* Update InstEmitMemoryHelper.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nit.

* 10 fewer MSIL bytes for us

* Add comment. Nits.

* Update Translator.cs

* Update Aot.cs

* Nits.

* Opt..

* Opt..

* Opt..

* Opt..

* Allow to change compression level.

* Update MemoryManager.cs

* Update Translator.cs

* Manage corner cases during the save phase. Nits.

* Update Aot.cs

* Translator response tweak for Aot disabled. Nit.

* Nit.

* Nits.

* Create DelegateHelpers.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nits.

* Fix due to #784.

* Fixes due to #757 & #841.

* Fix due to #846.

* Fix due to #847.

* Use MethodInfo for managed method calls.

Use IR methods instead of managed methods about Max/Min (S/U).
Follow-ups & Nits.

* Add missing exception messages.

Reintroduce slow path for Fmov_Vi.
Implement slow path for Fmov_Si.

* Switch to the new folder structure.

Nits.

* Impl. index-based relocation information. Impl. cache file version field.

* Nit.

* Address gdkchan comments.

Mainly:
- fixed cache file corruption issue on exit; - exposed a way to disable AOT on the GUI.

* Address AcK77 comment.

* Address Thealexbarney, jduncanator & emmauss comments.

Header magic, CpuId (FI) & Aot -> Ptc.

* Adaptation to the new application reloading system.

Improvements to the call system of managed methods.
Follow-ups.
Nits.

* Get the same boot times as on master when PTC is disabled.

* Profiled Aot.

* A32 support (#897).

* #975 support (1 of 2).

* #975 support (2 of 2).

* Rebase fix & nits.

* Some fixes and nits (still one bug left).

* One fix & nits.

* Tests fix (by gdk) & nits.

* Support translations not only in high quality and rejit.

Nits.

* Added possibility to skip translations and continue execution, using `ESC` key.

* Update SettingsWindow.cs

* Update GLRenderer.cs

* Update Ptc.cs

* Disabled Profiled PTC by default as requested in the past by gdk.

* Fix rejit bug. Increased number of parallel translations. Add stack unwinding stuffs support (1 of 2).

Nits.

* Add stack unwinding stuffs support (2 of 2). Tuned number of parallel translations.

* Restored the ability to assemble jumps with 8-bit offset when Profiled PTC is disabled or during profiling.

Modifications due to rebase.
Nits.

* Limited profiling of the functions to be translated to the addresses belonging to the range of static objects only.

* Nits.

* Nits.

* Update Delegates.cs

* Nit.

* Update InstEmitSimdArithmetic.cs

* Address riperiperi comments.

* Fixed the issue of unjustifiably longer boot times at the second boot than at the first boot, measured at the same time or reference point and with the same number of translated functions.

* Implemented a simple redundant load/save mechanism.

Halved the value of Decoder.MaxInstsPerFunction more appropriate for the current performance of the Translator.
Replaced by Logger.PrintError to Logger.PrintDebug in TexturePool.cs about the supposed invalid texture format to avoid the spawn of the log.
Nits.

* Nit.

Improved Logger.PrintError in TexturePool.cs to avoid log spawn.
Added missing code for FZ handling (in output) for fp max/min instructions (slow paths).

* Add configuration migration for PTC

Co-authored-by: Thog <me@thog.eu>
											
										
										
											2020-06-16 18:28:02 +00:00
+								            }
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
+								            else
 								            {
 								                // From sreg.
 								                return context.VectorExtract(type, GetVecA32(reg >> 2), reg & 3);
 								            }
 								        }
 								        public static void InsertScalar(ArmEmitterContext context, int reg, Operand value)
 								        {
 								            Debug.Assert(value.Type != OperandType.V128);
 								            Operand vec, insert;
 								            if (value.Type == OperandType.FP64 || value.Type == OperandType.I64)
 								            {
 								                // From dreg.
 								                vec = GetVecA32(reg >> 1);
 								                insert = context.VectorInsert(vec, value, reg & 1);
 								            }
 								            else
 								            {
 								                // From sreg.
 								                vec = GetVecA32(reg >> 2);
 								                insert = context.VectorInsert(vec, value, reg & 3);
 								            }
 								            context.Copy(vec, insert);
 								        }
-												Implement VMULL, VMLSL, VRSHR, VQRSHRN, VQRSHRUN AArch32 instructions + other fixes (#977)

* Implement VMULL, VMLSL, VQRSHRN, VQRSHRUN AArch32 instructions plus other fixes

* Re-align opcode table

* Re-enable undefined, use subclasses to fix checks

* Add test and fix VRSHR instruction

* PR feedback
											
										
										
											2020-03-11 00:49:27 +00:00
+								        public static Operand ExtractElement(ArmEmitterContext context, int reg, int size, bool signed)
 								        {
 								            return EmitVectorExtract32(context, reg >> (4 - size), reg & ((16 >> size) - 1), size, signed);
 								        }
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
+								        public static void EmitVectorImmUnaryOp32(ArmEmitterContext context, Func1I emit)
 								        {
 								            IOpCode32SimdImm op = (IOpCode32SimdImm)context.CurrOp;
 								            Operand imm = Const(op.Immediate);
 								            int elems = op.Elems;
 								            (int index, int subIndex) = GetQuadwordAndSubindex(op.Vd, op.RegisterSize);
 								            Operand vec = GetVecA32(index);
 								            Operand res = vec;
 								            for (int item = 0; item < elems; item++)
 								            {
 								                res = EmitVectorInsert(context, res, emit(imm), item + subIndex * elems, op.Size);
 								            }
 								            context.Copy(vec, res);
 								        }
 								        public static void EmitScalarUnaryOpF32(ArmEmitterContext context, Func1I emit)
 								        {
 								            OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
 								            OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;
 								            Operand m = ExtractScalar(context, type, op.Vm);
 								            InsertScalar(context, op.Vd, emit(m));
 								        }
 								        public static void EmitScalarBinaryOpF32(ArmEmitterContext context, Func2I emit)
 								        {
 								            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
 								            OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;
 								            Operand n = ExtractScalar(context, type, op.Vn);
 								            Operand m = ExtractScalar(context, type, op.Vm);
 								            InsertScalar(context, op.Vd, emit(n, m));
 								        }
 								        public static void EmitScalarBinaryOpI32(ArmEmitterContext context, Func2I emit)
 								        {
 								            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
 								            OperandType type = (op.Size & 1) != 0 ? OperandType.I64 : OperandType.I32;
 								            if (op.Size < 2)
 								            {
 								                throw new NotSupportedException("Cannot perform a scalar SIMD operation on integers smaller than 32 bits.");
 								            }
 								            Operand n = ExtractScalar(context, type, op.Vn);
 								            Operand m = ExtractScalar(context, type, op.Vm);
 								            InsertScalar(context, op.Vd, emit(n, m));
 								        }
 								        public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Func3I emit)
 								        {
 								            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
 								            OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;
 								            Operand a = ExtractScalar(context, type, op.Vd);
 								            Operand n = ExtractScalar(context, type, op.Vn);
 								            Operand m = ExtractScalar(context, type, op.Vm);
 								            InsertScalar(context, op.Vd, emit(a, n, m));
 								        }
 								        public static void EmitVectorUnaryOpF32(ArmEmitterContext context, Func1I emit)
 								        {
 								            OpCode32Simd op = (OpCode32Simd)context.CurrOp;
 								            int sizeF = op.Size & 1;
 								            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;
 								            int elems = op.GetBytesCount() >> sizeF + 2;
 								            Operand res = GetVecA32(op.Qd);
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand me = context.VectorExtract(type, GetVecA32(op.Qm), op.Fm + index);
 								                res = context.VectorInsert(res, emit(me), op.Fd + index);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        public static void EmitVectorBinaryOpF32(ArmEmitterContext context, Func2I emit)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            int sizeF = op.Size & 1;
 								            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;
 								            int elems = op.GetBytesCount() >> (sizeF + 2);
 								            Operand res = GetVecA32(op.Qd);
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand ne = context.VectorExtract(type, GetVecA32(op.Qn), op.Fn + index);
 								                Operand me = context.VectorExtract(type, GetVecA32(op.Qm), op.Fm + index);
 								                res = context.VectorInsert(res, emit(ne, me), op.Fd + index);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Func3I emit)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            int sizeF = op.Size & 1;
 								            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;
 								            int elems = op.GetBytesCount() >> sizeF + 2;
 								            Operand res = GetVecA32(op.Qd);
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand de = context.VectorExtract(type, GetVecA32(op.Qd), op.Fd + index);
 								                Operand ne = context.VectorExtract(type, GetVecA32(op.Qn), op.Fn + index);
 								                Operand me = context.VectorExtract(type, GetVecA32(op.Qm), op.Fm + index);
 								                res = context.VectorInsert(res, emit(de, ne, me), op.Fd + index);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        // Integer
 								        public static void EmitVectorUnaryOpI32(ArmEmitterContext context, Func1I emit, bool signed)
 								        {
 								            OpCode32Simd op = (OpCode32Simd)context.CurrOp;
 								            Operand res = GetVecA32(op.Qd);
 								            int elems = op.GetBytesCount() >> op.Size;
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed);
 								                res = EmitVectorInsert(context, res, emit(me), op.Id + index, op.Size);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        public static void EmitVectorBinaryOpI32(ArmEmitterContext context, Func2I emit, bool signed)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            Operand res = GetVecA32(op.Qd);
 								            int elems = op.GetBytesCount() >> op.Size;
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);
 								                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed);
 								                res = EmitVectorInsert(context, res, emit(ne, me), op.Id + index, op.Size);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
-												Implement VMULL, VMLSL, VRSHR, VQRSHRN, VQRSHRUN AArch32 instructions + other fixes (#977)

* Implement VMULL, VMLSL, VQRSHRN, VQRSHRUN AArch32 instructions plus other fixes

* Re-align opcode table

* Re-enable undefined, use subclasses to fix checks

* Add test and fix VRSHR instruction

* PR feedback
											
										
										
											2020-03-11 00:49:27 +00:00
+								        public static void EmitVectorBinaryLongOpI32(ArmEmitterContext context, Func2I emit, bool signed)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            Operand res = context.VectorZero();
 								            int elems = op.GetBytesCount() >> op.Size;
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);
 								                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed);
 								                if (op.Size == 2)
 								                {
 								                    ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne);
 								                    me = signed ? context.SignExtend32(OperandType.I64, me) : context.ZeroExtend32(OperandType.I64, me);
 								                }
 								                res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size + 1);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        public static void EmitVectorTernaryLongOpI32(ArmEmitterContext context, Func3I emit, bool signed)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            Operand res = context.VectorZero();
 								            int elems = op.GetBytesCount() >> op.Size;
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand de = EmitVectorExtract32(context, op.Qd, op.Id + index, op.Size + 1, signed);
 								                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size,     signed);
 								                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size,     signed);
 								                if (op.Size == 2)
 								                {
 								                    ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne);
 								                    me = signed ? context.SignExtend32(OperandType.I64, me) : context.ZeroExtend32(OperandType.I64, me);
 								                }
 								                res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size + 1);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
+								        public static void EmitVectorTernaryOpI32(ArmEmitterContext context, Func3I emit, bool signed)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            Operand res = GetVecA32(op.Qd);
 								            int elems = op.GetBytesCount() >> op.Size;
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand de = EmitVectorExtract32(context, op.Qd, op.Id + index, op.Size, signed);
 								                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);
 								                Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed);
 								                res = EmitVectorInsert(context, res, emit(de, ne, me), op.Id + index, op.Size);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        public static void EmitVectorUnaryOpSx32(ArmEmitterContext context, Func1I emit)
 								        {
 								            EmitVectorUnaryOpI32(context, emit, true);
 								        }
 								        public static void EmitVectorBinaryOpSx32(ArmEmitterContext context, Func2I emit)
 								        {
 								            EmitVectorBinaryOpI32(context, emit, true);
 								        }
 								        public static void EmitVectorTernaryOpSx32(ArmEmitterContext context, Func3I emit)
 								        {
 								            EmitVectorTernaryOpI32(context, emit, true);
 								        }
 								        public static void EmitVectorUnaryOpZx32(ArmEmitterContext context, Func1I emit)
 								        {
 								            EmitVectorUnaryOpI32(context, emit, false);
 								        }
 								        public static void EmitVectorBinaryOpZx32(ArmEmitterContext context, Func2I emit)
 								        {
 								            EmitVectorBinaryOpI32(context, emit, false);
 								        }
 								        public static void EmitVectorTernaryOpZx32(ArmEmitterContext context, Func3I emit)
 								        {
 								            EmitVectorTernaryOpI32(context, emit, false);
 								        }
 								        // Vector by scalar
 								        public static void EmitVectorByScalarOpF32(ArmEmitterContext context, Func2I emit)
 								        {
 								            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
 								            int sizeF = op.Size & 1;
 								            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;
 								            int elems = op.GetBytesCount() >> sizeF + 2;
 								            Operand m = ExtractScalar(context, type, op.Vm);
 								            Operand res = GetVecA32(op.Qd);
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand ne = context.VectorExtract(type, GetVecA32(op.Qn), op.Fn + index);
 								                res = context.VectorInsert(res, emit(ne, m), op.Fd + index);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        public static void EmitVectorByScalarOpI32(ArmEmitterContext context, Func2I emit, bool signed)
 								        {
 								            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
-												Implement VMULL, VMLSL, VRSHR, VQRSHRN, VQRSHRUN AArch32 instructions + other fixes (#977)

* Implement VMULL, VMLSL, VQRSHRN, VQRSHRUN AArch32 instructions plus other fixes

* Re-align opcode table

* Re-enable undefined, use subclasses to fix checks

* Add test and fix VRSHR instruction

* PR feedback
											
										
										
											2020-03-11 00:49:27 +00:00
+								            Operand m = ExtractElement(context, op.Vm, op.Size, signed);
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
 								            Operand res = GetVecA32(op.Qd);
 								            int elems = op.GetBytesCount() >> op.Size;
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);
-												Implement VMULL, VMLSL, VRSHR, VQRSHRN, VQRSHRUN AArch32 instructions + other fixes (#977)

* Implement VMULL, VMLSL, VQRSHRN, VQRSHRUN AArch32 instructions plus other fixes

* Re-align opcode table

* Re-enable undefined, use subclasses to fix checks

* Add test and fix VRSHR instruction

* PR feedback
											
										
										
											2020-03-11 00:49:27 +00:00
+								                res = EmitVectorInsert(context, res, emit(ne, m), op.Id + index, op.Size);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        public static void EmitVectorByScalarLongOpI32(ArmEmitterContext context, Func2I emit, bool signed)
 								        {
 								            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
 								            Operand m = ExtractElement(context, op.Vm, op.Size, signed);
 								            if (op.Size == 2)
 								            {
 								                m = signed ? context.SignExtend32(OperandType.I64, m) : context.ZeroExtend32(OperandType.I64, m);
 								            }
 								            Operand res = context.VectorZero();
 								            int elems = op.GetBytesCount() >> op.Size;
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);
 								                if (op.Size == 2)
 								                {
 								                    ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne);
 								                }
 								                res = EmitVectorInsert(context, res, emit(ne, m), index, op.Size + 1);
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
+								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        public static void EmitVectorsByScalarOpF32(ArmEmitterContext context, Func3I emit)
 								        {
 								            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
 								            int sizeF = op.Size & 1;
 								            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;
 								            int elems = op.GetBytesCount() >> sizeF + 2;
 								            Operand m = ExtractScalar(context, type, op.Vm);
 								            Operand res = GetVecA32(op.Qd);
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand de = context.VectorExtract(type, GetVecA32(op.Qd), op.Fd + index);
 								                Operand ne = context.VectorExtract(type, GetVecA32(op.Qn), op.Fn + index);
 								                res = context.VectorInsert(res, emit(de, ne, m), op.Fd + index);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        public static void EmitVectorsByScalarOpI32(ArmEmitterContext context, Func3I emit, bool signed)
 								        {
 								            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
 								            Operand m = EmitVectorExtract32(context, op.Vm >> (4 - op.Size), op.Vm & ((1 << (4 - op.Size)) - 1), op.Size, signed);
 								            Operand res = GetVecA32(op.Qd);
 								            int elems = op.GetBytesCount() >> op.Size;
 								            for (int index = 0; index < elems; index++)
 								            {
 								                Operand de = EmitVectorExtract32(context, op.Qd, op.Id + index, op.Size, signed);
 								                Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed);
 								                res = EmitVectorInsert(context, res, emit(de, ne, m), op.Id + index, op.Size);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        // Pairwise
 								        public static void EmitVectorPairwiseOpF32(ArmEmitterContext context, Func2I emit)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            int sizeF = op.Size & 1;
 								            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;
 								            int elems = op.GetBytesCount() >> (sizeF + 2);
 								            int pairs = elems >> 1;
 								            Operand res = GetVecA32(op.Qd);
 								            Operand mvec = GetVecA32(op.Qm);
 								            Operand nvec = GetVecA32(op.Qn);
 								            for (int index = 0; index < pairs; index++)
 								            {
 								                int pairIndex = index << 1;
 								                Operand n1 = context.VectorExtract(type, nvec, op.Fn + pairIndex);
 								                Operand n2 = context.VectorExtract(type, nvec, op.Fn + pairIndex + 1);
 								                res = context.VectorInsert(res, emit(n1, n2), op.Fd + index);
 								                Operand m1 = context.VectorExtract(type, mvec, op.Fm + pairIndex);
 								                Operand m2 = context.VectorExtract(type, mvec, op.Fm + pairIndex + 1);
 								                res = context.VectorInsert(res, emit(m1, m2), op.Fd + index + pairs);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        public static void EmitVectorPairwiseOpI32(ArmEmitterContext context, Func2I emit, bool signed)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            int elems = op.GetBytesCount() >> op.Size;
 								            int pairs = elems >> 1;
 								            Operand res = GetVecA32(op.Qd);
 								            for (int index = 0; index < pairs; index++)
 								            {
 								                int pairIndex = index << 1;
 								                Operand n1 = EmitVectorExtract32(context, op.Qn, op.In + pairIndex, op.Size, signed);
 								                Operand n2 = EmitVectorExtract32(context, op.Qn, op.In + pairIndex + 1, op.Size, signed);
 								                Operand m1 = EmitVectorExtract32(context, op.Qm, op.Im + pairIndex, op.Size, signed);
 								                Operand m2 = EmitVectorExtract32(context, op.Qm, op.Im + pairIndex + 1, op.Size, signed);
 								                res = EmitVectorInsert(context, res, emit(n1, n2), op.Id + index, op.Size);
 								                res = EmitVectorInsert(context, res, emit(m1, m2), op.Id + index + pairs, op.Size);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
 								        // Narrow
-												Implement VMULL, VMLSL, VRSHR, VQRSHRN, VQRSHRUN AArch32 instructions + other fixes (#977)

* Implement VMULL, VMLSL, VQRSHRN, VQRSHRUN AArch32 instructions plus other fixes

* Re-align opcode table

* Re-enable undefined, use subclasses to fix checks

* Add test and fix VRSHR instruction

* PR feedback
											
										
										
											2020-03-11 00:49:27 +00:00
+								        public static void EmitVectorUnaryNarrowOp32(ArmEmitterContext context, Func1I emit, bool signed = false)
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
+								        {
 								            OpCode32Simd op = (OpCode32Simd)context.CurrOp;
 								            int elems = 8 >> op.Size; // Size contains the target element size. (for when it becomes a doubleword)
 								            Operand res = GetVecA32(op.Qd);
 								            int id = (op.Vd & 1) << (3 - op.Size); // Target doubleword base.
 								            for (int index = 0; index < elems; index++)
 								            {
-												Implement VMULL, VMLSL, VRSHR, VQRSHRN, VQRSHRUN AArch32 instructions + other fixes (#977)

* Implement VMULL, VMLSL, VQRSHRN, VQRSHRUN AArch32 instructions plus other fixes

* Re-align opcode table

* Re-enable undefined, use subclasses to fix checks

* Add test and fix VRSHR instruction

* PR feedback
											
										
										
											2020-03-11 00:49:27 +00:00
+								                Operand m = EmitVectorExtract32(context, op.Qm, index, op.Size + 1, signed);
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
 								                res = EmitVectorInsert(context, res, emit(m), id + index, op.Size);
 								            }
 								            context.Copy(GetVecA32(op.Qd), res);
 								        }
-												Implement Fast Paths for most A32 SIMD instructions (#952)

* Begin work on A32 SIMD Intrinsics

* More instructions, some cleanup.

* Intrinsics for Move instructions (zip etc)

These pass the existing tests.

* Intrinsics for some of Cvt

While doing this I noticed that the conversion for int/fp was incorrect
in the slow path. I'll fix this in the original repo.

* Intrinsics for more Arithmetic instructions.

* Intrinsics for Vext

* Fix VEXT Intrinsic for double words.

* Use InsertPs to move scalar values.

* Cleanup, fix VPADD.f32 and VMIN signed integer.

* Cleanup, add SSE2 support for scalar insert.

Works similarly to the IR scalar insert, but obviously this one works
directly on V128.

* Minor cleanup.

* Enable intrinsic for FP64 to integer conversion.

* Address feedback apart from splitting out intrinsic float abs

Also: bad VREV encodings as undefined rather than throwing in translation.

* Move float abs to helper, fix bug with cvt

* Rename opc2 & 3 to match A32 docs, use ArgumentOutOfRangeException appropriately.

* Get name of variable at compilation rather than string literal.

* Use correct double sign mask.

											
										
										
											2020-03-05 00:41:33 +00:00
+								        // Intrinsic Helpers
 								        public static Operand EmitMoveDoubleWordToSide(ArmEmitterContext context, Operand input, int originalV, int targetV)
 								        {
 								            Debug.Assert(input.Type == OperandType.V128);
 								            int originalSide = originalV & 1;
 								            int targetSide = targetV & 1;
 								            if (originalSide == targetSide)
 								            {
 								                return input;
 								            }
 								            if (targetSide == 1)
 								            {
 								                return context.AddIntrinsic(Intrinsic.X86Movlhps, input, input); // Low to high.
-												Add Profiled Persistent Translation Cache. (#769)

* Delete DelegateTypes.cs

* Delete DelegateCache.cs

* Add files via upload

* Update Horizon.cs

* Update Program.cs

* Update MainWindow.cs

* Update Aot.cs

* Update RelocEntry.cs

* Update Translator.cs

* Update MemoryManager.cs

* Update InstEmitMemoryHelper.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nit.

* 10 fewer MSIL bytes for us

* Add comment. Nits.

* Update Translator.cs

* Update Aot.cs

* Nits.

* Opt..

* Opt..

* Opt..

* Opt..

* Allow to change compression level.

* Update MemoryManager.cs

* Update Translator.cs

* Manage corner cases during the save phase. Nits.

* Update Aot.cs

* Translator response tweak for Aot disabled. Nit.

* Nit.

* Nits.

* Create DelegateHelpers.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nits.

* Fix due to #784.

* Fixes due to #757 & #841.

* Fix due to #846.

* Fix due to #847.

* Use MethodInfo for managed method calls.

Use IR methods instead of managed methods about Max/Min (S/U).
Follow-ups & Nits.

* Add missing exception messages.

Reintroduce slow path for Fmov_Vi.
Implement slow path for Fmov_Si.

* Switch to the new folder structure.

Nits.

* Impl. index-based relocation information. Impl. cache file version field.

* Nit.

* Address gdkchan comments.

Mainly:
- fixed cache file corruption issue on exit; - exposed a way to disable AOT on the GUI.

* Address AcK77 comment.

* Address Thealexbarney, jduncanator & emmauss comments.

Header magic, CpuId (FI) & Aot -> Ptc.

* Adaptation to the new application reloading system.

Improvements to the call system of managed methods.
Follow-ups.
Nits.

* Get the same boot times as on master when PTC is disabled.

* Profiled Aot.

* A32 support (#897).

* #975 support (1 of 2).

* #975 support (2 of 2).

* Rebase fix & nits.

* Some fixes and nits (still one bug left).

* One fix & nits.

* Tests fix (by gdk) & nits.

* Support translations not only in high quality and rejit.

Nits.

* Added possibility to skip translations and continue execution, using `ESC` key.

* Update SettingsWindow.cs

* Update GLRenderer.cs

* Update Ptc.cs

* Disabled Profiled PTC by default as requested in the past by gdk.

* Fix rejit bug. Increased number of parallel translations. Add stack unwinding stuffs support (1 of 2).

Nits.

* Add stack unwinding stuffs support (2 of 2). Tuned number of parallel translations.

* Restored the ability to assemble jumps with 8-bit offset when Profiled PTC is disabled or during profiling.

Modifications due to rebase.
Nits.

* Limited profiling of the functions to be translated to the addresses belonging to the range of static objects only.

* Nits.

* Nits.

* Update Delegates.cs

* Nit.

* Update InstEmitSimdArithmetic.cs

* Address riperiperi comments.

* Fixed the issue of unjustifiably longer boot times at the second boot than at the first boot, measured at the same time or reference point and with the same number of translated functions.

* Implemented a simple redundant load/save mechanism.

Halved the value of Decoder.MaxInstsPerFunction more appropriate for the current performance of the Translator.
Replaced by Logger.PrintError to Logger.PrintDebug in TexturePool.cs about the supposed invalid texture format to avoid the spawn of the log.
Nits.

* Nit.

Improved Logger.PrintError in TexturePool.cs to avoid log spawn.
Added missing code for FZ handling (in output) for fp max/min instructions (slow paths).

* Add configuration migration for PTC

Co-authored-by: Thog <me@thog.eu>
											
										
										
											2020-06-16 18:28:02 +00:00
+								            }
-												Implement Fast Paths for most A32 SIMD instructions (#952)

* Begin work on A32 SIMD Intrinsics

* More instructions, some cleanup.

* Intrinsics for Move instructions (zip etc)

These pass the existing tests.

* Intrinsics for some of Cvt

While doing this I noticed that the conversion for int/fp was incorrect
in the slow path. I'll fix this in the original repo.

* Intrinsics for more Arithmetic instructions.

* Intrinsics for Vext

* Fix VEXT Intrinsic for double words.

* Use InsertPs to move scalar values.

* Cleanup, fix VPADD.f32 and VMIN signed integer.

* Cleanup, add SSE2 support for scalar insert.

Works similarly to the IR scalar insert, but obviously this one works
directly on V128.

* Minor cleanup.

* Enable intrinsic for FP64 to integer conversion.

* Address feedback apart from splitting out intrinsic float abs

Also: bad VREV encodings as undefined rather than throwing in translation.

* Move float abs to helper, fix bug with cvt

* Rename opc2 & 3 to match A32 docs, use ArgumentOutOfRangeException appropriately.

* Get name of variable at compilation rather than string literal.

* Use correct double sign mask.

											
										
										
											2020-03-05 00:41:33 +00:00
+								            else
 								            {
 								                return context.AddIntrinsic(Intrinsic.X86Movhlps, input, input); // High to low.
 								            }
 								        }
 								        public static Operand EmitDoubleWordInsert(ArmEmitterContext context, Operand target, Operand value, int targetV)
 								        {
 								            Debug.Assert(target.Type == OperandType.V128 && value.Type == OperandType.V128);
 								            int targetSide = targetV & 1;
 								            int shuffleMask = 2;
 								            if (targetSide == 1)
 								            {
 								                return context.AddIntrinsic(Intrinsic.X86Shufpd, target, value, Const(shuffleMask));
-												Add Profiled Persistent Translation Cache. (#769)

* Delete DelegateTypes.cs

* Delete DelegateCache.cs

* Add files via upload

* Update Horizon.cs

* Update Program.cs

* Update MainWindow.cs

* Update Aot.cs

* Update RelocEntry.cs

* Update Translator.cs

* Update MemoryManager.cs

* Update InstEmitMemoryHelper.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nit.

* 10 fewer MSIL bytes for us

* Add comment. Nits.

* Update Translator.cs

* Update Aot.cs

* Nits.

* Opt..

* Opt..

* Opt..

* Opt..

* Allow to change compression level.

* Update MemoryManager.cs

* Update Translator.cs

* Manage corner cases during the save phase. Nits.

* Update Aot.cs

* Translator response tweak for Aot disabled. Nit.

* Nit.

* Nits.

* Create DelegateHelpers.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nits.

* Fix due to #784.

* Fixes due to #757 & #841.

* Fix due to #846.

* Fix due to #847.

* Use MethodInfo for managed method calls.

Use IR methods instead of managed methods about Max/Min (S/U).
Follow-ups & Nits.

* Add missing exception messages.

Reintroduce slow path for Fmov_Vi.
Implement slow path for Fmov_Si.

* Switch to the new folder structure.

Nits.

* Impl. index-based relocation information. Impl. cache file version field.

* Nit.

* Address gdkchan comments.

Mainly:
- fixed cache file corruption issue on exit; - exposed a way to disable AOT on the GUI.

* Address AcK77 comment.

* Address Thealexbarney, jduncanator & emmauss comments.

Header magic, CpuId (FI) & Aot -> Ptc.

* Adaptation to the new application reloading system.

Improvements to the call system of managed methods.
Follow-ups.
Nits.

* Get the same boot times as on master when PTC is disabled.

* Profiled Aot.

* A32 support (#897).

* #975 support (1 of 2).

* #975 support (2 of 2).

* Rebase fix & nits.

* Some fixes and nits (still one bug left).

* One fix & nits.

* Tests fix (by gdk) & nits.

* Support translations not only in high quality and rejit.

Nits.

* Added possibility to skip translations and continue execution, using `ESC` key.

* Update SettingsWindow.cs

* Update GLRenderer.cs

* Update Ptc.cs

* Disabled Profiled PTC by default as requested in the past by gdk.

* Fix rejit bug. Increased number of parallel translations. Add stack unwinding stuffs support (1 of 2).

Nits.

* Add stack unwinding stuffs support (2 of 2). Tuned number of parallel translations.

* Restored the ability to assemble jumps with 8-bit offset when Profiled PTC is disabled or during profiling.

Modifications due to rebase.
Nits.

* Limited profiling of the functions to be translated to the addresses belonging to the range of static objects only.

* Nits.

* Nits.

* Update Delegates.cs

* Nit.

* Update InstEmitSimdArithmetic.cs

* Address riperiperi comments.

* Fixed the issue of unjustifiably longer boot times at the second boot than at the first boot, measured at the same time or reference point and with the same number of translated functions.

* Implemented a simple redundant load/save mechanism.

Halved the value of Decoder.MaxInstsPerFunction more appropriate for the current performance of the Translator.
Replaced by Logger.PrintError to Logger.PrintDebug in TexturePool.cs about the supposed invalid texture format to avoid the spawn of the log.
Nits.

* Nit.

Improved Logger.PrintError in TexturePool.cs to avoid log spawn.
Added missing code for FZ handling (in output) for fp max/min instructions (slow paths).

* Add configuration migration for PTC

Co-authored-by: Thog <me@thog.eu>
											
										
										
											2020-06-16 18:28:02 +00:00
+								            }
-												Implement Fast Paths for most A32 SIMD instructions (#952)

* Begin work on A32 SIMD Intrinsics

* More instructions, some cleanup.

* Intrinsics for Move instructions (zip etc)

These pass the existing tests.

* Intrinsics for some of Cvt

While doing this I noticed that the conversion for int/fp was incorrect
in the slow path. I'll fix this in the original repo.

* Intrinsics for more Arithmetic instructions.

* Intrinsics for Vext

* Fix VEXT Intrinsic for double words.

* Use InsertPs to move scalar values.

* Cleanup, fix VPADD.f32 and VMIN signed integer.

* Cleanup, add SSE2 support for scalar insert.

Works similarly to the IR scalar insert, but obviously this one works
directly on V128.

* Minor cleanup.

* Enable intrinsic for FP64 to integer conversion.

* Address feedback apart from splitting out intrinsic float abs

Also: bad VREV encodings as undefined rather than throwing in translation.

* Move float abs to helper, fix bug with cvt

* Rename opc2 & 3 to match A32 docs, use ArgumentOutOfRangeException appropriately.

* Get name of variable at compilation rather than string literal.

* Use correct double sign mask.

											
										
										
											2020-03-05 00:41:33 +00:00
+								            else
 								            {
 								                return context.AddIntrinsic(Intrinsic.X86Shufpd, value, target, Const(shuffleMask));
 								            }
 								        }
 								        public static Operand EmitScalarInsert(ArmEmitterContext context, Operand target, Operand value, int reg, bool doubleWidth)
 								        {
 								            Debug.Assert(target.Type == OperandType.V128 && value.Type == OperandType.V128);
 								            // Insert from index 0 in value to index in target.
 								            int index = reg & (doubleWidth ? 1 : 3);
 								            if (doubleWidth)
 								            {
 								                if (index == 1)
 								                {
 								                    return context.AddIntrinsic(Intrinsic.X86Movlhps, target, value); // Low to high.
 								                }
 								                else
 								                {
 								                    return context.AddIntrinsic(Intrinsic.X86Shufpd, value, target, Const(2)); // Low to low, keep high from original.
 								                }
 								            }
 								            else
 								            {
 								                if (Optimizations.UseSse41)
 								                {
 								                    return context.AddIntrinsic(Intrinsic.X86Insertps, target, value, Const(index << 4));
-												Add Profiled Persistent Translation Cache. (#769)

* Delete DelegateTypes.cs

* Delete DelegateCache.cs

* Add files via upload

* Update Horizon.cs

* Update Program.cs

* Update MainWindow.cs

* Update Aot.cs

* Update RelocEntry.cs

* Update Translator.cs

* Update MemoryManager.cs

* Update InstEmitMemoryHelper.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nit.

* 10 fewer MSIL bytes for us

* Add comment. Nits.

* Update Translator.cs

* Update Aot.cs

* Nits.

* Opt..

* Opt..

* Opt..

* Opt..

* Allow to change compression level.

* Update MemoryManager.cs

* Update Translator.cs

* Manage corner cases during the save phase. Nits.

* Update Aot.cs

* Translator response tweak for Aot disabled. Nit.

* Nit.

* Nits.

* Create DelegateHelpers.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nits.

* Fix due to #784.

* Fixes due to #757 & #841.

* Fix due to #846.

* Fix due to #847.

* Use MethodInfo for managed method calls.

Use IR methods instead of managed methods about Max/Min (S/U).
Follow-ups & Nits.

* Add missing exception messages.

Reintroduce slow path for Fmov_Vi.
Implement slow path for Fmov_Si.

* Switch to the new folder structure.

Nits.

* Impl. index-based relocation information. Impl. cache file version field.

* Nit.

* Address gdkchan comments.

Mainly:
- fixed cache file corruption issue on exit; - exposed a way to disable AOT on the GUI.

* Address AcK77 comment.

* Address Thealexbarney, jduncanator & emmauss comments.

Header magic, CpuId (FI) & Aot -> Ptc.

* Adaptation to the new application reloading system.

Improvements to the call system of managed methods.
Follow-ups.
Nits.

* Get the same boot times as on master when PTC is disabled.

* Profiled Aot.

* A32 support (#897).

* #975 support (1 of 2).

* #975 support (2 of 2).

* Rebase fix & nits.

* Some fixes and nits (still one bug left).

* One fix & nits.

* Tests fix (by gdk) & nits.

* Support translations not only in high quality and rejit.

Nits.

* Added possibility to skip translations and continue execution, using `ESC` key.

* Update SettingsWindow.cs

* Update GLRenderer.cs

* Update Ptc.cs

* Disabled Profiled PTC by default as requested in the past by gdk.

* Fix rejit bug. Increased number of parallel translations. Add stack unwinding stuffs support (1 of 2).

Nits.

* Add stack unwinding stuffs support (2 of 2). Tuned number of parallel translations.

* Restored the ability to assemble jumps with 8-bit offset when Profiled PTC is disabled or during profiling.

Modifications due to rebase.
Nits.

* Limited profiling of the functions to be translated to the addresses belonging to the range of static objects only.

* Nits.

* Nits.

* Update Delegates.cs

* Nit.

* Update InstEmitSimdArithmetic.cs

* Address riperiperi comments.

* Fixed the issue of unjustifiably longer boot times at the second boot than at the first boot, measured at the same time or reference point and with the same number of translated functions.

* Implemented a simple redundant load/save mechanism.

Halved the value of Decoder.MaxInstsPerFunction more appropriate for the current performance of the Translator.
Replaced by Logger.PrintError to Logger.PrintDebug in TexturePool.cs about the supposed invalid texture format to avoid the spawn of the log.
Nits.

* Nit.

Improved Logger.PrintError in TexturePool.cs to avoid log spawn.
Added missing code for FZ handling (in output) for fp max/min instructions (slow paths).

* Add configuration migration for PTC

Co-authored-by: Thog <me@thog.eu>
											
										
										
											2020-06-16 18:28:02 +00:00
+								                }
-												Implement Fast Paths for most A32 SIMD instructions (#952)

* Begin work on A32 SIMD Intrinsics

* More instructions, some cleanup.

* Intrinsics for Move instructions (zip etc)

These pass the existing tests.

* Intrinsics for some of Cvt

While doing this I noticed that the conversion for int/fp was incorrect
in the slow path. I'll fix this in the original repo.

* Intrinsics for more Arithmetic instructions.

* Intrinsics for Vext

* Fix VEXT Intrinsic for double words.

* Use InsertPs to move scalar values.

* Cleanup, fix VPADD.f32 and VMIN signed integer.

* Cleanup, add SSE2 support for scalar insert.

Works similarly to the IR scalar insert, but obviously this one works
directly on V128.

* Minor cleanup.

* Enable intrinsic for FP64 to integer conversion.

* Address feedback apart from splitting out intrinsic float abs

Also: bad VREV encodings as undefined rather than throwing in translation.

* Move float abs to helper, fix bug with cvt

* Rename opc2 & 3 to match A32 docs, use ArgumentOutOfRangeException appropriately.

* Get name of variable at compilation rather than string literal.

* Use correct double sign mask.

											
										
										
											2020-03-05 00:41:33 +00:00
+								                else
 								                {
 								                    target = EmitSwapScalar(context, target, index, doubleWidth); // Swap value to replace into element 0.
 								                    target = context.AddIntrinsic(Intrinsic.X86Movss, target, value); // Move the value into element 0 of the vector.
 								                    return EmitSwapScalar(context, target, index, doubleWidth); // Swap new value back to the correct index.
 								                }
 								            }
 								        }
 								        public static Operand EmitSwapScalar(ArmEmitterContext context, Operand target, int reg, bool doubleWidth)
 								        {
 								            // Index into 0, 0 into index. This swap happens at the start of an A32 scalar op if required.
 								            int index = reg & (doubleWidth ? 1 : 3);
 								            if (index == 0) return target;
 								            if (doubleWidth)
 								            {
 								                int shuffleMask = 1; // Swap top and bottom. (b0 = 1, b1 = 0)
 								                return context.AddIntrinsic(Intrinsic.X86Shufpd, target, target, Const(shuffleMask));
-												Add Profiled Persistent Translation Cache. (#769)

* Delete DelegateTypes.cs

* Delete DelegateCache.cs

* Add files via upload

* Update Horizon.cs

* Update Program.cs

* Update MainWindow.cs

* Update Aot.cs

* Update RelocEntry.cs

* Update Translator.cs

* Update MemoryManager.cs

* Update InstEmitMemoryHelper.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nit.

* 10 fewer MSIL bytes for us

* Add comment. Nits.

* Update Translator.cs

* Update Aot.cs

* Nits.

* Opt..

* Opt..

* Opt..

* Opt..

* Allow to change compression level.

* Update MemoryManager.cs

* Update Translator.cs

* Manage corner cases during the save phase. Nits.

* Update Aot.cs

* Translator response tweak for Aot disabled. Nit.

* Nit.

* Nits.

* Create DelegateHelpers.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nits.

* Fix due to #784.

* Fixes due to #757 & #841.

* Fix due to #846.

* Fix due to #847.

* Use MethodInfo for managed method calls.

Use IR methods instead of managed methods about Max/Min (S/U).
Follow-ups & Nits.

* Add missing exception messages.

Reintroduce slow path for Fmov_Vi.
Implement slow path for Fmov_Si.

* Switch to the new folder structure.

Nits.

* Impl. index-based relocation information. Impl. cache file version field.

* Nit.

* Address gdkchan comments.

Mainly:
- fixed cache file corruption issue on exit; - exposed a way to disable AOT on the GUI.

* Address AcK77 comment.

* Address Thealexbarney, jduncanator & emmauss comments.

Header magic, CpuId (FI) & Aot -> Ptc.

* Adaptation to the new application reloading system.

Improvements to the call system of managed methods.
Follow-ups.
Nits.

* Get the same boot times as on master when PTC is disabled.

* Profiled Aot.

* A32 support (#897).

* #975 support (1 of 2).

* #975 support (2 of 2).

* Rebase fix & nits.

* Some fixes and nits (still one bug left).

* One fix & nits.

* Tests fix (by gdk) & nits.

* Support translations not only in high quality and rejit.

Nits.

* Added possibility to skip translations and continue execution, using `ESC` key.

* Update SettingsWindow.cs

* Update GLRenderer.cs

* Update Ptc.cs

* Disabled Profiled PTC by default as requested in the past by gdk.

* Fix rejit bug. Increased number of parallel translations. Add stack unwinding stuffs support (1 of 2).

Nits.

* Add stack unwinding stuffs support (2 of 2). Tuned number of parallel translations.

* Restored the ability to assemble jumps with 8-bit offset when Profiled PTC is disabled or during profiling.

Modifications due to rebase.
Nits.

* Limited profiling of the functions to be translated to the addresses belonging to the range of static objects only.

* Nits.

* Nits.

* Update Delegates.cs

* Nit.

* Update InstEmitSimdArithmetic.cs

* Address riperiperi comments.

* Fixed the issue of unjustifiably longer boot times at the second boot than at the first boot, measured at the same time or reference point and with the same number of translated functions.

* Implemented a simple redundant load/save mechanism.

Halved the value of Decoder.MaxInstsPerFunction more appropriate for the current performance of the Translator.
Replaced by Logger.PrintError to Logger.PrintDebug in TexturePool.cs about the supposed invalid texture format to avoid the spawn of the log.
Nits.

* Nit.

Improved Logger.PrintError in TexturePool.cs to avoid log spawn.
Added missing code for FZ handling (in output) for fp max/min instructions (slow paths).

* Add configuration migration for PTC

Co-authored-by: Thog <me@thog.eu>
											
										
										
											2020-06-16 18:28:02 +00:00
+								            }
-												Implement Fast Paths for most A32 SIMD instructions (#952)

* Begin work on A32 SIMD Intrinsics

* More instructions, some cleanup.

* Intrinsics for Move instructions (zip etc)

These pass the existing tests.

* Intrinsics for some of Cvt

While doing this I noticed that the conversion for int/fp was incorrect
in the slow path. I'll fix this in the original repo.

* Intrinsics for more Arithmetic instructions.

* Intrinsics for Vext

* Fix VEXT Intrinsic for double words.

* Use InsertPs to move scalar values.

* Cleanup, fix VPADD.f32 and VMIN signed integer.

* Cleanup, add SSE2 support for scalar insert.

Works similarly to the IR scalar insert, but obviously this one works
directly on V128.

* Minor cleanup.

* Enable intrinsic for FP64 to integer conversion.

* Address feedback apart from splitting out intrinsic float abs

Also: bad VREV encodings as undefined rather than throwing in translation.

* Move float abs to helper, fix bug with cvt

* Rename opc2 & 3 to match A32 docs, use ArgumentOutOfRangeException appropriately.

* Get name of variable at compilation rather than string literal.

* Use correct double sign mask.

											
										
										
											2020-03-05 00:41:33 +00:00
+								            else
 								            {
 								                int shuffleMask = (3 << 6) | (2 << 4) | (1 << 2) | index; // Swap index and 0. (others remain)
 								                shuffleMask &= ~(3 << (index * 2));
 								                return context.AddIntrinsic(Intrinsic.X86Shufps, target, target, Const(shuffleMask));
 								            }
 								        }
 								        // Vector Operand Templates
 								        public static void EmitVectorUnaryOpSimd32(ArmEmitterContext context, Func1I vectorFunc)
 								        {
 								            OpCode32Simd op = (OpCode32Simd)context.CurrOp;
 								            Operand m = GetVecA32(op.Qm);
 								            Operand d = GetVecA32(op.Qd);
 								            if (!op.Q) // Register swap: move relevant doubleword to destination side.
 								            {
 								                m = EmitMoveDoubleWordToSide(context, m, op.Vm, op.Vd);
 								            }
 								            Operand res = vectorFunc(m);
 								            if (!op.Q) // Register insert.
 								            {
 								                res = EmitDoubleWordInsert(context, d, res, op.Vd);
 								            }
 								            context.Copy(d, res);
 								        }
 								        public static void EmitVectorUnaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
 								        {
 								            OpCode32Simd op = (OpCode32Simd)context.CurrOp;
 								            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
 								            EmitVectorUnaryOpSimd32(context, (m) => context.AddIntrinsic(inst, m));
 								        }
 								        public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc, int side = -1)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            Operand n = GetVecA32(op.Qn);
 								            Operand m = GetVecA32(op.Qm);
 								            Operand d = GetVecA32(op.Qd);
 								            if (side == -1)
 								            {
 								                side = op.Vd;
 								            }
 								            if (!op.Q) // Register swap: move relevant doubleword to destination side.
 								            {
 								                n = EmitMoveDoubleWordToSide(context, n, op.Vn, side);
 								                m = EmitMoveDoubleWordToSide(context, m, op.Vm, side);
 								            }
 								            Operand res = vectorFunc(n, m);
 								            if (!op.Q) // Register insert.
 								            {
 								                if (side != op.Vd)
 								                {
 								                    res = EmitMoveDoubleWordToSide(context, res, side, op.Vd);
 								                }
 								                res = EmitDoubleWordInsert(context, d, res, op.Vd);
 								            }
 								            context.Copy(d, res);
 								        }
 								        public static void EmitVectorBinaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
 								            EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m));
 								        }
 								        public static void EmitVectorTernaryOpSimd32(ArmEmitterContext context, Func3I vectorFunc)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            Operand n = GetVecA32(op.Qn);
 								            Operand m = GetVecA32(op.Qm);
 								            Operand d = GetVecA32(op.Qd);
 								            Operand initialD = d;
 								            if (!op.Q) // Register swap: move relevant doubleword to destination side.
 								            {
 								                n = EmitMoveDoubleWordToSide(context, n, op.Vn, op.Vd);
 								                m = EmitMoveDoubleWordToSide(context, m, op.Vm, op.Vd);
 								            }
 								            Operand res = vectorFunc(d, n, m);
 								            if (!op.Q) // Register insert.
 								            {
 								                res = EmitDoubleWordInsert(context, initialD, res, op.Vd);
 								            }
 								            context.Copy(initialD, res);
 								        }
 								        public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1;
 								            Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2;
 								            EmitVectorTernaryOpSimd32(context, (d, n, m) =>
 								            {
 								                Operand res = context.AddIntrinsic(inst1, n, m);
 								                return res = context.AddIntrinsic(inst2, d, res);
 								            });
 								        }
 								        public static void EmitScalarUnaryOpSimd32(ArmEmitterContext context, Func1I scalarFunc)
 								        {
 								            OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
 								            bool doubleSize = (op.Size & 1) != 0;
 								            int shift = doubleSize ? 1 : 2;
 								            Operand m = GetVecA32(op.Vm >> shift);
 								            Operand d = GetVecA32(op.Vd >> shift);
 								            m = EmitSwapScalar(context, m, op.Vm, doubleSize);
 								            Operand res = scalarFunc(m);
 								            // Insert scalar into vector.
 								            res = EmitScalarInsert(context, d, res, op.Vd, doubleSize);
 								            context.Copy(d, res);
 								        }
 								        public static void EmitScalarUnaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
 								        {
 								            OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
 								            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
 								            EmitScalarUnaryOpSimd32(context, (m) => (inst == 0) ? m : context.AddIntrinsic(inst, m));
 								        }
 								        public static void EmitScalarBinaryOpSimd32(ArmEmitterContext context, Func2I scalarFunc)
 								        {
 								            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
 								            bool doubleSize = (op.Size & 1) != 0;
 								            int shift = doubleSize ? 1 : 2;
 								            Operand n = GetVecA32(op.Vn >> shift);
 								            Operand m = GetVecA32(op.Vm >> shift);
 								            Operand d = GetVecA32(op.Vd >> shift);
 								            n = EmitSwapScalar(context, n, op.Vn, doubleSize);
 								            m = EmitSwapScalar(context, m, op.Vm, doubleSize);
 								            Operand res = scalarFunc(n, m);
 								            // Insert scalar into vector.
 								            res = EmitScalarInsert(context, d, res, op.Vd, doubleSize);
 								            context.Copy(d, res);
 								        }
 								        public static void EmitScalarBinaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
 								        {
 								            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
 								            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
 								            EmitScalarBinaryOpSimd32(context, (n, m) =>  context.AddIntrinsic(inst, n, m));
 								        }
 								        public static void EmitScalarTernaryOpSimd32(ArmEmitterContext context, Func3I scalarFunc)
 								        {
 								            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
 								            bool doubleSize = (op.Size & 1) != 0;
 								            int shift = doubleSize ? 1 : 2;
 								            Operand n = GetVecA32(op.Vn >> shift);
 								            Operand m = GetVecA32(op.Vm >> shift);
 								            Operand d = GetVecA32(op.Vd >> shift);
 								            Operand initialD = d;
 								            n = EmitSwapScalar(context, n, op.Vn, doubleSize);
 								            m = EmitSwapScalar(context, m, op.Vm, doubleSize);
 								            d = EmitSwapScalar(context, d, op.Vd, doubleSize);
 								            Operand res = scalarFunc(d, n, m);
 								            // Insert scalar into vector.
 								            res = EmitScalarInsert(context, initialD, res, op.Vd, doubleSize);
 								            context.Copy(initialD, res);
 								        }
 								        public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
 								        {
 								            OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
 								            bool doubleSize = (op.Size & 1) != 0;
 								            int shift = doubleSize ? 1 : 2;
 								            Intrinsic inst1 = doubleSize ? inst64pt1 : inst32pt1;
 								            Intrinsic inst2 = doubleSize ? inst64pt2 : inst32pt2;
 								            EmitScalarTernaryOpSimd32(context, (d, n, m) =>
 								            {
 								                Operand res = context.AddIntrinsic(inst1, n, m);
 								                return context.AddIntrinsic(inst2, d, res);
 								            });
 								        }
 								        // By Scalar
 								        public static void EmitVectorByScalarOpSimd32(ArmEmitterContext context, Func2I vectorFunc)
 								        {
 								            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
 								            Operand n = GetVecA32(op.Qn);
 								            Operand d = GetVecA32(op.Qd);
 								            int index = op.Vm & 3;
 								            int dupeMask = (index << 6) | (index << 4) | (index << 2) | index;
 								            Operand m = GetVecA32(op.Vm >> 2);
 								            m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask));
 								            if (!op.Q) // Register swap: move relevant doubleword to destination side.
 								            {
 								                n = EmitMoveDoubleWordToSide(context, n, op.Vn, op.Vd);
 								            }
 								            Operand res = vectorFunc(n, m);
 								            if (!op.Q) // Register insert.
 								            {
 								                res = EmitDoubleWordInsert(context, d, res, op.Vd);
 								            }
 								            context.Copy(d, res);
 								        }
 								        public static void EmitVectorByScalarOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
 								        {
 								            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
 								            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
 								            EmitVectorByScalarOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m));
 								        }
 								        public static void EmitVectorsByScalarOpSimd32(ArmEmitterContext context, Func3I vectorFunc)
 								        {
 								            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
 								            Operand n = GetVecA32(op.Qn);
 								            Operand d = GetVecA32(op.Qd);
 								            Operand initialD = d;
 								            int index = op.Vm & 3;
 								            int dupeMask = (index << 6) | (index << 4) | (index << 2) | index;
 								            Operand m = GetVecA32(op.Vm >> 2);
 								            m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask));
 								            if (!op.Q) // Register swap: move relevant doubleword to destination side.
 								            {
 								                n = EmitMoveDoubleWordToSide(context, n, op.Vn, op.Vd);
 								            }
 								            Operand res = vectorFunc(d, n, m);
 								            if (!op.Q) // Register insert.
 								            {
 								                res = EmitDoubleWordInsert(context, initialD, res, op.Vd);
 								            }
 								            context.Copy(initialD, res);
 								        }
 								        public static void EmitVectorsByScalarOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
 								        {
 								            OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
 								            Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1;
 								            Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2;
 								            EmitVectorsByScalarOpSimd32(context, (d, n, m) =>
 								            {
 								                Operand res = context.AddIntrinsic(inst1, n, m);
 								                return res = context.AddIntrinsic(inst2, d, res);
 								            });
 								        }
 								        // Pairwise
 								        public static void EmitSse2VectorPairwiseOpF32(ArmEmitterContext context, Intrinsic inst32)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            EmitVectorBinaryOpSimd32(context, (n, m) =>
 								            {
 								                Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, n, m);
 								                Operand part0 = unpck;
 								                Operand part1 = context.AddIntrinsic(Intrinsic.X86Movhlps, unpck, unpck);
 								                return context.AddIntrinsic(inst32, part0, part1);
 								            }, 0);
 								        }
 								        public static void EmitSsse3VectorPairwiseOp32(ArmEmitterContext context, Intrinsic[] inst)
 								        {
 								            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
 								            EmitVectorBinaryOpSimd32(context, (n, m) =>
 								            {
 								                if (op.RegisterSize == RegisterSize.Simd64)
 								                {
 								                    Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]);
 								                    Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks[op.Size]);
 								                    Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n
 								                    Operand left = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroEvenMask); // 0:even from m:n
 								                    Operand right = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroOddMask); // 0:odd  from m:n
 								                    return context.AddIntrinsic(inst[op.Size], left, right);
 								                }
 								                else if (op.Size < 3)
 								                {
 								                    Operand oddEvenMask = X86GetElements(context, OddMasks[op.Size], EvenMasks[op.Size]);
 								                    Operand oddEvenN = context.AddIntrinsic(Intrinsic.X86Pshufb, n, oddEvenMask); // odd:even from n
 								                    Operand oddEvenM = context.AddIntrinsic(Intrinsic.X86Pshufb, m, oddEvenMask); // odd:even from m
 								                    Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, oddEvenN, oddEvenM);
 								                    Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, oddEvenN, oddEvenM);
 								                    return context.AddIntrinsic(inst[op.Size], left, right);
 								                }
 								                else
 								                {
 								                    Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m);
 								                    Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, n, m);
 								                    return context.AddIntrinsic(inst[3], left, right);
 								                }
 								            }, 0);
 								        }
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
+								        // Generic Functions
-												Add Profiled Persistent Translation Cache. (#769)

* Delete DelegateTypes.cs

* Delete DelegateCache.cs

* Add files via upload

* Update Horizon.cs

* Update Program.cs

* Update MainWindow.cs

* Update Aot.cs

* Update RelocEntry.cs

* Update Translator.cs

* Update MemoryManager.cs

* Update InstEmitMemoryHelper.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nit.

* 10 fewer MSIL bytes for us

* Add comment. Nits.

* Update Translator.cs

* Update Aot.cs

* Nits.

* Opt..

* Opt..

* Opt..

* Opt..

* Allow to change compression level.

* Update MemoryManager.cs

* Update Translator.cs

* Manage corner cases during the save phase. Nits.

* Update Aot.cs

* Translator response tweak for Aot disabled. Nit.

* Nit.

* Nits.

* Create DelegateHelpers.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nits.

* Fix due to #784.

* Fixes due to #757 & #841.

* Fix due to #846.

* Fix due to #847.

* Use MethodInfo for managed method calls.

Use IR methods instead of managed methods about Max/Min (S/U).
Follow-ups & Nits.

* Add missing exception messages.

Reintroduce slow path for Fmov_Vi.
Implement slow path for Fmov_Si.

* Switch to the new folder structure.

Nits.

* Impl. index-based relocation information. Impl. cache file version field.

* Nit.

* Address gdkchan comments.

Mainly:
- fixed cache file corruption issue on exit; - exposed a way to disable AOT on the GUI.

* Address AcK77 comment.

* Address Thealexbarney, jduncanator & emmauss comments.

Header magic, CpuId (FI) & Aot -> Ptc.

* Adaptation to the new application reloading system.

Improvements to the call system of managed methods.
Follow-ups.
Nits.

* Get the same boot times as on master when PTC is disabled.

* Profiled Aot.

* A32 support (#897).

* #975 support (1 of 2).

* #975 support (2 of 2).

* Rebase fix & nits.

* Some fixes and nits (still one bug left).

* One fix & nits.

* Tests fix (by gdk) & nits.

* Support translations not only in high quality and rejit.

Nits.

* Added possibility to skip translations and continue execution, using `ESC` key.

* Update SettingsWindow.cs

* Update GLRenderer.cs

* Update Ptc.cs

* Disabled Profiled PTC by default as requested in the past by gdk.

* Fix rejit bug. Increased number of parallel translations. Add stack unwinding stuffs support (1 of 2).

Nits.

* Add stack unwinding stuffs support (2 of 2). Tuned number of parallel translations.

* Restored the ability to assemble jumps with 8-bit offset when Profiled PTC is disabled or during profiling.

Modifications due to rebase.
Nits.

* Limited profiling of the functions to be translated to the addresses belonging to the range of static objects only.

* Nits.

* Nits.

* Update Delegates.cs

* Nit.

* Update InstEmitSimdArithmetic.cs

* Address riperiperi comments.

* Fixed the issue of unjustifiably longer boot times at the second boot than at the first boot, measured at the same time or reference point and with the same number of translated functions.

* Implemented a simple redundant load/save mechanism.

Halved the value of Decoder.MaxInstsPerFunction more appropriate for the current performance of the Translator.
Replaced by Logger.PrintError to Logger.PrintDebug in TexturePool.cs about the supposed invalid texture format to avoid the spawn of the log.
Nits.

* Nit.

Improved Logger.PrintError in TexturePool.cs to avoid log spawn.
Added missing code for FZ handling (in output) for fp max/min instructions (slow paths).

* Add configuration migration for PTC

Co-authored-by: Thog <me@thog.eu>
											
										
										
											2020-06-16 18:28:02 +00:00
+								        public static Operand EmitSoftFloatCallDefaultFpscr(ArmEmitterContext context, string name, params Operand[] callArgs)
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
+								        {
 								            IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
-												Add Profiled Persistent Translation Cache. (#769)

* Delete DelegateTypes.cs

* Delete DelegateCache.cs

* Add files via upload

* Update Horizon.cs

* Update Program.cs

* Update MainWindow.cs

* Update Aot.cs

* Update RelocEntry.cs

* Update Translator.cs

* Update MemoryManager.cs

* Update InstEmitMemoryHelper.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nit.

* 10 fewer MSIL bytes for us

* Add comment. Nits.

* Update Translator.cs

* Update Aot.cs

* Nits.

* Opt..

* Opt..

* Opt..

* Opt..

* Allow to change compression level.

* Update MemoryManager.cs

* Update Translator.cs

* Manage corner cases during the save phase. Nits.

* Update Aot.cs

* Translator response tweak for Aot disabled. Nit.

* Nit.

* Nits.

* Create DelegateHelpers.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nits.

* Fix due to #784.

* Fixes due to #757 & #841.

* Fix due to #846.

* Fix due to #847.

* Use MethodInfo for managed method calls.

Use IR methods instead of managed methods about Max/Min (S/U).
Follow-ups & Nits.

* Add missing exception messages.

Reintroduce slow path for Fmov_Vi.
Implement slow path for Fmov_Si.

* Switch to the new folder structure.

Nits.

* Impl. index-based relocation information. Impl. cache file version field.

* Nit.

* Address gdkchan comments.

Mainly:
- fixed cache file corruption issue on exit; - exposed a way to disable AOT on the GUI.

* Address AcK77 comment.

* Address Thealexbarney, jduncanator & emmauss comments.

Header magic, CpuId (FI) & Aot -> Ptc.

* Adaptation to the new application reloading system.

Improvements to the call system of managed methods.
Follow-ups.
Nits.

* Get the same boot times as on master when PTC is disabled.

* Profiled Aot.

* A32 support (#897).

* #975 support (1 of 2).

* #975 support (2 of 2).

* Rebase fix & nits.

* Some fixes and nits (still one bug left).

* One fix & nits.

* Tests fix (by gdk) & nits.

* Support translations not only in high quality and rejit.

Nits.

* Added possibility to skip translations and continue execution, using `ESC` key.

* Update SettingsWindow.cs

* Update GLRenderer.cs

* Update Ptc.cs

* Disabled Profiled PTC by default as requested in the past by gdk.

* Fix rejit bug. Increased number of parallel translations. Add stack unwinding stuffs support (1 of 2).

Nits.

* Add stack unwinding stuffs support (2 of 2). Tuned number of parallel translations.

* Restored the ability to assemble jumps with 8-bit offset when Profiled PTC is disabled or during profiling.

Modifications due to rebase.
Nits.

* Limited profiling of the functions to be translated to the addresses belonging to the range of static objects only.

* Nits.

* Nits.

* Update Delegates.cs

* Nit.

* Update InstEmitSimdArithmetic.cs

* Address riperiperi comments.

* Fixed the issue of unjustifiably longer boot times at the second boot than at the first boot, measured at the same time or reference point and with the same number of translated functions.

* Implemented a simple redundant load/save mechanism.

Halved the value of Decoder.MaxInstsPerFunction more appropriate for the current performance of the Translator.
Replaced by Logger.PrintError to Logger.PrintDebug in TexturePool.cs about the supposed invalid texture format to avoid the spawn of the log.
Nits.

* Nit.

Improved Logger.PrintError in TexturePool.cs to avoid log spawn.
Added missing code for FZ handling (in output) for fp max/min instructions (slow paths).

* Add configuration migration for PTC

Co-authored-by: Thog <me@thog.eu>
											
										
										
											2020-06-16 18:28:02 +00:00
+								            MethodInfo info = (op.Size & 1) == 0
 								                ? typeof(SoftFloat32).GetMethod(name)
 								                : typeof(SoftFloat64).GetMethod(name);
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
 								            Array.Resize(ref callArgs, callArgs.Length + 1);
 								            callArgs[callArgs.Length - 1] = Const(1);
-												Add Profiled Persistent Translation Cache. (#769)

* Delete DelegateTypes.cs

* Delete DelegateCache.cs

* Add files via upload

* Update Horizon.cs

* Update Program.cs

* Update MainWindow.cs

* Update Aot.cs

* Update RelocEntry.cs

* Update Translator.cs

* Update MemoryManager.cs

* Update InstEmitMemoryHelper.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nit.

* 10 fewer MSIL bytes for us

* Add comment. Nits.

* Update Translator.cs

* Update Aot.cs

* Nits.

* Opt..

* Opt..

* Opt..

* Opt..

* Allow to change compression level.

* Update MemoryManager.cs

* Update Translator.cs

* Manage corner cases during the save phase. Nits.

* Update Aot.cs

* Translator response tweak for Aot disabled. Nit.

* Nit.

* Nits.

* Create DelegateHelpers.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nits.

* Fix due to #784.

* Fixes due to #757 & #841.

* Fix due to #846.

* Fix due to #847.

* Use MethodInfo for managed method calls.

Use IR methods instead of managed methods about Max/Min (S/U).
Follow-ups & Nits.

* Add missing exception messages.

Reintroduce slow path for Fmov_Vi.
Implement slow path for Fmov_Si.

* Switch to the new folder structure.

Nits.

* Impl. index-based relocation information. Impl. cache file version field.

* Nit.

* Address gdkchan comments.

Mainly:
- fixed cache file corruption issue on exit; - exposed a way to disable AOT on the GUI.

* Address AcK77 comment.

* Address Thealexbarney, jduncanator & emmauss comments.

Header magic, CpuId (FI) & Aot -> Ptc.

* Adaptation to the new application reloading system.

Improvements to the call system of managed methods.
Follow-ups.
Nits.

* Get the same boot times as on master when PTC is disabled.

* Profiled Aot.

* A32 support (#897).

* #975 support (1 of 2).

* #975 support (2 of 2).

* Rebase fix & nits.

* Some fixes and nits (still one bug left).

* One fix & nits.

* Tests fix (by gdk) & nits.

* Support translations not only in high quality and rejit.

Nits.

* Added possibility to skip translations and continue execution, using `ESC` key.

* Update SettingsWindow.cs

* Update GLRenderer.cs

* Update Ptc.cs

* Disabled Profiled PTC by default as requested in the past by gdk.

* Fix rejit bug. Increased number of parallel translations. Add stack unwinding stuffs support (1 of 2).

Nits.

* Add stack unwinding stuffs support (2 of 2). Tuned number of parallel translations.

* Restored the ability to assemble jumps with 8-bit offset when Profiled PTC is disabled or during profiling.

Modifications due to rebase.
Nits.

* Limited profiling of the functions to be translated to the addresses belonging to the range of static objects only.

* Nits.

* Nits.

* Update Delegates.cs

* Nit.

* Update InstEmitSimdArithmetic.cs

* Address riperiperi comments.

* Fixed the issue of unjustifiably longer boot times at the second boot than at the first boot, measured at the same time or reference point and with the same number of translated functions.

* Implemented a simple redundant load/save mechanism.

Halved the value of Decoder.MaxInstsPerFunction more appropriate for the current performance of the Translator.
Replaced by Logger.PrintError to Logger.PrintDebug in TexturePool.cs about the supposed invalid texture format to avoid the spawn of the log.
Nits.

* Nit.

Improved Logger.PrintError in TexturePool.cs to avoid log spawn.
Added missing code for FZ handling (in output) for fp max/min instructions (slow paths).

* Add configuration migration for PTC

Co-authored-by: Thog <me@thog.eu>
											
										
										
											2020-06-16 18:28:02 +00:00
+								            return context.Call(info, callArgs);
-												Add most of the A32 instruction set to ARMeilleure (#897)

* Implement TEQ and MOV (Imm16)

* Initial work on A32 instructions + SVC. No tests yet, hangs in rtld.

* Implement CLZ, fix BFI and BFC

Now stops on SIMD initialization.

* Exclusive access instructions, fix to mul, system instructions.

Now gets to a break after SignalProcessWideKey64.

* Better impl of UBFX, add UDIV and SDIV

Now boots way further - now stuck on VMOV instruction.

* Many more instructions, start on SIMD and testing framework.

* Fix build issues

* svc: Rework 32 bit codepath

Fixing once and for all argument ordering issues.

* Fix 32 bits stacktrace

* hle debug: Add 32 bits dynamic section parsing

* Fix highCq mode, add many tests, fix some instruction bugs

Still suffers from critical malloc failure :weary:

* Fix incorrect opcode decoders and a few more instructions.

* Add a few instructions and fix others. re-disable highCq for now.

Disabled the svc memory clear since i'm not sure about it.

* Fix build

* Fix typo in ordered/exclusive stores.

* Implement some more instructions, fix others.

Uxtab16/Sxtab16 are untested.

* Begin impl of pairwise, some other instructions.

* Add a few more instructions, a quick hack to fix svcs for now.

* Add tests and fix issues with VTRN, VZIP, VUZP

* Add a few more instructions, fix Vmul_1 encoding.

* Fix way too many instruction bugs, add tests for some of the more important ones.

* Fix HighCq, enable FastFP paths for some floating point instructions

(not entirely sure why these were disabled, so important to note this
commit exists)

Branching has been removed in A32 shifts until I figure out if it's
worth it

* Cleanup Part 1

There should be no functional change between these next few commits.
Should is the key word. (except for removing break handler)

* Implement 32 bits syscalls

Co-authored-by: riperiperi <rhy3756547@hotmail.com>

Implement all 32 bits counterparts of the 64 bits syscalls we currently
have.

* Refactor part 2: Move index/subindex logic to Operand

May have inadvertently fixed one (1) bug

* Add FlushProcessDataCache32

* Address jd's comments

* Remove 16 bit encodings from OpCodeTable

Still need to catch some edge cases (operands that use the "F" flag) and
make Q encodings with non-even indexes undefined.

* Correct Fpscr handling for FP vector slow paths

WIP

* Add StandardFPSCRValue behaviour for all Arithmetic instructions

* Add StandardFPSCRValue behaviour to compare instructions.

* Force passing of fpcr to FPProcessException and FPUnpack.

Reduces potential for code error significantly

* OpCode cleanup

* Remove urgency from DMB comment in MRRC

DMB is currently a no-op via the instruction, so it should likely still
be a no-op here.

* Test Cleanup

* Fix FPDefaultNaN on Ryzen CPUs

* Improve some tests, fix some shift instructions, add slow path for Vadd

* Fix Typo

* More test cleanup

* Flip order of Fx and index, to indicate that the operand's is the "base"

* Remove Simd32 register type, use Int32 and Int64 for scalars like A64 does.

* Reintroduce alignment to DecoderHelper (removed by accident)

* One more realign as reading diffs is hard

* Use I32 registers in A32 (part 2)

Swap default integer register type based on current execution mode.

* FPSCR flags as Registers (part 1)

Still need to change NativeContext and ExecutionContext to allow
getting/setting with the flag values.

* Use I32 registers in A32 (part 1)

* FPSCR flags as registers (part 2)

Only CMP flags are on the registers right now. It could be useful to use
more of the space in non-fast-float when implementing A32 flags
accurately in the fast path.

* Address Feedback

* Correct FP->Int behaviour (should saturate)

* Make branches made by writing to PC eligible for Rejit

Greatly improves performance in most games.

* Remove unused branching for Vtbl

* RejitRequest as a class rather than a tuple

Makes a lot more sense than storing tuples on a dictionary.

* Add VMOVN, VSHR (imm), VSHRN (imm) and related tests

* Re-order InstEmitSystem32

Alphabetical sorting.

* Address Feedback

Feedback from Ac_K, remove and sort usings.

* Address Feedback 2

* Address Feedback from LDj3SNuD

Opcode table reordered to have alphabetical sorting within groups,
Vmaxnm and Vminnm have split names to be less ambiguous, SoftFloat nits,
Test nits and Test simplification with ValueSource.

* Add Debug Asserts to A32 helpers

Mainly to prevent the shift ones from being used on I64 operands, as
they expect I32 input for most operations (eg. carry flag setting), and
expect I32 input for shift and boolean amounts. Most other helper
functions don't take Operands, throw on out of range values, and take
specific types of OpCode, so didn't need any asserts.

* Use ConstF rather than creating an operand.

(useful for pooling in future)

* Move exclusive load to helper, reference call flag rather than literal 1.

* Address LDj feedback (minus table flatten)

one final look before it's all gone. the world is so beautiful.

* Flatten OpCodeTable

oh no

* Address more table ordering

* Call Flag as int on A32

Co-authored-by: Natalie C. <cyuubiapps@gmail.com>
Co-authored-by: Thog <thog@protonmail.com>

											
										
										
											2020-02-23 21:20:40 +00:00
+								        }
 								        public static Operand EmitVectorExtractSx32(ArmEmitterContext context, int reg, int index, int size)
 								        {
 								            return EmitVectorExtract32(context, reg, index, size, true);
 								        }
 								        public static Operand EmitVectorExtractZx32(ArmEmitterContext context, int reg, int index, int size)
 								        {
 								            return EmitVectorExtract32(context, reg, index, size, false);
 								        }
 								        public static Operand EmitVectorExtract32(ArmEmitterContext context, int reg, int index, int size, bool signed)
 								        {
 								            ThrowIfInvalid(index, size);
 								            Operand res = null;
 								            switch (size)
 								            {
 								                case 0:
 								                    res = context.VectorExtract8(GetVec(reg), index);
 								                    break;
 								                case 1:
 								                    res = context.VectorExtract16(GetVec(reg), index);
 								                    break;
 								                case 2:
 								                    res = context.VectorExtract(OperandType.I32, GetVec(reg), index);
 								                    break;
 								                case 3:
 								                    res = context.VectorExtract(OperandType.I64, GetVec(reg), index);
 								                    break;
 								            }
 								            if (signed)
 								            {
 								                switch (size)
 								                {
 								                    case 0: res = context.SignExtend8(OperandType.I32, res); break;
 								                    case 1: res = context.SignExtend16(OperandType.I32, res); break;
 								                }
 								            }
 								            else
 								            {
 								                switch (size)
 								                {
 								                    case 0: res = context.ZeroExtend8(OperandType.I32, res); break;
 								                    case 1: res = context.ZeroExtend16(OperandType.I32, res); break;
 								                }
 								            }
 								            return res;
 								        }
 								    }
 								}