ryujinx/ARMeilleure/CodeGen/X86/CodeGenContext.cs

using ARMeilleure.CodeGen.RegisterAllocators;
using ARMeilleure.Common;
using ARMeilleure.IntermediateRepresentation;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;

namespace ARMeilleure.CodeGen.X86
{
    class CodeGenContext
    {
        private const int ReservedBytesForJump = 1;

        private Stream _stream;

        public int StreamOffset => (int)_stream.Length;

        public AllocationResult AllocResult { get; }

        public Assembler Assembler { get; }

        public BasicBlock CurrBlock { get; private set; }

        public int CallArgsRegionSize { get; }
        public int XmmSaveRegionSize  { get; }

        private long[] _blockOffsets;

        private struct Jump
        {
            public bool IsConditional { get; }

            public X86Condition Condition { get; }

            public BasicBlock Target { get; }

            public long JumpPosition { get; }

            public long RelativeOffset { get; set; }

            public int InstSize { get; set; }

            public Jump(BasicBlock target, long jumpPosition)
            {
                IsConditional = false;
                Condition     = 0;
                Target        = target;
                JumpPosition  = jumpPosition;

                RelativeOffset = 0;

                InstSize = 0;
            }

            public Jump(X86Condition condition, BasicBlock target, long jumpPosition)
            {
                IsConditional = true;
                Condition     = condition;
                Target        = target;
                JumpPosition  = jumpPosition;

                RelativeOffset = 0;

                InstSize = 0;
            }
        }

        private List<Jump> _jumps;

        private X86Condition _jNearCondition;

        private long _jNearPosition;
        private int  _jNearLength;

        public CodeGenContext(Stream stream, AllocationResult allocResult, int maxCallArgs, int blocksCount)
        {
            _stream = stream;

            AllocResult = allocResult;

            Assembler = new Assembler(stream);

            CallArgsRegionSize = GetCallArgsRegionSize(allocResult, maxCallArgs, out int xmmSaveRegionSize);
            XmmSaveRegionSize  = xmmSaveRegionSize;

            _blockOffsets = new long[blocksCount];

            _jumps = new List<Jump>();
        }

        private int GetCallArgsRegionSize(AllocationResult allocResult, int maxCallArgs, out int xmmSaveRegionSize)
        {
            // We need to add 8 bytes to the total size, as the call to this
            // function already pushed 8 bytes (the return address).
            int intMask = CallingConvention.GetIntCalleeSavedRegisters() & allocResult.IntUsedRegisters;
            int vecMask = CallingConvention.GetVecCalleeSavedRegisters() & allocResult.VecUsedRegisters;

            xmmSaveRegionSize = BitUtils.CountBits(vecMask) * 16;

            int calleeSaveRegionSize = BitUtils.CountBits(intMask) * 8 + xmmSaveRegionSize + 8;

            int argsCount = maxCallArgs;

            if (argsCount < 0)
            {
                // When the function has no calls, argsCount is -1.
                // In this case, we don't need to allocate the shadow space.
                argsCount = 0;
            }
            else if (argsCount < 4)
            {
                // The ABI mandates that the space for at least 4 arguments
                // is reserved on the stack (this is called shadow space).
                argsCount = 4;
            }

            int frameSize = calleeSaveRegionSize + allocResult.SpillRegionSize;

            // TODO: Instead of always multiplying by 16 (the largest possible size of a variable,
            // since a V128 has 16 bytes), we should calculate the exact size consumed by the
            // arguments passed to the called functions on the stack.
            int callArgsAndFrameSize = frameSize + argsCount * 16;

            // Ensure that the Stack Pointer will be aligned to 16 bytes.
            callArgsAndFrameSize = (callArgsAndFrameSize + 0xf) & ~0xf;

            return callArgsAndFrameSize - frameSize;
        }

        public void EnterBlock(BasicBlock block)
        {
            _blockOffsets[block.Index] = _stream.Position;

            CurrBlock = block;
        }

        public void JumpTo(BasicBlock target)
        {
            _jumps.Add(new Jump(target, _stream.Position));

            WritePadding(ReservedBytesForJump);
        }

        public void JumpTo(X86Condition condition, BasicBlock target)
        {
            _jumps.Add(new Jump(condition, target, _stream.Position));

            WritePadding(ReservedBytesForJump);
        }

        public void JumpToNear(X86Condition condition)
        {
            _jNearCondition = condition;
            _jNearPosition  = _stream.Position;
            _jNearLength    = Assembler.GetJccLength(0);

            _stream.Seek(_jNearLength, SeekOrigin.Current);
        }

        public void JumpHere()
        {
            long currentPosition = _stream.Position;

            _stream.Seek(_jNearPosition, SeekOrigin.Begin);

            long offset = currentPosition - (_jNearPosition + _jNearLength);

            Debug.Assert(_jNearLength == Assembler.GetJccLength(offset), "Relative offset doesn't fit on near jump.");

            Assembler.Jcc(_jNearCondition, offset);

            _stream.Seek(currentPosition, SeekOrigin.Begin);
        }

        private void WritePadding(int size)
        {
            while (size-- > 0)
            {
                _stream.WriteByte(0);
            }
        }

        public byte[] GetCode()
        {
            // Write jump relative offsets.
            bool modified;

            do
            {
                modified = false;

                for (int index = 0; index < _jumps.Count; index++)
                {
                    Jump jump = _jumps[index];

                    long jumpTarget = _blockOffsets[jump.Target.Index];

                    long offset = jumpTarget - jump.JumpPosition;

                    if (offset < 0)
                    {
                        for (int index2 = index - 1; index2 >= 0; index2--)
                        {
                            Jump jump2 = _jumps[index2];

                            if (jump2.JumpPosition < jumpTarget)
                            {
                                break;
                            }

                            offset -= jump2.InstSize - ReservedBytesForJump;
                        }
                    }
                    else
                    {
                        for (int index2 = index + 1; index2 < _jumps.Count; index2++)
                        {
                            Jump jump2 = _jumps[index2];

                            if (jump2.JumpPosition >= jumpTarget)
                            {
                                break;
                            }

                            offset += jump2.InstSize - ReservedBytesForJump;
                        }

                        offset -= ReservedBytesForJump;
                    }

                    if (jump.IsConditional)
                    {
                        jump.InstSize = Assembler.GetJccLength(offset);
                    }
                    else
                    {
                        jump.InstSize = Assembler.GetJmpLength(offset);
                    }

                    // The jump is relative to the next instruction, not the current one.
                    // Since we didn't know the next instruction address when calculating
                    // the offset (as the size of the current jump instruction was not know),
                    // we now need to compensate the offset with the jump instruction size.
                    // It's also worth to note that:
                    // - This is only needed for backward jumps.
                    // - The GetJmpLength and GetJccLength also compensates the offset
                    // internally when computing the jump instruction size.
                    if (offset < 0)
                    {
                        offset -= jump.InstSize;
                    }

                    if (jump.RelativeOffset != offset)
                    {
                        modified = true;
                    }

                    jump.RelativeOffset = offset;

                    _jumps[index] = jump;
                }
            }
            while (modified);

            // Write the code, ignoring the dummy bytes after jumps, into a new stream.
            _stream.Seek(0, SeekOrigin.Begin);

            using (MemoryStream codeStream = new MemoryStream())
            {
                Assembler assembler = new Assembler(codeStream);

                byte[] buffer;

                for (int index = 0; index < _jumps.Count; index++)
                {
                    Jump jump = _jumps[index];

                    buffer = new byte[jump.JumpPosition - _stream.Position];

                    _stream.Read(buffer, 0, buffer.Length);
                    _stream.Seek(ReservedBytesForJump, SeekOrigin.Current);

                    codeStream.Write(buffer);

                    if (jump.IsConditional)
                    {
                        assembler.Jcc(jump.Condition, jump.RelativeOffset);
                    }
                    else
                    {
                        assembler.Jmp(jump.RelativeOffset);
                    }
                }

                buffer = new byte[_stream.Length - _stream.Position];

                _stream.Read(buffer, 0, buffer.Length);

                codeStream.Write(buffer);

                return codeStream.ToArray();
            }
        }
    }
}
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary 2019-08-08 18:56:22 +00:00			`using ARMeilleure.CodeGen.RegisterAllocators;`
			`using ARMeilleure.Common;`
			`using ARMeilleure.IntermediateRepresentation;`
			`using System.Collections.Generic;`
			`using System.Diagnostics;`
			`using System.IO;`

			`namespace ARMeilleure.CodeGen.X86`
			`{`
			`class CodeGenContext`
			`{`
			`private const int ReservedBytesForJump = 1;`

			`private Stream _stream;`

			`public int StreamOffset => (int)_stream.Length;`

			`public AllocationResult AllocResult { get; }`

			`public Assembler Assembler { get; }`

			`public BasicBlock CurrBlock { get; private set; }`

			`public int CallArgsRegionSize { get; }`
			`public int XmmSaveRegionSize { get; }`

			`private long[] _blockOffsets;`

			`private struct Jump`
			`{`
			`public bool IsConditional { get; }`

			`public X86Condition Condition { get; }`

			`public BasicBlock Target { get; }`

			`public long JumpPosition { get; }`

			`public long RelativeOffset { get; set; }`

			`public int InstSize { get; set; }`

			`public Jump(BasicBlock target, long jumpPosition)`
			`{`
			`IsConditional = false;`
			`Condition = 0;`
			`Target = target;`
			`JumpPosition = jumpPosition;`

			`RelativeOffset = 0;`

			`InstSize = 0;`
			`}`

			`public Jump(X86Condition condition, BasicBlock target, long jumpPosition)`
			`{`
			`IsConditional = true;`
			`Condition = condition;`
			`Target = target;`
			`JumpPosition = jumpPosition;`

			`RelativeOffset = 0;`

			`InstSize = 0;`
			`}`
			`}`

			`private List<Jump> _jumps;`

			`private X86Condition _jNearCondition;`

			`private long _jNearPosition;`
			`private int _jNearLength;`

			`public CodeGenContext(Stream stream, AllocationResult allocResult, int maxCallArgs, int blocksCount)`
			`{`
			`_stream = stream;`

			`AllocResult = allocResult;`

			`Assembler = new Assembler(stream);`

			`CallArgsRegionSize = GetCallArgsRegionSize(allocResult, maxCallArgs, out int xmmSaveRegionSize);`
			`XmmSaveRegionSize = xmmSaveRegionSize;`

			`_blockOffsets = new long[blocksCount];`

			`_jumps = new List<Jump>();`
			`}`

			`private int GetCallArgsRegionSize(AllocationResult allocResult, int maxCallArgs, out int xmmSaveRegionSize)`
			`{`
			`// We need to add 8 bytes to the total size, as the call to this`
			`// function already pushed 8 bytes (the return address).`
			`int intMask = CallingConvention.GetIntCalleeSavedRegisters() & allocResult.IntUsedRegisters;`
			`int vecMask = CallingConvention.GetVecCalleeSavedRegisters() & allocResult.VecUsedRegisters;`

			`xmmSaveRegionSize = BitUtils.CountBits(vecMask) * 16;`

			`int calleeSaveRegionSize = BitUtils.CountBits(intMask) * 8 + xmmSaveRegionSize + 8;`

			`int argsCount = maxCallArgs;`

			`if (argsCount < 0)`
			`{`
			`// When the function has no calls, argsCount is -1.`
			`// In this case, we don't need to allocate the shadow space.`
			`argsCount = 0;`
			`}`
			`else if (argsCount < 4)`
			`{`
			`// The ABI mandates that the space for at least 4 arguments`
			`// is reserved on the stack (this is called shadow space).`
			`argsCount = 4;`
			`}`

			`int frameSize = calleeSaveRegionSize + allocResult.SpillRegionSize;`

			`// TODO: Instead of always multiplying by 16 (the largest possible size of a variable,`
			`// since a V128 has 16 bytes), we should calculate the exact size consumed by the`
			`// arguments passed to the called functions on the stack.`
			`int callArgsAndFrameSize = frameSize + argsCount * 16;`

			`// Ensure that the Stack Pointer will be aligned to 16 bytes.`
			`callArgsAndFrameSize = (callArgsAndFrameSize + 0xf) & ~0xf;`

			`return callArgsAndFrameSize - frameSize;`
			`}`

			`public void EnterBlock(BasicBlock block)`
			`{`
			`_blockOffsets[block.Index] = _stream.Position;`

			`CurrBlock = block;`
			`}`

			`public void JumpTo(BasicBlock target)`
			`{`
			`_jumps.Add(new Jump(target, _stream.Position));`

			`WritePadding(ReservedBytesForJump);`
			`}`

			`public void JumpTo(X86Condition condition, BasicBlock target)`
			`{`
			`_jumps.Add(new Jump(condition, target, _stream.Position));`

			`WritePadding(ReservedBytesForJump);`
			`}`

			`public void JumpToNear(X86Condition condition)`
			`{`
			`_jNearCondition = condition;`
			`_jNearPosition = _stream.Position;`
			`_jNearLength = Assembler.GetJccLength(0);`

			`_stream.Seek(_jNearLength, SeekOrigin.Current);`
			`}`

			`public void JumpHere()`
			`{`
			`long currentPosition = _stream.Position;`

			`_stream.Seek(_jNearPosition, SeekOrigin.Begin);`

			`long offset = currentPosition - (_jNearPosition + _jNearLength);`

			`Debug.Assert(_jNearLength == Assembler.GetJccLength(offset), "Relative offset doesn't fit on near jump.");`

			`Assembler.Jcc(_jNearCondition, offset);`

			`_stream.Seek(currentPosition, SeekOrigin.Begin);`
			`}`

			`private void WritePadding(int size)`
			`{`
			`while (size-- > 0)`
			`{`
			`_stream.WriteByte(0);`
			`}`
			`}`

			`public byte[] GetCode()`
			`{`
			`// Write jump relative offsets.`
			`bool modified;`

			`do`
			`{`
			`modified = false;`

			`for (int index = 0; index < _jumps.Count; index++)`
			`{`
			`Jump jump = _jumps[index];`

			`long jumpTarget = _blockOffsets[jump.Target.Index];`

			`long offset = jumpTarget - jump.JumpPosition;`

			`if (offset < 0)`
			`{`
			`for (int index2 = index - 1; index2 >= 0; index2--)`
			`{`
			`Jump jump2 = _jumps[index2];`

			`if (jump2.JumpPosition < jumpTarget)`
			`{`
			`break;`
			`}`

			`offset -= jump2.InstSize - ReservedBytesForJump;`
			`}`
			`}`
			`else`
			`{`
			`for (int index2 = index + 1; index2 < _jumps.Count; index2++)`
			`{`
			`Jump jump2 = _jumps[index2];`

			`if (jump2.JumpPosition >= jumpTarget)`
			`{`
			`break;`
			`}`

			`offset += jump2.InstSize - ReservedBytesForJump;`
			`}`

			`offset -= ReservedBytesForJump;`
			`}`

			`if (jump.IsConditional)`
			`{`
			`jump.InstSize = Assembler.GetJccLength(offset);`
			`}`
			`else`
			`{`
			`jump.InstSize = Assembler.GetJmpLength(offset);`
			`}`

			`// The jump is relative to the next instruction, not the current one.`
			`// Since we didn't know the next instruction address when calculating`
			`// the offset (as the size of the current jump instruction was not know),`
			`// we now need to compensate the offset with the jump instruction size.`
			`// It's also worth to note that:`
			`// - This is only needed for backward jumps.`
			`// - The GetJmpLength and GetJccLength also compensates the offset`
			`// internally when computing the jump instruction size.`
			`if (offset < 0)`
			`{`
			`offset -= jump.InstSize;`
			`}`

			`if (jump.RelativeOffset != offset)`
			`{`
			`modified = true;`
			`}`

			`jump.RelativeOffset = offset;`

			`_jumps[index] = jump;`
			`}`
			`}`
			`while (modified);`

			`// Write the code, ignoring the dummy bytes after jumps, into a new stream.`
			`_stream.Seek(0, SeekOrigin.Begin);`

			`using (MemoryStream codeStream = new MemoryStream())`
			`{`
			`Assembler assembler = new Assembler(codeStream);`

			`byte[] buffer;`

			`for (int index = 0; index < _jumps.Count; index++)`
			`{`
			`Jump jump = _jumps[index];`

			`buffer = new byte[jump.JumpPosition - _stream.Position];`

			`_stream.Read(buffer, 0, buffer.Length);`
			`_stream.Seek(ReservedBytesForJump, SeekOrigin.Current);`

			`codeStream.Write(buffer);`

			`if (jump.IsConditional)`
			`{`
			`assembler.Jcc(jump.Condition, jump.RelativeOffset);`
			`}`
			`else`
			`{`
			`assembler.Jmp(jump.RelativeOffset);`
			`}`
			`}`

			`buffer = new byte[_stream.Length - _stream.Position];`

			`_stream.Read(buffer, 0, buffer.Length);`

			`codeStream.Write(buffer);`

			`return codeStream.ToArray();`
			`}`
			`}`
			`}`
			`}`