From 7e4d986a731e9cba05f24b2efd14e18ebc39e75d Mon Sep 17 00:00:00 2001 From: gdkchan Date: Mon, 10 Feb 2020 21:10:05 -0300 Subject: [PATCH] Support compute uniform buffers emulated with global memory (#924) --- Ryujinx.Graphics.Gpu/Constants.cs | 16 +++- Ryujinx.Graphics.Gpu/Engine/Compute.cs | 28 +++++++ .../Translation/GlobalMemory.cs | 24 +++--- .../Translation/Lowering.cs | 2 +- .../Optimizations/GlobalToStorage.cs | 79 ++++++++++++++++++- 5 files changed, 133 insertions(+), 16 deletions(-) diff --git a/Ryujinx.Graphics.Gpu/Constants.cs b/Ryujinx.Graphics.Gpu/Constants.cs index 65cd8846..cc476654 100644 --- a/Ryujinx.Graphics.Gpu/Constants.cs +++ b/Ryujinx.Graphics.Gpu/Constants.cs @@ -8,11 +8,18 @@ namespace Ryujinx.Graphics.Gpu /// /// Maximum number of compute uniform buffers. /// - public const int TotalCpUniformBuffers = 8; + /// + /// This does not reflect the hardware count, the API will emulate some constant buffers using + /// global memory to make up for the low amount of compute constant buffers supported by hardware (only 8). + /// + public const int TotalCpUniformBuffers = 17; // 8 hardware constant buffers + 9 emulated (14 available to the user). /// - /// Maximum number of compute storage buffers (this is an API limitation). + /// Maximum number of compute storage buffers. /// + /// + /// The maximum number of storage buffers is API limited, the hardware supports a unlimited amount. + /// public const int TotalCpStorageBuffers = 16; /// @@ -21,8 +28,11 @@ namespace Ryujinx.Graphics.Gpu public const int TotalGpUniformBuffers = 18; /// - /// Maximum number of graphics storage buffers (this is an API limitation). + /// Maximum number of graphics storage buffers. /// + /// + /// The maximum number of storage buffers is API limited, the hardware supports a unlimited amount. + /// public const int TotalGpStorageBuffers = 16; /// diff --git a/Ryujinx.Graphics.Gpu/Engine/Compute.cs b/Ryujinx.Graphics.Gpu/Engine/Compute.cs index 9178cfb0..fc257f99 100644 --- a/Ryujinx.Graphics.Gpu/Engine/Compute.cs +++ b/Ryujinx.Graphics.Gpu/Engine/Compute.cs @@ -69,6 +69,34 @@ namespace Ryujinx.Graphics.Gpu.Engine BufferManager.SetComputeUniformBuffer(index, gpuVa, size); } + for (int index = 0; index < info.CBuffers.Count; index++) + { + BufferDescriptor cb = info.CBuffers[index]; + + // NVN uses the "hardware" constant buffer for anything that is less than 8, + // and those are already bound above. + // Anything greater than or equal to 8 uses the emulated constant buffers. + // They are emulated using global memory loads. + if (cb.Slot < 8) + { + continue; + } + + ubEnableMask |= 1u << cb.Slot; + + ulong cbDescAddress = BufferManager.GetComputeUniformBufferAddress(0); + + int cbDescOffset = 0x260 + cb.Slot * 0x10; + + cbDescAddress += (ulong)cbDescOffset; + + ReadOnlySpan cbDescriptorData = _context.PhysicalMemory.GetSpan(cbDescAddress, 0x10); + + SbDescriptor cbDescriptor = MemoryMarshal.Cast(cbDescriptorData)[0]; + + BufferManager.SetComputeUniformBuffer(cb.Slot, cbDescriptor.PackAddress(), (uint)cbDescriptor.Size); + } + for (int index = 0; index < info.SBuffers.Count; index++) { BufferDescriptor sb = info.SBuffers[index]; diff --git a/Ryujinx.Graphics.Shader/Translation/GlobalMemory.cs b/Ryujinx.Graphics.Shader/Translation/GlobalMemory.cs index a442357d..75bd9ddf 100644 --- a/Ryujinx.Graphics.Shader/Translation/GlobalMemory.cs +++ b/Ryujinx.Graphics.Shader/Translation/GlobalMemory.cs @@ -11,6 +11,11 @@ namespace Ryujinx.Graphics.Shader.Translation public const int StorageDescsSize = StorageDescSize * StorageMaxCount; + public const int UbeBaseOffset = 0x98; // In words. + public const int UbeMaxCount = 9; + public const int UbeDescsSize = StorageDescSize * UbeMaxCount; + public const int UbeFirstCbuf = 8; + public static bool UsesGlobalMemory(Instruction inst) { return (inst.IsAtomic() && IsGlobalMr(inst)) || @@ -30,17 +35,16 @@ namespace Ryujinx.Graphics.Shader.Translation public static int GetStorageBaseCbOffset(ShaderStage stage) { - switch (stage) + return stage switch { - case ShaderStage.Compute: return StorageDescsBaseOffset + 2 * StorageDescsSize; - case ShaderStage.Vertex: return StorageDescsBaseOffset; - case ShaderStage.TessellationControl: return StorageDescsBaseOffset + 1 * StorageDescsSize; - case ShaderStage.TessellationEvaluation: return StorageDescsBaseOffset + 2 * StorageDescsSize; - case ShaderStage.Geometry: return StorageDescsBaseOffset + 3 * StorageDescsSize; - case ShaderStage.Fragment: return StorageDescsBaseOffset + 4 * StorageDescsSize; - } - - return 0; + ShaderStage.Compute => StorageDescsBaseOffset + 2 * StorageDescsSize, + ShaderStage.Vertex => StorageDescsBaseOffset, + ShaderStage.TessellationControl => StorageDescsBaseOffset + 1 * StorageDescsSize, + ShaderStage.TessellationEvaluation => StorageDescsBaseOffset + 2 * StorageDescsSize, + ShaderStage.Geometry => StorageDescsBaseOffset + 3 * StorageDescsSize, + ShaderStage.Fragment => StorageDescsBaseOffset + 4 * StorageDescsSize, + _ => 0 + }; } } } \ No newline at end of file diff --git a/Ryujinx.Graphics.Shader/Translation/Lowering.cs b/Ryujinx.Graphics.Shader/Translation/Lowering.cs index 1ee21e0a..99aea26e 100644 --- a/Ryujinx.Graphics.Shader/Translation/Lowering.cs +++ b/Ryujinx.Graphics.Shader/Translation/Lowering.cs @@ -81,7 +81,7 @@ namespace Ryujinx.Graphics.Shader.Translation Operand alignMask = Const(-config.QueryInfo(QueryInfoName.StorageBufferOffsetAlignment)); - Operand baseAddrTrunc = PrependOperation(Instruction.BitwiseAnd, sbBaseAddrLow, Const(-64)); + Operand baseAddrTrunc = PrependOperation(Instruction.BitwiseAnd, sbBaseAddrLow, alignMask); Operand byteOffset = PrependOperation(Instruction.Subtract, addrLow, baseAddrTrunc); Operand wordOffset = PrependOperation(Instruction.ShiftRightU32, byteOffset, Const(2)); diff --git a/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs b/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs index 8efd2c52..7988ef6c 100644 --- a/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs +++ b/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs @@ -31,8 +31,27 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations if (storageIndex >= 0) { + // Storage buffers are implemented using global memory access. + // If we know from where the base address of the access is loaded, + // we can guess which storage buffer it is accessing. + // We can then replace the global memory access with a storage + // buffer access. node = ReplaceGlobalWithStorage(node, config, storageIndex); } + else if (config.Stage == ShaderStage.Compute && operation.Inst == Instruction.LoadGlobal) + { + // Here we effectively try to replace a LDG instruction with LDC. + // The hardware only supports a limited amount of constant buffers + // so NVN "emulates" more constant buffers using global memory access. + // Here we try to replace the global access back to a constant buffer + // load. + storageIndex = SearchForStorageBase(asgOperation, UbeBaseOffset, UbeBaseOffset + UbeDescsSize); + + if (storageIndex >= 0) + { + node = ReplaceLdgWithLdc(node, config, storageIndex); + } + } } } } @@ -42,8 +61,6 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations { Operation operation = (Operation)node.Value; - Operation storageOp; - Operand GetStorageOffset() { Operand addrLow = operation.GetSource(0); @@ -80,6 +97,8 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations sources[index] = operation.GetSource(index); } + Operation storageOp; + if (operation.Inst.IsAtomic()) { Instruction inst = (operation.Inst & ~Instruction.MrMask) | Instruction.MrStorage; @@ -109,6 +128,62 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations return node; } + private static LinkedListNode ReplaceLdgWithLdc(LinkedListNode node, ShaderConfig config, int storageIndex) + { + Operation operation = (Operation)node.Value; + + Operand GetCbufOffset() + { + Operand addrLow = operation.GetSource(0); + + Operand baseAddrLow = Cbuf(0, UbeBaseOffset + storageIndex * StorageDescSize); + + Operand baseAddrTrunc = Local(); + + Operand alignMask = Const(-config.QueryInfo(QueryInfoName.StorageBufferOffsetAlignment)); + + Operation andOp = new Operation(Instruction.BitwiseAnd, baseAddrTrunc, baseAddrLow, alignMask); + + node.List.AddBefore(node, andOp); + + Operand byteOffset = Local(); + Operand wordOffset = Local(); + + Operation subOp = new Operation(Instruction.Subtract, byteOffset, addrLow, baseAddrTrunc); + Operation shrOp = new Operation(Instruction.ShiftRightU32, wordOffset, byteOffset, Const(2)); + + node.List.AddBefore(node, subOp); + node.List.AddBefore(node, shrOp); + + return wordOffset; + } + + Operand[] sources = new Operand[operation.SourcesCount]; + + sources[0] = Const(UbeFirstCbuf + storageIndex); + sources[1] = GetCbufOffset(); + + for (int index = 2; index < operation.SourcesCount; index++) + { + sources[index] = operation.GetSource(index); + } + + Operation ldcOp = new Operation(Instruction.LoadConstant, operation.Dest, sources); + + for (int index = 0; index < operation.SourcesCount; index++) + { + operation.SetSource(index, null); + } + + LinkedListNode oldNode = node; + + node = node.List.AddBefore(node, ldcOp); + + node.List.Remove(oldNode); + + return node; + } + private static int SearchForStorageBase(Operation operation, int sbStart, int sbEnd) { Queue assignments = new Queue();