ryujinx/Ryujinx.Graphics.Gpu/Engine/Threed/SemaphoreUpdater.cs

224 lines
7.9 KiB
C#
Raw Normal View History

using Ryujinx.Common;
using Ryujinx.Graphics.GAL;
Add a Multithreading layer for the GAL, multi-thread shader compilation at runtime (#2501) * Initial Implementation About as fast as nvidia GL multithreading, can be improved with faster command queuing. * Struct based command list Speeds up a bit. Still a lot of time lost to resource copy. * Do shader init while the render thread is active. * Introduce circular span pool V1 Ideally should be able to use structs instead of references for storing these spans on commands. Will try that next. * Refactor SpanRef some more Use a struct to represent SpanRef, rather than a reference. * Flush buffers on background thread * Use a span for UpdateRenderScale. Much faster than copying the array. * Calculate command size using reflection * WIP parallel shaders * Some minor optimisation * Only 2 max refs per command now. The command with 3 refs is gone. :relieved: * Don't cast on the GPU side * Remove redundant casts, force sync on window present * Fix Shader Cache * Fix host shader save. * Fixup to work with new renderer stuff * Make command Run static, use array of delegates as lookup Profile says this takes less time than the previous way. * Bring up to date * Add settings toggle. Fix Muiltithreading Off mode. * Fix warning. * Release tracking lock for flushes * Fix Conditional Render fast path with threaded gal * Make handle iteration safe when releasing the lock This is mostly temporary. * Attempt to set backend threading on driver Only really works on nvidia before launching a game. * Fix race condition with BufferModifiedRangeList, exceptions in tracking actions * Update buffer set commands * Some cleanup * Only use stutter workaround when using opengl renderer non-threaded * Add host-conditional reservation of counter events There has always been the possibility that conditional rendering could use a query object just as it is disposed by the counter queue. This change makes it so that when the host decides to use host conditional rendering, the query object is reserved so that it cannot be deleted. Counter events can optionally start reserved, as the threaded implementation can reserve them before the backend creates them, and there would otherwise be a short amount of time where the counter queue could dispose the event before a call to reserve it could be made. * Address Feedback * Make counter flush tracked again. Hopefully does not cause any issues this time. * Wait for FlushTo on the main queue thread. Currently assumes only one thread will want to FlushTo (in this case, the GPU thread) * Add SDL2 headless integration * Add HLE macro commands. Co-authored-by: Mary <mary@mary.zone>
2021-08-26 22:31:29 +00:00
using System.Runtime.InteropServices;
namespace Ryujinx.Graphics.Gpu.Engine.Threed
{
/// <summary>
/// Semaphore updater.
/// </summary>
class SemaphoreUpdater
{
private const int NsToTicksFractionNumerator = 384;
private const int NsToTicksFractionDenominator = 625;
/// <summary>
/// GPU semaphore operation.
/// </summary>
private enum SemaphoreOperation
{
Release = 0,
Acquire = 1,
Counter = 2
}
/// <summary>
/// Counter type for GPU counter reset.
/// </summary>
private enum ResetCounterType
{
SamplesPassed = 1,
ZcullStats = 2,
TransformFeedbackPrimitivesWritten = 0x10,
InputVertices = 0x12,
InputPrimitives = 0x13,
VertexShaderInvocations = 0x15,
TessControlShaderInvocations = 0x16,
TessEvaluationShaderInvocations = 0x17,
TessEvaluationShaderPrimitives = 0x18,
GeometryShaderInvocations = 0x1a,
GeometryShaderPrimitives = 0x1b,
ClipperInputPrimitives = 0x1c,
ClipperOutputPrimitives = 0x1d,
FragmentShaderInvocations = 0x1e,
PrimitivesGenerated = 0x1f
}
/// <summary>
/// Counter type for GPU counter reporting.
/// </summary>
private enum ReportCounterType
{
Zero = 0,
InputVertices = 1,
InputPrimitives = 3,
VertexShaderInvocations = 5,
GeometryShaderInvocations = 7,
GeometryShaderPrimitives = 9,
ZcullStats0 = 0xa,
TransformFeedbackPrimitivesWritten = 0xb,
ZcullStats1 = 0xc,
ZcullStats2 = 0xe,
ClipperInputPrimitives = 0xf,
ZcullStats3 = 0x10,
ClipperOutputPrimitives = 0x11,
PrimitivesGenerated = 0x12,
FragmentShaderInvocations = 0x13,
SamplesPassed = 0x15,
TransformFeedbackOffset = 0x1a,
TessControlShaderInvocations = 0x1b,
TessEvaluationShaderInvocations = 0x1d,
TessEvaluationShaderPrimitives = 0x1f
}
private readonly GpuContext _context;
private readonly GpuChannel _channel;
private readonly DeviceStateWithShadow<ThreedClassState> _state;
/// <summary>
/// Creates a new instance of the semaphore updater.
/// </summary>
/// <param name="context">GPU context</param>
/// <param name="channel">GPU channel</param>
/// <param name="state">Channel state</param>
public SemaphoreUpdater(GpuContext context, GpuChannel channel, DeviceStateWithShadow<ThreedClassState> state)
{
_context = context;
_channel = channel;
_state = state;
}
/// <summary>
/// Resets the value of an internal GPU counter back to zero.
/// </summary>
/// <param name="argument">Method call argument</param>
public void ResetCounter(int argument)
{
ResetCounterType type = (ResetCounterType)argument;
switch (type)
{
case ResetCounterType.SamplesPassed:
_context.Renderer.ResetCounter(CounterType.SamplesPassed);
break;
case ResetCounterType.PrimitivesGenerated:
_context.Renderer.ResetCounter(CounterType.PrimitivesGenerated);
break;
case ResetCounterType.TransformFeedbackPrimitivesWritten:
_context.Renderer.ResetCounter(CounterType.TransformFeedbackPrimitivesWritten);
break;
}
}
/// <summary>
/// Writes a GPU counter to guest memory.
/// </summary>
/// <param name="argument">Method call argument</param>
public void Report(int argument)
{
SemaphoreOperation op = (SemaphoreOperation)(argument & 3);
ReportCounterType type = (ReportCounterType)((argument >> 23) & 0x1f);
switch (op)
{
case SemaphoreOperation.Release: ReleaseSemaphore(); break;
case SemaphoreOperation.Counter: ReportCounter(type); break;
}
}
/// <summary>
/// Writes (or Releases) a GPU semaphore value to guest memory.
/// </summary>
private void ReleaseSemaphore()
{
_channel.MemoryManager.Write(_state.State.SemaphoreAddress.Pack(), _state.State.SemaphorePayload);
_context.AdvanceSequence();
}
/// <summary>
/// Packed GPU counter data (including GPU timestamp) in memory.
/// </summary>
private struct CounterData
{
public ulong Counter;
public ulong Timestamp;
}
/// <summary>
/// Writes a GPU counter to guest memory.
/// This also writes the current timestamp value.
/// </summary>
/// <param name="type">Counter to be written to memory</param>
private void ReportCounter(ReportCounterType type)
{
ulong gpuVa = _state.State.SemaphoreAddress.Pack();
ulong ticks = ConvertNanosecondsToTicks((ulong)PerformanceCounter.ElapsedNanoseconds);
if (GraphicsConfig.FastGpuTime)
{
// Divide by some amount to report time as if operations were performed faster than they really are.
// This can prevent some games from switching to a lower resolution because rendering is too slow.
ticks /= 256;
}
ICounterEvent counter = null;
void resultHandler(object evt, ulong result)
{
CounterData counterData = new CounterData
{
Counter = result,
Timestamp = ticks
};
if (counter?.Invalid != true)
{
_channel.MemoryManager.Write(gpuVa, counterData);
}
}
switch (type)
{
case ReportCounterType.Zero:
resultHandler(null, 0);
break;
case ReportCounterType.SamplesPassed:
Add a Multithreading layer for the GAL, multi-thread shader compilation at runtime (#2501) * Initial Implementation About as fast as nvidia GL multithreading, can be improved with faster command queuing. * Struct based command list Speeds up a bit. Still a lot of time lost to resource copy. * Do shader init while the render thread is active. * Introduce circular span pool V1 Ideally should be able to use structs instead of references for storing these spans on commands. Will try that next. * Refactor SpanRef some more Use a struct to represent SpanRef, rather than a reference. * Flush buffers on background thread * Use a span for UpdateRenderScale. Much faster than copying the array. * Calculate command size using reflection * WIP parallel shaders * Some minor optimisation * Only 2 max refs per command now. The command with 3 refs is gone. :relieved: * Don't cast on the GPU side * Remove redundant casts, force sync on window present * Fix Shader Cache * Fix host shader save. * Fixup to work with new renderer stuff * Make command Run static, use array of delegates as lookup Profile says this takes less time than the previous way. * Bring up to date * Add settings toggle. Fix Muiltithreading Off mode. * Fix warning. * Release tracking lock for flushes * Fix Conditional Render fast path with threaded gal * Make handle iteration safe when releasing the lock This is mostly temporary. * Attempt to set backend threading on driver Only really works on nvidia before launching a game. * Fix race condition with BufferModifiedRangeList, exceptions in tracking actions * Update buffer set commands * Some cleanup * Only use stutter workaround when using opengl renderer non-threaded * Add host-conditional reservation of counter events There has always been the possibility that conditional rendering could use a query object just as it is disposed by the counter queue. This change makes it so that when the host decides to use host conditional rendering, the query object is reserved so that it cannot be deleted. Counter events can optionally start reserved, as the threaded implementation can reserve them before the backend creates them, and there would otherwise be a short amount of time where the counter queue could dispose the event before a call to reserve it could be made. * Address Feedback * Make counter flush tracked again. Hopefully does not cause any issues this time. * Wait for FlushTo on the main queue thread. Currently assumes only one thread will want to FlushTo (in this case, the GPU thread) * Add SDL2 headless integration * Add HLE macro commands. Co-authored-by: Mary <mary@mary.zone>
2021-08-26 22:31:29 +00:00
counter = _context.Renderer.ReportCounter(CounterType.SamplesPassed, resultHandler, false);
break;
case ReportCounterType.PrimitivesGenerated:
Add a Multithreading layer for the GAL, multi-thread shader compilation at runtime (#2501) * Initial Implementation About as fast as nvidia GL multithreading, can be improved with faster command queuing. * Struct based command list Speeds up a bit. Still a lot of time lost to resource copy. * Do shader init while the render thread is active. * Introduce circular span pool V1 Ideally should be able to use structs instead of references for storing these spans on commands. Will try that next. * Refactor SpanRef some more Use a struct to represent SpanRef, rather than a reference. * Flush buffers on background thread * Use a span for UpdateRenderScale. Much faster than copying the array. * Calculate command size using reflection * WIP parallel shaders * Some minor optimisation * Only 2 max refs per command now. The command with 3 refs is gone. :relieved: * Don't cast on the GPU side * Remove redundant casts, force sync on window present * Fix Shader Cache * Fix host shader save. * Fixup to work with new renderer stuff * Make command Run static, use array of delegates as lookup Profile says this takes less time than the previous way. * Bring up to date * Add settings toggle. Fix Muiltithreading Off mode. * Fix warning. * Release tracking lock for flushes * Fix Conditional Render fast path with threaded gal * Make handle iteration safe when releasing the lock This is mostly temporary. * Attempt to set backend threading on driver Only really works on nvidia before launching a game. * Fix race condition with BufferModifiedRangeList, exceptions in tracking actions * Update buffer set commands * Some cleanup * Only use stutter workaround when using opengl renderer non-threaded * Add host-conditional reservation of counter events There has always been the possibility that conditional rendering could use a query object just as it is disposed by the counter queue. This change makes it so that when the host decides to use host conditional rendering, the query object is reserved so that it cannot be deleted. Counter events can optionally start reserved, as the threaded implementation can reserve them before the backend creates them, and there would otherwise be a short amount of time where the counter queue could dispose the event before a call to reserve it could be made. * Address Feedback * Make counter flush tracked again. Hopefully does not cause any issues this time. * Wait for FlushTo on the main queue thread. Currently assumes only one thread will want to FlushTo (in this case, the GPU thread) * Add SDL2 headless integration * Add HLE macro commands. Co-authored-by: Mary <mary@mary.zone>
2021-08-26 22:31:29 +00:00
counter = _context.Renderer.ReportCounter(CounterType.PrimitivesGenerated, resultHandler, false);
break;
case ReportCounterType.TransformFeedbackPrimitivesWritten:
Add a Multithreading layer for the GAL, multi-thread shader compilation at runtime (#2501) * Initial Implementation About as fast as nvidia GL multithreading, can be improved with faster command queuing. * Struct based command list Speeds up a bit. Still a lot of time lost to resource copy. * Do shader init while the render thread is active. * Introduce circular span pool V1 Ideally should be able to use structs instead of references for storing these spans on commands. Will try that next. * Refactor SpanRef some more Use a struct to represent SpanRef, rather than a reference. * Flush buffers on background thread * Use a span for UpdateRenderScale. Much faster than copying the array. * Calculate command size using reflection * WIP parallel shaders * Some minor optimisation * Only 2 max refs per command now. The command with 3 refs is gone. :relieved: * Don't cast on the GPU side * Remove redundant casts, force sync on window present * Fix Shader Cache * Fix host shader save. * Fixup to work with new renderer stuff * Make command Run static, use array of delegates as lookup Profile says this takes less time than the previous way. * Bring up to date * Add settings toggle. Fix Muiltithreading Off mode. * Fix warning. * Release tracking lock for flushes * Fix Conditional Render fast path with threaded gal * Make handle iteration safe when releasing the lock This is mostly temporary. * Attempt to set backend threading on driver Only really works on nvidia before launching a game. * Fix race condition with BufferModifiedRangeList, exceptions in tracking actions * Update buffer set commands * Some cleanup * Only use stutter workaround when using opengl renderer non-threaded * Add host-conditional reservation of counter events There has always been the possibility that conditional rendering could use a query object just as it is disposed by the counter queue. This change makes it so that when the host decides to use host conditional rendering, the query object is reserved so that it cannot be deleted. Counter events can optionally start reserved, as the threaded implementation can reserve them before the backend creates them, and there would otherwise be a short amount of time where the counter queue could dispose the event before a call to reserve it could be made. * Address Feedback * Make counter flush tracked again. Hopefully does not cause any issues this time. * Wait for FlushTo on the main queue thread. Currently assumes only one thread will want to FlushTo (in this case, the GPU thread) * Add SDL2 headless integration * Add HLE macro commands. Co-authored-by: Mary <mary@mary.zone>
2021-08-26 22:31:29 +00:00
counter = _context.Renderer.ReportCounter(CounterType.TransformFeedbackPrimitivesWritten, resultHandler, false);
break;
}
_channel.MemoryManager.CounterCache.AddOrUpdate(gpuVa, counter);
}
/// <summary>
/// Converts a nanoseconds timestamp value to Maxwell time ticks.
/// </summary>
/// <remarks>
/// The frequency is 614400000 Hz.
/// </remarks>
/// <param name="nanoseconds">Timestamp in nanoseconds</param>
/// <returns>Maxwell ticks</returns>
private static ulong ConvertNanosecondsToTicks(ulong nanoseconds)
{
// We need to divide first to avoid overflows.
// We fix up the result later by calculating the difference and adding
// that to the result.
ulong divided = nanoseconds / NsToTicksFractionDenominator;
ulong rounded = divided * NsToTicksFractionDenominator;
ulong errorBias = (nanoseconds - rounded) * NsToTicksFractionNumerator / NsToTicksFractionDenominator;
return divided * NsToTicksFractionNumerator + errorBias;
}
}
}