diff --git a/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs b/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs index 9e267376..2963f7cf 100644 --- a/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs +++ b/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs @@ -1,13 +1,14 @@ using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using Ryujinx.Graphics.Nvdec.Vp9.Dsp; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using Ryujinx.Graphics.Video; using System; using System.Buffers.Binary; using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using Ryujinx.Graphics.Nvdec.Vp9.Common; -using Ryujinx.Graphics.Nvdec.Vp9.Dsp; -using Ryujinx.Graphics.Nvdec.Vp9.Types; -using Ryujinx.Graphics.Video; +using System.Threading.Tasks; using Mv = Ryujinx.Graphics.Nvdec.Vp9.Types.Mv; namespace Ryujinx.Graphics.Nvdec.Vp9 @@ -1095,6 +1096,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 data = data.Slice(size); } + private static void GetTileBuffers(ref Vp9Common cm, ArrayPtr data, int tileCols, ref Array64 tileBuffers) + { + int c; + + for (c = 0; c < tileCols; ++c) + { + bool isLast = c == tileCols - 1; + ref TileBuffer buf = ref tileBuffers[c]; + buf.Col = c; + GetTileBuffer(isLast, ref cm.Error, ref data, ref buf); + } + } + private static void GetTileBuffers( ref Vp9Common cm, ArrayPtr data, @@ -1181,5 +1195,163 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // Get last tile data. return cm.TileWorkerData[tileCols * tileRows - 1].BitReader.FindEnd(); } + + private static bool DecodeTileCol(ref TileWorkerData tileData, ref Vp9Common cm, ref Array64 tileBuffers) + { + ref TileInfo tile = ref tileData.Xd.Tile; + int finalCol = (1 << cm.Log2TileCols) - 1; + ArrayPtr bitReaderEnd = ArrayPtr.Null; + + int n = tileData.BufStart; + + tileData.Xd.Corrupted = false; + + do + { + ref TileBuffer buf = ref tileBuffers[n]; + + Debug.Assert(cm.Log2TileRows == 0); + tileData.Dqcoeff = new Array32>(); + tile.Init(ref cm, 0, buf.Col); + SetupTokenDecoder(buf.Data, buf.Size, ref tileData.ErrorInfo, ref tileData.BitReader); + cm.InitMacroBlockD(ref tileData.Xd, new ArrayPtr(ref tileData.Dqcoeff[0][0], 32 * 32)); + tileData.Xd.ErrorInfo = new Ptr(ref tileData.ErrorInfo); + + for (int miRow = tile.MiRowStart; miRow < tile.MiRowEnd; miRow += Constants.MiBlockSize) + { + tileData.Xd.LeftContext = new Array3>(); + tileData.Xd.LeftSegContext = new Array8(); + for (int miCol = tile.MiColStart; miCol < tile.MiColEnd; miCol += Constants.MiBlockSize) + { + DecodePartition(ref tileData, ref cm, miRow, miCol, BlockSize.Block64x64, 4); + } + } + + if (buf.Col == finalCol) + { + bitReaderEnd = tileData.BitReader.FindEnd(); + } + } while (!tileData.Xd.Corrupted && ++n <= tileData.BufEnd); + + tileData.DataEnd = bitReaderEnd; + return !tileData.Xd.Corrupted; + } + + public static unsafe ArrayPtr DecodeTilesMt(ref Vp9Common cm, ArrayPtr data, int maxThreads) + { + ArrayPtr bitReaderEnd = ArrayPtr.Null; + + int tileCols = 1 << cm.Log2TileCols; + int tileRows = 1 << cm.Log2TileRows; + int totalTiles = tileCols * tileRows; + int numWorkers = Math.Min(maxThreads, tileCols); + int n; + + Debug.Assert(tileCols <= (1 << 6)); + Debug.Assert(tileRows == 1); + + cm.AboveContext.ToSpan().Fill(0); + cm.AboveSegContext.ToSpan().Fill(0); + + for (n = 0; n < numWorkers; ++n) + { + ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles]; + + tileData.Xd = cm.Mb; + tileData.Xd.Counts = new Ptr(ref tileData.Counts); + tileData.Counts = new Vp9BackwardUpdates(); + } + + Array64 tileBuffers = new Array64(); + + GetTileBuffers(ref cm, data, tileCols, ref tileBuffers); + + tileBuffers.ToSpan().Slice(0, tileCols).Sort(CompareTileBuffers); + + if (numWorkers == tileCols) + { + TileBuffer largest = tileBuffers[0]; + Span buffers = tileBuffers.ToSpan(); + buffers.Slice(1).CopyTo(buffers.Slice(0, tileBuffers.Length - 1)); + tileBuffers[tileCols - 1] = largest; + } + else + { + int start = 0, end = tileCols - 2; + TileBuffer tmp; + + // Interleave the tiles to distribute the load between threads, assuming a + // larger tile implies it is more difficult to decode. + while (start < end) + { + tmp = tileBuffers[start]; + tileBuffers[start] = tileBuffers[end]; + tileBuffers[end] = tmp; + start += 2; + end -= 2; + } + } + + int baseVal = tileCols / numWorkers; + int remain = tileCols % numWorkers; + int bufStart = 0; + + for (n = 0; n < numWorkers; ++n) + { + int count = baseVal + (remain + n) / numWorkers; + ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles]; + + tileData.BufStart = bufStart; + tileData.BufEnd = bufStart + count - 1; + tileData.DataEnd = data.Slice(data.Length); + bufStart += count; + } + + Ptr cmPtr = new Ptr(ref cm); + + Parallel.For(0, numWorkers, (n) => + { + ref TileWorkerData tileData = ref cmPtr.Value.TileWorkerData[n + totalTiles]; + + if (!DecodeTileCol(ref tileData, ref cmPtr.Value, ref tileBuffers)) + { + cmPtr.Value.Mb.Corrupted = true; + } + }); + + for (; n > 0; --n) + { + if (bitReaderEnd.IsNull) + { + ref TileWorkerData tileData = ref cm.TileWorkerData[n - 1 + totalTiles]; + bitReaderEnd = tileData.DataEnd; + } + } + + for (n = 0; n < numWorkers; ++n) + { + ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles]; + AccumulateFrameCounts(ref cm.Counts.Value, ref tileData.Counts); + } + + Debug.Assert(!bitReaderEnd.IsNull || cm.Mb.Corrupted); + return bitReaderEnd; + } + + private static int CompareTileBuffers(TileBuffer bufA, TileBuffer bufB) + { + return (bufA.Size < bufB.Size ? 1 : 0) - (bufA.Size > bufB.Size ? 1 : 0); + } + + private static void AccumulateFrameCounts(ref Vp9BackwardUpdates accum, ref Vp9BackwardUpdates counts) + { + Span a = MemoryMarshal.Cast(MemoryMarshal.CreateSpan(ref accum, 1)); + Span c = MemoryMarshal.Cast(MemoryMarshal.CreateSpan(ref counts, 1)); + + for (int i = 0; i < a.Length; i++) + { + a[i] += c[i]; + } + } } } diff --git a/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs b/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs index ff4221ac..f82ca761 100644 --- a/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs +++ b/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs @@ -92,7 +92,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 cm.Mb.SetupBlockPlanes(1, 1); - cm.AllocTileWorkerData(_allocator, 1 << pictureInfo.Log2TileCols, 1 << pictureInfo.Log2TileRows); + int tileCols = 1 << pictureInfo.Log2TileCols; + int tileRows = 1 << pictureInfo.Log2TileRows; + + // Video usually have only 4 columns, so more threads won't make a difference for those. + // Try to not take all CPU cores for video decoding. + int maxThreads = Math.Min(4, Environment.ProcessorCount / 2); + + cm.AllocTileWorkerData(_allocator, tileCols, tileRows, maxThreads); cm.AllocContextBuffers(_allocator, output.Width, output.Height); cm.InitContextBuffers(); cm.SetupSegmentationDequant(); @@ -104,7 +111,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { try { - DecodeFrame.DecodeTiles(ref cm, new ArrayPtr(dataPtr, bitstream.Length)); + if (maxThreads > 1 && tileRows == 1 && tileCols > 1) + { + DecodeFrame.DecodeTilesMt(ref cm, new ArrayPtr(dataPtr, bitstream.Length), maxThreads); + } + else + { + DecodeFrame.DecodeTiles(ref cm, new ArrayPtr(dataPtr, bitstream.Length)); + } } catch (InternalErrorException) { diff --git a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs index b4ad4344..e41a31ca 100644 --- a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs +++ b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs @@ -87,6 +87,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp return rv; } + [SkipLocalsInit] public static void Iwht4x416Add(ReadOnlySpan input, Span dest, int stride) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, @@ -142,6 +143,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void Iwht4x41Add(ReadOnlySpan input, Span dest, int stride) { int i; @@ -209,6 +211,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3)); } + [SkipLocalsInit] public static void Idct4(ReadOnlySpan input, Span output) { Span step = stackalloc short[4]; @@ -231,6 +234,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[3] = WrapLow(step[0] - step[3]); } + [SkipLocalsInit] public static void Idct4x416Add(ReadOnlySpan input, Span dest, int stride) { int i, j; @@ -359,6 +363,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[7] = WrapLow(-x1); } + [SkipLocalsInit] public static void Idct8(ReadOnlySpan input, Span output) { Span step1 = stackalloc short[8]; @@ -416,6 +421,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[7] = WrapLow(step1[0] - step1[7]); } + [SkipLocalsInit] public static void Idct8x864Add(ReadOnlySpan input, Span dest, int stride) { int i, j; @@ -449,6 +455,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void Idct8x812Add(ReadOnlySpan input, Span dest, int stride) { int i, j; @@ -457,6 +464,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Span tempIn = stackalloc int[8]; Span tempOut = stackalloc int[8]; + output.Fill(0); + // First transform rows // Only first 4 row has non-zero coefs for (i = 0; i < 4; ++i) @@ -671,6 +680,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[15] = WrapLow(-x1); } + [SkipLocalsInit] public static void Idct16(ReadOnlySpan input, Span output) { Span step1 = stackalloc short[16]; @@ -838,6 +848,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[15] = WrapLow(step2[0] - step2[15]); } + [SkipLocalsInit] public static void Idct16x16256Add(ReadOnlySpan input, Span dest, int stride) { int i, j; @@ -870,6 +881,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void Idct16x1638Add(ReadOnlySpan input, Span dest, int stride) { int i, j; @@ -878,6 +890,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Span tempIn = stackalloc int[16]; Span tempOut = stackalloc int[16]; + output.Fill(0); + // First transform rows. Since all non-zero dct coefficients are in // upper-left 8x8 area, we only need to calculate first 8 rows here. for (i = 0; i < 8; ++i) @@ -903,6 +917,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void Idct16x1610Add(ReadOnlySpan input, Span dest, int stride) { int i, j; @@ -911,6 +926,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Span tempIn = stackalloc int[16]; Span tempOut = stackalloc int[16]; + output.Fill(0); + // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. for (i = 0; i < 4; ++i) @@ -955,6 +972,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void Idct32(ReadOnlySpan input, Span output) { Span step1 = stackalloc short[32]; @@ -1324,6 +1342,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[31] = WrapLow(step1[0] - step1[31]); } + [SkipLocalsInit] public static void Idct32x321024Add(ReadOnlySpan input, Span dest, int stride) { int i, j; @@ -1370,6 +1389,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void Idct32x32135Add(ReadOnlySpan input, Span dest, int stride) { int i, j; @@ -1378,6 +1398,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Span tempIn = stackalloc int[32]; Span tempOut = stackalloc int[32]; + output.Fill(0); + // Rows // Only upper-left 16x16 has non-zero coeff for (i = 0; i < 16; ++i) @@ -1403,6 +1425,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void Idct32x3234Add(ReadOnlySpan input, Span dest, int stride) { int i, j; @@ -1411,6 +1434,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Span tempIn = stackalloc int[32]; Span tempOut = stackalloc int[32]; + output.Fill(0); + // Rows // Only upper-left 8x8 has non-zero coeff for (i = 0; i < 8; ++i) @@ -1456,6 +1481,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void HighbdIwht4x416Add(ReadOnlySpan input, Span dest, int stride, int bd) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, @@ -1511,6 +1537,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void HighbdIwht4x41Add(ReadOnlySpan input, Span dest, int stride, int bd) { int i; @@ -1584,6 +1611,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd); } + [SkipLocalsInit] public static void HighbdIdct4(ReadOnlySpan input, Span output, int bd) { Span step = stackalloc int[4]; @@ -1613,6 +1641,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[3] = HighbdWrapLow(step[0] - step[3], bd); } + [SkipLocalsInit] public static void HighbdIdct4x416Add(ReadOnlySpan input, Span dest, int stride, int bd) { int i, j; @@ -1748,6 +1777,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[7] = HighbdWrapLow(-x1, bd); } + [SkipLocalsInit] public static void HighbdIdct8(ReadOnlySpan input, Span output, int bd) { Span step1 = stackalloc int[8]; @@ -1803,6 +1833,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[7] = HighbdWrapLow(step1[0] - step1[7], bd); } + [SkipLocalsInit] public static void HighbdIdct8x864Add(ReadOnlySpan input, Span dest, int stride, int bd) { int i, j; @@ -1835,6 +1866,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void HighbdIdct8x812Add(ReadOnlySpan input, Span dest, int stride, int bd) { int i, j; @@ -1843,6 +1875,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Span tempIn = stackalloc int[8]; Span tempOut = stackalloc int[8]; + output.Fill(0); + // First transform rows // Only first 4 row has non-zero coefs for (i = 0; i < 4; ++i) @@ -2062,6 +2096,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[15] = HighbdWrapLow(-x1, bd); } + [SkipLocalsInit] public static void HighbdIdct16(ReadOnlySpan input, Span output, int bd) { Span step1 = stackalloc int[16]; @@ -2236,6 +2271,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[15] = HighbdWrapLow(step2[0] - step2[15], bd); } + [SkipLocalsInit] public static void HighbdIdct16x16256Add(ReadOnlySpan input, Span dest, int stride, int bd) { int i, j; @@ -2268,6 +2304,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void HighbdIdct16x1638Add(ReadOnlySpan input, Span dest, int stride, int bd) { int i, j; @@ -2276,6 +2313,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Span tempIn = stackalloc int[16]; Span tempOut = stackalloc int[16]; + output.Fill(0); + // First transform rows. Since all non-zero dct coefficients are in // upper-left 8x8 area, we only need to calculate first 8 rows here. for (i = 0; i < 8; ++i) @@ -2303,6 +2342,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void HighbdIdct16x1610Add(ReadOnlySpan input, Span dest, int stride, int bd) { int i, j; @@ -2311,6 +2351,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Span tempIn = stackalloc int[16]; Span tempOut = stackalloc int[16]; + output.Fill(0); + // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. for (i = 0; i < 4; ++i) @@ -2355,6 +2397,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void HighbdIdct32(ReadOnlySpan input, Span output, int bd) { Span step1 = stackalloc int[32]; @@ -2539,7 +2582,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64; - temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64; + temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64; step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64; @@ -2731,6 +2774,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp output[31] = HighbdWrapLow(step1[0] - step1[31], bd); } + [SkipLocalsInit] public static void HighbdIdct32x321024Add(ReadOnlySpan input, Span dest, int stride, int bd) { int i, j; @@ -2777,6 +2821,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void HighbdIdct32x32135Add(ReadOnlySpan input, Span dest, int stride, int bd) { int i, j; @@ -2785,6 +2830,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Span tempIn = stackalloc int[32]; Span tempOut = stackalloc int[32]; + output.Fill(0); + // Rows // Only upper-left 16x16 has non-zero coeff for (i = 0; i < 16; ++i) @@ -2812,6 +2859,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } + [SkipLocalsInit] public static void HighbdIdct32x3234Add(ReadOnlySpan input, Span dest, int stride, int bd) { int i, j; @@ -2820,6 +2868,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Span tempIn = stackalloc int[32]; Span tempOut = stackalloc int[32]; + output.Fill(0); + // Rows // Only upper-left 8x8 has non-zero coeff for (i = 0; i < 8; ++i) diff --git a/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs b/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs index 3b60889b..c5a25e6b 100644 --- a/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs +++ b/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs @@ -4,6 +4,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { internal struct TileBuffer { + public int Col; public ArrayPtr Data; public int Size; } diff --git a/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs b/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs index 40557274..333a077a 100644 --- a/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs +++ b/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs @@ -1,14 +1,20 @@ using Ryujinx.Common.Memory; using Ryujinx.Graphics.Nvdec.Vp9.Dsp; using Ryujinx.Graphics.Nvdec.Vp9.Types; +using Ryujinx.Graphics.Video; namespace Ryujinx.Graphics.Nvdec.Vp9 { internal struct TileWorkerData { + public ArrayPtr DataEnd; + public int BufStart; + public int BufEnd; public Reader BitReader; + public Vp9BackwardUpdates Counts; public MacroBlockD Xd; /* dqcoeff are shared by all the planes. So planes must be decoded serially */ public Array32> Dqcoeff; + public InternalErrorInfo ErrorInfo; } } diff --git a/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs b/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs index 4ca05954..faadd349 100644 --- a/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs +++ b/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs @@ -127,9 +127,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types MBs = MbRows * MbCols; } - public void AllocTileWorkerData(MemoryAllocator allocator, int tileCols, int tileRows) + public void AllocTileWorkerData(MemoryAllocator allocator, int tileCols, int tileRows, int maxThreads) { - TileWorkerData = allocator.Allocate(tileCols * tileRows); + TileWorkerData = allocator.Allocate(tileCols * tileRows + (maxThreads > 1 ? maxThreads : 0)); } public void FreeTileWorkerData(MemoryAllocator allocator)