From 42c75dbb8f9472f434d0324a37a87e91ee7b50f3 Mon Sep 17 00:00:00 2001 From: gdkchan Date: Sat, 22 Jan 2022 15:23:00 -0300 Subject: [PATCH] Add support for BC1/2/3 decompression (for 3D textures) (#2987) * Add support for BC1/2/3 decompression (for 3D textures) * Optimize and clean up * Unsafe not needed here * Fix alpha value interpolation when a0 <= a1 --- Ryujinx.Graphics.GAL/Capabilities.cs | 37 +- Ryujinx.Graphics.GAL/Format.cs | 22 - Ryujinx.Graphics.Gpu/GpuContext.cs | 19 +- Ryujinx.Graphics.Gpu/Image/Texture.cs | 30 +- .../Image/TextureCompatibility.cs | 18 +- Ryujinx.Graphics.OpenGL/FormatTable.cs | 2 - Ryujinx.Graphics.OpenGL/Renderer.cs | 1 + Ryujinx.Graphics.Texture/BCnDecoder.cs | 734 ++++++++++++++++-- 8 files changed, 720 insertions(+), 143 deletions(-) diff --git a/Ryujinx.Graphics.GAL/Capabilities.cs b/Ryujinx.Graphics.GAL/Capabilities.cs index c7cedb34..4e5dff59 100644 --- a/Ryujinx.Graphics.GAL/Capabilities.cs +++ b/Ryujinx.Graphics.GAL/Capabilities.cs @@ -2,30 +2,32 @@ namespace Ryujinx.Graphics.GAL { public struct Capabilities { - public bool HasFrontFacingBug { get; } - public bool HasVectorIndexingBug { get; } + public readonly bool HasFrontFacingBug; + public readonly bool HasVectorIndexingBug; - public bool SupportsAstcCompression { get; } - public bool SupportsBgraFormat { get; } - public bool SupportsR4G4Format { get; } - public bool SupportsFragmentShaderInterlock { get; } - public bool SupportsFragmentShaderOrderingIntel { get; } - public bool SupportsImageLoadFormatted { get; } - public bool SupportsMismatchingViewFormat { get; } - public bool SupportsNonConstantTextureOffset { get; } - public bool SupportsShaderBallot { get; } - public bool SupportsTextureShadowLod { get; } - public bool SupportsViewportSwizzle { get; } - public bool SupportsIndirectParameters { get; } + public readonly bool SupportsAstcCompression; + public readonly bool Supports3DTextureCompression; + public readonly bool SupportsBgraFormat; + public readonly bool SupportsR4G4Format; + public readonly bool SupportsFragmentShaderInterlock; + public readonly bool SupportsFragmentShaderOrderingIntel; + public readonly bool SupportsImageLoadFormatted; + public readonly bool SupportsMismatchingViewFormat; + public readonly bool SupportsNonConstantTextureOffset; + public readonly bool SupportsShaderBallot; + public readonly bool SupportsTextureShadowLod; + public readonly bool SupportsViewportSwizzle; + public readonly bool SupportsIndirectParameters; - public int MaximumComputeSharedMemorySize { get; } - public float MaximumSupportedAnisotropy { get; } - public int StorageBufferOffsetAlignment { get; } + public readonly int MaximumComputeSharedMemorySize; + public readonly float MaximumSupportedAnisotropy; + public readonly int StorageBufferOffsetAlignment; public Capabilities( bool hasFrontFacingBug, bool hasVectorIndexingBug, bool supportsAstcCompression, + bool supports3DTextureCompression, bool supportsBgraFormat, bool supportsR4G4Format, bool supportsFragmentShaderInterlock, @@ -44,6 +46,7 @@ namespace Ryujinx.Graphics.GAL HasFrontFacingBug = hasFrontFacingBug; HasVectorIndexingBug = hasVectorIndexingBug; SupportsAstcCompression = supportsAstcCompression; + Supports3DTextureCompression = supports3DTextureCompression; SupportsBgraFormat = supportsBgraFormat; SupportsR4G4Format = supportsR4G4Format; SupportsFragmentShaderInterlock = supportsFragmentShaderInterlock; diff --git a/Ryujinx.Graphics.GAL/Format.cs b/Ryujinx.Graphics.GAL/Format.cs index d5e183ba..a454413b 100644 --- a/Ryujinx.Graphics.GAL/Format.cs +++ b/Ryujinx.Graphics.GAL/Format.cs @@ -67,11 +67,9 @@ namespace Ryujinx.Graphics.GAL R10G10B10A2Uint, R11G11B10Float, R9G9B9E5Float, - Bc1RgbUnorm, Bc1RgbaUnorm, Bc2Unorm, Bc3Unorm, - Bc1RgbSrgb, Bc1RgbaSrgb, Bc2Srgb, Bc3Srgb, @@ -349,25 +347,5 @@ namespace Ryujinx.Graphics.GAL { return format.IsUint() || format.IsSint(); } - - /// - /// Checks if the texture format is a BC4 compressed format. - /// - /// Texture format - /// True if the texture format is a BC4 compressed format, false otherwise - public static bool IsBc4(this Format format) - { - return format == Format.Bc4Unorm || format == Format.Bc4Snorm; - } - - /// - /// Checks if the texture format is a BC5 compressed format. - /// - /// Texture format - /// True if the texture format is a BC5 compressed format, false otherwise - public static bool IsBc5(this Format format) - { - return format == Format.Bc5Unorm || format == Format.Bc5Snorm; - } } } \ No newline at end of file diff --git a/Ryujinx.Graphics.Gpu/GpuContext.cs b/Ryujinx.Graphics.Gpu/GpuContext.cs index 5c9af383..ddc95b2c 100644 --- a/Ryujinx.Graphics.Gpu/GpuContext.cs +++ b/Ryujinx.Graphics.Gpu/GpuContext.cs @@ -78,14 +78,27 @@ namespace Ryujinx.Graphics.Gpu /// /// Host hardware capabilities. /// - internal Capabilities Capabilities => _caps.Value; + internal ref Capabilities Capabilities + { + get + { + if (!_capsLoaded) + { + _caps = Renderer.GetCapabilities(); + _capsLoaded = true; + } + + return ref _caps; + } + } /// /// Event for signalling shader cache loading progress. /// public event Action ShaderCacheStateChanged; - private readonly Lazy _caps; + private bool _capsLoaded; + private Capabilities _caps; private Thread _gpuThread; /// @@ -110,8 +123,6 @@ namespace Ryujinx.Graphics.Gpu DeferredActions = new Queue(); PhysicalMemoryRegistry = new ConcurrentDictionary(); - - _caps = new Lazy(Renderer.GetCapabilities); } /// diff --git a/Ryujinx.Graphics.Gpu/Image/Texture.cs b/Ryujinx.Graphics.Gpu/Image/Texture.cs index b2fa15a2..e1f00606 100644 --- a/Ryujinx.Graphics.Gpu/Image/Texture.cs +++ b/Ryujinx.Graphics.Gpu/Image/Texture.cs @@ -834,13 +834,31 @@ namespace Ryujinx.Graphics.Gpu.Image { data = PixelConverter.ConvertR4G4ToR4G4B4A4(data); } - else if (Target == Target.Texture3D && Format.IsBc4()) + else if (!_context.Capabilities.Supports3DTextureCompression && Target == Target.Texture3D) { - data = BCnDecoder.DecodeBC4(data, width, height, depth, levels, layers, Info.FormatInfo.Format == Format.Bc4Snorm); - } - else if (Target == Target.Texture3D && Format.IsBc5()) - { - data = BCnDecoder.DecodeBC5(data, width, height, depth, levels, layers, Info.FormatInfo.Format == Format.Bc5Snorm); + switch (Format) + { + case Format.Bc1RgbaSrgb: + case Format.Bc1RgbaUnorm: + data = BCnDecoder.DecodeBC1(data, width, height, depth, levels, layers); + break; + case Format.Bc2Srgb: + case Format.Bc2Unorm: + data = BCnDecoder.DecodeBC2(data, width, height, depth, levels, layers); + break; + case Format.Bc3Srgb: + case Format.Bc3Unorm: + data = BCnDecoder.DecodeBC3(data, width, height, depth, levels, layers); + break; + case Format.Bc4Snorm: + case Format.Bc4Unorm: + data = BCnDecoder.DecodeBC4(data, width, height, depth, levels, layers, Format == Format.Bc4Snorm); + break; + case Format.Bc5Snorm: + case Format.Bc5Unorm: + data = BCnDecoder.DecodeBC5(data, width, height, depth, levels, layers, Format == Format.Bc5Snorm); + break; + } } return data; diff --git a/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs b/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs index 0461a81f..188e1e09 100644 --- a/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs +++ b/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs @@ -14,9 +14,6 @@ namespace Ryujinx.Graphics.Gpu.Image private enum FormatClass { Unclassified, - BCn64, - BCn128, - Bc1Rgb, Bc1Rgba, Bc2, Bc3, @@ -88,13 +85,21 @@ namespace Ryujinx.Graphics.Gpu.Image return new FormatInfo(Format.R4G4B4A4Unorm, 1, 1, 2, 4); } - if (info.Target == Target.Texture3D) + if (!caps.Supports3DTextureCompression && info.Target == Target.Texture3D) { - // The host API does not support 3D BC4/BC5 compressed formats. + // The host API does not support 3D compressed formats. // We assume software decompression will be done for those textures, // and so we adjust the format here to match the decompressor output. switch (info.FormatInfo.Format) { + case Format.Bc1RgbaSrgb: + case Format.Bc2Srgb: + case Format.Bc3Srgb: + return new FormatInfo(Format.R8G8B8A8Srgb, 1, 1, 4, 4); + case Format.Bc1RgbaUnorm: + case Format.Bc2Unorm: + case Format.Bc3Unorm: + return new FormatInfo(Format.R8G8B8A8Unorm, 1, 1, 4, 4); case Format.Bc4Unorm: return new FormatInfo(Format.R8Unorm, 1, 1, 1, 1); case Format.Bc4Snorm: @@ -749,9 +754,6 @@ namespace Ryujinx.Graphics.Gpu.Image { switch (format) { - case Format.Bc1RgbSrgb: - case Format.Bc1RgbUnorm: - return FormatClass.Bc1Rgb; case Format.Bc1RgbaSrgb: case Format.Bc1RgbaUnorm: return FormatClass.Bc1Rgba; diff --git a/Ryujinx.Graphics.OpenGL/FormatTable.cs b/Ryujinx.Graphics.OpenGL/FormatTable.cs index e3249cd6..41fd9f37 100644 --- a/Ryujinx.Graphics.OpenGL/FormatTable.cs +++ b/Ryujinx.Graphics.OpenGL/FormatTable.cs @@ -80,11 +80,9 @@ namespace Ryujinx.Graphics.OpenGL Add(Format.R10G10B10A2Uint, new FormatInfo(4, false, false, All.Rgb10A2ui, PixelFormat.RgbaInteger, PixelType.UnsignedInt2101010Reversed)); Add(Format.R11G11B10Float, new FormatInfo(3, false, false, All.R11fG11fB10f, PixelFormat.Rgb, PixelType.UnsignedInt10F11F11FRev)); Add(Format.R9G9B9E5Float, new FormatInfo(3, false, false, All.Rgb9E5, PixelFormat.Rgb, PixelType.UnsignedInt5999Rev)); - Add(Format.Bc1RgbUnorm, new FormatInfo(3, true, false, All.CompressedRgbS3tcDxt1Ext)); Add(Format.Bc1RgbaUnorm, new FormatInfo(4, true, false, All.CompressedRgbaS3tcDxt1Ext)); Add(Format.Bc2Unorm, new FormatInfo(4, true, false, All.CompressedRgbaS3tcDxt3Ext)); Add(Format.Bc3Unorm, new FormatInfo(4, true, false, All.CompressedRgbaS3tcDxt5Ext)); - Add(Format.Bc1RgbSrgb, new FormatInfo(3, false, false, All.CompressedSrgbS3tcDxt1Ext)); Add(Format.Bc1RgbaSrgb, new FormatInfo(4, true, false, All.CompressedSrgbAlphaS3tcDxt1Ext)); Add(Format.Bc2Srgb, new FormatInfo(4, false, false, All.CompressedSrgbAlphaS3tcDxt3Ext)); Add(Format.Bc3Srgb, new FormatInfo(4, false, false, All.CompressedSrgbAlphaS3tcDxt5Ext)); diff --git a/Ryujinx.Graphics.OpenGL/Renderer.cs b/Ryujinx.Graphics.OpenGL/Renderer.cs index ceacbf29..8d44f2e4 100644 --- a/Ryujinx.Graphics.OpenGL/Renderer.cs +++ b/Ryujinx.Graphics.OpenGL/Renderer.cs @@ -104,6 +104,7 @@ namespace Ryujinx.Graphics.OpenGL hasFrontFacingBug: HwCapabilities.Vendor == HwCapabilities.GpuVendor.IntelWindows, hasVectorIndexingBug: HwCapabilities.Vendor == HwCapabilities.GpuVendor.AmdWindows, supportsAstcCompression: HwCapabilities.SupportsAstcCompression, + supports3DTextureCompression: false, supportsBgraFormat: false, supportsR4G4Format: false, supportsFragmentShaderInterlock: HwCapabilities.SupportsFragmentShaderInterlock, diff --git a/Ryujinx.Graphics.Texture/BCnDecoder.cs b/Ryujinx.Graphics.Texture/BCnDecoder.cs index b8b04bac..b840cac8 100644 --- a/Ryujinx.Graphics.Texture/BCnDecoder.cs +++ b/Ryujinx.Graphics.Texture/BCnDecoder.cs @@ -1,7 +1,9 @@ using Ryujinx.Common; using System; -using System.Runtime.CompilerServices; +using System.Buffers.Binary; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; namespace Ryujinx.Graphics.Texture { @@ -10,22 +12,30 @@ namespace Ryujinx.Graphics.Texture private const int BlockWidth = 4; private const int BlockHeight = 4; - public static byte[] DecodeBC4(ReadOnlySpan data, int width, int height, int depth, int levels, int layers, bool signed) + public static byte[] DecodeBC1(ReadOnlySpan data, int width, int height, int depth, int levels, int layers) { int size = 0; for (int l = 0; l < levels; l++) { - size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers; + size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4; } byte[] output = new byte[size]; - ReadOnlySpan data64 = MemoryMarshal.Cast(data); + Span tile = stackalloc byte[BlockWidth * BlockHeight * 4]; - Span rPal = stackalloc byte[8]; + Span tileAsUint = MemoryMarshal.Cast(tile); + Span outputAsUint = MemoryMarshal.Cast(output); - int baseOOffs = 0; + Span> tileAsVector128 = MemoryMarshal.Cast>(tile); + + Span> outputLine0 = default; + Span> outputLine1 = default; + Span> outputLine2 = default; + Span> outputLine3 = default; + + int imageBaseOOffs = 0; for (int l = 0; l < levels; l++) { @@ -39,11 +49,302 @@ namespace Ryujinx.Graphics.Texture for (int y = 0; y < h; y++) { int baseY = y * BlockHeight; + int copyHeight = Math.Min(BlockHeight, height - baseY); + int lineBaseOOffs = imageBaseOOffs + baseY * width; + + if (copyHeight == 4) + { + outputLine0 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs)); + outputLine1 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width)); + outputLine2 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width * 2)); + outputLine3 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width * 3)); + } for (int x = 0; x < w; x++) { int baseX = x * BlockWidth; - int lineBaseOOffs = baseOOffs + baseX; + int copyWidth = Math.Min(BlockWidth, width - baseX); + + BC1DecodeTileRgb(tile, data); + + if ((copyWidth | copyHeight) == 4) + { + outputLine0[x] = tileAsVector128[0]; + outputLine1[x] = tileAsVector128[1]; + outputLine2[x] = tileAsVector128[2]; + outputLine3[x] = tileAsVector128[3]; + } + else + { + int pixelBaseOOffs = lineBaseOOffs + baseX; + + for (int tY = 0; tY < copyHeight; tY++) + { + tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth)); + } + } + + data = data.Slice(8); + } + } + + imageBaseOOffs += width * height; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + + public static byte[] DecodeBC2(ReadOnlySpan data, int width, int height, int depth, int levels, int layers) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4; + } + + byte[] output = new byte[size]; + + Span tile = stackalloc byte[BlockWidth * BlockHeight * 4]; + + Span tileAsUint = MemoryMarshal.Cast(tile); + Span outputAsUint = MemoryMarshal.Cast(output); + + Span> tileAsVector128 = MemoryMarshal.Cast>(tile); + + Span> outputLine0 = default; + Span> outputLine1 = default; + Span> outputLine2 = default; + Span> outputLine3 = default; + + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int w = BitUtils.DivRoundUp(width, BlockWidth); + int h = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + for (int y = 0; y < h; y++) + { + int baseY = y * BlockHeight; + int copyHeight = Math.Min(BlockHeight, height - baseY); + int lineBaseOOffs = imageBaseOOffs + baseY * width; + + if (copyHeight == 4) + { + outputLine0 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs)); + outputLine1 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width)); + outputLine2 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width * 2)); + outputLine3 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width * 3)); + } + + for (int x = 0; x < w; x++) + { + int baseX = x * BlockWidth; + int copyWidth = Math.Min(BlockWidth, width - baseX); + + BC23DecodeTileRgb(tile, data.Slice(8)); + + ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data); + + for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, block >>= 4) + { + tile[i] = (byte)((block & 0xf) | (block << 4)); + } + + if ((copyWidth | copyHeight) == 4) + { + outputLine0[x] = tileAsVector128[0]; + outputLine1[x] = tileAsVector128[1]; + outputLine2[x] = tileAsVector128[2]; + outputLine3[x] = tileAsVector128[3]; + } + else + { + int pixelBaseOOffs = lineBaseOOffs + baseX; + + for (int tY = 0; tY < copyHeight; tY++) + { + tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth)); + } + } + + data = data.Slice(16); + } + } + + imageBaseOOffs += width * height; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + + public static byte[] DecodeBC3(ReadOnlySpan data, int width, int height, int depth, int levels, int layers) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4; + } + + byte[] output = new byte[size]; + + Span tile = stackalloc byte[BlockWidth * BlockHeight * 4]; + Span rPal = stackalloc byte[8]; + + Span tileAsUint = MemoryMarshal.Cast(tile); + Span outputAsUint = MemoryMarshal.Cast(output); + + Span> tileAsVector128 = MemoryMarshal.Cast>(tile); + + Span> outputLine0 = default; + Span> outputLine1 = default; + Span> outputLine2 = default; + Span> outputLine3 = default; + + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int w = BitUtils.DivRoundUp(width, BlockWidth); + int h = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + for (int y = 0; y < h; y++) + { + int baseY = y * BlockHeight; + int copyHeight = Math.Min(BlockHeight, height - baseY); + int lineBaseOOffs = imageBaseOOffs + baseY * width; + + if (copyHeight == 4) + { + outputLine0 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs)); + outputLine1 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width)); + outputLine2 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width * 2)); + outputLine3 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width * 3)); + } + + for (int x = 0; x < w; x++) + { + int baseX = x * BlockWidth; + int copyWidth = Math.Min(BlockWidth, width - baseX); + + BC23DecodeTileRgb(tile, data.Slice(8)); + + ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data); + + rPal[0] = (byte)block; + rPal[1] = (byte)(block >> 8); + + BCnLerpAlphaUnorm(rPal); + BCnDecodeTileAlphaRgba(tile, rPal, block >> 16); + + if ((copyWidth | copyHeight) == 4) + { + outputLine0[x] = tileAsVector128[0]; + outputLine1[x] = tileAsVector128[1]; + outputLine2[x] = tileAsVector128[2]; + outputLine3[x] = tileAsVector128[3]; + } + else + { + int pixelBaseOOffs = lineBaseOOffs + baseX; + + for (int tY = 0; tY < copyHeight; tY++) + { + tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth)); + } + } + + data = data.Slice(16); + } + } + + imageBaseOOffs += width * height; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + + public static byte[] DecodeBC4(ReadOnlySpan data, int width, int height, int depth, int levels, int layers, bool signed) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers; + } + + byte[] output = new byte[size]; + Span outputSpan = new Span(output); + + ReadOnlySpan data64 = MemoryMarshal.Cast(data); + + Span tile = stackalloc byte[BlockWidth * BlockHeight]; + Span rPal = stackalloc byte[8]; + + Span tileAsUint = MemoryMarshal.Cast(tile); + + Span outputLine0 = default; + Span outputLine1 = default; + Span outputLine2 = default; + Span outputLine3 = default; + + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int w = BitUtils.DivRoundUp(width, BlockWidth); + int h = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + for (int y = 0; y < h; y++) + { + int baseY = y * BlockHeight; + int copyHeight = Math.Min(BlockHeight, height - baseY); + int lineBaseOOffs = imageBaseOOffs + baseY * width; + + if (copyHeight == 4) + { + outputLine0 = MemoryMarshal.Cast(outputSpan.Slice(lineBaseOOffs)); + outputLine1 = MemoryMarshal.Cast(outputSpan.Slice(lineBaseOOffs + width)); + outputLine2 = MemoryMarshal.Cast(outputSpan.Slice(lineBaseOOffs + width * 2)); + outputLine3 = MemoryMarshal.Cast(outputSpan.Slice(lineBaseOOffs + width * 3)); + } + + for (int x = 0; x < w; x++) + { + int baseX = x * BlockWidth; + int copyWidth = Math.Min(BlockWidth, width - baseX); ulong block = data64[0]; @@ -52,45 +353,43 @@ namespace Ryujinx.Graphics.Texture if (signed) { - CalculateBC3AlphaS(rPal); + BCnLerpAlphaSnorm(rPal); } else { - CalculateBC3Alpha(rPal); + BCnLerpAlphaUnorm(rPal); } - ulong rI = block >> 16; + BCnDecodeTileAlpha(tile, rPal, block >> 16); - for (int texel = 0; texel < BlockWidth * BlockHeight; texel++) + if ((copyWidth | copyHeight) == 4) { - int tX = texel & 3; - int tY = texel >> 2; + outputLine0[x] = tileAsUint[0]; + outputLine1[x] = tileAsUint[1]; + outputLine2[x] = tileAsUint[2]; + outputLine3[x] = tileAsUint[3]; + } + else + { + int pixelBaseOOffs = lineBaseOOffs + baseX; - if (baseX + tX >= width || baseY + tY >= height) + for (int tY = 0; tY < copyHeight; tY++) { - continue; + tile.Slice(tY * 4, copyWidth).CopyTo(outputSpan.Slice(pixelBaseOOffs + width * tY, copyWidth)); } - - int shift = texel * 3; - - byte r = rPal[(int)((rI >> shift) & 7)]; - - int oOffs = lineBaseOOffs + tY * width + tX; - - output[oOffs] = r; } data64 = data64.Slice(1); } - - baseOOffs += width * (baseY + BlockHeight > height ? (height & (BlockHeight - 1)) : BlockHeight); } + + imageBaseOOffs += width * height; } } - width = Math.Max(1, width >> 1); + width = Math.Max(1, width >> 1); height = Math.Max(1, height >> 1); - depth = Math.Max(1, depth >> 1); + depth = Math.Max(1, depth >> 1); } return output; @@ -109,10 +408,22 @@ namespace Ryujinx.Graphics.Texture ReadOnlySpan data64 = MemoryMarshal.Cast(data); + Span rTile = stackalloc byte[BlockWidth * BlockHeight * 2]; + Span gTile = stackalloc byte[BlockWidth * BlockHeight * 2]; Span rPal = stackalloc byte[8]; Span gPal = stackalloc byte[8]; - int baseOOffs = 0; + Span outputAsUshort = MemoryMarshal.Cast(output); + + Span rTileAsUint = MemoryMarshal.Cast(rTile); + Span gTileAsUint = MemoryMarshal.Cast(gTile); + + Span outputLine0 = default; + Span outputLine1 = default; + Span outputLine2 = default; + Span outputLine3 = default; + + int imageBaseOOffs = 0; for (int l = 0; l < levels; l++) { @@ -126,11 +437,21 @@ namespace Ryujinx.Graphics.Texture for (int y = 0; y < h; y++) { int baseY = y * BlockHeight; + int copyHeight = Math.Min(BlockHeight, height - baseY); + int lineBaseOOffs = imageBaseOOffs + baseY * width; + + if (copyHeight == 4) + { + outputLine0 = MemoryMarshal.Cast(outputAsUshort.Slice(lineBaseOOffs)); + outputLine1 = MemoryMarshal.Cast(outputAsUshort.Slice(lineBaseOOffs + width)); + outputLine2 = MemoryMarshal.Cast(outputAsUshort.Slice(lineBaseOOffs + width * 2)); + outputLine3 = MemoryMarshal.Cast(outputAsUshort.Slice(lineBaseOOffs + width * 3)); + } for (int x = 0; x < w; x++) { int baseX = x * BlockWidth; - int lineBaseOOffs = baseOOffs + baseX; + int copyWidth = Math.Min(BlockWidth, width - baseX); ulong blockL = data64[0]; ulong blockH = data64[1]; @@ -142,101 +463,346 @@ namespace Ryujinx.Graphics.Texture if (signed) { - CalculateBC3AlphaS(rPal); - CalculateBC3AlphaS(gPal); + BCnLerpAlphaSnorm(rPal); + BCnLerpAlphaSnorm(gPal); } else { - CalculateBC3Alpha(rPal); - CalculateBC3Alpha(gPal); + BCnLerpAlphaUnorm(rPal); + BCnLerpAlphaUnorm(gPal); } - ulong rI = blockL >> 16; - ulong gI = blockH >> 16; + BCnDecodeTileAlpha(rTile, rPal, blockL >> 16); + BCnDecodeTileAlpha(gTile, gPal, blockH >> 16); - for (int texel = 0; texel < BlockWidth * BlockHeight; texel++) + if ((copyWidth | copyHeight) == 4) { - int tX = texel & 3; - int tY = texel >> 2; + outputLine0[x] = InterleaveBytes(rTileAsUint[0], gTileAsUint[0]); + outputLine1[x] = InterleaveBytes(rTileAsUint[1], gTileAsUint[1]); + outputLine2[x] = InterleaveBytes(rTileAsUint[2], gTileAsUint[2]); + outputLine3[x] = InterleaveBytes(rTileAsUint[3], gTileAsUint[3]); + } + else + { + int pixelBaseOOffs = lineBaseOOffs + baseX; - if (baseX + tX >= width || baseY + tY >= height) + for (int tY = 0; tY < copyHeight; tY++) { - continue; + int line = pixelBaseOOffs + width * tY; + + for (int tX = 0; tX < copyWidth; tX++) + { + int texel = tY * BlockWidth + tX; + + outputAsUshort[line + tX] = (ushort)(rTile[texel] | (gTile[texel] << 8)); + } } - - int shift = texel * 3; - - byte r = rPal[(int)((rI >> shift) & 7)]; - byte g = gPal[(int)((gI >> shift) & 7)]; - - int oOffs = (lineBaseOOffs + tY * width + tX) * 2; - - output[oOffs + 0] = r; - output[oOffs + 1] = g; } data64 = data64.Slice(2); } - - baseOOffs += width * (baseY + BlockHeight > height ? (height & (BlockHeight - 1)) : BlockHeight); } + + imageBaseOOffs += width * height; } } - width = Math.Max(1, width >> 1); + width = Math.Max(1, width >> 1); height = Math.Max(1, height >> 1); - depth = Math.Max(1, depth >> 1); + depth = Math.Max(1, depth >> 1); } return output; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void CalculateBC3Alpha(Span alpha) + private static ulong InterleaveBytes(uint left, uint right) { - for (int i = 2; i < 8; i++) + return InterleaveBytesWithZeros(left) | (InterleaveBytesWithZeros(right) << 8); + } + + private static ulong InterleaveBytesWithZeros(uint value) + { + ulong output = value; + output = (output ^ (output << 16)) & 0xffff0000ffffUL; + output = (output ^ (output << 8)) & 0xff00ff00ff00ffUL; + return output; + } + + private static void BCnLerpAlphaUnorm(Span alpha) + { + byte a0 = alpha[0]; + byte a1 = alpha[1]; + + if (a0 > a1) { - if (alpha[0] > alpha[1]) + alpha[2] = (byte)((6 * a0 + 1 * a1) / 7); + alpha[3] = (byte)((5 * a0 + 2 * a1) / 7); + alpha[4] = (byte)((4 * a0 + 3 * a1) / 7); + alpha[5] = (byte)((3 * a0 + 4 * a1) / 7); + alpha[6] = (byte)((2 * a0 + 5 * a1) / 7); + alpha[7] = (byte)((1 * a0 + 6 * a1) / 7); + } + else + { + alpha[2] = (byte)((4 * a0 + 1 * a1) / 5); + alpha[3] = (byte)((3 * a0 + 2 * a1) / 5); + alpha[4] = (byte)((2 * a0 + 3 * a1) / 5); + alpha[5] = (byte)((1 * a0 + 4 * a1) / 5); + alpha[6] = 0; + alpha[7] = 0xff; + } + } + + private static void BCnLerpAlphaSnorm(Span alpha) + { + sbyte a0 = (sbyte)alpha[0]; + sbyte a1 = (sbyte)alpha[1]; + + if (a0 > a1) + { + alpha[2] = (byte)((6 * a0 + 1 * a1) / 7); + alpha[3] = (byte)((5 * a0 + 2 * a1) / 7); + alpha[4] = (byte)((4 * a0 + 3 * a1) / 7); + alpha[5] = (byte)((3 * a0 + 4 * a1) / 7); + alpha[6] = (byte)((2 * a0 + 5 * a1) / 7); + alpha[7] = (byte)((1 * a0 + 6 * a1) / 7); + } + else + { + alpha[2] = (byte)((4 * a0 + 1 * a1) / 5); + alpha[3] = (byte)((3 * a0 + 2 * a1) / 5); + alpha[4] = (byte)((2 * a0 + 3 * a1) / 5); + alpha[5] = (byte)((1 * a0 + 4 * a1) / 5); + alpha[6] = 0x80; + alpha[7] = 0x7f; + } + } + + private unsafe static void BCnDecodeTileAlpha(Span output, Span rPal, ulong rI) + { + if (Avx2.IsSupported) + { + Span> outputAsVector128 = MemoryMarshal.Cast>(output); + + Vector128 shifts = Vector128.Create(0u, 3u, 6u, 9u); + Vector128 masks = Vector128.Create(7u); + + Vector128 vClut; + + fixed (byte* pRPal = rPal) { - alpha[i] = (byte)(((8 - i) * alpha[0] + (i - 1) * alpha[1]) / 7); + vClut = Sse2.LoadScalarVector128((ulong*)pRPal).AsByte(); } - else if (i < 6) + + Vector128 indices0 = Vector128.Create((uint)rI); + Vector128 indices1 = Vector128.Create((uint)(rI >> 24)); + Vector128 indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts); + Vector128 indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts); + Vector128 indices01 = Sse2.ShiftRightLogical(indices00, 12); + Vector128 indices11 = Sse2.ShiftRightLogical(indices10, 12); + indices00 = Sse2.And(indices00, masks); + indices10 = Sse2.And(indices10, masks); + indices01 = Sse2.And(indices01, masks); + indices11 = Sse2.And(indices11, masks); + + Vector128 indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32()); + Vector128 indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32()); + + Vector128 indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16()); + + outputAsVector128[0] = Ssse3.Shuffle(vClut, indices); + } + else + { + for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3) { - alpha[i] = (byte)(((6 - i) * alpha[0] + (i - 1) * alpha[1]) / 7); - } - else if (i == 6) - { - alpha[i] = 0; - } - else /* i == 7 */ - { - alpha[i] = 0xff; + output[i] = rPal[(int)(rI & 7)]; } } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void CalculateBC3AlphaS(Span alpha) + private unsafe static void BCnDecodeTileAlphaRgba(Span output, Span rPal, ulong rI) { - for (int i = 2; i < 8; i++) + if (Avx2.IsSupported) { - if ((sbyte)alpha[0] > (sbyte)alpha[1]) + Span> outputAsVector256 = MemoryMarshal.Cast>(output); + + Vector256 shifts = Vector256.Create(0u, 3u, 6u, 9u, 12u, 15u, 18u, 21u); + + Vector128 vClut128; + + fixed (byte* pRPal = rPal) { - alpha[i] = (byte)(((8 - i) * (sbyte)alpha[0] + (i - 1) * (sbyte)alpha[1]) / 7); + vClut128 = Sse2.LoadScalarVector128((ulong*)pRPal).AsUInt32(); } - else if (i < 6) + + Vector256 vClut = Avx2.ConvertToVector256Int32(vClut128.AsByte()).AsUInt32(); + vClut = Avx2.ShiftLeftLogical(vClut, 24); + + Vector256 indices0 = Vector256.Create((uint)rI); + Vector256 indices1 = Vector256.Create((uint)(rI >> 24)); + + indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts); + indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts); + + outputAsVector256[0] = Avx2.Or(outputAsVector256[0], Avx2.PermuteVar8x32(vClut, indices0)); + outputAsVector256[1] = Avx2.Or(outputAsVector256[1], Avx2.PermuteVar8x32(vClut, indices1)); + } + else + { + for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, rI >>= 3) { - alpha[i] = (byte)(((6 - i) * (sbyte)alpha[0] + (i - 1) * (sbyte)alpha[1]) / 7); - } - else if (i == 6) - { - alpha[i] = 0x80; - } - else /* i == 7 */ - { - alpha[i] = 0x7f; + output[i] = rPal[(int)(rI & 7)]; } } } + + private unsafe static void BC1DecodeTileRgb(Span output, ReadOnlySpan input) + { + Span clut = stackalloc uint[4]; + + uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input); + uint c0 = (ushort)c0c1; + uint c1 = (ushort)(c0c1 >> 16); + + clut[0] = ConvertRgb565ToRgb888(c0) | 0xff000000; + clut[1] = ConvertRgb565ToRgb888(c1) | 0xff000000; + clut[2] = BC1LerpRgb2(clut[0], clut[1], c0, c1); + clut[3] = BC1LerpRgb3(clut[0], clut[1], c0, c1); + + BCnDecodeTileRgb(clut, output, input); + } + + private unsafe static void BC23DecodeTileRgb(Span output, ReadOnlySpan input) + { + Span clut = stackalloc uint[4]; + + uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input); + uint c0 = (ushort)c0c1; + uint c1 = (ushort)(c0c1 >> 16); + + clut[0] = ConvertRgb565ToRgb888(c0); + clut[1] = ConvertRgb565ToRgb888(c1); + clut[2] = BC23LerpRgb2(clut[0], clut[1]); + clut[3] = BC23LerpRgb3(clut[0], clut[1]); + + BCnDecodeTileRgb(clut, output, input); + } + + private unsafe static void BCnDecodeTileRgb(Span clut, Span output, ReadOnlySpan input) + { + if (Avx2.IsSupported) + { + Span> outputAsVector256 = MemoryMarshal.Cast>(output); + + Vector256 shifts0 = Vector256.Create(0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u); + Vector256 shifts1 = Vector256.Create(16u, 18u, 20u, 22u, 24u, 26u, 28u, 30u); + Vector256 masks = Vector256.Create(3u); + + Vector256 vClut; + + fixed (uint* pClut = &clut[0]) + { + vClut = Sse2.LoadVector128(pClut).ToVector256Unsafe(); + } + + Vector256 indices0; + + fixed (byte* pInput = input) + { + indices0 = Avx2.BroadcastScalarToVector256((uint*)(pInput + 4)); + } + + Vector256 indices1 = indices0; + + indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts0); + indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts1); + indices0 = Avx2.And(indices0, masks); + indices1 = Avx2.And(indices1, masks); + + outputAsVector256[0] = Avx2.PermuteVar8x32(vClut, indices0); + outputAsVector256[1] = Avx2.PermuteVar8x32(vClut, indices1); + } + else + { + Span outputAsUint = MemoryMarshal.Cast(output); + + uint indices = BinaryPrimitives.ReadUInt32LittleEndian(input.Slice(4)); + + for (int i = 0; i < BlockWidth * BlockHeight; i++, indices >>= 2) + { + outputAsUint[i] = clut[(int)(indices & 3)]; + } + } + } + + private static uint BC1LerpRgb2(uint color0, uint color1, uint c0, uint c1) + { + if (c0 > c1) + { + return BC23LerpRgb2(color0, color1) | 0xff000000; + } + + uint carry = color0 & color1; + uint addHalve = ((color0 ^ color1) >> 1) & 0x7f7f7f; + return (addHalve + carry) | 0xff000000; + } + + private static uint BC23LerpRgb2(uint color0, uint color1) + { + uint r0 = (byte)color0; + uint g0 = color0 & 0xff00; + uint b0 = color0 & 0xff0000; + + uint r1 = (byte)color1; + uint g1 = color1 & 0xff00; + uint b1 = color1 & 0xff0000; + + uint mixR = (2 * r0 + r1) / 3; + uint mixG = (2 * g0 + g1) / 3; + uint mixB = (2 * b0 + b1) / 3; + + return mixR | (mixG & 0xff00) | (mixB & 0xff0000); + } + + private static uint BC1LerpRgb3(uint color0, uint color1, uint c0, uint c1) + { + if (c0 > c1) + { + return BC23LerpRgb3(color0, color1) | 0xff000000; + } + + return 0; + } + + private static uint BC23LerpRgb3(uint color0, uint color1) + { + uint r0 = (byte)color0; + uint g0 = color0 & 0xff00; + uint b0 = color0 & 0xff0000; + + uint r1 = (byte)color1; + uint g1 = color1 & 0xff00; + uint b1 = color1 & 0xff0000; + + uint mixR = (2 * r1 + r0) / 3; + uint mixG = (2 * g1 + g0) / 3; + uint mixB = (2 * b1 + b0) / 3; + + return mixR | (mixG & 0xff00) | (mixB & 0xff0000); + } + + private static uint ConvertRgb565ToRgb888(uint value) + { + uint b = (value & 0x1f) << 19; + uint g = (value << 5) & 0xfc00; + uint r = (value >> 8) & 0xf8; + + b |= b >> 5; + g |= g >> 6; + r |= r >> 5; + + return r | (g & 0xff00) | (b & 0xff0000); + } } } \ No newline at end of file