Ver Fonte

GPU: Eliminate CB0 accesses when storage buffer accesses are resolved (#3847)

* Eliminate CB0 accesses

Still some work to do, decouple from hle?

* Forgot the important part somehow

* Fix and improve alignment test

* Address Feedback

* Remove some complexity when checking storage buffer alignment

* Update Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs

Co-authored-by: gdkchan <gab.dark.100@gmail.com>

Co-authored-by: gdkchan <gab.dark.100@gmail.com>
riperiperi há 3 anos atrás
pai
commit
33a4d7d1ba

+ 5 - 0
Ryujinx.Graphics.Gpu/Constants.cs

@@ -95,5 +95,10 @@ namespace Ryujinx.Graphics.Gpu
         /// Byte alignment for block linear textures
         /// </summary>
         public const int GobAlignment = 64;
+
+        /// <summary>
+        /// Expected byte alignment for storage buffers
+        /// </summary>
+        public const int StorageAlignment = 16;
     }
 }

+ 29 - 16
Ryujinx.Graphics.Gpu/Engine/Compute/ComputeClass.cs

@@ -138,7 +138,8 @@ namespace Ryujinx.Graphics.Gpu.Engine.Compute
                 qmd.CtaThreadDimension1,
                 qmd.CtaThreadDimension2,
                 localMemorySize,
-                sharedMemorySize);
+                sharedMemorySize,
+                _channel.BufferManager.HasUnalignedStorageBuffers);
 
             CachedShaderProgram cs = memoryManager.Physical.ShaderCache.GetComputeShader(_channel, poolState, computeState, shaderGpuVa);
 
@@ -150,6 +151,33 @@ namespace Ryujinx.Graphics.Gpu.Engine.Compute
 
             ShaderProgramInfo info = cs.Shaders[0].Info;
 
+            bool hasUnaligned = _channel.BufferManager.HasUnalignedStorageBuffers;
+
+            for (int index = 0; index < info.SBuffers.Count; index++)
+            {
+                BufferDescriptor sb = info.SBuffers[index];
+
+                ulong sbDescAddress = _channel.BufferManager.GetComputeUniformBufferAddress(0);
+
+                int sbDescOffset = 0x310 + sb.Slot * 0x10;
+
+                sbDescAddress += (ulong)sbDescOffset;
+
+                SbDescriptor sbDescriptor = _channel.MemoryManager.Physical.Read<SbDescriptor>(sbDescAddress);
+
+                _channel.BufferManager.SetComputeStorageBuffer(sb.Slot, sbDescriptor.PackAddress(), (uint)sbDescriptor.Size, sb.Flags);
+            }
+
+            if ((_channel.BufferManager.HasUnalignedStorageBuffers) != hasUnaligned)
+            {
+                // Refetch the shader, as assumptions about storage buffer alignment have changed.
+                cs = memoryManager.Physical.ShaderCache.GetComputeShader(_channel, poolState, computeState, shaderGpuVa);
+
+                _context.Renderer.Pipeline.SetProgram(cs.HostProgram);
+
+                info = cs.Shaders[0].Info;
+            }
+
             for (int index = 0; index < info.CBuffers.Count; index++)
             {
                 BufferDescriptor cb = info.CBuffers[index];
@@ -174,21 +202,6 @@ namespace Ryujinx.Graphics.Gpu.Engine.Compute
                 _channel.BufferManager.SetComputeUniformBuffer(cb.Slot, cbDescriptor.PackAddress(), (uint)cbDescriptor.Size);
             }
 
-            for (int index = 0; index < info.SBuffers.Count; index++)
-            {
-                BufferDescriptor sb = info.SBuffers[index];
-
-                ulong sbDescAddress = _channel.BufferManager.GetComputeUniformBufferAddress(0);
-
-                int sbDescOffset = 0x310 + sb.Slot * 0x10;
-
-                sbDescAddress += (ulong)sbDescOffset;
-
-                SbDescriptor sbDescriptor = _channel.MemoryManager.Physical.Read<SbDescriptor>(sbDescAddress);
-
-                _channel.BufferManager.SetComputeStorageBuffer(sb.Slot, sbDescriptor.PackAddress(), (uint)sbDescriptor.Size, sb.Flags);
-            }
-
             _channel.BufferManager.SetComputeStorageBufferBindings(info.SBuffers);
             _channel.BufferManager.SetComputeUniformBufferBindings(info.CBuffers);
 

+ 6 - 2
Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs

@@ -293,9 +293,12 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed
         /// </summary>
         private void CommitBindings()
         {
+            var buffers = _channel.BufferManager;
+            var hasUnaligned = buffers.HasUnalignedStorageBuffers;
+
             UpdateStorageBuffers();
 
-            if (!_channel.TextureManager.CommitGraphicsBindings(_shaderSpecState))
+            if (!_channel.TextureManager.CommitGraphicsBindings(_shaderSpecState) || (buffers.HasUnalignedStorageBuffers != hasUnaligned))
             {
                 // Shader must be reloaded.
                 UpdateShaderState();
@@ -1361,7 +1364,8 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed
                 _state.State.AlphaTestFunc,
                 _state.State.AlphaTestRef,
                 ref attributeTypes,
-                _drawState.HasConstantBufferDrawParameters);
+                _drawState.HasConstantBufferDrawParameters,
+                _channel.BufferManager.HasUnalignedStorageBuffers);
         }
 
         /// <summary>

+ 43 - 3
Ryujinx.Graphics.Gpu/Memory/BufferManager.cs

@@ -17,6 +17,9 @@ namespace Ryujinx.Graphics.Gpu.Memory
         private readonly GpuContext _context;
         private readonly GpuChannel _channel;
 
+        private int _unalignedStorageBuffers;
+        public bool HasUnalignedStorageBuffers => _unalignedStorageBuffers > 0;
+
         private IndexBuffer _indexBuffer;
         private readonly VertexBuffer[] _vertexBuffers;
         private readonly BufferBounds[] _transformFeedbackBuffers;
@@ -38,6 +41,11 @@ namespace Ryujinx.Graphics.Gpu.Memory
             /// </summary>
             public BufferBounds[] Buffers { get; }
 
+            /// <summary>
+            /// Flag indicating if this binding is unaligned.
+            /// </summary>
+            public bool[] Unaligned { get; }
+
             /// <summary>
             /// Total amount of buffers used on the shader.
             /// </summary>
@@ -51,6 +59,7 @@ namespace Ryujinx.Graphics.Gpu.Memory
             {
                 Bindings = new BufferDescriptor[count];
                 Buffers = new BufferBounds[count];
+                Unaligned = new bool[count];
             }
 
             /// <summary>
@@ -202,6 +211,31 @@ namespace Ryujinx.Graphics.Gpu.Memory
             _transformFeedbackBuffersDirty = true;
         }
 
+        /// <summary>
+        /// Records the alignment of a storage buffer.
+        /// Unaligned storage buffers disable some optimizations on the shader.
+        /// </summary>
+        /// <param name="buffers">The binding list to modify</param>
+        /// <param name="index">Index of the storage buffer</param>
+        /// <param name="gpuVa">Start GPU virtual address of the buffer</param>
+        private void RecordStorageAlignment(BuffersPerStage buffers, int index, ulong gpuVa)
+        {
+            bool unaligned = (gpuVa & (Constants.StorageAlignment - 1)) != 0;
+
+            if (unaligned || HasUnalignedStorageBuffers)
+            {
+                // Check if the alignment changed for this binding.
+
+                ref bool currentUnaligned = ref buffers.Unaligned[index];
+
+                if (currentUnaligned != unaligned)
+                {
+                    currentUnaligned = unaligned;
+                    _unalignedStorageBuffers += unaligned ? 1 : -1;
+                }
+            }
+        }
+
         /// <summary>
         /// Sets a storage buffer on the compute pipeline.
         /// Storage buffers can be read and written to on shaders.
@@ -214,6 +248,8 @@ namespace Ryujinx.Graphics.Gpu.Memory
         {
             size += gpuVa & ((ulong)_context.Capabilities.StorageBufferOffsetAlignment - 1);
 
+            RecordStorageAlignment(_cpStorageBuffers, index, gpuVa);
+
             gpuVa = BitUtils.AlignDown(gpuVa, _context.Capabilities.StorageBufferOffsetAlignment);
 
             ulong address = _channel.MemoryManager.Physical.BufferCache.TranslateAndCreateBuffer(_channel.MemoryManager, gpuVa, size);
@@ -234,17 +270,21 @@ namespace Ryujinx.Graphics.Gpu.Memory
         {
             size += gpuVa & ((ulong)_context.Capabilities.StorageBufferOffsetAlignment - 1);
 
+            BuffersPerStage buffers = _gpStorageBuffers[stage];
+
+            RecordStorageAlignment(buffers, index, gpuVa);
+
             gpuVa = BitUtils.AlignDown(gpuVa, _context.Capabilities.StorageBufferOffsetAlignment);
 
             ulong address = _channel.MemoryManager.Physical.BufferCache.TranslateAndCreateBuffer(_channel.MemoryManager, gpuVa, size);
 
-            if (_gpStorageBuffers[stage].Buffers[index].Address != address ||
-                _gpStorageBuffers[stage].Buffers[index].Size != size)
+            if (buffers.Buffers[index].Address != address ||
+                buffers.Buffers[index].Size != size)
             {
                 _gpStorageBuffersDirty = true;
             }
 
-            _gpStorageBuffers[stage].SetBounds(index, address, size, flags);
+            buffers.SetBounds(index, address, size, flags);
         }
 
         /// <summary>

+ 3 - 1
Ryujinx.Graphics.Gpu/Shader/ComputeShaderCacheHashTable.cs

@@ -36,6 +36,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
         /// </summary>
         /// <param name="channel">GPU channel</param>
         /// <param name="poolState">Texture pool state</param>
+        /// <param name="computeState">Compute state</param>
         /// <param name="gpuVa">GPU virtual address of the compute shader</param>
         /// <param name="program">Cached host program for the given state, if found</param>
         /// <param name="cachedGuestCode">Cached guest code, if any found</param>
@@ -43,6 +44,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
         public bool TryFind(
             GpuChannel channel,
             GpuChannelPoolState poolState,
+            GpuChannelComputeState computeState,
             ulong gpuVa,
             out CachedShaderProgram program,
             out byte[] cachedGuestCode)
@@ -50,7 +52,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
             program = null;
             ShaderCodeAccessor codeAccessor = new ShaderCodeAccessor(channel.MemoryManager, gpuVa);
             bool hasSpecList = _cache.TryFindItem(codeAccessor, out var specList, out cachedGuestCode);
-            return hasSpecList && specList.TryFindForCompute(channel, poolState, out program);
+            return hasSpecList && specList.TryFindForCompute(channel, poolState, computeState, out program);
         }
 
         /// <summary>

+ 6 - 0
Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheGpuAccessor.cs

@@ -225,6 +225,12 @@ namespace Ryujinx.Graphics.Gpu.Shader.DiskCache
             return _oldSpecState.GraphicsState.EarlyZForce;
         }
 
+        /// <inheritdoc/>
+        public bool QueryHasUnalignedStorageBuffer()
+        {
+            return _oldSpecState.GraphicsState.HasUnalignedStorageBuffer || _oldSpecState.ComputeState.HasUnalignedStorageBuffer;
+        }
+
         /// <inheritdoc/>
         public bool QueryViewportTransformDisable()
         {

+ 1 - 1
Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs

@@ -22,7 +22,7 @@ namespace Ryujinx.Graphics.Gpu.Shader.DiskCache
         private const ushort FileFormatVersionMajor = 1;
         private const ushort FileFormatVersionMinor = 2;
         private const uint FileFormatVersionPacked = ((uint)FileFormatVersionMajor << 16) | FileFormatVersionMinor;
-        private const uint CodeGenVersion = 3747;
+        private const uint CodeGenVersion = 3848;
 
         private const string SharedTocFileName = "shared.toc";
         private const string SharedDataFileName = "shared.data";

+ 6 - 0
Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs

@@ -145,6 +145,12 @@ namespace Ryujinx.Graphics.Gpu.Shader
             return _state.GraphicsState.HasConstantBufferDrawParameters;
         }
 
+        /// <inheritdoc/>
+        public bool QueryHasUnalignedStorageBuffer()
+        {
+            return _state.GraphicsState.HasUnalignedStorageBuffer || _state.ComputeState.HasUnalignedStorageBuffer;
+        }
+
         /// <inheritdoc/>
         public InputTopology QueryPrimitiveTopology()
         {

+ 9 - 1
Ryujinx.Graphics.Gpu/Shader/GpuChannelComputeState.cs

@@ -32,6 +32,11 @@ namespace Ryujinx.Graphics.Gpu.Shader
         /// </summary>
         public readonly int SharedMemorySize;
 
+        /// <summary>
+        /// Indicates that any storage buffer use is unaligned.
+        /// </summary>
+        public readonly bool HasUnalignedStorageBuffer;
+
         /// <summary>
         /// Creates a new GPU compute state.
         /// </summary>
@@ -40,18 +45,21 @@ namespace Ryujinx.Graphics.Gpu.Shader
         /// <param name="localSizeZ">Local group size Z of the compute shader</param>
         /// <param name="localMemorySize">Local memory size of the compute shader</param>
         /// <param name="sharedMemorySize">Shared memory size of the compute shader</param>
+        /// <param name="hasUnalignedStorageBuffer">Indicates that any storage buffer use is unaligned</param>
         public GpuChannelComputeState(
             int localSizeX,
             int localSizeY,
             int localSizeZ,
             int localMemorySize,
-            int sharedMemorySize)
+            int sharedMemorySize,
+            bool hasUnalignedStorageBuffer)
         {
             LocalSizeX = localSizeX;
             LocalSizeY = localSizeY;
             LocalSizeZ = localSizeZ;
             LocalMemorySize = localMemorySize;
             SharedMemorySize = sharedMemorySize;
+            HasUnalignedStorageBuffer = hasUnalignedStorageBuffer;
         }
     }
 }

+ 9 - 1
Ryujinx.Graphics.Gpu/Shader/GpuChannelGraphicsState.cs

@@ -82,6 +82,11 @@ namespace Ryujinx.Graphics.Gpu.Shader
         /// </summary>
         public readonly bool HasConstantBufferDrawParameters;
 
+        /// <summary>
+        /// Indicates that any storage buffer use is unaligned.
+        /// </summary>
+        public readonly bool HasUnalignedStorageBuffer;
+
         /// <summary>
         /// Creates a new GPU graphics state.
         /// </summary>
@@ -99,6 +104,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
         /// <param name="alphaTestReference">When alpha test is enabled, indicates the value to compare with the fragment output alpha</param>
         /// <param name="attributeTypes">Type of the vertex attributes consumed by the shader</param>
         /// <param name="hasConstantBufferDrawParameters">Indicates that the draw is writing the base vertex, base instance and draw index to Constant Buffer 0</param>
+        /// <param name="hasUnalignedStorageBuffer">Indicates that any storage buffer use is unaligned</param>
         public GpuChannelGraphicsState(
             bool earlyZForce,
             PrimitiveTopology topology,
@@ -113,7 +119,8 @@ namespace Ryujinx.Graphics.Gpu.Shader
             CompareOp alphaTestCompare,
             float alphaTestReference,
             ref Array32<AttributeType> attributeTypes,
-            bool hasConstantBufferDrawParameters)
+            bool hasConstantBufferDrawParameters,
+            bool hasUnalignedStorageBuffer)
         {
             EarlyZForce = earlyZForce;
             Topology = topology;
@@ -129,6 +136,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
             AlphaTestReference = alphaTestReference;
             AttributeTypes = attributeTypes;
             HasConstantBufferDrawParameters = hasConstantBufferDrawParameters;
+            HasUnalignedStorageBuffer = hasUnalignedStorageBuffer;
         }
     }
 }

+ 5 - 3
Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs

@@ -203,12 +203,12 @@ namespace Ryujinx.Graphics.Gpu.Shader
             GpuChannelComputeState computeState,
             ulong gpuVa)
         {
-            if (_cpPrograms.TryGetValue(gpuVa, out var cpShader) && IsShaderEqual(channel, poolState, cpShader, gpuVa))
+            if (_cpPrograms.TryGetValue(gpuVa, out var cpShader) && IsShaderEqual(channel, poolState, computeState, cpShader, gpuVa))
             {
                 return cpShader;
             }
 
-            if (_computeShaderCache.TryFind(channel, poolState, gpuVa, out cpShader, out byte[] cachedGuestCode))
+            if (_computeShaderCache.TryFind(channel, poolState, computeState, gpuVa, out cpShader, out byte[] cachedGuestCode))
             {
                 _cpPrograms[gpuVa] = cpShader;
                 return cpShader;
@@ -473,18 +473,20 @@ namespace Ryujinx.Graphics.Gpu.Shader
         /// </summary>
         /// <param name="channel">GPU channel using the shader</param>
         /// <param name="poolState">GPU channel state to verify shader compatibility</param>
+        /// <param name="computeState">GPU channel compute state to verify shader compatibility</param>
         /// <param name="cpShader">Cached compute shader</param>
         /// <param name="gpuVa">GPU virtual address of the shader code in memory</param>
         /// <returns>True if the code is different, false otherwise</returns>
         private static bool IsShaderEqual(
             GpuChannel channel,
             GpuChannelPoolState poolState,
+            GpuChannelComputeState computeState,
             CachedShaderProgram cpShader,
             ulong gpuVa)
         {
             if (IsShaderEqual(channel.MemoryManager, cpShader.Shaders[0], gpuVa))
             {
-                return cpShader.SpecializationState.MatchesCompute(channel, poolState, true);
+                return cpShader.SpecializationState.MatchesCompute(channel, poolState, computeState, true);
             }
 
             return false;

+ 3 - 2
Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationList.cs

@@ -53,13 +53,14 @@ namespace Ryujinx.Graphics.Gpu.Shader
         /// </summary>
         /// <param name="channel">GPU channel</param>
         /// <param name="poolState">Texture pool state</param>
+        /// <param name="computeState">Compute state</param>
         /// <param name="program">Cached program, if found</param>
         /// <returns>True if a compatible program is found, false otherwise</returns>
-        public bool TryFindForCompute(GpuChannel channel, GpuChannelPoolState poolState, out CachedShaderProgram program)
+        public bool TryFindForCompute(GpuChannel channel, GpuChannelPoolState poolState, GpuChannelComputeState computeState, out CachedShaderProgram program)
         {
             foreach (var entry in _entries)
             {
-                if (entry.SpecializationState.MatchesCompute(channel, poolState, true))
+                if (entry.SpecializationState.MatchesCompute(channel, poolState, computeState, true))
                 {
                     program = entry;
                     return true;

+ 12 - 1
Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationState.cs

@@ -531,6 +531,11 @@ namespace Ryujinx.Graphics.Gpu.Shader
                 return false;
             }
 
+            if (graphicsState.HasUnalignedStorageBuffer != GraphicsState.HasUnalignedStorageBuffer)
+            {
+                return false;
+            }
+
             return Matches(channel, poolState, checkTextures, isCompute: false);
         }
 
@@ -539,10 +544,16 @@ namespace Ryujinx.Graphics.Gpu.Shader
         /// </summary>
         /// <param name="channel">GPU channel</param>
         /// <param name="poolState">Texture pool state</param>
+        /// <param name="computeState">Compute state</param>
         /// <param name="checkTextures">Indicates whether texture descriptors should be checked</param>
         /// <returns>True if the state matches, false otherwise</returns>
-        public bool MatchesCompute(GpuChannel channel, GpuChannelPoolState poolState, bool checkTextures)
+        public bool MatchesCompute(GpuChannel channel, GpuChannelPoolState poolState, GpuChannelComputeState computeState, bool checkTextures)
         {
+            if (computeState.HasUnalignedStorageBuffer != ComputeState.HasUnalignedStorageBuffer)
+            {
+                return false;
+            }
+
             return Matches(channel, poolState, checkTextures, isCompute: true);
         }
 

+ 2 - 0
Ryujinx.Graphics.Shader/Constants.cs

@@ -10,5 +10,7 @@ namespace Ryujinx.Graphics.Shader
         public const int NvnBaseVertexByteOffset = 0x640;
         public const int NvnBaseInstanceByteOffset = 0x644;
         public const int NvnDrawIndexByteOffset = 0x648;
+
+        public const int StorageAlignment = 16;
     }
 }

+ 9 - 0
Ryujinx.Graphics.Shader/IGpuAccessor.cs

@@ -177,6 +177,15 @@ namespace Ryujinx.Graphics.Shader
             return false;
         }
 
+        /// <summary>
+        /// Queries whenever the current draw uses unaligned storage buffer addresses.
+        /// </summary>
+        /// <returns>True if any storage buffer address is not aligned to 16 bytes, false otherwise</returns>
+        bool QueryHasUnalignedStorageBuffer()
+        {
+            return false;
+        }
+
         /// <summary>
         /// Queries host about the presence of the FrontFacing built-in variable bug.
         /// </summary>

+ 169 - 37
Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs

@@ -34,7 +34,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
                         // we can guess which storage buffer it is accessing.
                         // We can then replace the global memory access with a storage
                         // buffer access.
-                        node = ReplaceGlobalWithStorage(node, config, storageIndex);
+                        node = ReplaceGlobalWithStorage(block, node, config, storageIndex);
                     }
                     else if (config.Stage == ShaderStage.Compute && operation.Inst == Instruction.LoadGlobal)
                     {
@@ -54,7 +54,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
             }
         }
 
-        private static LinkedListNode<INode> ReplaceGlobalWithStorage(LinkedListNode<INode> node, ShaderConfig config, int storageIndex)
+        private static LinkedListNode<INode> ReplaceGlobalWithStorage(BasicBlock block, LinkedListNode<INode> node, ShaderConfig config, int storageIndex)
         {
             Operation operation = (Operation)node.Value;
 
@@ -64,42 +64,10 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
 
             config.SetUsedStorageBuffer(storageIndex, isWrite);
 
-            Operand GetStorageOffset()
-            {
-                Operand addrLow = operation.GetSource(0);
-
-                Operand baseAddrLow = Cbuf(0, GetStorageCbOffset(config.Stage, storageIndex));
-
-                Operand baseAddrTrunc = Local();
-
-                Operand alignMask = Const(-config.GpuAccessor.QueryHostStorageBufferOffsetAlignment());
-
-                Operation andOp = new Operation(Instruction.BitwiseAnd, baseAddrTrunc, baseAddrLow, alignMask);
-
-                node.List.AddBefore(node, andOp);
-
-                Operand byteOffset = Local();
-                Operation subOp = new Operation(Instruction.Subtract, byteOffset, addrLow, baseAddrTrunc);
-
-                node.List.AddBefore(node, subOp);
-
-                if (isStg16Or8)
-                {
-                    return byteOffset;
-                }
-
-                Operand wordOffset = Local();
-                Operation shrOp = new Operation(Instruction.ShiftRightU32, wordOffset, byteOffset, Const(2));
-
-                node.List.AddBefore(node, shrOp);
-
-                return wordOffset;
-            }
-
             Operand[] sources = new Operand[operation.SourcesCount];
 
             sources[0] = Const(storageIndex);
-            sources[1] = GetStorageOffset();
+            sources[1] = GetStorageOffset(block, node, config, storageIndex, operation.GetSource(0), isStg16Or8);
 
             for (int index = 2; index < operation.SourcesCount; index++)
             {
@@ -144,6 +112,170 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
             return node;
         }
 
+        private static Operand GetStorageOffset(
+            BasicBlock block,
+            LinkedListNode<INode> node,
+            ShaderConfig config,
+            int storageIndex,
+            Operand addrLow,
+            bool isStg16Or8)
+        {
+            int baseAddressCbOffset = GetStorageCbOffset(config.Stage, storageIndex);
+
+            bool storageAligned = !(config.GpuAccessor.QueryHasUnalignedStorageBuffer() || config.GpuAccessor.QueryHostStorageBufferOffsetAlignment() > Constants.StorageAlignment);
+
+            (Operand byteOffset, int constantOffset) = storageAligned ?
+                GetStorageOffset(block, Utils.FindLastOperation(addrLow, block), baseAddressCbOffset) :
+                (null, 0);
+
+            if (byteOffset == null)
+            {
+                Operand baseAddrLow = Cbuf(0, baseAddressCbOffset);
+                Operand baseAddrTrunc = Local();
+
+                Operand alignMask = Const(-config.GpuAccessor.QueryHostStorageBufferOffsetAlignment());
+
+                Operation andOp = new Operation(Instruction.BitwiseAnd, baseAddrTrunc, baseAddrLow, alignMask);
+
+                node.List.AddBefore(node, andOp);
+
+                Operand offset = Local();
+                Operation subOp = new Operation(Instruction.Subtract, offset, addrLow, baseAddrTrunc);
+
+                node.List.AddBefore(node, subOp);
+
+                byteOffset = offset;
+            }
+            else if (constantOffset != 0)
+            {
+                Operand offset = Local();
+                Operation addOp = new Operation(Instruction.Add, offset, byteOffset, Const(constantOffset));
+
+                node.List.AddBefore(node, addOp);
+
+                byteOffset = offset;
+            }
+
+            if (byteOffset != null)
+            {
+                ReplaceAddressAlignment(node.List, addrLow, byteOffset, constantOffset);
+            }
+
+            if (isStg16Or8)
+            {
+                return byteOffset;
+            }
+
+            Operand wordOffset = Local();
+            Operation shrOp = new Operation(Instruction.ShiftRightU32, wordOffset, byteOffset, Const(2));
+
+            node.List.AddBefore(node, shrOp);
+
+            return wordOffset;
+        }
+
+        private static bool IsCb0Offset(Operand operand, int offset)
+        {
+            return operand.Type == OperandType.ConstantBuffer && operand.GetCbufSlot() == 0 && operand.GetCbufOffset() == offset;
+        }
+
+        private static void ReplaceAddressAlignment(LinkedList<INode> list, Operand address, Operand byteOffset, int constantOffset)
+        {
+            // When we emit 16/8-bit LDG, we add extra code to determine the address alignment.
+            // Eliminate the storage buffer base address from this too, leaving only the byte offset.
+
+            foreach (INode useNode in address.UseOps)
+            {
+                if (useNode is Operation op && op.Inst == Instruction.BitwiseAnd)
+                {
+                    Operand src1 = op.GetSource(0);
+                    Operand src2 = op.GetSource(1);
+
+                    int addressIndex = -1;
+
+                    if (src1 == address && src2.Type == OperandType.Constant && src2.Value == 3)
+                    {
+                        addressIndex = 0;
+                    }
+                    else if (src2 == address && src1.Type == OperandType.Constant && src1.Value == 3)
+                    {
+                        addressIndex = 1;
+                    }
+
+                    if (addressIndex != -1)
+                    {
+                        LinkedListNode<INode> node = list.Find(op);
+
+                        // Add offset calculation before the use. Needs to be on the same block.
+                        if (node != null)
+                        {
+                            Operand offset = Local();
+                            Operation addOp = new Operation(Instruction.Add, offset, byteOffset, Const(constantOffset));
+                            list.AddBefore(node, addOp);
+
+                            op.SetSource(addressIndex, offset);
+                        }
+                    }
+                }
+            }
+        }
+
+        private static (Operand, int) GetStorageOffset(BasicBlock block, Operand address, int baseAddressCbOffset)
+        {
+            if (IsCb0Offset(address, baseAddressCbOffset))
+            {
+                // Direct offset: zero.
+                return (Const(0), 0);
+            }
+
+            (address, int constantOffset) = GetStorageConstantOffset(block, address);
+
+            address = Utils.FindLastOperation(address, block);
+
+            if (IsCb0Offset(address, baseAddressCbOffset))
+            {
+                // Only constant offset
+                return (Const(0), constantOffset);
+            }
+
+            if (!(address.AsgOp is Operation offsetAdd) || offsetAdd.Inst != Instruction.Add)
+            {
+                return (null, 0);
+            }
+
+            Operand src1 = offsetAdd.GetSource(0);
+            Operand src2 = Utils.FindLastOperation(offsetAdd.GetSource(1), block);
+
+            if (IsCb0Offset(src2, baseAddressCbOffset))
+            {
+                return (src1, constantOffset);
+            }
+            else if (IsCb0Offset(src1, baseAddressCbOffset))
+            {
+                return (src2, constantOffset);
+            }
+
+            return (null, 0);
+        }
+
+        private static (Operand, int) GetStorageConstantOffset(BasicBlock block, Operand address)
+        {
+            if (!(address.AsgOp is Operation offsetAdd) || offsetAdd.Inst != Instruction.Add)
+            {
+                return (address, 0);
+            }
+
+            Operand src1 = offsetAdd.GetSource(0);
+            Operand src2 = offsetAdd.GetSource(1);
+
+            if (src2.Type != OperandType.Constant)
+            {
+                return (address, 0);
+            }
+
+            return (src1, src2.Value);
+        }
+
         private static LinkedListNode<INode> ReplaceLdgWithLdc(LinkedListNode<INode> node, ShaderConfig config, int storageIndex)
         {
             Operation operation = (Operation)node.Value;
@@ -165,7 +297,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
                 Operand byteOffset = Local();
                 Operand wordOffset = Local();
 
-                Operation subOp = new Operation(Instruction.Subtract,      byteOffset, addrLow, baseAddrTrunc);
+                Operation subOp = new Operation(Instruction.Subtract, byteOffset, addrLow, baseAddrTrunc);
                 Operation shrOp = new Operation(Instruction.ShiftRightU32, wordOffset, byteOffset, Const(2));
 
                 node.List.AddBefore(node, subOp);
@@ -260,7 +392,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
         {
             if (operand.Type == OperandType.ConstantBuffer)
             {
-                int slot   = operand.GetCbufSlot();
+                int slot = operand.GetCbufSlot();
                 int offset = operand.GetCbufOffset();
 
                 if (slot == 0 && offset >= sbStart && offset < sbEnd)